Fix unwanted split at :innen + lc letter Change-Id: I7ad7e410b391146cf6bbe076a153309738ef0841

commit: a5804ff22cb9c1fe17992c317222cf6b20b8acbb [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Feb 09 18:54:11 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Feb 09 18:54:11 2026 +0100
tree: 75e856d07ae0fbf7397aadce7dad5711b04147bc
parent: a2db4c19e82e2a2f241b7522106a62b526bd7b47 [diff]
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 0b9cbb8..d9b5c53 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex

@@ -834,6 +834,13 @@
 // These rules use lookahead to ensure the ending is NOT followed by a letter
 // (e.g., "Nutzer/in " is valid, but "Innenminister/Innenministerinnen" is two words)
 
+// Compound forms: Lehrer:innenfortbildung, Nutzer/innenfreundlichkeit
+// These rules are greedy and match if the ending is followed by further alphanumeric characters
+// RESTRICTION: Only matches lowercase endings to avoid merging capitalized words like "Innenminister/Innenministerinnen"
+({WORD}({DASH}{WORD})*):{GENDER_ENDING_LC}{ALPHANUM}+         { return currentToken(); }
+({WORD}({DASH}{WORD})*){SLASH}-?{GENDER_ENDING_LC}{ALPHANUM}+ { return currentToken(); }
+({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}{SLASH}-?{GENDER_ENDING_FRAU}{ALPHANUM}+ { return currentToken(); }
+
 // Colon forms: Nutzer:in, Nutzer:innen, Kosovo-Albaner:innen
 // Match pattern + one extra char, check if it's a letter in semantic action
 ({WORD}({DASH}{WORD})*):{GENDER_ENDING}.                   { return genderNounColonToken(); }

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
index 6d62414..07a6812 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro

@@ -30,8 +30,13 @@
 // Note: This is now only used with MANN_WORD, not in general GENDER_ENDING
 GENDER_ENDING_FRAU = (frau(en)?)
 
+
+// Lowercase gender endings for compound detection (to avoid matching capitalized words like Innenminister)
+GENDER_ENDING_IN_LC = (in|innen)
+
 // General gender endings (only -in/-innen forms for colon, slash, parenthetical)
 GENDER_ENDING = ({GENDER_ENDING_IN})
+GENDER_ENDING_LC = ({GENDER_ENDING_IN_LC})
 
 // Words ending in "mann" (with non-empty prefix) for Kaufmann/frau pattern
 // Matches: Kaufmann, Geschäftsmann, etc. but NOT just "mann"

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de_old.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de_old.jflex-macro
index 771b6d9..51503b3 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de_old.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de_old.jflex-macro

@@ -20,6 +20,8 @@
 // Gender-sensitive endings DISABLED for traditional German orthography
 // Use Unicode non-characters (U+FDD0-FDD2) so the rules never match
 GENDER_ENDING = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_LC = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_IN_LC = ("\uFDD0\uFDD1\uFDD2")
 GENDER_ENDING_FRAU = ("\uFDD0\uFDD1\uFDD2")
 GENDER_SHORT_SUFFIX = ("\uFDD0\uFDD1\uFDD2")
 MANN_WORD = ("\uFDD0\uFDD1\uFDD2")

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro
index ccdf46e..e41f5ab 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro

@@ -10,6 +10,8 @@
 // Gender-sensitive endings not used for English (German-only feature)
 // Use Unicode non-characters (U+FDD0-FDD2) so the rules never match
 GENDER_ENDING = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_LC = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_IN_LC = ("\uFDD0\uFDD1\uFDD2")
 GENDER_ENDING_FRAU = ("\uFDD0\uFDD1\uFDD2")
 GENDER_SHORT_SUFFIX = ("\uFDD0\uFDD1\uFDD2")
 MANN_WORD = ("\uFDD0\uFDD1\uFDD2")

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro
index fdca84a..ecb6b7c 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro

@@ -12,6 +12,8 @@
 // Gender-sensitive endings not used for French (German-only feature)
 // Use Unicode non-characters (U+FDD0-FDD2) so the rules never match
 GENDER_ENDING = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_LC = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_IN_LC = ("\uFDD0\uFDD1\uFDD2")
 GENDER_ENDING_FRAU = ("\uFDD0\uFDD1\uFDD2")
 GENDER_SHORT_SUFFIX = ("\uFDD0\uFDD1\uFDD2")
 MANN_WORD = ("\uFDD0\uFDD1\uFDD2")

diff --git a/src/test/resources/tokenizer/dontsplit.txt b/src/test/resources/tokenizer/dontsplit.txt
index 1997f6e..dc1f514 100644
--- a/src/test/resources/tokenizer/dontsplit.txt
+++ b/src/test/resources/tokenizer/dontsplit.txt

@@ -39,3 +39,4 @@
 jede_r
 jede/r
 eine(n)
+Lehrer:innenfortbildung
commit	a5804ff22cb9c1fe17992c317222cf6b20b8acbb	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Feb 09 18:54:11 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Feb 09 18:54:11 2026 +0100
tree	75e856d07ae0fbf7397aadce7dad5711b04147bc
parent	a2db4c19e82e2a2f241b7522106a62b526bd7b47 [diff]