Add lookahead to noun gender endings to prevent false matches Forms like Innenminister/Innenministerinnen were incorrectly being parsed as a single gender-marked token (Innenminister/Innen + ministerinnen). Now gender noun endings (:in, :innen, /in, /innen, (in), /frau, etc.) use a lookahead character to verify the ending is NOT followed by a letter - ensuring they are at a word boundary. This correctly tokenizes: - Innenminister/Innenministerinnen → Innenminister / Innenministerinnen - Nutzer/in → Nutzer/in (valid gender form, unchanged) - Kaufmann/frau → Kaufmann/frau (valid gender form, unchanged) Adds test cases for both split and don't-split scenarios. Change-Id: I509a6f12ec1bb5678b1d8e8a063d0164498de5de

commit: 2173013a1ef865a231026ca4fd81f041732c769f [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sat Feb 07 12:29:13 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Feb 07 12:32:10 2026 +0100
tree: 63c9029b5d691871a7141ac73fd843103df0c45e
parent: 9ef5dec549c076b6d4ef028f862ed6954133a2be [diff]
diff --git a/Readme.md b/Readme.md
index ada404c..90535cd 100644
--- a/Readme.md
+++ b/Readme.md

@@ -30,7 +30,7 @@
 ```
 #### Note
 Because of the large table of abbreviations, the conversion from the jflex source to java,
-i.e. the calculation of the DFA, takes about 5 to 30 minutes, depending on your hardware,
+i.e. the calculation of the DFA, takes about 20 to 40 minutes, depending on your hardware,
 and requires a lot of heap space.
 
 For development, you can disable the large abbreviation lists to speed up the build:

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 6219a01..342c668 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex

@@ -432,6 +432,90 @@
     final Span genderShortSuffixAtEOF() {
         return currentToken();
     }
+
+    /**
+     * Handle noun gender ending with colon separator.
+     * Pattern: {WORD}:{GENDER_ENDING}{lookahead}
+     * If lookahead is a letter, return just WORD, pushing back the rest.
+     * If lookahead is not a letter, return WORD:ending.
+     */
+    final Span genderNounColonToken() {
+        String matched = yytext();
+        int lastChar = matched.codePointAt(matched.length() - 1);
+        
+        // Find the colon position
+        int colonPos = matched.lastIndexOf(':');
+        
+        if (isLetter(lastChar)) {
+            // Followed by a letter - not a valid gender form
+            // Return just the WORD part (before colon)
+            yypushback(matched.length() - colonPos);
+            return currentToken();
+        } else {
+            // Followed by non-letter - valid gender form
+            // Push back just the lookahead character
+            yypushback(1);
+            return currentToken();
+        }
+    }
+
+    /**
+     * Handle noun gender ending with slash separator.
+     * Pattern: {WORD}/{-}{GENDER_ENDING}{lookahead}
+     * If lookahead is a letter, return just WORD, pushing back the rest.
+     * If lookahead is not a letter, return WORD/ending or WORD/-ending.
+     */
+    final Span genderNounSlashToken() {
+        String matched = yytext();
+        int lastChar = matched.codePointAt(matched.length() - 1);
+        
+        // Find the slash position
+        int slashPos = matched.lastIndexOf('/');
+        
+        if (isLetter(lastChar)) {
+            // Followed by a letter - not a valid gender form
+            // Return just the WORD part (before slash)
+            yypushback(matched.length() - slashPos);
+            return currentToken();
+        } else {
+            // Followed by non-letter - valid gender form
+            yypushback(1);
+            return currentToken();
+        }
+    }
+
+    /**
+     * Handle noun gender ending with parentheses.
+     * Pattern: {WORD}({-}{GENDER_ENDING}){lookahead}
+     * If lookahead is a letter, return just WORD, pushing back the rest.
+     * If lookahead is not a letter, return WORD(ending).
+     */
+    final Span genderNounParenToken() {
+        String matched = yytext();
+        int lastChar = matched.codePointAt(matched.length() - 1);
+        
+        // Find the opening paren position
+        int parenPos = matched.lastIndexOf('(');
+        
+        if (isLetter(lastChar)) {
+            // Followed by a letter - not a valid gender form
+            // Return just the WORD part (before open paren)
+            yypushback(matched.length() - parenPos);
+            return currentToken();
+        } else {
+            // Followed by non-letter - valid gender form
+            yypushback(1);
+            return currentToken();
+        }
+    }
+
+    /**
+     * Handle noun gender ending at end of input (no lookahead char).
+     * This is always a valid gender form since there's nothing following.
+     */
+    final Span genderNounAtEOF() {
+        return currentToken();
+    }
 %}
 
 THAI       = [\u0E00-\u0E59]
@@ -740,22 +824,31 @@
 \]\]+                                                          { return currentToken();}
 
 // Gender-sensitive forms (German-specific, via GENDER_ENDING macro in language-specific_de.jflex-macro)
+// These rules use lookahead to ensure the ending is NOT followed by a letter
+// (e.g., "Nutzer/in " is valid, but "Innenminister/Innenministerinnen" is two words)
+
 // Colon forms: Nutzer:in, Nutzer:innen, Kosovo-Albaner:innen
-({WORD}({DASH}{WORD})*):{GENDER_ENDING}                    { return currentToken(); }
+// Match pattern + one extra char, check if it's a letter in semantic action
+({WORD}({DASH}{WORD})*):{GENDER_ENDING}.                   { return genderNounColonToken(); }
+({WORD}({DASH}{WORD})*):{GENDER_ENDING}$                   { return genderNounAtEOF(); }
 
 // Slash forms for -in/-innen: Nutzer/in, Nutzer/innen, Nutzer/-in, Kosovo-Albaner/innen
-({WORD}({DASH}{WORD})*){SLASH}-?{GENDER_ENDING}            { return currentToken(); }
+({WORD}({DASH}{WORD})*){SLASH}-?{GENDER_ENDING}.           { return genderNounSlashToken(); }
+({WORD}({DASH}{WORD})*){SLASH}-?{GENDER_ENDING}$           { return genderNounAtEOF(); }
 
 // Slash forms for -frau: Kaufmann/frau, Kaufmann/-frau, Geschäftsmann/frau
 // Only applies when word ends in "mann" (with non-empty prefix before it)
-({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}{SLASH}-?{GENDER_ENDING_FRAU}  { return currentToken(); }
+({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}{SLASH}-?{GENDER_ENDING_FRAU}.  { return genderNounSlashToken(); }
+({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}{SLASH}-?{GENDER_ENDING_FRAU}$  { return genderNounAtEOF(); }
 
 // Parenthetical forms for -in/-innen: Nutzer(in), Nutzer(innen), Nutzer(-in)
-({WORD}({DASH}{WORD})*)"("-?{GENDER_ENDING}")"             { return currentToken(); }
+({WORD}({DASH}{WORD})*)"("-?{GENDER_ENDING}")".            { return genderNounParenToken(); }
+({WORD}({DASH}{WORD})*)"("-?{GENDER_ENDING}")"$            { return genderNounAtEOF(); }
 
 // Parenthetical forms for -frau: Kaufmann(frau), Kaufmann(-frau)
 // Only applies when word ends in "mann" (with non-empty prefix before it)
-({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}"("-?{GENDER_ENDING_FRAU}")"  { return currentToken(); }
+({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}"("-?{GENDER_ENDING_FRAU}")".  { return genderNounParenToken(); }
+({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}"("-?{GENDER_ENDING_FRAU}")"$  { return genderNounAtEOF(); }
 
 // Short gender endings (determiners, adjectives, pronouns)
 // e.g. eine(n), gute:r, ihm/r, ein(e)

diff --git a/src/test/resources/tokenizer/dontsplit.txt b/src/test/resources/tokenizer/dontsplit.txt
index 4b71731..1997f6e 100644
--- a/src/test/resources/tokenizer/dontsplit.txt
+++ b/src/test/resources/tokenizer/dontsplit.txt

@@ -13,6 +13,7 @@
 Verkäufer*innen
 Verkäufer_innen
 Verkäufer:innen
+Innenminster/innen
 ein(e)
 ein/e
 ein*e

diff --git a/src/test/resources/tokenizer/split.txt b/src/test/resources/tokenizer/split.txt
index 99650b4..14a0e37 100644
--- a/src/test/resources/tokenizer/split.txt
+++ b/src/test/resources/tokenizer/split.txt

@@ -8,3 +8,4 @@
 das/ist/falsch
 mir:geht
 Vor/Nachteile
+Innenminister/Innenministerinnen
commit	2173013a1ef865a231026ca4fd81f041732c769f	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Feb 07 12:29:13 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Feb 07 12:32:10 2026 +0100
tree	63c9029b5d691871a7141ac73fd843103df0c45e
parent	9ef5dec549c076b6d4ef028f862ed6954133a2be [diff]