Support German gender-sensitive DET, ADJ, PRON endings Change-Id: I22050da6cfc6c6f9abca9de0c9a7dd0df5574148

commit: 9ef5dec549c076b6d4ef028f862ed6954133a2be [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Feb 05 17:30:09 2026 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Feb 07 10:12:25 2026 +0100
tree: ed75b8c4b0d9f9a9eedfadaba9d7d4562e90a06d
parent: fc7c04a8b78fd43ef15409ad9cf67ee7c75eb18c [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d63d5c..3881a21 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -9,6 +9,7 @@
   - Parenthetical forms: `Nutzer(in)`, `Nutzer(innen)`, `Nutzer(-in)`
   - Kaufmann/frau pattern: `Kaufmann/frau`, `Kaufmann/-frau`, `Geschäftsmann/frau`
     (only applies when word ends in "mann" with non-empty prefix)
+  - Short forms for determiners, adjectives, pronouns: `eine(n)`, `gute:r`, `ihm/r`, `diese(r)`, `ein(e)`
 
 ## 2.3.1 [2026-01-28]
 

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 8609827..6219a01 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex

@@ -199,7 +199,10 @@
         int i = 0;
         List<Span> list = new ArrayList<Span>();
         tokenId = 0;
-        yyreset(new StringReader(s));
+        // Ensure input ends with newline so that $ (end-of-line) rules work correctly
+        // at end of string. This is needed for gender suffix detection at EOF.
+        String input = s.endsWith("\n") || s.endsWith("\r") ? s : s + "\n";
+        yyreset(new StringReader(input));
         try {
             while (!this.zzAtEOF) {
                 token = this.getNextToken();
@@ -342,6 +345,93 @@
         name = matched.substring(start, matched.length() - 3);
         outputStream.println("<file name=\"" + name + "\"/>");
     }
+
+    /**
+     * Check if a character is a Unicode letter.
+     */
+    private boolean isLetter(int ch) {
+        return ch >= 0 && Character.isLetter(ch);
+    }
+
+    /**
+     * Handle gender short suffix with colon separator.
+     * Pattern: {WORD}:{suffix}{lookahead}
+     * If lookahead is a letter, return just WORD, pushing back the rest.
+     * If lookahead is not a letter, return WORD:suffix.
+     */
+    final Span genderColonSuffixToken() {
+        String matched = yytext();
+        int lastChar = matched.codePointAt(matched.length() - 1);
+        
+        // Find the colon position
+        int colonPos = matched.lastIndexOf(':');
+        
+        if (isLetter(lastChar)) {
+            // Followed by a letter - not a valid gender form
+            // Return just the WORD part (before colon)
+            yypushback(matched.length() - colonPos);
+            return currentToken();
+        } else {
+            // Followed by non-letter - valid gender form
+            // Push back just the lookahead character
+            yypushback(1);
+            return currentToken();
+        }
+    }
+
+    /**
+     * Handle gender short suffix with slash separator.
+     * Pattern: {WORD}/{-suffix}{lookahead}
+     */
+    final Span genderSlashSuffixToken() {
+        String matched = yytext();
+        int lastChar = matched.codePointAt(matched.length() - 1);
+        
+        // Find the slash position
+        int slashPos = matched.lastIndexOf('/');
+        
+        if (isLetter(lastChar)) {
+            // Followed by a letter - not a valid gender form
+            // Return just the WORD part (before slash)
+            yypushback(matched.length() - slashPos);
+            return currentToken();
+        } else {
+            // Followed by non-letter - valid gender form
+            yypushback(1);
+            return currentToken();
+        }
+    }
+
+    /**
+     * Handle gender short suffix with star separator.
+     * Pattern: {WORD}*{suffix}{lookahead}
+     */
+    final Span genderStarSuffixToken() {
+        String matched = yytext();
+        int lastChar = matched.codePointAt(matched.length() - 1);
+        
+        // Find the star position
+        int starPos = matched.lastIndexOf('*');
+        
+        if (isLetter(lastChar)) {
+            // Followed by a letter - not a valid gender form
+            // Return just the WORD part (before star)
+            yypushback(matched.length() - starPos);
+            return currentToken();
+        } else {
+            // Followed by non-letter - valid gender form
+            yypushback(1);
+            return currentToken();
+        }
+    }
+
+    /**
+     * Handle gender short suffix at end of input (no lookahead char).
+     * This is always a valid gender form since there's nothing following.
+     */
+    final Span genderShortSuffixAtEOF() {
+        return currentToken();
+    }
 %}
 
 THAI       = [\u0E00-\u0E59]
@@ -667,6 +757,31 @@
 // Only applies when word ends in "mann" (with non-empty prefix before it)
 ({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}"("-?{GENDER_ENDING_FRAU}")"  { return currentToken(); }
 
+// Short gender endings (determiners, adjectives, pronouns)
+// e.g. eine(n), gute:r, ihm/r, ein(e)
+// Separators: colon, slash (optional dash), parens, star
+
+// Colon: gute:r, ein:e
+// Match pattern + one extra char, check if it's a letter in semantic action
+({WORD}):{GENDER_SHORT_SUFFIX}.  { return genderColonSuffixToken(); }
+({WORD}):{GENDER_SHORT_SUFFIX}$  { return genderShortSuffixAtEOF(); }
+
+// Slash: ihm/r, eine/-n, ein/e
+({WORD}){SLASH}-?{GENDER_SHORT_SUFFIX}.  { return genderSlashSuffixToken(); }
+({WORD}){SLASH}-?{GENDER_SHORT_SUFFIX}$  { return genderShortSuffixAtEOF(); }
+
+// Parens: eine(n), ein(e) - parentheses already provide word boundary
+({WORD})"("-?{GENDER_SHORT_SUFFIX}")"       { return currentToken(); }
+
+// Star: gute*r
+({WORD})\*{GENDER_SHORT_SUFFIX}.  { return genderStarSuffixToken(); }
+({WORD})\*{GENDER_SHORT_SUFFIX}$  { return genderShortSuffixAtEOF(); }
+
+
+
+
+
+
 
 // normal stuff
 

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
index 421c637..6d62414 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro

@@ -21,6 +21,11 @@
 // Matches patterns like: in, innen, In, Innen, IN, INNEN (case-insensitive)
 GENDER_ENDING_IN = ([iI][nN]|[iI][nN][nN][eE][nN])
 
+// Short endings for determiners, adjectives, pronouns
+// e, n, r, s, m, es, er, em, en (lowercase only - gender suffixes are never uppercase)
+GENDER_SHORT_SUFFIX = ([enrsm]|e[srmn])
+
+
 // Gender-sensitive endings with frau/frauen (lowercase only - capitalized Frau is a standalone word)
 // Note: This is now only used with MANN_WORD, not in general GENDER_ENDING
 GENDER_ENDING_FRAU = (frau(en)?)
@@ -32,3 +37,6 @@
 // Matches: Kaufmann, Geschäftsmann, etc. but NOT just "mann"
 MANN_WORD = ({LETTER}+[Mm][Aa][Nn][Nn])
 
+// Non-letter character for lookahead (word boundary equivalent)
+// Matches whitespace, newlines, common punctuation, and NUL (for EOF detection)
+NON_LETTER = [\x00\n\r\t ,.;:!?'\)\]\}\>]

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro
index 411c7ad..ccdf46e 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro

@@ -8,7 +8,13 @@
 CLITIC = ({ENGLISH_CLITIC}|{FRENCH_CLITIC})
 
 // Gender-sensitive endings not used for English (German-only feature)
-// Use NUL character so the rules never match
-GENDER_ENDING = (\u0000)
-GENDER_ENDING_FRAU = (\u0000)
-MANN_WORD = (\u0000)
+// Use Unicode non-characters (U+FDD0-FDD2) so the rules never match
+GENDER_ENDING = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_FRAU = ("\uFDD0\uFDD1\uFDD2")
+GENDER_SHORT_SUFFIX = ("\uFDD0\uFDD1\uFDD2")
+MANN_WORD = ("\uFDD0\uFDD1\uFDD2")
+
+// Non-letter character for lookahead (word boundary equivalent)
+NON_LETTER = [\x00\n\r\t ,.;:!?'\)\]\}\>]
+
+

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro
index 4cb9166..fdca84a 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro

@@ -10,7 +10,13 @@
 CLITIC = ({ENGLISH_CLITIC}|{FRENCH_CLITIC})
 
 // Gender-sensitive endings not used for French (German-only feature)
-// Use NUL character so the rules never match
-GENDER_ENDING = (\u0000)
-GENDER_ENDING_FRAU = (\u0000)
-MANN_WORD = (\u0000)
+// Use Unicode non-characters (U+FDD0-FDD2) so the rules never match
+GENDER_ENDING = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_FRAU = ("\uFDD0\uFDD1\uFDD2")
+GENDER_SHORT_SUFFIX = ("\uFDD0\uFDD1\uFDD2")
+MANN_WORD = ("\uFDD0\uFDD1\uFDD2")
+
+// Non-letter character for lookahead (word boundary equivalent)
+NON_LETTER = [\x00\n\r\t ,.;:!?'\)\]\}\>]
+
+

diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index 1d8961a..7d92a03 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java

@@ -13,6 +13,11 @@
 import org.junit.runners.JUnit4;
 
 import java.io.PrintStream;
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
 import java.lang.reflect.InvocationTargetException;
 
 @RunWith(JUnit4.class)
@@ -1056,4 +1061,35 @@
         assertEquals("frau", tokens[5]);
         assertEquals(6, tokens.length);
     }
+
+    @Test
+    public void testGenderSensitiveFromFile() throws IOException {
+        DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+        try (InputStream is = getClass().getResourceAsStream("/tokenizer/dontsplit.txt");
+             BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
+            String line;
+            while ((line = reader.readLine()) != null) {
+                line = line.trim();
+                if (line.isEmpty() || line.startsWith("#")) continue;
+                String[] tokens = tok.tokenize(line);
+                assertEquals("Should not split: " + line, 1, tokens.length);
+                assertEquals("Should match exact string: " + line, line, tokens[0]);
+            }
+        }
+    }
+
+    @Test
+    public void testSplitFromFile() throws IOException {
+        DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+        try (InputStream is = getClass().getResourceAsStream("/tokenizer/split.txt");
+             BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
+            String line;
+            while ((line = reader.readLine()) != null) {
+                line = line.trim();
+                if (line.isEmpty() || line.startsWith("#")) continue;
+                String[] tokens = tok.tokenize(line);
+                assertTrue("Should split: " + line, tokens.length > 1);
+            }
+        }
+    }
 }

diff --git a/src/test/resources/tokenizer/dontsplit.txt b/src/test/resources/tokenizer/dontsplit.txt
new file mode 100644
index 0000000..4b71731
--- /dev/null
+++ b/src/test/resources/tokenizer/dontsplit.txt

@@ -0,0 +1,40 @@
+gute:r
+diese(r)
+ihm/r
+ein:e
+jede*r
+große_r
+eines/r
+Kaufmann/frau
+Nutzer:in
+Kaufmann(-frau)
+Verkäufer/in
+Verkäufer/-in
+Verkäufer*innen
+Verkäufer_innen
+Verkäufer:innen
+ein(e)
+ein/e
+ein*e
+ein_e
+eines/r
+einer/s
+einem/r
+einer/m
+eine/n
+diese(n)
+diese/r
+diese:r
+diesem/r
+lehrer:innen
+schüler*innen
+student_innen
+mitarbeiter:in
+kolleg/in
+eine:r
+ein:e
+jede:r
+jede*r
+jede_r
+jede/r
+eine(n)

diff --git a/src/test/resources/tokenizer/split.txt b/src/test/resources/tokenizer/split.txt
new file mode 100644
index 0000000..99650b4
--- /dev/null
+++ b/src/test/resources/tokenizer/split.txt

@@ -0,0 +1,10 @@
+der/die
+er/sie
+und/oder
+Modell/Versuch
+Quelle:rbb
+Foto:emm
+Dies(ist)falsch
+das/ist/falsch
+mir:geht
+Vor/Nachteile
commit	9ef5dec549c076b6d4ef028f862ed6954133a2be	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Feb 05 17:30:09 2026 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Feb 07 10:12:25 2026 +0100
tree	ed75b8c4b0d9f9a9eedfadaba9d7d4562e90a06d
parent	fc7c04a8b78fd43ef15409ad9cf67ee7c75eb18c [diff]