Support German gender-sensitive DET, ADJ, PRON endings
Change-Id: I22050da6cfc6c6f9abca9de0c9a7dd0df5574148
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4d63d5c..3881a21 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -9,6 +9,7 @@
- Parenthetical forms: `Nutzer(in)`, `Nutzer(innen)`, `Nutzer(-in)`
- Kaufmann/frau pattern: `Kaufmann/frau`, `Kaufmann/-frau`, `Geschäftsmann/frau`
(only applies when word ends in "mann" with non-empty prefix)
+ - Short forms for determiners, adjectives, pronouns: `eine(n)`, `gute:r`, `ihm/r`, `diese(r)`, `ein(e)`
## 2.3.1 [2026-01-28]
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 8609827..6219a01 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -199,7 +199,10 @@
int i = 0;
List<Span> list = new ArrayList<Span>();
tokenId = 0;
- yyreset(new StringReader(s));
+ // Ensure input ends with newline so that $ (end-of-line) rules work correctly
+ // at end of string. This is needed for gender suffix detection at EOF.
+ String input = s.endsWith("\n") || s.endsWith("\r") ? s : s + "\n";
+ yyreset(new StringReader(input));
try {
while (!this.zzAtEOF) {
token = this.getNextToken();
@@ -342,6 +345,93 @@
name = matched.substring(start, matched.length() - 3);
outputStream.println("<file name=\"" + name + "\"/>");
}
+
+ /**
+ * Check if a character is a Unicode letter.
+ */
+ private boolean isLetter(int ch) {
+ return ch >= 0 && Character.isLetter(ch);
+ }
+
+ /**
+ * Handle gender short suffix with colon separator.
+ * Pattern: {WORD}:{suffix}{lookahead}
+ * If lookahead is a letter, return just WORD, pushing back the rest.
+ * If lookahead is not a letter, return WORD:suffix.
+ */
+ final Span genderColonSuffixToken() {
+ String matched = yytext();
+ int lastChar = matched.codePointAt(matched.length() - 1);
+
+ // Find the colon position
+ int colonPos = matched.lastIndexOf(':');
+
+ if (isLetter(lastChar)) {
+ // Followed by a letter - not a valid gender form
+ // Return just the WORD part (before colon)
+ yypushback(matched.length() - colonPos);
+ return currentToken();
+ } else {
+ // Followed by non-letter - valid gender form
+ // Push back just the lookahead character
+ yypushback(1);
+ return currentToken();
+ }
+ }
+
+ /**
+ * Handle gender short suffix with slash separator.
+ * Pattern: {WORD}/{-suffix}{lookahead}
+ */
+ final Span genderSlashSuffixToken() {
+ String matched = yytext();
+ int lastChar = matched.codePointAt(matched.length() - 1);
+
+ // Find the slash position
+ int slashPos = matched.lastIndexOf('/');
+
+ if (isLetter(lastChar)) {
+ // Followed by a letter - not a valid gender form
+ // Return just the WORD part (before slash)
+ yypushback(matched.length() - slashPos);
+ return currentToken();
+ } else {
+ // Followed by non-letter - valid gender form
+ yypushback(1);
+ return currentToken();
+ }
+ }
+
+ /**
+ * Handle gender short suffix with star separator.
+ * Pattern: {WORD}*{suffix}{lookahead}
+ */
+ final Span genderStarSuffixToken() {
+ String matched = yytext();
+ int lastChar = matched.codePointAt(matched.length() - 1);
+
+ // Find the star position
+ int starPos = matched.lastIndexOf('*');
+
+ if (isLetter(lastChar)) {
+ // Followed by a letter - not a valid gender form
+ // Return just the WORD part (before star)
+ yypushback(matched.length() - starPos);
+ return currentToken();
+ } else {
+ // Followed by non-letter - valid gender form
+ yypushback(1);
+ return currentToken();
+ }
+ }
+
+ /**
+ * Handle gender short suffix at end of input (no lookahead char).
+ * This is always a valid gender form since there's nothing following.
+ */
+ final Span genderShortSuffixAtEOF() {
+ return currentToken();
+ }
%}
THAI = [\u0E00-\u0E59]
@@ -667,6 +757,31 @@
// Only applies when word ends in "mann" (with non-empty prefix before it)
({WORD}({DASH}{WORD})*{DASH})?{MANN_WORD}"("-?{GENDER_ENDING_FRAU}")" { return currentToken(); }
+// Short gender endings (determiners, adjectives, pronouns)
+// e.g. eine(n), gute:r, ihm/r, ein(e)
+// Separators: colon, slash (optional dash), parens, star
+
+// Colon: gute:r, ein:e
+// Match pattern + one extra char, check if it's a letter in semantic action
+({WORD}):{GENDER_SHORT_SUFFIX}. { return genderColonSuffixToken(); }
+({WORD}):{GENDER_SHORT_SUFFIX}$ { return genderShortSuffixAtEOF(); }
+
+// Slash: ihm/r, eine/-n, ein/e
+({WORD}){SLASH}-?{GENDER_SHORT_SUFFIX}. { return genderSlashSuffixToken(); }
+({WORD}){SLASH}-?{GENDER_SHORT_SUFFIX}$ { return genderShortSuffixAtEOF(); }
+
+// Parens: eine(n), ein(e) - parentheses already provide word boundary
+({WORD})"("-?{GENDER_SHORT_SUFFIX}")" { return currentToken(); }
+
+// Star: gute*r
+({WORD})\*{GENDER_SHORT_SUFFIX}. { return genderStarSuffixToken(); }
+({WORD})\*{GENDER_SHORT_SUFFIX}$ { return genderShortSuffixAtEOF(); }
+
+
+
+
+
+
// normal stuff
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
index 421c637..6d62414 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_de.jflex-macro
@@ -21,6 +21,11 @@
// Matches patterns like: in, innen, In, Innen, IN, INNEN (case-insensitive)
GENDER_ENDING_IN = ([iI][nN]|[iI][nN][nN][eE][nN])
+// Short endings for determiners, adjectives, pronouns
+// e, n, r, s, m, es, er, em, en (lowercase only - gender suffixes are never uppercase)
+GENDER_SHORT_SUFFIX = ([enrsm]|e[srmn])
+
+
// Gender-sensitive endings with frau/frauen (lowercase only - capitalized Frau is a standalone word)
// Note: This is now only used with MANN_WORD, not in general GENDER_ENDING
GENDER_ENDING_FRAU = (frau(en)?)
@@ -32,3 +37,6 @@
// Matches: Kaufmann, Geschäftsmann, etc. but NOT just "mann"
MANN_WORD = ({LETTER}+[Mm][Aa][Nn][Nn])
+// Non-letter character for lookahead (word boundary equivalent)
+// Matches whitespace, newlines, common punctuation, and NUL (for EOF detection)
+NON_LETTER = [\x00\n\r\t ,.;:!?'\)\]\}\>]
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro
index 411c7ad..ccdf46e 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_en.jflex-macro
@@ -8,7 +8,13 @@
CLITIC = ({ENGLISH_CLITIC}|{FRENCH_CLITIC})
// Gender-sensitive endings not used for English (German-only feature)
-// Use NUL character so the rules never match
-GENDER_ENDING = (\u0000)
-GENDER_ENDING_FRAU = (\u0000)
-MANN_WORD = (\u0000)
+// Use Unicode non-characters (U+FDD0-FDD2) so the rules never match
+GENDER_ENDING = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_FRAU = ("\uFDD0\uFDD1\uFDD2")
+GENDER_SHORT_SUFFIX = ("\uFDD0\uFDD1\uFDD2")
+MANN_WORD = ("\uFDD0\uFDD1\uFDD2")
+
+// Non-letter character for lookahead (word boundary equivalent)
+NON_LETTER = [\x00\n\r\t ,.;:!?'\)\]\}\>]
+
+
diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro
index 4cb9166..fdca84a 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/language-specific_fr.jflex-macro
@@ -10,7 +10,13 @@
CLITIC = ({ENGLISH_CLITIC}|{FRENCH_CLITIC})
// Gender-sensitive endings not used for French (German-only feature)
-// Use NUL character so the rules never match
-GENDER_ENDING = (\u0000)
-GENDER_ENDING_FRAU = (\u0000)
-MANN_WORD = (\u0000)
+// Use Unicode non-characters (U+FDD0-FDD2) so the rules never match
+GENDER_ENDING = ("\uFDD0\uFDD1\uFDD2")
+GENDER_ENDING_FRAU = ("\uFDD0\uFDD1\uFDD2")
+GENDER_SHORT_SUFFIX = ("\uFDD0\uFDD1\uFDD2")
+MANN_WORD = ("\uFDD0\uFDD1\uFDD2")
+
+// Non-letter character for lookahead (word boundary equivalent)
+NON_LETTER = [\x00\n\r\t ,.;:!?'\)\]\}\>]
+
+
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index 1d8961a..7d92a03 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -13,6 +13,11 @@
import org.junit.runners.JUnit4;
import java.io.PrintStream;
+import java.io.BufferedReader;
+import java.io.InputStream;
+import java.io.InputStreamReader;
+import java.io.IOException;
+import java.nio.charset.StandardCharsets;
import java.lang.reflect.InvocationTargetException;
@RunWith(JUnit4.class)
@@ -1056,4 +1061,35 @@
assertEquals("frau", tokens[5]);
assertEquals(6, tokens.length);
}
+
+ @Test
+ public void testGenderSensitiveFromFile() throws IOException {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+ try (InputStream is = getClass().getResourceAsStream("/tokenizer/dontsplit.txt");
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ line = line.trim();
+ if (line.isEmpty() || line.startsWith("#")) continue;
+ String[] tokens = tok.tokenize(line);
+ assertEquals("Should not split: " + line, 1, tokens.length);
+ assertEquals("Should match exact string: " + line, line, tokens[0]);
+ }
+ }
+ }
+
+ @Test
+ public void testSplitFromFile() throws IOException {
+ DerekoDfaTokenizer_de tok = new DerekoDfaTokenizer_de();
+ try (InputStream is = getClass().getResourceAsStream("/tokenizer/split.txt");
+ BufferedReader reader = new BufferedReader(new InputStreamReader(is, StandardCharsets.UTF_8))) {
+ String line;
+ while ((line = reader.readLine()) != null) {
+ line = line.trim();
+ if (line.isEmpty() || line.startsWith("#")) continue;
+ String[] tokens = tok.tokenize(line);
+ assertTrue("Should split: " + line, tokens.length > 1);
+ }
+ }
+ }
}
diff --git a/src/test/resources/tokenizer/dontsplit.txt b/src/test/resources/tokenizer/dontsplit.txt
new file mode 100644
index 0000000..4b71731
--- /dev/null
+++ b/src/test/resources/tokenizer/dontsplit.txt
@@ -0,0 +1,40 @@
+gute:r
+diese(r)
+ihm/r
+ein:e
+jede*r
+große_r
+eines/r
+Kaufmann/frau
+Nutzer:in
+Kaufmann(-frau)
+Verkäufer/in
+Verkäufer/-in
+Verkäufer*innen
+Verkäufer_innen
+Verkäufer:innen
+ein(e)
+ein/e
+ein*e
+ein_e
+eines/r
+einer/s
+einem/r
+einer/m
+eine/n
+diese(n)
+diese/r
+diese:r
+diesem/r
+lehrer:innen
+schüler*innen
+student_innen
+mitarbeiter:in
+kolleg/in
+eine:r
+ein:e
+jede:r
+jede*r
+jede_r
+jede/r
+eine(n)
diff --git a/src/test/resources/tokenizer/split.txt b/src/test/resources/tokenizer/split.txt
new file mode 100644
index 0000000..99650b4
--- /dev/null
+++ b/src/test/resources/tokenizer/split.txt
@@ -0,0 +1,10 @@
+der/die
+er/sie
+und/oder
+Modell/Versuch
+Quelle:rbb
+Foto:emm
+Dies(ist)falsch
+das/ist/falsch
+mir:geht
+Vor/Nachteile