Introduced multiple gender forms for nouns in german
Change-Id: Ic98042ccc01903ec279f9d58a4c3a11504dc4025
diff --git a/src/de/gender.xfst b/src/de/gender.xfst
new file mode 100644
index 0000000..8adc35c
--- /dev/null
+++ b/src/de/gender.xfst
@@ -0,0 +1,34 @@
+! Gender-sensitive endings (German)
+! By M. Kupietz (KorAP-Tokenizer)
+
+! Matches patterns like: in, innen, In, Innen, IN, INNEN (case-insensitive)
+define genderEndingsIn [ [i | I] [n | N] ( [n | N] [e | E] [n | N] ) ];
+
+! Slash forms are restricted to lowercase in/innen to avoid
+! false positives in compounds like "Nutzer/Innenarchitekt".
+define genderEndingsInLower [ i n ( n e n ) ];
+
+! Gender-sensitive endings with frau/frauen
+! (lowercase only - capitalized Frau is a standalone word)
+! Note: This is now only used for words ending in "mann"
+! (with non-empty prefix) for Kaufmann/frau pattern
+! Matches: Kaufmann, Geschäftsmann, etc. but NOT just "mann"
+define genderEndingsFrau [ {frau} ( {en} ) ];
+
+! General gender endings (only -in/-innen forms for colon, slash, parenthetical)
+! Colon forms: Nutzer:in, Nutzer:In, Nutzer:innen
+! Slash forms for -in/-innen: Nutzer/in, Nutzer/innen, Nutzer/-in, Kosovo-Albaner/innen
+define genderIn [ ":" genderEndingsIn | Slash ( %- ) genderEndingsInLower ];
+
+! Slash forms for -frau: Kaufmann/frau, Kaufmann/-frau, Geschäftsmann/frau
+! Only applies when word ends in "mann" (with non-empty prefix before it)
+define genderFrau [ {mann} Slash ( %- ) genderEndingsFrau ];
+
+! Parenthetical forms for -in/-innen: Nutzer(in), Nutzer(innen), Nutzer(-in)
+define genderParenIn %( ( %- ) genderEndingsIn %);
+
+! Parenthetical forms for -frau: Kaufmann(frau), Kaufmann(-frau)
+! Only applies when word ends in "mann" (with non-empty prefix before it)
+define genderParenFrau {mann} %( ( %- ) genderEndingsFrau %);
+
+define GenderEndings [ genderIn | genderFrau | genderParenIn | genderParenFrau ];
\ No newline at end of file
diff --git a/src/de/tokenizer.xfst b/src/de/tokenizer.xfst
index aa2092b..a6bff24 100644
--- a/src/de/tokenizer.xfst
+++ b/src/de/tokenizer.xfst
@@ -46,10 +46,17 @@
define NotSmallCaps [? - a - b - c - d - e - f - g - h - i - j - k - l - m - n - o - p - q - r - s - t - u - v - w - x - y - z - ü - ö - ä - è - é - ú - á - â - ê - î - ô - û];
-define Word Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
+define BaseWord Char+ ([Apos|Asterisk] Char+)* ([s|S] [%’|%`]);
define Plusampersand @txt"de/plusampersand.txt";
-define Word [Plusampersand | Word] (Dash [Plusampersand | Word])*;
+
+source de/gender.xfst
+
+define CoreWord [Plusampersand | BaseWord] (Dash [Plusampersand | BaseWord])*;
+define Word [ CoreWord | CoreWord GenderEndings ];
+
+! [ Word GenderEndings || WS] @-> ... NLout,
+
! Abbreviations and Initials
! The abbreviation list is part of the sentence splitter tool