Introduced multiple gender forms for nouns in german

Change-Id: Ic98042ccc01903ec279f9d58a4c3a11504dc4025
diff --git a/datok_test.go b/datok_test.go
index 6c64260..e18ba0a 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -793,167 +793,330 @@
 
 	// Regression test for hyphenated abbreviations from Wiktionary (2024-12)
 	tokens = ttokenize(dat, w, "Ich wohne in Ba.-Wü. und bin Dipl.-Ing. bei Reg.-Bez. Karlsruhe.")
-        assert.Equal("Ich", tokens[0])
-        assert.Equal("wohne", tokens[1])
-        assert.Equal("in", tokens[2])
-        assert.Equal("Ba.-Wü.", tokens[3])
-        assert.Equal("und", tokens[4])
-        assert.Equal("bin", tokens[5])
-        assert.Equal("Dipl.-Ing.", tokens[6])
-        assert.Equal("bei", tokens[7])
-        assert.Equal("Reg.-Bez.", tokens[8])
-        assert.Equal("Karlsruhe", tokens[9])
-        assert.Equal(".", tokens[10])
-	assert.Equal(11, len(tokens));
+	assert.Equal("Ich", tokens[0])
+	assert.Equal("wohne", tokens[1])
+	assert.Equal("in", tokens[2])
+	assert.Equal("Ba.-Wü.", tokens[3])
+	assert.Equal("und", tokens[4])
+	assert.Equal("bin", tokens[5])
+	assert.Equal("Dipl.-Ing.", tokens[6])
+	assert.Equal("bei", tokens[7])
+	assert.Equal("Reg.-Bez.", tokens[8])
+	assert.Equal("Karlsruhe", tokens[9])
+	assert.Equal(".", tokens[10])
+	assert.Equal(11, len(tokens))
 
 	// Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/131
 	tokens = ttokenize(dat, w, "Donau\u00ADdampf\u00ADschiff")
-        assert.Equal("Donau\u00ADdampf\u00ADschiff", tokens[0])
-	assert.Equal(1, len(tokens));
+	assert.Equal("Donau\u00ADdampf\u00ADschiff", tokens[0])
+	assert.Equal(1, len(tokens))
 
 	// Regression test for https://github.com/KorAP/KorAP-Tokenizer/issues/115
 	tokens = ttokenize(dat, w, "Die Serb*innen wie die Kosovo-Albaner*innen")
-        assert.Equal("Die", tokens[0]);
-        assert.Equal("Serb*innen", tokens[1]);
-        assert.Equal("wie", tokens[2]);
-        assert.Equal("die", tokens[3]);
-        assert.Equal("Kosovo-Albaner*innen", tokens[4]);
-        assert.Equal(5, len(tokens));
-       
-        // Test Wikipedia emoji template from the issue
+	assert.Equal("Die", tokens[0])
+	assert.Equal("Serb*innen", tokens[1])
+	assert.Equal("wie", tokens[2])
+	assert.Equal("die", tokens[3])
+	assert.Equal("Kosovo-Albaner*innen", tokens[4])
+	assert.Equal(5, len(tokens))
+
+	// Test Wikipedia emoji template from the issue
 	tokens = ttokenize(dat, w, "Ein Smiley [_EMOJI:{{S|;)}}_] hier")
-        assert.Equal("Ein", tokens[0]);
-        assert.Equal("Smiley", tokens[1]);
-        assert.Equal("[_EMOJI:{{S|;)}}_]", tokens[2]); // Should be one token
-        assert.Equal("hier", tokens[3]);
-        assert.Equal(4, len(tokens));
-        
-        // Test simple pragma still works
+	assert.Equal("Ein", tokens[0])
+	assert.Equal("Smiley", tokens[1])
+	assert.Equal("[_EMOJI:{{S|;)}}_]", tokens[2]) // Should be one token
+	assert.Equal("hier", tokens[3])
+	assert.Equal(4, len(tokens))
+
+	// Test simple pragma still works
 	tokens = ttokenize(dat, w, "Name: [_ANONYMIZED_] Ende")
-        assert.Equal("Name", tokens[0]);
-        assert.Equal(":", tokens[1]);
-        assert.Equal("[_ANONYMIZED_]", tokens[2]); // Should be one token
-        assert.Equal("Ende", tokens[3]);
-        assert.Equal(4, len(tokens));
+	assert.Equal("Name", tokens[0])
+	assert.Equal(":", tokens[1])
+	assert.Equal("[_ANONYMIZED_]", tokens[2]) // Should be one token
+	assert.Equal("Ende", tokens[3])
+	assert.Equal(4, len(tokens))
+
+	// Gender forms
+	// Basic colon forms with -in/-innen
+	tokens = ttokenize(dat, w, "Die Schüler:innen und Lehrer:in kamen.")
+	assert.Equal("Die", tokens[0])
+	assert.Equal("Schüler:innen", tokens[1])
+	assert.Equal("und", tokens[2])
+	assert.Equal("Lehrer:in", tokens[3])
+	assert.Equal("kamen", tokens[4])
+	assert.Equal(".", tokens[5])
+	assert.Equal(6, len(tokens))
+
+	// More colon examples
+	tokens = ttokenize(dat, w, "Künstler:innen Mitarbeiter:innen Bürger:innen")
+	assert.Equal("Künstler:innen", tokens[0])
+	assert.Equal("Mitarbeiter:innen", tokens[1])
+	assert.Equal("Bürger:innen", tokens[2])
+	assert.Equal(3, len(tokens))
+
+	// Basic slash forms
+	tokens = ttokenize(dat, w, "Autor/in Autor/innen Teilnehmer/innen")
+	assert.Equal("Autor/in", tokens[0])
+	assert.Equal("Autor/innen", tokens[1])
+	assert.Equal("Teilnehmer/innen", tokens[2])
+	assert.Equal(3, len(tokens))
+
+	// Slash forms with hyphen: /-in, /-innen, /-frau
+	tokens = ttokenize(dat, w, "Kaufmann/-frau und Fachmann/-frau")
+	assert.Equal("Kaufmann/-frau", tokens[0])
+	assert.Equal("und", tokens[1])
+	assert.Equal("Fachmann/-frau", tokens[2])
+	assert.Equal(3, len(tokens))
+
+	// Slash forms without hyphen for frau (lowercase only)
+	tokens = ttokenize(dat, w, "Kaufmann/frau ist auch korrekt.")
+	assert.Equal("Kaufmann/frau", tokens[0])
+	assert.Equal("ist", tokens[1])
+	assert.Equal("auch", tokens[2])
+	assert.Equal("korrekt", tokens[3])
+	assert.Equal(".", tokens[4])
+	assert.Equal(5, len(tokens))
+
+	// Basic parenthetical forms
+	tokens = ttokenize(dat, w, "Schüler(innen) und Lehrer(in) kamen.")
+	assert.Equal("Schüler(innen)", tokens[0])
+	assert.Equal("und", tokens[1])
+	assert.Equal("Lehrer(in)", tokens[2])
+	assert.Equal("kamen", tokens[3])
+	assert.Equal(".", tokens[4])
+	assert.Equal(5, len(tokens))
+
+	// Compound words with hyphen + gender ending
+	tokens = ttokenize(dat, w, "Die Kosovo-Albaner/innen und Kosovo-Albaner:innen trafen sich.")
+	assert.Equal("Die", tokens[0])
+	assert.Equal("Kosovo-Albaner/innen", tokens[1])
+	assert.Equal("und", tokens[2])
+	assert.Equal("Kosovo-Albaner:innen", tokens[3])
+	assert.Equal("trafen", tokens[4])
+	assert.Equal("sich", tokens[5])
+	assert.Equal(".", tokens[6])
+	assert.Equal(7, len(tokens))
+
+	// With hyphen: Kosovo-Albaner/-innen
+	tokens = ttokenize(dat, w, "Kosovo-Albaner/-innen kamen.")
+	assert.Equal("Kosovo-Albaner/-innen", tokens[0])
+	assert.Equal("kamen", tokens[1])
+	assert.Equal(".", tokens[2])
+	assert.Equal(3, len(tokens))
+
+	// Mann/Frau should be separated (capital F = standalone word, not suffix)
+	tokens = ttokenize(dat, w, "Ob Mann/Frau das will?")
+	assert.Equal("Ob", tokens[0])
+	assert.Equal("Mann", tokens[1])
+	assert.Equal("/", tokens[2])
+	assert.Equal("Frau", tokens[3])
+	assert.Equal("das", tokens[4])
+	assert.Equal("will", tokens[5])
+	assert.Equal("?", tokens[6])
+	assert.Equal(7, len(tokens))
+
+	// Also Männer/Frauen
+	tokens = ttokenize(dat, w, "Männer/Frauen sind willkommen.")
+	assert.Equal("Männer", tokens[0])
+	assert.Equal("/", tokens[1])
+	assert.Equal("Frauen", tokens[2])
+	assert.Equal("sind", tokens[3])
+	assert.Equal("willkommen", tokens[4])
+	assert.Equal(".", tokens[5])
+	assert.Equal(6, len(tokens))
+
+	// /frau should only be joined when word ends in "mann"
+	// "xxx/frau" where xxx doesn't end in "mann" should be SEPARATED
+	tokens = ttokenize(dat, w, "xxx/frau sollte getrennt sein.")
+	assert.Equal("xxx", tokens[0])
+	assert.Equal("/", tokens[1])
+	assert.Equal("frau", tokens[2])
+	assert.Equal("sollte", tokens[3])
+	assert.Equal("getrennt", tokens[4])
+	assert.Equal("sein", tokens[5])
+	assert.Equal(".", tokens[6])
+	assert.Equal(7, len(tokens))
+
+	// But Kaufmann/frau should be one token (word ends in "mann")
+	tokens = ttokenize(dat, w, "Kaufmann/frau ist ein Beruf.")
+	assert.Equal("Kaufmann/frau", tokens[0])
+	assert.Equal("ist", tokens[1])
+	assert.Equal("ein", tokens[2])
+	assert.Equal("Beruf", tokens[3])
+	assert.Equal(".", tokens[4])
+	assert.Equal(5, len(tokens))
+
+	// And Fachmann/-frau should be one token
+	tokens = ttokenize(dat, w, "Fachmann/-frau gesucht")
+	assert.Equal("Fachmann/-frau", tokens[0])
+	assert.Equal("gesucht", tokens[1])
+	assert.Equal(2, len(tokens))
+
+	// Geschäftsmann/frau should also be one token
+	tokens = ttokenize(dat, w, "Ein Geschäftsmann/frau wird gesucht.")
+	assert.Equal("Ein", tokens[0])
+	assert.Equal("Geschäftsmann/frau", tokens[1])
+	assert.Equal("wird", tokens[2])
+	assert.Equal("gesucht", tokens[3])
+	assert.Equal(".", tokens[4])
+	assert.Equal(5, len(tokens))
+
+	// Genderstern forms (these should already work via existing rules)
+	tokens = ttokenize(dat, w, "Schüler*innen und Lehrer*innen")
+	assert.Equal("Schüler*innen", tokens[0])
+	assert.Equal("und", tokens[1])
+	assert.Equal("Lehrer*innen", tokens[2])
+	assert.Equal(3, len(tokens))
+
+	// Mixed sentence with various gender forms
+	tokens = ttokenize(dat, w, "Die Schüler:innen, Lehrer/innen und Mitarbeiter(innen) sowie Kaufmann/-frau trafen sich.")
+	assert.Equal("Die", tokens[0])
+	assert.Equal("Schüler:innen", tokens[1])
+	assert.Equal(",", tokens[2])
+	assert.Equal("Lehrer/innen", tokens[3])
+	assert.Equal("und", tokens[4])
+	assert.Equal("Mitarbeiter(innen)", tokens[5])
+	assert.Equal("sowie", tokens[6])
+	assert.Equal("Kaufmann/-frau", tokens[7])
+	assert.Equal("trafen", tokens[8])
+	assert.Equal("sich", tokens[9])
+	assert.Equal(".", tokens[10])
+	assert.Equal(11, len(tokens))
+
+	tokens = ttokenize(dat, w, "Nutzer/Innenarchitekt")
+	assert.Equal("Nutzer", tokens[0])
+	assert.Equal("/", tokens[1])
+	assert.Equal("Innenarchitekt", tokens[2])
+	assert.Equal(3, len(tokens))
+
+	tokens = ttokenize(dat, w, "Innenminister/in")
+	assert.Equal("Innenminister/in", tokens[0])
+	assert.Equal(1, len(tokens))
+
+	tokens = ttokenize(dat, w, "Innenminister/Innenministerinnen")
+	assert.Equal("Innenminister", tokens[0])
+	assert.Equal("/", tokens[1])
+	assert.Equal("Innenministerinnen", tokens[2])
+	assert.Equal(3, len(tokens))
 
 	/*
-        DeReKo-Behaviour
-	tokens = ttokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
-	assert.Equal("'ve", tokens[1]);
-	assert.Equal("'ll", tokens[3]);
-	assert.Equal("'d", tokens[5]);
-	assert.Equal("'m", tokens[7]);
-	assert.Equal("'re", tokens[9]);
-	assert.Equal("'s", tokens[11]);
-	assert.Equal("is", tokens[12]);
-	assert.Equal("n't", tokens[13]);
-	assert.Equal(14, len(tokens));
+		        DeReKo-Behaviour
+			tokens = ttokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
+			assert.Equal("'ve", tokens[1]);
+			assert.Equal("'ll", tokens[3]);
+			assert.Equal("'d", tokens[5]);
+			assert.Equal("'m", tokens[7]);
+			assert.Equal("'re", tokens[9]);
+			assert.Equal("'s", tokens[11]);
+			assert.Equal("is", tokens[12]);
+			assert.Equal("n't", tokens[13]);
+			assert.Equal(14, len(tokens));
 
-	
-	assert.Equal(tokens[0], "Der")
-	assert.Equal(tokens[1], "alte")
-	assert.Equal(tokens[2], "Mann")
-	assert.Equal(len(tokens), 3)
 
-	/*
-		@Test
-		public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
-				DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
-		}
+			assert.Equal(tokens[0], "Der")
+			assert.Equal(tokens[1], "alte")
+			assert.Equal(tokens[2], "Mann")
+			assert.Equal(len(tokens), 3)
 
-		@Test
-		public void frenchTokenizerKnowsFrenchAbbreviations () {
-				DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
-				tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
-				assert.Equal("Approx.", tokens[0]);
-				assert.Equal("juill.", tokens[2]);
-				assert.Equal("prof.", tokens[5]);
-				assert.Equal("exerc.", tokens[15]);
-				assert.Equal("no.", tokens[16]);
-				assert.Equal("pp.", tokens[21]);
-		}
+			/*
+				@Test
+				public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
+						DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
+				}
 
-		@Test
-		public void frenchTokenizerKnowsFrenchContractions () {
-				DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
-				tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
-				assert.Equal("J'", tokens[0]);
-				assert.Equal("j'", tokens[2]);
-				assert.Equal("qu'", tokens[4]);
-				assert.Equal("d'", tokens[6]);
-				assert.Equal("jusqu'", tokens[8]);
-				assert.Equal("Aujourd'hui", tokens[10]);
-				assert.Equal("D'", tokens[11]); // ’
-				assert.Equal("Quelqu'un", tokens[13]); // ’
-				assert.Equal("Presqu'île", tokens[14]); // ’
-		}
+				@Test
+				public void frenchTokenizerKnowsFrenchAbbreviations () {
+						DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
+						tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
+						assert.Equal("Approx.", tokens[0]);
+						assert.Equal("juill.", tokens[2]);
+						assert.Equal("prof.", tokens[5]);
+						assert.Equal("exerc.", tokens[15]);
+						assert.Equal("no.", tokens[16]);
+						assert.Equal("pp.", tokens[21]);
+				}
 
-		@Test
-		public void frenchTokenizerKnowsFrenchClitics () {
-				DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
-				tokens = tokenize(dat, w, "suis-je sont-elles ")
-				assert.Equal("suis", tokens[0]);
-				assert.Equal("-je", tokens[1]);
-				assert.Equal("sont", tokens[2]);
-				assert.Equal("-elles", tokens[3]);
-		}
+				@Test
+				public void frenchTokenizerKnowsFrenchContractions () {
+						DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
+						tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
+						assert.Equal("J'", tokens[0]);
+						assert.Equal("j'", tokens[2]);
+						assert.Equal("qu'", tokens[4]);
+						assert.Equal("d'", tokens[6]);
+						assert.Equal("jusqu'", tokens[8]);
+						assert.Equal("Aujourd'hui", tokens[10]);
+						assert.Equal("D'", tokens[11]); // ’
+						assert.Equal("Quelqu'un", tokens[13]); // ’
+						assert.Equal("Presqu'île", tokens[14]); // ’
+				}
 
-		@Test
-		public void testEnglishTokenizerScienceAbbreviations () {
-				DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
-				tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
-				assert.Equal("Approx.", tokens[0]);
-				assert.Equal("in", tokens[1]);
-				assert.Equal("Sept.", tokens[2]);
-				assert.Equal("1954", tokens[3]);
-				assert.Equal(",", tokens[4]);
-				assert.Equal("Assoc.", tokens[5]);
-				assert.Equal("Prof.", tokens[6]);
-				assert.Equal("Dr.", tokens[7]);
-				assert.Equal("R.", tokens[8]);
-				assert.Equal("J.", tokens[9]);
-				assert.Equal("Ewing", tokens[10]);
-				assert.Equal("reviewed", tokens[11]);
-				assert.Equal("articles", tokens[12]);
-				assert.Equal("on", tokens[13]);
-				assert.Equal("Enzymol.", tokens[14]);
-				assert.Equal("Bacteriol.", tokens[15]);
-				assert.Equal("effects", tokens[16]);
-				assert.Equal("later", tokens[17]);
-				assert.Equal("published", tokens[18]);
-				assert.Equal("in", tokens[19]);
-				assert.Equal("Nutr.", tokens[20]);
-				assert.Equal("Rheumatol.", tokens[21]);
-				assert.Equal("No.", tokens[22]);
-				assert.Equal("12", tokens[23]);
-				assert.Equal("and", tokens[24]);
-				assert.Equal("Nº.", tokens[25]);
-				assert.Equal("13.", tokens[26]);
-				assert.Equal(",", tokens[27]);
-				assert.Equal("pp.", tokens[28]);
-				assert.Equal("17-18", tokens[29]);
-				assert.Equal(".", tokens[30]);
-		}
+				@Test
+				public void frenchTokenizerKnowsFrenchClitics () {
+						DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
+						tokens = tokenize(dat, w, "suis-je sont-elles ")
+						assert.Equal("suis", tokens[0]);
+						assert.Equal("-je", tokens[1]);
+						assert.Equal("sont", tokens[2]);
+						assert.Equal("-elles", tokens[3]);
+				}
 
-		@Test
-		public void englishTokenizerCanGuessWhetherIIsAbbrev () {
-				DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
-				tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
-				assert.Equal("I.", tokens[1]);
-				assert.Equal("I", tokens[8]);
-				assert.Equal(".", tokens[9]);
-				assert.Equal("I", tokens[12]);
-				assert.Equal(".", tokens[13]);
-		}
+				@Test
+				public void testEnglishTokenizerScienceAbbreviations () {
+						DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
+						tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
+						assert.Equal("Approx.", tokens[0]);
+						assert.Equal("in", tokens[1]);
+						assert.Equal("Sept.", tokens[2]);
+						assert.Equal("1954", tokens[3]);
+						assert.Equal(",", tokens[4]);
+						assert.Equal("Assoc.", tokens[5]);
+						assert.Equal("Prof.", tokens[6]);
+						assert.Equal("Dr.", tokens[7]);
+						assert.Equal("R.", tokens[8]);
+						assert.Equal("J.", tokens[9]);
+						assert.Equal("Ewing", tokens[10]);
+						assert.Equal("reviewed", tokens[11]);
+						assert.Equal("articles", tokens[12]);
+						assert.Equal("on", tokens[13]);
+						assert.Equal("Enzymol.", tokens[14]);
+						assert.Equal("Bacteriol.", tokens[15]);
+						assert.Equal("effects", tokens[16]);
+						assert.Equal("later", tokens[17]);
+						assert.Equal("published", tokens[18]);
+						assert.Equal("in", tokens[19]);
+						assert.Equal("Nutr.", tokens[20]);
+						assert.Equal("Rheumatol.", tokens[21]);
+						assert.Equal("No.", tokens[22]);
+						assert.Equal("12", tokens[23]);
+						assert.Equal("and", tokens[24]);
+						assert.Equal("Nº.", tokens[25]);
+						assert.Equal("13.", tokens[26]);
+						assert.Equal(",", tokens[27]);
+						assert.Equal("pp.", tokens[28]);
+						assert.Equal("17-18", tokens[29]);
+						assert.Equal(".", tokens[30]);
+				}
 
-		@Test
-		public void testZipOuputArchive () {
+				@Test
+				public void englishTokenizerCanGuessWhetherIIsAbbrev () {
+						DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
+						tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
+						assert.Equal("I.", tokens[1]);
+						assert.Equal("I", tokens[8]);
+						assert.Equal(".", tokens[9]);
+						assert.Equal("I", tokens[12]);
+						assert.Equal(".", tokens[13]);
+				}
 
-				final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
-				System.setOut(new PrintStream(clearOut));
-				tokens = tokenize(dat, w, "Archive:  ich/bin/ein.zip\n")
-				assert.Equal(0, len(tokens));
-		}
+				@Test
+				public void testZipOuputArchive () {
+
+						final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
+						System.setOut(new PrintStream(clearOut));
+						tokens = tokenize(dat, w, "Archive:  ich/bin/ein.zip\n")
+						assert.Equal(0, len(tokens));
+				}
 	*/
 	/*