Support German gender-sensitive DET, ADJ, PRON ending (from KorAP-Tokenizer)
Change-Id: I8f20ecb913c0fe514b5936ab43287ca616695f16
diff --git a/Changes b/Changes
index 02f5d40..235124b 100644
--- a/Changes
+++ b/Changes
@@ -1,38 +1,41 @@
-0.3.1 2026-02-04
- - Introduce hyphenated abbreviations in german tokenizer.
- - Support Wikipedia templates.
- - Introduced multiple gender forms for nouns
- in german tokenizer.
+0.3.1 2026-02-11
+ - Introduce hyphenated abbreviations in german tokenizer.
+ - Support Wikipedia templates.
+ - Introduced multiple gender forms for nouns
+ in german tokenizer.
+ (from KorAP-Tokenizer)
+ - Added short forms for determiners, adjectives, pronouns
+ "eine(n)", "gute:r", "ihm/r", "diese(r)", "ein(e)"
0.2.2 2023-09-06
- - Fix behaviour for end of text character positions
- when no end of sentence occured before.
+ - Fix behaviour for end of text character positions
+ when no end of sentence occured before.
0.2.1 2023-09-05
- - Add english tokenizer.
- - Fix buffer bug.
- - Improve Readme.
- - Minor performance improvements.
+ - Add english tokenizer.
+ - Fix buffer bug.
+ - Improve Readme.
+ - Minor performance improvements.
0.1.7 2023-02-28
- - Add dependabot checks.
- - Add update command.
+ - Add dependabot checks.
+ - Add update command.
0.1.6 2022-04-14
- - Rename TOKEN_SYMBOL to TOKEN_BOUND.
+ - Rename TOKEN_SYMBOL to TOKEN_BOUND.
0.1.5 2022-03-28
- - Improve Emoticon-List.
+ - Improve Emoticon-List.
0.1.4 2022-03-27
- - Improved handling of ellipsis.
- - Make algorithm more robust to nevere fail.
- - Remove match option.
+ - Improved handling of ellipsis.
+ - Make algorithm more robust to nevere fail.
+ - Remove match option.
0.1.3 2022-03-08
- - Introduced refined handling of sentences including speech.
+ - Introduced refined handling of sentences including speech.
0.1.2 2021-12-07
- - Improve performance of rune to symbol conversion in transduction
- method.
- - Support Plusampersand word list in compounds.
\ No newline at end of file
+ - Improve performance of rune to symbol conversion in transduction
+method.
+ - Support Plusampersand word list in compounds.
diff --git a/datok_test.go b/datok_test.go
index e18ba0a..aac4b86 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -1,6 +1,7 @@
package datok
import (
+ "bufio"
"bytes"
"fmt"
"os"
@@ -31,6 +32,28 @@
return tokens[:len(tokens)-1]
}
+func ttokenLines(t *testing.T, path string) []string {
+ f, err := os.Open(path)
+ if err != nil {
+ t.Fatalf("failed to open %s: %v", path, err)
+ }
+ defer f.Close()
+
+ lines := []string{}
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line == "" || strings.HasPrefix(line, "#") {
+ continue
+ }
+ lines = append(lines, line)
+ }
+ if err := scanner.Err(); err != nil {
+ t.Fatalf("failed to read %s: %v", path, err)
+ }
+ return lines
+}
+
func TestDoubleArraySimpleString(t *testing.T) {
assert := assert.New(t)
// bau | bauamt
@@ -1157,6 +1180,43 @@
assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
}
+func TestDoubleArrayFullTokenizerGenderDontSplitFromFile(t *testing.T) {
+ assert := assert.New(t)
+
+ if dat == nil {
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
+ }
+ assert.NotNil(dat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ for _, token := range ttokenLines(t, "testdata/de/dontsplit.txt") {
+ tokens := ttokenize(dat, w, token)
+ assert.Equalf(1, len(tokens), "should not split %q", token)
+ if len(tokens) == 1 {
+ assert.Equalf(token, tokens[0], "token surface should match for %q", token)
+ }
+ }
+}
+
+func TestDoubleArrayFullTokenizerGenderSplitFromFile(t *testing.T) {
+ assert := assert.New(t)
+
+ if dat == nil {
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
+ }
+ assert.NotNil(dat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ for _, token := range ttokenLines(t, "testdata/de/split.txt") {
+ tokens := ttokenize(dat, w, token)
+ assert.Greaterf(len(tokens), 1, "should split %q", token)
+ }
+}
+
func TestDoubleArrayLoadFactor1(t *testing.T) {
assert := assert.New(t)
tok := LoadFomaFile("testdata/abbr_bench.fst")
diff --git a/src/de/gender.xfst b/src/de/gender.xfst
index 8adc35c..dd8c7de 100644
--- a/src/de/gender.xfst
+++ b/src/de/gender.xfst
@@ -8,6 +8,12 @@
! false positives in compounds like "Nutzer/Innenarchitekt".
define genderEndingsInLower [ i n ( n e n ) ];
+! Short endings for determiners, adjectives and pronouns:
+! e, n, r, s, m, es, er, em, en
+define genderShortSuffix [ [e|n|r|s|m] | [ e [s|r|m|n] ] ];
+
+define genderSeparator [":" | Slash ( %- ) | Asterisk | "_" ];
+
! Gender-sensitive endings with frau/frauen
! (lowercase only - capitalized Frau is a standalone word)
! Note: This is now only used for words ending in "mann"
@@ -15,10 +21,21 @@
! Matches: Kaufmann, Geschäftsmann, etc. but NOT just "mann"
define genderEndingsFrau [ {frau} ( {en} ) ];
-! General gender endings (only -in/-innen forms for colon, slash, parenthetical)
-! Colon forms: Nutzer:in, Nutzer:In, Nutzer:innen
-! Slash forms for -in/-innen: Nutzer/in, Nutzer/innen, Nutzer/-in, Kosovo-Albaner/innen
-define genderIn [ ":" genderEndingsIn | Slash ( %- ) genderEndingsInLower ];
+! General gender endings for -in/-innen without continuation:
+! allow all I/i variants for :, *, _; keep slash lowercase-only to
+! avoid false positives like Nutzer/Innenarchitekt.
+define genderIn [
+ [ ":" | Asterisk | "_" ] genderEndingsIn |
+ Slash ( %- ) genderEndingsInLower
+];
+
+! Short forms for DET/ADJ/PRON with colon/slash/star/underscore:
+! gute:r, ihm/r, ein*e, ein_e, diese:r
+define genderShort [ genderSeparator genderShortSuffix ];
+
+! Parenthetical short forms:
+! eine(n), ein(e)
+define genderParenShort %( ( %- ) genderShortSuffix %);
! Slash forms for -frau: Kaufmann/frau, Kaufmann/-frau, Geschäftsmann/frau
! Only applies when word ends in "mann" (with non-empty prefix before it)
@@ -31,4 +48,12 @@
! Only applies when word ends in "mann" (with non-empty prefix before it)
define genderParenFrau {mann} %( ( %- ) genderEndingsFrau %);
-define GenderEndings [ genderIn | genderFrau | genderParenIn | genderParenFrau ];
\ No newline at end of file
+! Compound continuation for forms like Lehrer:innenfortbildung.
+! For slash, restrict to lowercase in/innen so /Innen... keeps splitting
+! (e.g. Nutzer/Innenarchitekt, Innenminister/Innenministerinnen).
+define genderInComp [
+ [ ":" | Asterisk | "_" ] genderEndingsIn Char+ |
+ Slash ( %- ) genderEndingsInLower Char+
+];
+
+define GenderEndings [ genderIn | genderInComp | genderShort | genderFrau | genderParenIn | genderParenShort | genderParenFrau ];
\ No newline at end of file
diff --git a/testdata/de/dontsplit.txt b/testdata/de/dontsplit.txt
new file mode 100644
index 0000000..786c6b4
--- /dev/null
+++ b/testdata/de/dontsplit.txt
@@ -0,0 +1,46 @@
+gute:r
+diese(r)
+ihm/r
+ein:e
+jede*r
+große_r
+eines/r
+Kaufmann/frau
+Nutzer:in
+Kaufmann(-frau)
+Verkäufer/in
+Verkäufer/-in
+Verkäufer*innen
+Verkäufer_innen
+Verkäufer:innen
+Innenminster/innen
+ein(e)
+ein/e
+ein*e
+ein_e
+eines/r
+einer/s
+einem/r
+einer/m
+eine/n
+diese(n)
+diese/r
+diese:r
+diesem/r
+lehrer:innen
+schüler*innen
+Lehrer:Innen
+student_innen
+mitarbeiter:in
+kolleg/in
+eine:r
+ein:e
+jede:r
+jede*r
+jede_r
+jede/r
+eine(n)
+Lehrer:innenfortbildung
+Lehrer:Innenfortbildung
+Lehrer*innenfortbildung
+Lehrer_innenfortbildung
diff --git a/testdata/de/split.txt b/testdata/de/split.txt
new file mode 100644
index 0000000..14a0e37
--- /dev/null
+++ b/testdata/de/split.txt
@@ -0,0 +1,11 @@
+der/die
+er/sie
+und/oder
+Modell/Versuch
+Quelle:rbb
+Foto:emm
+Dies(ist)falsch
+das/ist/falsch
+mir:geht
+Vor/Nachteile
+Innenminister/Innenministerinnen