Support German gender-sensitive DET, ADJ, PRON ending (from KorAP-Tokenizer)
Change-Id: I8f20ecb913c0fe514b5936ab43287ca616695f16
diff --git a/datok_test.go b/datok_test.go
index e18ba0a..aac4b86 100644
--- a/datok_test.go
+++ b/datok_test.go
@@ -1,6 +1,7 @@
package datok
import (
+ "bufio"
"bytes"
"fmt"
"os"
@@ -31,6 +32,28 @@
return tokens[:len(tokens)-1]
}
+func ttokenLines(t *testing.T, path string) []string {
+ f, err := os.Open(path)
+ if err != nil {
+ t.Fatalf("failed to open %s: %v", path, err)
+ }
+ defer f.Close()
+
+ lines := []string{}
+ scanner := bufio.NewScanner(f)
+ for scanner.Scan() {
+ line := strings.TrimSpace(scanner.Text())
+ if line == "" || strings.HasPrefix(line, "#") {
+ continue
+ }
+ lines = append(lines, line)
+ }
+ if err := scanner.Err(); err != nil {
+ t.Fatalf("failed to read %s: %v", path, err)
+ }
+ return lines
+}
+
func TestDoubleArraySimpleString(t *testing.T) {
assert := assert.New(t)
// bau | bauamt
@@ -1157,6 +1180,43 @@
assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
}
+func TestDoubleArrayFullTokenizerGenderDontSplitFromFile(t *testing.T) {
+ assert := assert.New(t)
+
+ if dat == nil {
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
+ }
+ assert.NotNil(dat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ for _, token := range ttokenLines(t, "testdata/de/dontsplit.txt") {
+ tokens := ttokenize(dat, w, token)
+ assert.Equalf(1, len(tokens), "should not split %q", token)
+ if len(tokens) == 1 {
+ assert.Equalf(token, tokens[0], "token surface should match for %q", token)
+ }
+ }
+}
+
+func TestDoubleArrayFullTokenizerGenderSplitFromFile(t *testing.T) {
+ assert := assert.New(t)
+
+ if dat == nil {
+ dat = LoadDatokFile("testdata/tokenizer_de.datok")
+ }
+ assert.NotNil(dat)
+
+ b := make([]byte, 0, 2048)
+ w := bytes.NewBuffer(b)
+
+ for _, token := range ttokenLines(t, "testdata/de/split.txt") {
+ tokens := ttokenize(dat, w, token)
+ assert.Greaterf(len(tokens), 1, "should split %q", token)
+ }
+}
+
func TestDoubleArrayLoadFactor1(t *testing.T) {
assert := assert.New(t)
tok := LoadFomaFile("testdata/abbr_bench.fst")