blob: 49a15239bbe89127549f286ffb3ee050465df45e [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
5 "strings"
6 "testing"
7
8 "github.com/stretchr/testify/assert"
9)
10
11func TestFullTokenizerMatrix(t *testing.T) {
12 assert := assert.New(t)
13 foma := LoadFomaFile("testdata/simpletok.fst")
14 assert.NotNil(foma)
15
16 mat := foma.ToMatrix()
17
18 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
19 b := make([]byte, 0, 2048)
20 w := bytes.NewBuffer(b)
21 var tokens []string
22 mat.Transduce(r, w)
23 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020024 assert.Equal(len(tokens), 10)
Akron1c34ce62021-09-23 23:27:39 +020025 assert.Equal("wald", tokens[0])
26 assert.Equal("gehen", tokens[1])
27 assert.Equal("Da", tokens[2])
28 assert.Equal("kann", tokens[3])
29 assert.Equal("man", tokens[4])
30 assert.Equal("was", tokens[5])
31 assert.Equal("\"erleben\"", tokens[6])
32 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020033
34 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
35 w.Reset()
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
38 assert.Equal("In", tokens[0])
39 assert.Equal("den", tokens[1])
40 assert.Equal("Wald", tokens[2])
41 assert.Equal("gehen", tokens[3])
42 assert.Equal("?", tokens[4])
43 assert.Equal("--", tokens[5])
44
45 r = strings.NewReader(" g? -- D")
46 w.Reset()
47 mat.Transduce(r, w)
48 tokens = strings.Split(w.String(), "\n")
49 assert.Equal("g", tokens[0])
50 assert.Equal("?", tokens[1])
51 assert.Equal("--", tokens[2])
52 assert.Equal("D", tokens[3])
53 assert.Equal("", tokens[4])
54 assert.Equal("", tokens[5])
55 assert.Equal(6, len(tokens))
56}
57
58func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
59 assert := assert.New(t)
60 foma := LoadFomaFile("testdata/tokenizer.fst")
61 assert.NotNil(foma)
62
63 mat := foma.ToMatrix()
64
65 b := make([]byte, 0, 2048)
66 w := bytes.NewBuffer(b)
67 var sentences []string
68
69 // testSentSplitterSimple
70 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
71 sentences = strings.Split(w.String(), "\n\n")
72
73 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
74 assert.Equal("Der\nalte\nMann\n.", sentences[0])
75 assert.Equal("", sentences[1])
76 assert.Equal(len(sentences), 2)
77
78 w.Reset()
79 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
80 sentences = strings.Split(w.String(), "\n\n")
81 assert.Equal(len(sentences), 2)
82 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
83 assert.Equal("", sentences[1])
84
85 /*
86
87 w.Reset()
88 assert.True(mat.Transduce(strings.NewReader(""), w))
89 sentences = strings.Split(w.String(), "\n\n")
90 assert.Equal(len(sentences), 1)
91 assert.Equal("\n", sentences[0])
92
93 w.Reset()
94 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
95 sentences = strings.Split(w.String(), "\n\n")
96 assert.Equal(len(sentences), 2)
97
98 w.Reset()
99 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
100 sentences = strings.Split(w.String(), "\n\n")
101 assert.Equal(len(sentences), 2)
102
103 w.Reset()
104 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
105 sentences = strings.Split(w.String(), "\n\n")
106 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
107 assert.Equal("", sentences[1])
108 assert.Equal(len(sentences), 2)
109
110 w.Reset()
111 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
112 sentences = strings.Split(w.String(), "\n\n")
113 assert.Equal("", sentences[1])
114 assert.Equal(len(sentences), 2)
115
116 w.Reset()
117 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
118 sentences = strings.Split(w.String(), "\n\n")
119 assert.Equal(len(sentences), 2)
120
121 w.Reset()
122 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
123 sentences = strings.Split(w.String(), "\n\n")
124 assert.Equal(len(sentences), 2)
125
126 w.Reset()
127 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
128 sentences = strings.Split(w.String(), "\n\n")
129 assert.Equal(len(sentences), 2)
130 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
131 assert.Equal("", sentences[1])
132
133 w.Reset()
134 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
135 sentences = strings.Split(w.String(), "\n\n")
136 assert.Equal(len(sentences), 3)
137 assert.Equal("Ausschalten\n!!!", sentences[0])
138 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
139 assert.Equal("", sentences[2])
140
141 w.Reset()
142 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
143 sentences = strings.Split(w.String(), "\n\n")
144 assert.Equal(len(sentences), 2)
145 */
146 /*
147 Test:
148 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
149 */
Akron1c34ce62021-09-23 23:27:39 +0200150}