blob: 40ddb8d3d9034dd80ba5759c91fbf798a7feae04 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akronbe3d3662023-04-26 13:22:38 +020023var mat_de, mat_en *MatrixTokenizer
Akron9fb63af2021-10-28 01:15:53 +020024
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akrondf275812022-03-27 12:54:46 +020073func TestMatrixSimpleString(t *testing.T) {
74 assert := assert.New(t)
75 // bau | bauamt
76 tok := LoadFomaFile("testdata/bauamt.fst")
77 mat := tok.ToMatrix()
78
79 b := make([]byte, 0, 2048)
80 w := bytes.NewBuffer(b)
81 var tokens []string
82
83 tokens = ttokenize(mat, w, "ibauamt")
84 assert.Equal("i", tokens[0])
85 assert.Equal("bauamt", tokens[1])
86
87 tokens = ttokenize(mat, w, "ibbauamt")
88 assert.Equal("i", tokens[0])
89
90 assert.Equal("b", tokens[1])
91 assert.Equal("bauamt", tokens[2])
92
93 tokens = ttokenize(mat, w, "bau")
94 assert.Equal("bau", tokens[0])
95
96 tokens = ttokenize(mat, w, "baum")
97 assert.Equal("bau", tokens[0])
98 assert.Equal("m", tokens[1])
99
100 tokens = ttokenize(mat, w, "baudibauamt")
101 assert.Equal("bau", tokens[0])
102 assert.Equal("d", tokens[1])
103 assert.Equal("i", tokens[2])
104 assert.Equal("bauamt", tokens[3])
105}
106
Akroncae39112023-04-26 19:43:16 +0200107func TestMatrixCliticRule(t *testing.T) {
108 assert := assert.New(t)
109 mat := LoadMatrixFile("testdata/clitic_test.matok")
110
111 b := make([]byte, 0, 2048)
112 w := bytes.NewBuffer(b)
113 var tokens []string
114
115 tokens = ttokenize(mat, w, "ibauamt")
116 assert.Equal("ibauamt", tokens[0])
117
118 exstring := "dead. "
119
120 tokens = ttokenize(mat, w, exstring)
121 assert.Equal("dead", tokens[0])
122 assert.Equal(".", tokens[1])
123
124 w.Reset()
125 tws := NewTokenWriter(w, TOKENS|SENTENCES)
126
127 assert.True(mat.TransduceTokenWriter(
128 strings.NewReader(exstring), tws),
129 )
130 tws.Flush()
131
132 matStr := w.String()
133 assert.Equal("dead\n.\n\n\n\n\n\n\n", matStr)
134}
135
Akronc9c0eae2021-10-22 19:49:43 +0200136func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +0200137 assert := assert.New(t)
138 foma := LoadFomaFile("testdata/simpletok.fst")
139 assert.NotNil(foma)
140
141 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +0200142 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +0200143
Akrondf275812022-03-27 12:54:46 +0200144 assert.Equal(ttokenizeStr(mat, "bau"), "bau")
145 assert.Equal(ttokenizeStr(mat, "bad"), "bad")
146 assert.Equal(ttokenizeStr(mat, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200147 b := make([]byte, 0, 1024)
148 buf := bytes.NewBuffer(b)
149 n, err := mat.WriteTo(buf)
150 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +0200151 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +0200152 mat2 := ParseMatrix(buf)
153 assert.NotNil(mat2)
154 assert.Equal(mat.sigma, mat2.sigma)
155 assert.Equal(mat.epsilon, mat2.epsilon)
156 assert.Equal(mat.unknown, mat2.unknown)
157 assert.Equal(mat.identity, mat2.identity)
158 assert.Equal(mat.stateCount, mat2.stateCount)
159 assert.Equal(len(mat.array), len(mat2.array))
160 assert.Equal(mat.array, mat2.array)
Akrondf275812022-03-27 12:54:46 +0200161 assert.Equal(ttokenizeStr(mat2, "bau"), "bau")
162 assert.Equal(ttokenizeStr(mat2, "bad"), "bad")
163 assert.Equal(ttokenizeStr(mat2, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200164}
165
Akrone396a932021-10-19 01:06:13 +0200166func TestMatrixIgnorableMCS(t *testing.T) {
167 assert := assert.New(t)
168
169 // This test relies on final states. That's why it is
170 // not working correctly anymore.
171
172 // File has MCS in sigma but not in net
173 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
174 assert.NotNil(tok)
175 mat := tok.ToMatrix()
176 assert.NotNil(mat)
177
178 b := make([]byte, 0, 2048)
179 w := bytes.NewBuffer(b)
180 var tokens []string
181
182 // Is only unambigous when transducing strictly greedy!
183 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
184 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200185 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200186 assert.Equal("a", tokens[0])
187 assert.Equal("b", tokens[1])
188 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200189 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200190}
191
Akronc9c0eae2021-10-22 19:49:43 +0200192func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200193 assert := assert.New(t)
194 foma := LoadFomaFile("testdata/tokenizer.fst")
195 assert.NotNil(foma)
196
197 mat := foma.ToMatrix()
198 assert.NotNil(foma)
199
200 tb := make([]byte, 0, 2048)
201 w := bytes.NewBuffer(tb)
202
203 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200204 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200205
206 b := make([]byte, 0, 1024)
207 buf := bytes.NewBuffer(b)
208 _, err := mat.WriteTo(buf)
209 assert.Nil(err)
210 w.Reset()
211 // assert.Equal(int64(248), n)
212
213 mat2 := ParseMatrix(buf)
214 assert.NotNil(mat2)
215 assert.Equal(mat.sigma, mat2.sigma)
216 assert.Equal(mat.epsilon, mat2.epsilon)
217 assert.Equal(mat.unknown, mat2.unknown)
218 assert.Equal(mat.identity, mat2.identity)
219 assert.Equal(mat.stateCount, mat2.stateCount)
220 assert.Equal(len(mat.array), len(mat2.array))
221 // assert.Equal(mat.array, mat2.array)
222
223 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200224 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200225}
226
Akronc9c0eae2021-10-22 19:49:43 +0200227func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200228 assert := assert.New(t)
229
Akronbe3d3662023-04-26 13:22:38 +0200230 if mat_de == nil {
231 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +0200232 }
Akron28031b72021-10-02 13:07:25 +0200233
Akronbe3d3662023-04-26 13:22:38 +0200234 assert.NotNil(mat_de)
Akron28031b72021-10-02 13:07:25 +0200235
236 b := make([]byte, 0, 2048)
237 w := bytes.NewBuffer(b)
238 var tokens []string
239
Akronbe3d3662023-04-26 13:22:38 +0200240 assert.True(mat_de.Transduce(strings.NewReader("tra. u Du?"), w))
Akron28031b72021-10-02 13:07:25 +0200241
242 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200243 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200244 assert.Equal("tra", tokens[0])
245 assert.Equal(".", tokens[1])
246 assert.Equal("", tokens[2])
247 assert.Equal("u", tokens[3])
248 assert.Equal("Du", tokens[4])
249 assert.Equal("?", tokens[5])
250 assert.Equal("", tokens[6])
251 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200252 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200253
254 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200255 assert.True(mat_de.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200256 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200257}
258
Akronc9c0eae2021-10-22 19:49:43 +0200259func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200260 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200261
Akronbe3d3662023-04-26 13:22:38 +0200262 if mat_de == nil {
263 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +0200264 }
Akron5c82a922021-09-24 19:11:29 +0200265
266 b := make([]byte, 0, 2048)
267 w := bytes.NewBuffer(b)
268 var sentences []string
269
270 // testSentSplitterSimple
Akronbe3d3662023-04-26 13:22:38 +0200271 assert.True(mat_de.Transduce(strings.NewReader("Der alte Mann."), w))
Akron5c82a922021-09-24 19:11:29 +0200272 sentences = strings.Split(w.String(), "\n\n")
273
Akrona854faa2021-10-22 19:31:08 +0200274 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200275 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200276 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200277 assert.Equal(len(sentences), 2)
278
279 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200280 assert.True(mat_de.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
Akrona854faa2021-10-22 19:31:08 +0200281 sentences = strings.Split(w.String(), "\n\n")
282 assert.Equal(len(sentences), 2)
283 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
284 assert.Equal("\n", sentences[1])
285
286 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200287 assert.True(mat_de.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
Akron5c82a922021-09-24 19:11:29 +0200288 sentences = strings.Split(w.String(), "\n\n")
289 assert.Equal(len(sentences), 2)
290 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200291 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200292
Akron28031b72021-10-02 13:07:25 +0200293 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200294 assert.True(mat_de.Transduce(strings.NewReader(""), w))
Akron28031b72021-10-02 13:07:25 +0200295 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200296 assert.Equal(len(sentences), 2)
297 assert.Equal("", sentences[0])
298 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200299
Akron28031b72021-10-02 13:07:25 +0200300 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200301 assert.True(mat_de.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
Akron28031b72021-10-02 13:07:25 +0200302 sentences = strings.Split(w.String(), "\n\n")
303 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200304
Akron28031b72021-10-02 13:07:25 +0200305 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200306 assert.True(mat_de.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
Akron28031b72021-10-02 13:07:25 +0200307 sentences = strings.Split(w.String(), "\n\n")
308 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200309
Akron28031b72021-10-02 13:07:25 +0200310 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200311 assert.True(mat_de.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
Akron28031b72021-10-02 13:07:25 +0200312 sentences = strings.Split(w.String(), "\n\n")
313 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200314 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200315 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200316
Akron28031b72021-10-02 13:07:25 +0200317 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200318 assert.True(mat_de.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
Akron28031b72021-10-02 13:07:25 +0200319 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200320 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200321 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200322
Akron28031b72021-10-02 13:07:25 +0200323 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200324 assert.True(mat_de.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
Akron28031b72021-10-02 13:07:25 +0200325 sentences = strings.Split(w.String(), "\n\n")
326 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200327
Akron28031b72021-10-02 13:07:25 +0200328 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200329 assert.True(mat_de.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
Akron28031b72021-10-02 13:07:25 +0200330 sentences = strings.Split(w.String(), "\n\n")
331 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200332
Akron28031b72021-10-02 13:07:25 +0200333 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200334 assert.True(mat_de.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
Akron28031b72021-10-02 13:07:25 +0200335 sentences = strings.Split(w.String(), "\n\n")
336 assert.Equal(len(sentences), 2)
337 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200338 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200339
Akron28031b72021-10-02 13:07:25 +0200340 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200341 assert.True(mat_de.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
Akron28031b72021-10-02 13:07:25 +0200342 sentences = strings.Split(w.String(), "\n\n")
343 assert.Equal(len(sentences), 3)
344 assert.Equal("Ausschalten\n!!!", sentences[0])
345 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200346 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200347
Akron28031b72021-10-02 13:07:25 +0200348 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200349 assert.True(mat_de.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
Akron28031b72021-10-02 13:07:25 +0200350 sentences = strings.Split(w.String(), "\n\n")
351 assert.Equal(len(sentences), 2)
Akrone96895f2022-03-08 19:58:37 +0100352
353 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200354 assert.True(mat_de.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w))
Akrone96895f2022-03-08 19:58:37 +0100355 sentences = strings.Split(w.String(), "\n\n")
356 assert.Equal(len(sentences), 5)
357 assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0])
358 assert.Equal("Geh\n!!!", sentences[1])
359 assert.Equal("\"\nLass\n!\n\"", sentences[2])
360 assert.Equal("Dann\nging\ner\n.", sentences[3])
361
362 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200363 assert.True(mat_de.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
Akrone96895f2022-03-08 19:58:37 +0100364 sentences = strings.Split(w.String(), "\n\n")
365 assert.Equal(len(sentences), 3)
366 assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0])
367 assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1])
Akronece3f012022-03-09 19:12:15 +0100368
369 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200370 assert.True(mat_de.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
Akronece3f012022-03-09 19:12:15 +0100371 sentences = strings.Split(w.String(), "\n\n")
372 assert.Equal(len(sentences), 3)
373 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
374 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
375
376 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200377 assert.True(mat_de.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
Akronece3f012022-03-09 19:12:15 +0100378 sentences = strings.Split(w.String(), "\n\n")
379 assert.Equal(len(sentences), 3)
380 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
381 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
382
383 text := `»Meinetwegen. Denkst du, daß ich darauf warte? Das fehlte noch.
384Übrigens, ich kriege schon einen und vielleicht bald. Da ist mir nicht
385bange. Neulich erst hat mir der kleine Ventivegni von drüben gesagt:
386'Fräulein Effi, was gilt die Wette, wir sind hier noch in diesem Jahre
387zu Polterabend und Hochzeit.'«
388
389»Und was sagtest du da?«`
390
391 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200392 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akronece3f012022-03-09 19:12:15 +0100393 sentences = strings.Split(w.String(), "\n\n")
394 assert.Equal(len(sentences), 8)
395 assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5])
396 assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6])
Akron4222ac82022-03-11 01:06:21 +0100397
398 text = `»Nun, gib dich zufrieden, ich fange schon an ... Also Baron
399Innstetten!`
400
401 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200402 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akron4222ac82022-03-11 01:06:21 +0100403 sentences = strings.Split(w.String(), "\n\n")
404 assert.Equal(len(sentences), 3)
405 assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
406 assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
Akrondf275812022-03-27 12:54:46 +0200407
Akronb98e4cf2022-03-27 23:56:49 +0200408 // Check parantheses at the end of the sentence
Akronf94b9ce2022-03-27 18:18:09 +0200409 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200410 assert.True(mat_de.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
Akronf94b9ce2022-03-27 18:18:09 +0200411 sentences = strings.Split(w.String(), "\n\n")
412 assert.Equal(len(sentences), 3)
413 assert.Equal("(\nEr\nging\n.\n)", sentences[0])
414 assert.Equal("Und\nkam\n(\nspäter\n)\n.", sentences[1])
Akron7aa1cbe2022-03-30 12:44:04 +0200415
416 // Check parantheses and quotes at the end of the sentence
417 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200418 assert.True(mat_de.Transduce(strings.NewReader("(Er sagte: \"Hallo!\") Dann ging er."), w))
Akron7aa1cbe2022-03-30 12:44:04 +0200419 sentences = strings.Split(w.String(), "\n\n")
420 assert.Equal(len(sentences), 3)
421 assert.Equal("(\nEr\nsagte\n:\n\"\nHallo\n!\n\"\n)", sentences[0])
422 assert.Equal("Dann\nging\ner\n.", sentences[1])
423
Akrondf275812022-03-27 12:54:46 +0200424}
425
426func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
427 assert := assert.New(t)
428
Akronbe3d3662023-04-26 13:22:38 +0200429 if mat_de == nil {
430 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akrondf275812022-03-27 12:54:46 +0200431 }
432
433 b := make([]byte, 0, 2048)
434 w := bytes.NewBuffer(b)
435 var sentences []string
436
437 text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
438
439 w.Reset()
Akronbe3d3662023-04-26 13:22:38 +0200440 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akrondf275812022-03-27 12:54:46 +0200441 sentences = strings.Split(w.String(), "\n\n")
Akronb4287552022-03-27 14:11:24 +0200442 assert.Equal(len(sentences), 6)
Akrondf275812022-03-27 12:54:46 +0200443 assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
444 assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
Akronb4287552022-03-27 14:11:24 +0200445 assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
446 assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
447 assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
Akron1c34ce62021-09-23 23:27:39 +0200448}
Akron28031b72021-10-02 13:07:25 +0200449
Akronc9c0eae2021-10-22 19:49:43 +0200450func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200451 assert := assert.New(t)
452
Akronbe3d3662023-04-26 13:22:38 +0200453 if mat_de == nil {
454 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +0200455 }
Akron28031b72021-10-02 13:07:25 +0200456
457 b := make([]byte, 0, 2048)
458 w := bytes.NewBuffer(b)
459 var tokens []string
460
461 // testTokenizerSimple
Akronbe3d3662023-04-26 13:22:38 +0200462 tokens = ttokenize(mat_de, w, "Der alte Mann")
Akron28031b72021-10-02 13:07:25 +0200463 assert.Equal(tokens[0], "Der")
464 assert.Equal(tokens[1], "alte")
465 assert.Equal(tokens[2], "Mann")
466 assert.Equal(len(tokens), 3)
467
Akronbe3d3662023-04-26 13:22:38 +0200468 tokens = ttokenize(mat_de, w, "Der alte Mann.")
Akron28031b72021-10-02 13:07:25 +0200469 assert.Equal(tokens[0], "Der")
470 assert.Equal(tokens[1], "alte")
471 assert.Equal(tokens[2], "Mann")
472 assert.Equal(tokens[3], ".")
473 assert.Equal(len(tokens), 4)
474
475 // testTokenizerAbbr
Akronbe3d3662023-04-26 13:22:38 +0200476 tokens = ttokenize(mat_de, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron28031b72021-10-02 13:07:25 +0200477 assert.Equal(tokens[0], "Der")
478 assert.Equal(tokens[1], "Vorsitzende")
479 assert.Equal(tokens[2], "der")
480 assert.Equal(tokens[3], "F.D.P.")
481 assert.Equal(tokens[4], "hat")
482 assert.Equal(tokens[5], "gewählt")
483 assert.Equal(len(tokens), 6)
484 // Ignored in KorAP-Tokenizer
485
486 // testTokenizerHost1
Akronbe3d3662023-04-26 13:22:38 +0200487 tokens = ttokenize(mat_de, w, "Gefunden auf wikipedia.org")
Akron28031b72021-10-02 13:07:25 +0200488 assert.Equal(tokens[0], "Gefunden")
489 assert.Equal(tokens[1], "auf")
490 assert.Equal(tokens[2], "wikipedia.org")
491 assert.Equal(len(tokens), 3)
492
493 // testTokenizerWwwHost
Akronbe3d3662023-04-26 13:22:38 +0200494 tokens = ttokenize(mat_de, w, "Gefunden auf www.wikipedia.org")
Akron28031b72021-10-02 13:07:25 +0200495 assert.Equal("Gefunden", tokens[0])
496 assert.Equal("auf", tokens[1])
497 assert.Equal("www.wikipedia.org", tokens[2])
498 assert.Equal(3, len(tokens))
499
500 // testTokenizerWwwUrl
Akronbe3d3662023-04-26 13:22:38 +0200501 tokens = ttokenize(mat_de, w, "Weitere Infos unter www.info.biz/info")
Akron28031b72021-10-02 13:07:25 +0200502 assert.Equal("www.info.biz/info", tokens[3])
503
504 // testTokenizerFtpHost
505 /*
506 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
507 assert.Equal("Kann", tokens[0])
508 assert.Equal("von", tokens[1])
509 assert.Equal("ftp.download.org", tokens[2])
510 assert.Equal(5, len(tokens))
511 // Ignored in KorAP-Tokenizer
512 */
513
514 // testTokenizerDash
Akronbe3d3662023-04-26 13:22:38 +0200515 tokens = ttokenize(mat_de, w, "Das war -- spitze")
Akron28031b72021-10-02 13:07:25 +0200516 assert.Equal(tokens[0], "Das")
517 assert.Equal(tokens[1], "war")
518 assert.Equal(tokens[2], "--")
519 assert.Equal(tokens[3], "spitze")
520 assert.Equal(len(tokens), 4)
521
522 // testTokenizerEmail1
Akronbe3d3662023-04-26 13:22:38 +0200523 tokens = ttokenize(mat_de, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron28031b72021-10-02 13:07:25 +0200524 assert.Equal(tokens[0], "Ich")
525 assert.Equal(tokens[1], "bin")
526 assert.Equal(tokens[2], "unter")
527 assert.Equal(tokens[3], "korap@ids-mannheim.de")
528 assert.Equal(tokens[4], "erreichbar")
529 assert.Equal(tokens[5], ".")
530 assert.Equal(len(tokens), 6)
531
532 // testTokenizerEmail2
Akronbe3d3662023-04-26 13:22:38 +0200533 tokens = ttokenize(mat_de, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron28031b72021-10-02 13:07:25 +0200534 assert.Equal(tokens[0], "Oder")
535 assert.Equal(tokens[1], "unter")
536 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
537 assert.Equal(tokens[3], ".")
538 assert.Equal(len(tokens), 4)
539
540 // testTokenizerEmail3
Akronbe3d3662023-04-26 13:22:38 +0200541 tokens = ttokenize(mat_de, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron28031b72021-10-02 13:07:25 +0200542 assert.Equal(tokens[0], "Oder")
543 assert.Equal(tokens[1], "unter")
544 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
545 assert.Equal(tokens[3], ".")
546 assert.Equal(len(tokens), 4)
547 // Ignored in KorAP-Tokenizer
548
549 // testTokenizerDoNotAcceptQuotedEmailNames
Akronbe3d3662023-04-26 13:22:38 +0200550 tokens = ttokenize(mat_de, w, "\"John Doe\"@xx.com")
Akron28031b72021-10-02 13:07:25 +0200551 assert.Equal("\"", tokens[0])
552 assert.Equal("John", tokens[1])
553 assert.Equal("Doe", tokens[2])
554 assert.Equal("\"", tokens[3])
555 assert.Equal("@xx", tokens[4])
556 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
557 assert.Equal("com", tokens[6])
558 assert.Equal(7, len(tokens))
559
560 // testTokenizerTwitter
Akronbe3d3662023-04-26 13:22:38 +0200561 tokens = ttokenize(mat_de, w, "Folgt @korap und #korap")
Akron28031b72021-10-02 13:07:25 +0200562 assert.Equal(tokens[0], "Folgt")
563 assert.Equal(tokens[1], "@korap")
564 assert.Equal(tokens[2], "und")
565 assert.Equal(tokens[3], "#korap")
566 assert.Equal(len(tokens), 4)
567
568 // testTokenizerWeb1
Akronbe3d3662023-04-26 13:22:38 +0200569 tokens = ttokenize(mat_de, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron28031b72021-10-02 13:07:25 +0200570 assert.Equal(tokens[0], "Unsere")
571 assert.Equal(tokens[1], "Website")
572 assert.Equal(tokens[2], "ist")
573 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
574 assert.Equal(len(tokens), 4)
575
576 // testTokenizerWeb2
Akronbe3d3662023-04-26 13:22:38 +0200577 tokens = ttokenize(mat_de, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron28031b72021-10-02 13:07:25 +0200578 assert.Equal(tokens[0], "Wir")
579 assert.Equal(tokens[1], "sind")
580 assert.Equal(tokens[2], "auch")
581 assert.Equal(tokens[3], "im")
582 assert.Equal(tokens[4], "Internet")
583 assert.Equal(tokens[5], "(")
584 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
585 assert.Equal(tokens[7], ")")
586 assert.Equal(len(tokens), 8)
587 // Ignored in KorAP-Tokenizer
588
589 // testTokenizerWeb3
Akronbe3d3662023-04-26 13:22:38 +0200590 tokens = ttokenize(mat_de, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron28031b72021-10-02 13:07:25 +0200591 assert.Equal(tokens[0], "Die")
592 assert.Equal(tokens[1], "Adresse")
593 assert.Equal(tokens[2], "ist")
594 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
595 assert.Equal(tokens[4], ".")
596 assert.Equal(len(tokens), 5)
597 // Ignored in KorAP-Tokenizer
598
599 // testTokenizerServer
Akronbe3d3662023-04-26 13:22:38 +0200600 tokens = ttokenize(mat_de, w, "Unser Server ist 10.0.10.51.")
Akron28031b72021-10-02 13:07:25 +0200601 assert.Equal(tokens[0], "Unser")
602 assert.Equal(tokens[1], "Server")
603 assert.Equal(tokens[2], "ist")
604 assert.Equal(tokens[3], "10.0.10.51")
605 assert.Equal(tokens[4], ".")
606 assert.Equal(len(tokens), 5)
607
608 // testTokenizerNum
Akronbe3d3662023-04-26 13:22:38 +0200609 tokens = ttokenize(mat_de, w, "Zu 50,4% ist es sicher")
Akron28031b72021-10-02 13:07:25 +0200610 assert.Equal(tokens[0], "Zu")
611 assert.Equal(tokens[1], "50,4%")
612 assert.Equal(tokens[2], "ist")
613 assert.Equal(tokens[3], "es")
614 assert.Equal(tokens[4], "sicher")
615 assert.Equal(len(tokens), 5)
616 // Differs from KorAP-Tokenizer
617
618 // testTokenizerDate
Akronbe3d3662023-04-26 13:22:38 +0200619 tokens = ttokenize(mat_de, w, "Der Termin ist am 5.9.2018")
Akron28031b72021-10-02 13:07:25 +0200620 assert.Equal(tokens[0], "Der")
621 assert.Equal(tokens[1], "Termin")
622 assert.Equal(tokens[2], "ist")
623 assert.Equal(tokens[3], "am")
624 assert.Equal(tokens[4], "5.9.2018")
625 assert.Equal(len(tokens), 5)
626
Akronbe3d3662023-04-26 13:22:38 +0200627 tokens = ttokenize(mat_de, w, "Der Termin ist am 5/9/2018")
Akron28031b72021-10-02 13:07:25 +0200628 assert.Equal(tokens[0], "Der")
629 assert.Equal(tokens[1], "Termin")
630 assert.Equal(tokens[2], "ist")
631 assert.Equal(tokens[3], "am")
632 assert.Equal(tokens[4], "5/9/2018")
633 assert.Equal(len(tokens), 5)
634
635 // testTokenizerDateRange
636 /*
637 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
638 assert.Equal(tokens[0], "Der")
639 assert.Equal(tokens[1], "Termin")
640 assert.Equal(tokens[2], "war")
641 assert.Equal(tokens[3], "vom")
642 assert.Equal(tokens[4], "4.")
643 assert.Equal(tokens[5], "-")
644 assert.Equal(tokens[6], "5.9.2018")
645 assert.Equal(len(tokens), 7)
646 // Ignored in KorAP-Tokenizer
647 */
648
649 // testTokenizerEmoji1
Akronbe3d3662023-04-26 13:22:38 +0200650 tokens = ttokenize(mat_de, w, "Das ist toll! ;)")
Akron28031b72021-10-02 13:07:25 +0200651 assert.Equal(tokens[0], "Das")
652 assert.Equal(tokens[1], "ist")
653 assert.Equal(tokens[2], "toll")
654 assert.Equal(tokens[3], "!")
655 assert.Equal(tokens[4], ";)")
656 assert.Equal(len(tokens), 5)
657
658 // testTokenizerRef1
Akronbe3d3662023-04-26 13:22:38 +0200659 tokens = ttokenize(mat_de, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron28031b72021-10-02 13:07:25 +0200660 assert.Equal(tokens[0], "Kupietz")
661 assert.Equal(tokens[1], "und")
662 assert.Equal(tokens[2], "Schmidt")
663 assert.Equal(tokens[3], "(2018)")
664 assert.Equal(tokens[4], ":")
665 assert.Equal(tokens[5], "Korpuslinguistik")
666 assert.Equal(len(tokens), 6)
667 // Differs from KorAP-Tokenizer!
668
669 // testTokenizerRef2 () {
Akronbe3d3662023-04-26 13:22:38 +0200670 tokens = ttokenize(mat_de, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron28031b72021-10-02 13:07:25 +0200671 assert.Equal(tokens[0], "Kupietz")
672 assert.Equal(tokens[1], "und")
673 assert.Equal(tokens[2], "Schmidt")
674 assert.Equal(tokens[3], "[2018]")
675 assert.Equal(tokens[4], ":")
676 assert.Equal(tokens[5], "Korpuslinguistik")
677 assert.Equal(len(tokens), 6)
678 // Differs from KorAP-Tokenizer!
679
680 // testTokenizerOmission1 () {
Akronbe3d3662023-04-26 13:22:38 +0200681 tokens = ttokenize(mat_de, w, "Er ist ein A****loch!")
Akron28031b72021-10-02 13:07:25 +0200682 assert.Equal(tokens[0], "Er")
683 assert.Equal(tokens[1], "ist")
684 assert.Equal(tokens[2], "ein")
685 assert.Equal(tokens[3], "A****loch")
686 assert.Equal(tokens[4], "!")
687 assert.Equal(len(tokens), 5)
688
689 // testTokenizerOmission2
Akronbe3d3662023-04-26 13:22:38 +0200690 tokens = ttokenize(mat_de, w, "F*ck!")
Akron28031b72021-10-02 13:07:25 +0200691 assert.Equal(tokens[0], "F*ck")
692 assert.Equal(tokens[1], "!")
693 assert.Equal(len(tokens), 2)
694
695 // testTokenizerOmission3 () {
Akronbe3d3662023-04-26 13:22:38 +0200696 tokens = ttokenize(mat_de, w, "Dieses verf***** Kleid!")
Akron28031b72021-10-02 13:07:25 +0200697 assert.Equal(tokens[0], "Dieses")
698 assert.Equal(tokens[1], "verf*****")
699 assert.Equal(tokens[2], "Kleid")
700 assert.Equal(tokens[3], "!")
701 assert.Equal(len(tokens), 4)
702
703 // Probably interpreted as HOST
704 // testTokenizerFileExtension1
Akronbe3d3662023-04-26 13:22:38 +0200705 tokens = ttokenize(mat_de, w, "Ich habe die readme.txt heruntergeladen")
Akron28031b72021-10-02 13:07:25 +0200706 assert.Equal(tokens[0], "Ich")
707 assert.Equal(tokens[1], "habe")
708 assert.Equal(tokens[2], "die")
709 assert.Equal(tokens[3], "readme.txt")
710 assert.Equal(tokens[4], "heruntergeladen")
711 assert.Equal(len(tokens), 5)
712
713 // Probably interpreted as HOST
714 // testTokenizerFileExtension2
Akronbe3d3662023-04-26 13:22:38 +0200715 tokens = ttokenize(mat_de, w, "Nimm die README.TXT!")
Akron28031b72021-10-02 13:07:25 +0200716 assert.Equal(tokens[0], "Nimm")
717 assert.Equal(tokens[1], "die")
718 assert.Equal(tokens[2], "README.TXT")
719 assert.Equal(tokens[3], "!")
720 assert.Equal(len(tokens), 4)
721
722 // Probably interpreted as HOST
723 // testTokenizerFileExtension3
Akronbe3d3662023-04-26 13:22:38 +0200724 tokens = ttokenize(mat_de, w, "Zeig mir profile.jpeg")
Akron28031b72021-10-02 13:07:25 +0200725 assert.Equal(tokens[0], "Zeig")
726 assert.Equal(tokens[1], "mir")
727 assert.Equal(tokens[2], "profile.jpeg")
728 assert.Equal(len(tokens), 3)
729
730 // testTokenizerFile1
731
Akronbe3d3662023-04-26 13:22:38 +0200732 tokens = ttokenize(mat_de, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akron28031b72021-10-02 13:07:25 +0200733 assert.Equal(tokens[0], "Zeig")
734 assert.Equal(tokens[1], "mir")
735 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
736 assert.Equal(len(tokens), 3)
737
738 // testTokenizerFile2
Akronbe3d3662023-04-26 13:22:38 +0200739 tokens = ttokenize(mat_de, w, "Gehe zu /Dokumente/profile.docx")
Akron28031b72021-10-02 13:07:25 +0200740 assert.Equal(tokens[0], "Gehe")
741 assert.Equal(tokens[1], "zu")
742 assert.Equal(tokens[2], "/Dokumente/profile.docx")
743 assert.Equal(len(tokens), 3)
744
745 // testTokenizerFile3
Akronbe3d3662023-04-26 13:22:38 +0200746 tokens = ttokenize(mat_de, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akron28031b72021-10-02 13:07:25 +0200747 assert.Equal(tokens[0], "Zeig")
748 assert.Equal(tokens[1], "mir")
749 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
750 assert.Equal(len(tokens), 3)
751 // Ignored in KorAP-Tokenizer
752
753 // testTokenizerPunct
Akronbe3d3662023-04-26 13:22:38 +0200754 tokens = ttokenize(mat_de, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akron28031b72021-10-02 13:07:25 +0200755 assert.Equal(tokens[0], "Er")
756 assert.Equal(tokens[1], "sagte")
757 assert.Equal(tokens[2], ":")
758 assert.Equal(tokens[3], "\"")
759 assert.Equal(tokens[4], "Es")
760 assert.Equal(tokens[5], "geht")
761 assert.Equal(tokens[6], "mir")
762 assert.Equal(tokens[7], "gut")
763 assert.Equal(tokens[8], "!")
764 assert.Equal(tokens[9], "\"")
765 assert.Equal(tokens[10], ",")
766 assert.Equal(tokens[11], "daraufhin")
767 assert.Equal(tokens[12], "ging")
768 assert.Equal(tokens[13], "er")
769 assert.Equal(tokens[14], ".")
770 assert.Equal(len(tokens), 15)
771
772 // testTokenizerPlusAmpersand
Akronbe3d3662023-04-26 13:22:38 +0200773 tokens = ttokenize(mat_de, w, "&quot;Das ist von C&A!&quot;")
Akron28031b72021-10-02 13:07:25 +0200774 assert.Equal(tokens[0], "&quot;")
775 assert.Equal(tokens[1], "Das")
776 assert.Equal(tokens[2], "ist")
777 assert.Equal(tokens[3], "von")
778 assert.Equal(tokens[4], "C&A")
779 assert.Equal(tokens[5], "!")
780 assert.Equal(tokens[6], "&quot;")
781 assert.Equal(len(tokens), 7)
782
783 // testTokenizerLongEnd
Akronbe3d3662023-04-26 13:22:38 +0200784 tokens = ttokenize(mat_de, w, "Siehst Du?!!?")
Akron28031b72021-10-02 13:07:25 +0200785 assert.Equal(tokens[0], "Siehst")
786 assert.Equal(tokens[1], "Du")
787 assert.Equal(tokens[2], "?!!?")
788 assert.Equal(len(tokens), 3)
789
790 // testTokenizerIrishO
Akronbe3d3662023-04-26 13:22:38 +0200791 tokens = ttokenize(mat_de, w, "Peter O'Toole")
Akron28031b72021-10-02 13:07:25 +0200792 assert.Equal(tokens[0], "Peter")
793 assert.Equal(tokens[1], "O'Toole")
794 assert.Equal(len(tokens), 2)
795
796 // testTokenizerAbr
Akronbe3d3662023-04-26 13:22:38 +0200797 tokens = ttokenize(mat_de, w, "Früher bzw. später ...")
Akron28031b72021-10-02 13:07:25 +0200798 assert.Equal(tokens[0], "Früher")
799 assert.Equal(tokens[1], "bzw.")
800 assert.Equal(tokens[2], "später")
801 assert.Equal(tokens[3], "...")
802 assert.Equal(len(tokens), 4)
803
804 // testTokenizerUppercaseRule
Akronbe3d3662023-04-26 13:22:38 +0200805 tokens = ttokenize(mat_de, w, "Es war spät.Morgen ist es früh.")
Akron28031b72021-10-02 13:07:25 +0200806 assert.Equal(tokens[0], "Es")
807 assert.Equal(tokens[1], "war")
808 assert.Equal(tokens[2], "spät")
809 assert.Equal(tokens[3], ".")
810 assert.Equal(tokens[4], "Morgen")
811 assert.Equal(tokens[5], "ist")
812 assert.Equal(tokens[6], "es")
813 assert.Equal(tokens[7], "früh")
814 assert.Equal(tokens[8], ".")
815 assert.Equal(len(tokens), 9)
816 // Ignored in KorAP-Tokenizer
817
818 // testTokenizerOrd
Akronbe3d3662023-04-26 13:22:38 +0200819 tokens = ttokenize(mat_de, w, "Sie erreichte den 1. Platz!")
Akron28031b72021-10-02 13:07:25 +0200820 assert.Equal(tokens[0], "Sie")
821 assert.Equal(tokens[1], "erreichte")
822 assert.Equal(tokens[2], "den")
823 assert.Equal(tokens[3], "1.")
824 assert.Equal(tokens[4], "Platz")
825 assert.Equal(tokens[5], "!")
826 assert.Equal(len(tokens), 6)
827
828 // testNoZipOuputArchive
Akronbe3d3662023-04-26 13:22:38 +0200829 tokens = ttokenize(mat_de, w, "Archive: Ich bin kein zip\n")
Akron28031b72021-10-02 13:07:25 +0200830 assert.Equal(tokens[0], "Archive")
831 assert.Equal(tokens[1], ":")
832 assert.Equal(tokens[2], "Ich")
833 assert.Equal(tokens[3], "bin")
834 assert.Equal(tokens[4], "kein")
835 assert.Equal(tokens[5], "zip")
836 assert.Equal(6, len(tokens))
837
838 // testTokenizerStrasse
Akronbe3d3662023-04-26 13:22:38 +0200839 tokens = ttokenize(mat_de, w, "Ich wohne in der Weststr. und Du?")
Akron28031b72021-10-02 13:07:25 +0200840 assert.Equal(tokens[4], "Weststr.")
841 assert.Equal(8, len(tokens))
842
843 // germanTokenizerKnowsGermanOmissionWords
Akronbe3d3662023-04-26 13:22:38 +0200844 tokens = ttokenize(mat_de, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron28031b72021-10-02 13:07:25 +0200845 assert.Equal("D'dorf", tokens[0])
846 assert.Equal("Ku'damm", tokens[1])
847 assert.Equal("Lu'hafen", tokens[2])
848 assert.Equal("M'gladbach", tokens[3])
849 assert.Equal("W'schaft", tokens[4])
850 assert.Equal(5, len(tokens))
851
852 // germanTokenizerDoesNOTSeparateGermanContractions
Akronbe3d3662023-04-26 13:22:38 +0200853 tokens = ttokenize(mat_de, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron28031b72021-10-02 13:07:25 +0200854 assert.Equal("mach's", tokens[0])
855 assert.Equal("macht's", tokens[1])
856 assert.Equal("was'n", tokens[2])
857 assert.Equal("ist's", tokens[3])
858 assert.Equal("haste", tokens[4])
859 assert.Equal("willste", tokens[5])
860 assert.Equal("kannste", tokens[6])
861 assert.Equal("biste", tokens[7])
862 assert.Equal("kriegste", tokens[8])
863 assert.Equal(9, len(tokens))
864
Akronbe3d3662023-04-26 13:22:38 +0200865 tokens = ttokenize(mat_de, w, "Es ist gleich 2:30 Uhr.")
Akron78dba062021-10-28 19:30:46 +0200866 assert.Equal("Es", tokens[0])
867 assert.Equal("ist", tokens[1])
868 assert.Equal("gleich", tokens[2])
869 assert.Equal("2:30", tokens[3])
870 assert.Equal("Uhr", tokens[4])
871 assert.Equal(".", tokens[5])
872 assert.Equal(6, len(tokens))
873
Akronbe3d3662023-04-26 13:22:38 +0200874 tokens = ttokenize(mat_de, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
Akron17984c82021-10-30 11:44:37 +0200875 assert.Equal("Sie", tokens[0])
876 assert.Equal("schwamm", tokens[1])
877 assert.Equal("die", tokens[2])
878 assert.Equal("Strecke", tokens[3])
879 assert.Equal("in", tokens[4])
880 assert.Equal("00:00:57,34", tokens[5])
881 assert.Equal("00:57,341", tokens[6])
882 assert.Equal("0:57", tokens[7])
883 assert.Equal("Stunden", tokens[8])
884 assert.Equal(".", tokens[9])
885 assert.Equal(10, len(tokens))
886
Akronf1106ec2021-11-05 13:04:44 +0100887 // waste example
Akronbe3d3662023-04-26 13:22:38 +0200888 tokens = ttokenize(mat_de, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
Akronf1106ec2021-11-05 13:04:44 +0100889 assert.Equal(tokens[0], "Am")
890 assert.Equal(tokens[1], "24.1.1806")
891 assert.Equal(tokens[2], "feierte")
892 assert.Equal(tokens[3], "E.")
893 assert.Equal(tokens[4], "T.")
894 assert.Equal(tokens[5], "A.")
895 assert.Equal(tokens[6], "Hoffmann")
896 assert.Equal(tokens[7], "seinen")
897 assert.Equal(tokens[8], "30.")
898 assert.Equal(tokens[9], "Geburtstag")
899 assert.Equal(tokens[10], ".")
900 assert.Equal(11, len(tokens))
901
Akron9135b202021-11-06 13:16:07 +0100902 // IPtest
Akronbe3d3662023-04-26 13:22:38 +0200903 tokens = ttokenize(mat_de, w, "Meine IP ist 192.178.168.55.")
Akron9135b202021-11-06 13:16:07 +0100904 assert.Equal(tokens[0], "Meine")
905 assert.Equal(tokens[1], "IP")
906 assert.Equal(tokens[2], "ist")
907 assert.Equal(tokens[3], "192.178.168.55")
908 assert.Equal(tokens[4], ".")
909 assert.Equal(5, len(tokens))
910
Akron6742b962021-11-09 01:17:20 +0100911 // XML entities
Akronbe3d3662023-04-26 13:22:38 +0200912 tokens = ttokenize(mat_de, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
Akron6742b962021-11-09 01:17:20 +0100913 assert.Equal(tokens[0], "Das")
914 assert.Equal(tokens[1], "ist")
915 assert.Equal(tokens[2], "&nbsp;")
916 assert.Equal(tokens[3], "1:30")
917 assert.Equal(tokens[4], "Stunden")
918 assert.Equal(tokens[5], "&")
919 assert.Equal(tokens[6], "20")
920 assert.Equal(tokens[7], "Minuten")
921 assert.Equal(tokens[8], "zu")
922 assert.Equal(tokens[9], "spät")
923 assert.Equal(tokens[10], "&GT;")
924 assert.Equal(tokens[11], ".")
925 assert.Equal(12, len(tokens))
926
Akron936c0f52021-12-07 11:30:53 +0100927 // Plusampersand compounds (1)
Akronbe3d3662023-04-26 13:22:38 +0200928 tokens = ttokenize(mat_de, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
Akrone62e8eb2021-12-03 11:59:53 +0100929 assert.Equal(tokens[0], "Die")
930 assert.Equal(tokens[1], "2G+-Regel")
931 assert.Equal(tokens[2], "soll")
932 assert.Equal(tokens[3], "weitere")
933 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
934 assert.Equal(tokens[5], "reduzieren")
935 assert.Equal(tokens[6], ".")
936 assert.Equal(7, len(tokens))
937
Akron936c0f52021-12-07 11:30:53 +0100938 // Plusampersand compounds (2)
Akronbe3d3662023-04-26 13:22:38 +0200939 tokens = ttokenize(mat_de, w, "Der Neu-C++-Programmierer.")
Akron936c0f52021-12-07 11:30:53 +0100940 assert.Equal(tokens[0], "Der")
941 assert.Equal(tokens[1], "Neu-C++-Programmierer")
942 assert.Equal(tokens[2], ".")
943 assert.Equal(3, len(tokens))
944
Akron54ed7e72022-01-04 12:05:00 +0100945 // z.B.
Akronbe3d3662023-04-26 13:22:38 +0200946 tokens = ttokenize(mat_de, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
Akron54ed7e72022-01-04 12:05:00 +0100947 assert.Equal(tokens[0], "Dies")
948 assert.Equal(tokens[1], "sind")
949 assert.Equal(tokens[2], "z.")
950 assert.Equal(tokens[3], "B.")
951 assert.Equal(tokens[4], "zwei")
952 assert.Equal(tokens[5], "Wörter")
953 assert.Equal(tokens[6], "-")
954 assert.Equal(tokens[7], "z.")
955 assert.Equal(tokens[8], "B.")
956 assert.Equal(tokens[9], "auch")
957 assert.Equal(tokens[10], ".")
958 assert.Equal(11, len(tokens))
959
Akron9a594712022-01-14 11:12:21 +0100960 // z.B.
Akronbe3d3662023-04-26 13:22:38 +0200961 tokens = ttokenize(mat_de, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
Akron9a594712022-01-14 11:12:21 +0100962 assert.Equal(tokens[0], "Dies")
963 assert.Equal(tokens[1], "sind")
964 assert.Equal(tokens[2], "z.")
965 assert.Equal(tokens[3], "B.")
966 assert.Equal(tokens[4], "zwei")
967 assert.Equal(tokens[5], "Wörter")
968 assert.Equal(tokens[6], "-")
969 assert.Equal(tokens[7], "z.")
970 assert.Equal(tokens[8], "B.")
971 assert.Equal(tokens[9], "auch")
972 assert.Equal(tokens[10], ".")
973 assert.Equal(11, len(tokens))
974
975 // Single quote handling
Akronbe3d3662023-04-26 13:22:38 +0200976 tokens = ttokenize(mat_de, w, "Es heißt 'Leitungssportteams' und nicht anders.")
Akron9a594712022-01-14 11:12:21 +0100977 assert.Equal(tokens[0], "Es")
978 assert.Equal(tokens[1], "heißt")
979 assert.Equal(tokens[2], "'")
980 assert.Equal(tokens[3], "Leitungssportteams")
981 assert.Equal(tokens[4], "'")
982 assert.Equal(tokens[5], "und")
983 assert.Equal(tokens[6], "nicht")
984 assert.Equal(tokens[7], "anders")
985 assert.Equal(tokens[8], ".")
986 assert.Equal(9, len(tokens))
987
Akronb02ad072022-01-19 12:41:44 +0100988 // Apostrophe handling
Akronbe3d3662023-04-26 13:22:38 +0200989 tokens = ttokenize(mat_de, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
Akronb02ad072022-01-19 12:41:44 +0100990 assert.Equal(tokens[0], "Das")
991 assert.Equal(tokens[1], "ist")
992 assert.Equal(tokens[2], "Nils’")
993 assert.Equal(tokens[3], "Einkaufskorb")
994 assert.Equal(tokens[4], "bei")
995 assert.Equal(tokens[5], "McDonald's")
996 assert.Equal(tokens[6], ".")
997 assert.Equal(7, len(tokens))
998
Akronbe3d3662023-04-26 13:22:38 +0200999}
1000
1001func TestMatrixFullTokenizerTokenSplitterEN(t *testing.T) {
1002 assert := assert.New(t)
1003
1004 if mat_en == nil {
1005 mat_en = LoadMatrixFile("testdata/tokenizer_en.matok")
1006 }
1007
1008 b := make([]byte, 0, 2048)
1009 w := bytes.NewBuffer(b)
1010 var tokens []string
1011
1012 // testEnglishTokenizerScienceAbbreviations
1013 tokens = ttokenize(mat_en, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
1014 assert.Equal("Approx.", tokens[0])
1015 assert.Equal("in", tokens[1])
1016 assert.Equal("Sept.", tokens[2])
1017 assert.Equal("1954", tokens[3])
1018 assert.Equal(",", tokens[4])
1019 assert.Equal("Assoc.", tokens[5])
1020 assert.Equal("Prof.", tokens[6])
1021 assert.Equal("Dr.", tokens[7])
1022 assert.Equal("R.", tokens[8])
1023 assert.Equal("J.", tokens[9])
1024 assert.Equal("Ewing", tokens[10])
1025 assert.Equal("reviewed", tokens[11])
1026 assert.Equal("articles", tokens[12])
1027 assert.Equal("on", tokens[13])
1028 assert.Equal("Enzymol.", tokens[14])
1029 assert.Equal("Bacteriol.", tokens[15])
1030 assert.Equal("effects", tokens[16])
1031 assert.Equal("later", tokens[17])
1032 assert.Equal("published", tokens[18])
1033 assert.Equal("in", tokens[19])
1034 assert.Equal("Nutr.", tokens[20])
1035 assert.Equal("Rheumatol.", tokens[21])
1036 assert.Equal("No.", tokens[22])
1037 assert.Equal("12", tokens[23])
1038 assert.Equal("and", tokens[24])
1039 assert.Equal("Nº.", tokens[25])
1040 assert.Equal("13.", tokens[26])
1041 assert.Equal(",", tokens[27])
1042 assert.Equal("pp.", tokens[28])
1043 assert.Equal("17-18", tokens[29])
1044 assert.Equal(".", tokens[30])
Akronbe3d3662023-04-26 13:22:38 +02001045
Akrond0dfea82023-04-26 19:24:17 +02001046 // englishTokenizerCanGuessWhetherIIsAbbrev
1047 tokens = ttokenize(mat_en, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
1048 assert.Equal("I.", tokens[1])
1049 assert.Equal("I", tokens[8])
1050 assert.Equal(".", tokens[9])
1051 assert.Equal("I", tokens[12])
1052 assert.Equal(".", tokens[13])
Akronbe3d3662023-04-26 13:22:38 +02001053
Akron28031b72021-10-02 13:07:25 +02001054 /*
1055 @Test
1056 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
1057 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1058 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
1059 assert.Equal("'ve", tokens[1]);
1060 assert.Equal("'ll", tokens[3]);
1061 assert.Equal("'d", tokens[5]);
1062 assert.Equal("'m", tokens[7]);
1063 assert.Equal("'re", tokens[9]);
1064 assert.Equal("'s", tokens[11]);
1065 assert.Equal("is", tokens[12]);
1066 assert.Equal("n't", tokens[13]);
1067 assert.Equal(14, len(tokens));
1068 }
1069
1070 @Test
1071 public void frenchTokenizerKnowsFrenchAbbreviations () {
1072 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1073 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
1074 assert.Equal("Approx.", tokens[0]);
1075 assert.Equal("juill.", tokens[2]);
1076 assert.Equal("prof.", tokens[5]);
1077 assert.Equal("exerc.", tokens[15]);
1078 assert.Equal("no.", tokens[16]);
1079 assert.Equal("pp.", tokens[21]);
1080 }
1081
1082 @Test
1083 public void frenchTokenizerKnowsFrenchContractions () {
1084 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1085 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
1086 assert.Equal("J'", tokens[0]);
1087 assert.Equal("j'", tokens[2]);
1088 assert.Equal("qu'", tokens[4]);
1089 assert.Equal("d'", tokens[6]);
1090 assert.Equal("jusqu'", tokens[8]);
1091 assert.Equal("Aujourd'hui", tokens[10]);
1092 assert.Equal("D'", tokens[11]); // ’
1093 assert.Equal("Quelqu'un", tokens[13]); // ’
1094 assert.Equal("Presqu'île", tokens[14]); // ’
1095 }
1096
1097 @Test
1098 public void frenchTokenizerKnowsFrenchClitics () {
1099 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1100 tokens = tokenize(dat, w, "suis-je sont-elles ")
1101 assert.Equal("suis", tokens[0]);
1102 assert.Equal("-je", tokens[1]);
1103 assert.Equal("sont", tokens[2]);
1104 assert.Equal("-elles", tokens[3]);
1105 }
1106
Akron28031b72021-10-02 13:07:25 +02001107
1108 @Test
1109 public void testZipOuputArchive () {
1110
1111 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
1112 System.setOut(new PrintStream(clearOut));
1113 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
1114 assert.Equal(0, len(tokens));
1115 }
1116 */
1117 /*
1118
1119 @Test
1120 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
1121 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
1122 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
1123 .printOffsets(true)
1124 .build();
1125 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
1126 assert.Equal("Text1", tokens[0].getType());
1127 assert.Equal(len(tokens), 9 );
1128 }
1129 */
1130}
1131
Akronb98e4cf2022-03-27 23:56:49 +02001132func TestMatrixEmoticons(t *testing.T) {
1133 assert := assert.New(t)
1134
Akronbe3d3662023-04-26 13:22:38 +02001135 if mat_de == nil {
1136 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akronb98e4cf2022-03-27 23:56:49 +02001137 }
1138
Akronbe3d3662023-04-26 13:22:38 +02001139 assert.NotNil(mat_de)
Akronb98e4cf2022-03-27 23:56:49 +02001140
1141 b := make([]byte, 0, 2048)
1142 w := bytes.NewBuffer(b)
1143 var tokens []string
1144
Akronbe3d3662023-04-26 13:22:38 +02001145 tokens = ttokenize(mat_de, w, ":-* ;) :)) :*( ^___^ T__T ^^; -_-;;; -_-^")
Akronb98e4cf2022-03-27 23:56:49 +02001146 assert.Equal(tokens[0], ":-*")
1147 assert.Equal(tokens[1], ";)")
1148 assert.Equal(tokens[2], ":))")
1149 assert.Equal(tokens[3], ":*(")
1150 assert.Equal(tokens[4], "^___^")
1151 assert.Equal(tokens[5], "T__T")
1152 assert.Equal(tokens[6], "^^;")
1153 assert.Equal(tokens[7], "-_-;;;")
1154 assert.Equal(tokens[8], "-_-^")
1155 assert.Equal(len(tokens), 9)
Akron6dcb6ce2022-04-09 16:09:51 +02001156
Akronbe3d3662023-04-26 13:22:38 +02001157 tokens = ttokenize(mat_de, w, "das -> Lustig<-!")
Akron6dcb6ce2022-04-09 16:09:51 +02001158 assert.Equal("das", tokens[0])
1159 assert.Equal("->", tokens[1])
1160 assert.Equal("Lustig", tokens[2])
1161 assert.Equal("<-", tokens[3])
1162 assert.Equal("!", tokens[4])
1163 assert.Equal(5, len(tokens))
Akronb98e4cf2022-03-27 23:56:49 +02001164}
1165
Akronc9c0eae2021-10-22 19:49:43 +02001166func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +02001167 assert := assert.New(t)
1168
Akronbe3d3662023-04-26 13:22:38 +02001169 if mat_de == nil {
1170 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001171 }
Akron28031b72021-10-02 13:07:25 +02001172
Akronbe3d3662023-04-26 13:22:38 +02001173 assert.NotNil(mat_de)
Akron28031b72021-10-02 13:07:25 +02001174
1175 b := make([]byte, 0, 2048)
1176 w := bytes.NewBuffer(b)
1177 var tokens []string
1178
Akronbe3d3662023-04-26 13:22:38 +02001179 tokens = ttokenize(mat_de, w, "Das <b>beste</b> Fußballspiel")
Akron28031b72021-10-02 13:07:25 +02001180 assert.Equal("Das", tokens[0])
1181 assert.Equal("<b>", tokens[1])
1182 assert.Equal("beste", tokens[2])
1183 assert.Equal("</b>", tokens[3])
1184 assert.Equal("Fußballspiel", tokens[4])
1185 assert.Equal(5, len(tokens))
1186
Akronbe3d3662023-04-26 13:22:38 +02001187 tokens = ttokenize(mat_de, w, "Das <b class=\"c\">beste</b> Fußballspiel")
Akron28031b72021-10-02 13:07:25 +02001188 assert.Equal("Das", tokens[0])
1189 assert.Equal("<b class=\"c\">", tokens[1])
1190 assert.Equal("beste", tokens[2])
1191 assert.Equal("</b>", tokens[3])
1192 assert.Equal("Fußballspiel", tokens[4])
1193 assert.Equal(5, len(tokens))
1194
Akronbe3d3662023-04-26 13:22:38 +02001195 tokens = ttokenize(mat_de, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
Akron28031b72021-10-02 13:07:25 +02001196 assert.Equal("der", tokens[0])
1197 assert.Equal("<x y=\"alte \">", tokens[1])
1198 assert.Equal("<x x>", tokens[2])
1199 assert.Equal("alte", tokens[3])
1200 assert.Equal("</x>", tokens[4])
1201 assert.Equal("etc.", tokens[5])
1202 assert.Equal("et", tokens[6])
1203 assert.Equal(".", tokens[7])
1204 assert.Equal("Mann", tokens[8])
1205 assert.Equal(".", tokens[9])
1206 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001207
Akronbe3d3662023-04-26 13:22:38 +02001208 tokens = ttokenize(mat_de, w, "das<br class=\"br\" />ging.")
Akron066d99c2021-10-28 19:04:59 +02001209 assert.Equal("das", tokens[0])
1210 assert.Equal("<br class=\"br\" />", tokens[1])
1211 assert.Equal("ging", tokens[2])
1212 assert.Equal(".", tokens[3])
1213 assert.Equal(4, len(tokens))
Akrond47c67e2022-04-10 11:02:59 +02001214
Akronbe3d3662023-04-26 13:22:38 +02001215 tokens = ttokenize(mat_de, w, "das <?robot xgh ?> <!-- hm hm --> <![CDATA[ cdata ]]> <br />")
Akrond47c67e2022-04-10 11:02:59 +02001216 assert.Equal("das", tokens[0])
1217 assert.Equal("<?robot", tokens[1])
1218 assert.Equal("xgh", tokens[2])
1219 assert.Equal("?>", tokens[3])
1220 assert.Equal("<!--", tokens[4])
1221 assert.Equal("hm", tokens[5])
1222 assert.Equal("hm", tokens[6])
1223 assert.Equal("-->", tokens[7])
1224 assert.Equal("<![CDATA[", tokens[8])
1225 assert.Equal("cdata", tokens[9])
1226 assert.Equal("]]>", tokens[10])
1227 assert.Equal("<br />", tokens[11])
1228 assert.Equal(12, len(tokens))
1229
Akron28031b72021-10-02 13:07:25 +02001230}
1231
Akronabcb6a52021-10-09 15:52:08 +02001232func TestMatokDatokEquivalence(t *testing.T) {
1233 assert := assert.New(t)
1234
Akronbe3d3662023-04-26 13:22:38 +02001235 if mat_de == nil {
1236 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001237 }
Akronabcb6a52021-10-09 15:52:08 +02001238 dat := LoadDatokFile("testdata/tokenizer.datok")
1239
1240 r := strings.NewReader(s)
1241
1242 tb := make([]byte, 0, 2048)
1243 w := bytes.NewBuffer(tb)
1244
1245 // Transduce with double array representation
1246 dat.Transduce(r, w)
1247
1248 datStr := w.String()
1249
1250 r.Reset(s)
1251 w.Reset()
1252
1253 // Transduce with matrix representation
Akronbe3d3662023-04-26 13:22:38 +02001254 mat_de.Transduce(r, w)
Akronabcb6a52021-10-09 15:52:08 +02001255
1256 matStr := w.String()
1257
1258 assert.Equal(datStr, matStr)
1259}
1260
Akronc9c0eae2021-10-22 19:49:43 +02001261func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001262 assert := assert.New(t)
1263
Akronbe3d3662023-04-26 13:22:38 +02001264 if mat_de == nil {
1265 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001266 }
Akrone396a932021-10-19 01:06:13 +02001267
Akronbe3d3662023-04-26 13:22:38 +02001268 assert.NotNil(mat_de)
Akrone396a932021-10-19 01:06:13 +02001269
1270 b := make([]byte, 0, 2048)
1271 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001272
Akronbe3d3662023-04-26 13:22:38 +02001273 assert.True(mat_de.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001274
1275 matStr := w.String()
1276
1277 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1278}
1279
Akronc9c0eae2021-10-22 19:49:43 +02001280func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001281 assert := assert.New(t)
1282
Akronbe3d3662023-04-26 13:22:38 +02001283 if mat_de == nil {
1284 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001285 }
Akrona854faa2021-10-22 19:31:08 +02001286
Akronbe3d3662023-04-26 13:22:38 +02001287 assert.NotNil(mat_de)
Akrona854faa2021-10-22 19:31:08 +02001288
1289 b := make([]byte, 0, 2048)
1290 w := bytes.NewBuffer(b)
1291
Akronbe3d3662023-04-26 13:22:38 +02001292 assert.True(mat_de.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
Akrona854faa2021-10-22 19:31:08 +02001293 matStr := w.String()
1294 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001295}
Akrona854faa2021-10-22 19:31:08 +02001296
Akron22c565a2021-11-28 17:31:36 +01001297func TestMatrixFullTokenizerLongText(t *testing.T) {
1298 assert := assert.New(t)
1299
Akronbe3d3662023-04-26 13:22:38 +02001300 if mat_de == nil {
1301 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron22c565a2021-11-28 17:31:36 +01001302 }
1303
Akronbe3d3662023-04-26 13:22:38 +02001304 assert.NotNil(mat_de)
Akron22c565a2021-11-28 17:31:36 +01001305
1306 b := make([]byte, 0, 2048)
1307 w := bytes.NewBuffer(b)
1308
1309 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1310
1311Copyright laws are changing all over the world. Be sure to check the
1312copyright laws for your country before downloading or redistributing
1313this or any other Project Gutenberg eBook.
1314
1315This header should be the first thing seen when viewing this Project
1316Gutenberg file. Please do not remove it. Do not change or edit the
1317header without written permission.
1318
1319Please read the "legal small print," and other information about the
1320eBook and Project Gutenberg at the bottom of this file. Included is
1321important information about your specific rights and restrictions in
1322how the file may be used. You can also find out about how to make a
1323donation to Project Gutenberg, and how to get involved.
1324
1325
1326**Welcome To The World of Free Plain Vanilla Electronic Texts**
1327
1328**eBooks Readable By Both Humans and By Computers, Since 1971**
1329
1330*****These eBooks Were Prepared By Thousands of Volunteers!*****
1331
1332
1333Title: Effi Briest
1334
1335Author: Theodor Fontane
1336
1337Release Date: March, 2004 [EBook #5323]
1338`
1339
Akronbe3d3662023-04-26 13:22:38 +02001340 assert.True(mat_de.Transduce(strings.NewReader(text), w))
Akron22c565a2021-11-28 17:31:36 +01001341
1342 assert.True(strings.Contains(w.String(), "Release"))
1343}
1344
Akronf6bdfdb2021-10-23 15:56:53 +02001345func TestMatrixTrimming(t *testing.T) {
1346 assert := assert.New(t)
1347
Akronbe3d3662023-04-26 13:22:38 +02001348 if mat_de == nil {
1349 mat_de = LoadMatrixFile("testdata/tokenizer.matok")
Akron9fb63af2021-10-28 01:15:53 +02001350 }
Akronf6bdfdb2021-10-23 15:56:53 +02001351
Akronbe3d3662023-04-26 13:22:38 +02001352 assert.NotNil(mat_de)
Akronf6bdfdb2021-10-23 15:56:53 +02001353
1354 b := make([]byte, 0, 2048)
1355 w := bytes.NewBuffer(b)
1356
Akronbe3d3662023-04-26 13:22:38 +02001357 assert.True(mat_de.Transduce(strings.NewReader(" Erste."), w))
Akronf6bdfdb2021-10-23 15:56:53 +02001358 matStr := w.String()
1359 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001360}
1361
Akronc9c0eae2021-10-22 19:49:43 +02001362func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001363 bu := make([]byte, 0, 2048)
1364 w := bytes.NewBuffer(bu)
1365
Akron28031b72021-10-02 13:07:25 +02001366 r := strings.NewReader(s)
1367
Akron094a4e82021-10-02 18:37:00 +02001368 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001369
1370 b.ResetTimer()
1371
1372 for i := 0; i < b.N; i++ {
1373 w.Reset()
1374 r.Reset(s)
1375 ok := mat.Transduce(r, w)
1376 if !ok {
1377 fmt.Println("Fail!")
1378 fmt.Println(w.String())
1379 os.Exit(1)
1380 }
1381 }
1382}