blob: 80523e587feacf8cf85e7f6db110f299bec65dce [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron9fb63af2021-10-28 01:15:53 +020023var mat *MatrixTokenizer
24
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akronc9c0eae2021-10-22 19:49:43 +020073func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +020074 assert := assert.New(t)
75 foma := LoadFomaFile("testdata/simpletok.fst")
76 assert.NotNil(foma)
77
78 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020079 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020080
81 assert.True(tmatch(mat, "bau"))
82 assert.True(tmatch(mat, "bad"))
83 assert.True(tmatch(mat, "wald gehen"))
84 b := make([]byte, 0, 1024)
85 buf := bytes.NewBuffer(b)
86 n, err := mat.WriteTo(buf)
87 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020088 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020089 mat2 := ParseMatrix(buf)
90 assert.NotNil(mat2)
91 assert.Equal(mat.sigma, mat2.sigma)
92 assert.Equal(mat.epsilon, mat2.epsilon)
93 assert.Equal(mat.unknown, mat2.unknown)
94 assert.Equal(mat.identity, mat2.identity)
95 assert.Equal(mat.stateCount, mat2.stateCount)
96 assert.Equal(len(mat.array), len(mat2.array))
97 assert.Equal(mat.array, mat2.array)
98 assert.True(tmatch(mat2, "bau"))
99 assert.True(tmatch(mat2, "bad"))
100 assert.True(tmatch(mat2, "wald gehen"))
101}
102
Akrone396a932021-10-19 01:06:13 +0200103func TestMatrixIgnorableMCS(t *testing.T) {
104 assert := assert.New(t)
105
106 // This test relies on final states. That's why it is
107 // not working correctly anymore.
108
109 // File has MCS in sigma but not in net
110 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
111 assert.NotNil(tok)
112 mat := tok.ToMatrix()
113 assert.NotNil(mat)
114
115 b := make([]byte, 0, 2048)
116 w := bytes.NewBuffer(b)
117 var tokens []string
118
119 // Is only unambigous when transducing strictly greedy!
120 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
121 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200122 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200123 assert.Equal("a", tokens[0])
124 assert.Equal("b", tokens[1])
125 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200126 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200127}
128
Akronc9c0eae2021-10-22 19:49:43 +0200129func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200130 assert := assert.New(t)
131 foma := LoadFomaFile("testdata/tokenizer.fst")
132 assert.NotNil(foma)
133
134 mat := foma.ToMatrix()
135 assert.NotNil(foma)
136
137 tb := make([]byte, 0, 2048)
138 w := bytes.NewBuffer(tb)
139
140 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200141 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200142
143 b := make([]byte, 0, 1024)
144 buf := bytes.NewBuffer(b)
145 _, err := mat.WriteTo(buf)
146 assert.Nil(err)
147 w.Reset()
148 // assert.Equal(int64(248), n)
149
150 mat2 := ParseMatrix(buf)
151 assert.NotNil(mat2)
152 assert.Equal(mat.sigma, mat2.sigma)
153 assert.Equal(mat.epsilon, mat2.epsilon)
154 assert.Equal(mat.unknown, mat2.unknown)
155 assert.Equal(mat.identity, mat2.identity)
156 assert.Equal(mat.stateCount, mat2.stateCount)
157 assert.Equal(len(mat.array), len(mat2.array))
158 // assert.Equal(mat.array, mat2.array)
159
160 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200161 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200162}
163
Akronc9c0eae2021-10-22 19:49:43 +0200164func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200165 assert := assert.New(t)
166
Akron9fb63af2021-10-28 01:15:53 +0200167 if mat == nil {
168 mat = LoadMatrixFile("testdata/tokenizer.matok")
169 }
Akron28031b72021-10-02 13:07:25 +0200170
171 assert.NotNil(mat)
172
173 b := make([]byte, 0, 2048)
174 w := bytes.NewBuffer(b)
175 var tokens []string
176
177 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
178
179 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200180 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200181 assert.Equal("tra", tokens[0])
182 assert.Equal(".", tokens[1])
183 assert.Equal("", tokens[2])
184 assert.Equal("u", tokens[3])
185 assert.Equal("Du", tokens[4])
186 assert.Equal("?", tokens[5])
187 assert.Equal("", tokens[6])
188 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200189 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200190
191 w.Reset()
192 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200193 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200194}
195
Akronc9c0eae2021-10-22 19:49:43 +0200196func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200197 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200198
199 if mat == nil {
200 mat = LoadMatrixFile("testdata/tokenizer.matok")
201 }
Akron5c82a922021-09-24 19:11:29 +0200202
203 b := make([]byte, 0, 2048)
204 w := bytes.NewBuffer(b)
205 var sentences []string
206
207 // testSentSplitterSimple
208 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
209 sentences = strings.Split(w.String(), "\n\n")
210
Akrona854faa2021-10-22 19:31:08 +0200211 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200212 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200213 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200214 assert.Equal(len(sentences), 2)
215
216 w.Reset()
Akrona854faa2021-10-22 19:31:08 +0200217 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
218 sentences = strings.Split(w.String(), "\n\n")
219 assert.Equal(len(sentences), 2)
220 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
221 assert.Equal("\n", sentences[1])
222
223 w.Reset()
Akron5c82a922021-09-24 19:11:29 +0200224 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal(len(sentences), 2)
227 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200228 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200229
Akron28031b72021-10-02 13:07:25 +0200230 w.Reset()
231 assert.True(mat.Transduce(strings.NewReader(""), w))
232 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200233 assert.Equal(len(sentences), 2)
234 assert.Equal("", sentences[0])
235 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200236
Akron28031b72021-10-02 13:07:25 +0200237 w.Reset()
238 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
239 sentences = strings.Split(w.String(), "\n\n")
240 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200241
Akron28031b72021-10-02 13:07:25 +0200242 w.Reset()
243 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
244 sentences = strings.Split(w.String(), "\n\n")
245 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200246
Akron28031b72021-10-02 13:07:25 +0200247 w.Reset()
248 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200251 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200252 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200253
Akron28031b72021-10-02 13:07:25 +0200254 w.Reset()
255 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
256 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200257 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200258 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200259
Akron28031b72021-10-02 13:07:25 +0200260 w.Reset()
261 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
262 sentences = strings.Split(w.String(), "\n\n")
263 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200264
Akron28031b72021-10-02 13:07:25 +0200265 w.Reset()
266 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
267 sentences = strings.Split(w.String(), "\n\n")
268 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200269
Akron28031b72021-10-02 13:07:25 +0200270 w.Reset()
271 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
272 sentences = strings.Split(w.String(), "\n\n")
273 assert.Equal(len(sentences), 2)
274 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200275 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200276
Akron28031b72021-10-02 13:07:25 +0200277 w.Reset()
278 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
279 sentences = strings.Split(w.String(), "\n\n")
280 assert.Equal(len(sentences), 3)
281 assert.Equal("Ausschalten\n!!!", sentences[0])
282 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200283 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200284
Akron28031b72021-10-02 13:07:25 +0200285 w.Reset()
286 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
287 sentences = strings.Split(w.String(), "\n\n")
288 assert.Equal(len(sentences), 2)
Akrone96895f2022-03-08 19:58:37 +0100289
290 w.Reset()
291 assert.True(mat.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w))
292 sentences = strings.Split(w.String(), "\n\n")
293 assert.Equal(len(sentences), 5)
294 assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0])
295 assert.Equal("Geh\n!!!", sentences[1])
296 assert.Equal("\"\nLass\n!\n\"", sentences[2])
297 assert.Equal("Dann\nging\ner\n.", sentences[3])
298
299 w.Reset()
300 assert.True(mat.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
301 sentences = strings.Split(w.String(), "\n\n")
302 assert.Equal(len(sentences), 3)
303 assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0])
304 assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1])
Akron1c34ce62021-09-23 23:27:39 +0200305}
Akron28031b72021-10-02 13:07:25 +0200306
Akronc9c0eae2021-10-22 19:49:43 +0200307func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200308 assert := assert.New(t)
309
Akron9fb63af2021-10-28 01:15:53 +0200310 if mat == nil {
311 mat = LoadMatrixFile("testdata/tokenizer.matok")
312 }
Akron28031b72021-10-02 13:07:25 +0200313
314 b := make([]byte, 0, 2048)
315 w := bytes.NewBuffer(b)
316 var tokens []string
317
318 // testTokenizerSimple
319 tokens = ttokenize(mat, w, "Der alte Mann")
320 assert.Equal(tokens[0], "Der")
321 assert.Equal(tokens[1], "alte")
322 assert.Equal(tokens[2], "Mann")
323 assert.Equal(len(tokens), 3)
324
325 tokens = ttokenize(mat, w, "Der alte Mann.")
326 assert.Equal(tokens[0], "Der")
327 assert.Equal(tokens[1], "alte")
328 assert.Equal(tokens[2], "Mann")
329 assert.Equal(tokens[3], ".")
330 assert.Equal(len(tokens), 4)
331
332 // testTokenizerAbbr
333 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
334 assert.Equal(tokens[0], "Der")
335 assert.Equal(tokens[1], "Vorsitzende")
336 assert.Equal(tokens[2], "der")
337 assert.Equal(tokens[3], "F.D.P.")
338 assert.Equal(tokens[4], "hat")
339 assert.Equal(tokens[5], "gewählt")
340 assert.Equal(len(tokens), 6)
341 // Ignored in KorAP-Tokenizer
342
343 // testTokenizerHost1
344 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
345 assert.Equal(tokens[0], "Gefunden")
346 assert.Equal(tokens[1], "auf")
347 assert.Equal(tokens[2], "wikipedia.org")
348 assert.Equal(len(tokens), 3)
349
350 // testTokenizerWwwHost
351 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
352 assert.Equal("Gefunden", tokens[0])
353 assert.Equal("auf", tokens[1])
354 assert.Equal("www.wikipedia.org", tokens[2])
355 assert.Equal(3, len(tokens))
356
357 // testTokenizerWwwUrl
358 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
359 assert.Equal("www.info.biz/info", tokens[3])
360
361 // testTokenizerFtpHost
362 /*
363 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
364 assert.Equal("Kann", tokens[0])
365 assert.Equal("von", tokens[1])
366 assert.Equal("ftp.download.org", tokens[2])
367 assert.Equal(5, len(tokens))
368 // Ignored in KorAP-Tokenizer
369 */
370
371 // testTokenizerDash
372 tokens = ttokenize(mat, w, "Das war -- spitze")
373 assert.Equal(tokens[0], "Das")
374 assert.Equal(tokens[1], "war")
375 assert.Equal(tokens[2], "--")
376 assert.Equal(tokens[3], "spitze")
377 assert.Equal(len(tokens), 4)
378
379 // testTokenizerEmail1
380 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
381 assert.Equal(tokens[0], "Ich")
382 assert.Equal(tokens[1], "bin")
383 assert.Equal(tokens[2], "unter")
384 assert.Equal(tokens[3], "korap@ids-mannheim.de")
385 assert.Equal(tokens[4], "erreichbar")
386 assert.Equal(tokens[5], ".")
387 assert.Equal(len(tokens), 6)
388
389 // testTokenizerEmail2
390 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
391 assert.Equal(tokens[0], "Oder")
392 assert.Equal(tokens[1], "unter")
393 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
394 assert.Equal(tokens[3], ".")
395 assert.Equal(len(tokens), 4)
396
397 // testTokenizerEmail3
398 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
399 assert.Equal(tokens[0], "Oder")
400 assert.Equal(tokens[1], "unter")
401 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
402 assert.Equal(tokens[3], ".")
403 assert.Equal(len(tokens), 4)
404 // Ignored in KorAP-Tokenizer
405
406 // testTokenizerDoNotAcceptQuotedEmailNames
407 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
408 assert.Equal("\"", tokens[0])
409 assert.Equal("John", tokens[1])
410 assert.Equal("Doe", tokens[2])
411 assert.Equal("\"", tokens[3])
412 assert.Equal("@xx", tokens[4])
413 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
414 assert.Equal("com", tokens[6])
415 assert.Equal(7, len(tokens))
416
417 // testTokenizerTwitter
418 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
419 assert.Equal(tokens[0], "Folgt")
420 assert.Equal(tokens[1], "@korap")
421 assert.Equal(tokens[2], "und")
422 assert.Equal(tokens[3], "#korap")
423 assert.Equal(len(tokens), 4)
424
425 // testTokenizerWeb1
426 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
427 assert.Equal(tokens[0], "Unsere")
428 assert.Equal(tokens[1], "Website")
429 assert.Equal(tokens[2], "ist")
430 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
431 assert.Equal(len(tokens), 4)
432
433 // testTokenizerWeb2
434 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
435 assert.Equal(tokens[0], "Wir")
436 assert.Equal(tokens[1], "sind")
437 assert.Equal(tokens[2], "auch")
438 assert.Equal(tokens[3], "im")
439 assert.Equal(tokens[4], "Internet")
440 assert.Equal(tokens[5], "(")
441 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
442 assert.Equal(tokens[7], ")")
443 assert.Equal(len(tokens), 8)
444 // Ignored in KorAP-Tokenizer
445
446 // testTokenizerWeb3
447 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
448 assert.Equal(tokens[0], "Die")
449 assert.Equal(tokens[1], "Adresse")
450 assert.Equal(tokens[2], "ist")
451 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
452 assert.Equal(tokens[4], ".")
453 assert.Equal(len(tokens), 5)
454 // Ignored in KorAP-Tokenizer
455
456 // testTokenizerServer
457 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
458 assert.Equal(tokens[0], "Unser")
459 assert.Equal(tokens[1], "Server")
460 assert.Equal(tokens[2], "ist")
461 assert.Equal(tokens[3], "10.0.10.51")
462 assert.Equal(tokens[4], ".")
463 assert.Equal(len(tokens), 5)
464
465 // testTokenizerNum
466 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
467 assert.Equal(tokens[0], "Zu")
468 assert.Equal(tokens[1], "50,4%")
469 assert.Equal(tokens[2], "ist")
470 assert.Equal(tokens[3], "es")
471 assert.Equal(tokens[4], "sicher")
472 assert.Equal(len(tokens), 5)
473 // Differs from KorAP-Tokenizer
474
475 // testTokenizerDate
476 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
477 assert.Equal(tokens[0], "Der")
478 assert.Equal(tokens[1], "Termin")
479 assert.Equal(tokens[2], "ist")
480 assert.Equal(tokens[3], "am")
481 assert.Equal(tokens[4], "5.9.2018")
482 assert.Equal(len(tokens), 5)
483
484 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
485 assert.Equal(tokens[0], "Der")
486 assert.Equal(tokens[1], "Termin")
487 assert.Equal(tokens[2], "ist")
488 assert.Equal(tokens[3], "am")
489 assert.Equal(tokens[4], "5/9/2018")
490 assert.Equal(len(tokens), 5)
491
492 // testTokenizerDateRange
493 /*
494 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
495 assert.Equal(tokens[0], "Der")
496 assert.Equal(tokens[1], "Termin")
497 assert.Equal(tokens[2], "war")
498 assert.Equal(tokens[3], "vom")
499 assert.Equal(tokens[4], "4.")
500 assert.Equal(tokens[5], "-")
501 assert.Equal(tokens[6], "5.9.2018")
502 assert.Equal(len(tokens), 7)
503 // Ignored in KorAP-Tokenizer
504 */
505
506 // testTokenizerEmoji1
507 tokens = ttokenize(mat, w, "Das ist toll! ;)")
508 assert.Equal(tokens[0], "Das")
509 assert.Equal(tokens[1], "ist")
510 assert.Equal(tokens[2], "toll")
511 assert.Equal(tokens[3], "!")
512 assert.Equal(tokens[4], ";)")
513 assert.Equal(len(tokens), 5)
514
515 // testTokenizerRef1
516 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
517 assert.Equal(tokens[0], "Kupietz")
518 assert.Equal(tokens[1], "und")
519 assert.Equal(tokens[2], "Schmidt")
520 assert.Equal(tokens[3], "(2018)")
521 assert.Equal(tokens[4], ":")
522 assert.Equal(tokens[5], "Korpuslinguistik")
523 assert.Equal(len(tokens), 6)
524 // Differs from KorAP-Tokenizer!
525
526 // testTokenizerRef2 () {
527 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
528 assert.Equal(tokens[0], "Kupietz")
529 assert.Equal(tokens[1], "und")
530 assert.Equal(tokens[2], "Schmidt")
531 assert.Equal(tokens[3], "[2018]")
532 assert.Equal(tokens[4], ":")
533 assert.Equal(tokens[5], "Korpuslinguistik")
534 assert.Equal(len(tokens), 6)
535 // Differs from KorAP-Tokenizer!
536
537 // testTokenizerOmission1 () {
538 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
539 assert.Equal(tokens[0], "Er")
540 assert.Equal(tokens[1], "ist")
541 assert.Equal(tokens[2], "ein")
542 assert.Equal(tokens[3], "A****loch")
543 assert.Equal(tokens[4], "!")
544 assert.Equal(len(tokens), 5)
545
546 // testTokenizerOmission2
547 tokens = ttokenize(mat, w, "F*ck!")
548 assert.Equal(tokens[0], "F*ck")
549 assert.Equal(tokens[1], "!")
550 assert.Equal(len(tokens), 2)
551
552 // testTokenizerOmission3 () {
553 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
554 assert.Equal(tokens[0], "Dieses")
555 assert.Equal(tokens[1], "verf*****")
556 assert.Equal(tokens[2], "Kleid")
557 assert.Equal(tokens[3], "!")
558 assert.Equal(len(tokens), 4)
559
560 // Probably interpreted as HOST
561 // testTokenizerFileExtension1
562 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
563 assert.Equal(tokens[0], "Ich")
564 assert.Equal(tokens[1], "habe")
565 assert.Equal(tokens[2], "die")
566 assert.Equal(tokens[3], "readme.txt")
567 assert.Equal(tokens[4], "heruntergeladen")
568 assert.Equal(len(tokens), 5)
569
570 // Probably interpreted as HOST
571 // testTokenizerFileExtension2
572 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
573 assert.Equal(tokens[0], "Nimm")
574 assert.Equal(tokens[1], "die")
575 assert.Equal(tokens[2], "README.TXT")
576 assert.Equal(tokens[3], "!")
577 assert.Equal(len(tokens), 4)
578
579 // Probably interpreted as HOST
580 // testTokenizerFileExtension3
581 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
582 assert.Equal(tokens[0], "Zeig")
583 assert.Equal(tokens[1], "mir")
584 assert.Equal(tokens[2], "profile.jpeg")
585 assert.Equal(len(tokens), 3)
586
587 // testTokenizerFile1
588
589 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
590 assert.Equal(tokens[0], "Zeig")
591 assert.Equal(tokens[1], "mir")
592 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
593 assert.Equal(len(tokens), 3)
594
595 // testTokenizerFile2
596 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
597 assert.Equal(tokens[0], "Gehe")
598 assert.Equal(tokens[1], "zu")
599 assert.Equal(tokens[2], "/Dokumente/profile.docx")
600 assert.Equal(len(tokens), 3)
601
602 // testTokenizerFile3
603 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
604 assert.Equal(tokens[0], "Zeig")
605 assert.Equal(tokens[1], "mir")
606 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
607 assert.Equal(len(tokens), 3)
608 // Ignored in KorAP-Tokenizer
609
610 // testTokenizerPunct
611 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
612 assert.Equal(tokens[0], "Er")
613 assert.Equal(tokens[1], "sagte")
614 assert.Equal(tokens[2], ":")
615 assert.Equal(tokens[3], "\"")
616 assert.Equal(tokens[4], "Es")
617 assert.Equal(tokens[5], "geht")
618 assert.Equal(tokens[6], "mir")
619 assert.Equal(tokens[7], "gut")
620 assert.Equal(tokens[8], "!")
621 assert.Equal(tokens[9], "\"")
622 assert.Equal(tokens[10], ",")
623 assert.Equal(tokens[11], "daraufhin")
624 assert.Equal(tokens[12], "ging")
625 assert.Equal(tokens[13], "er")
626 assert.Equal(tokens[14], ".")
627 assert.Equal(len(tokens), 15)
628
629 // testTokenizerPlusAmpersand
630 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
631 assert.Equal(tokens[0], "&quot;")
632 assert.Equal(tokens[1], "Das")
633 assert.Equal(tokens[2], "ist")
634 assert.Equal(tokens[3], "von")
635 assert.Equal(tokens[4], "C&A")
636 assert.Equal(tokens[5], "!")
637 assert.Equal(tokens[6], "&quot;")
638 assert.Equal(len(tokens), 7)
639
640 // testTokenizerLongEnd
641 tokens = ttokenize(mat, w, "Siehst Du?!!?")
642 assert.Equal(tokens[0], "Siehst")
643 assert.Equal(tokens[1], "Du")
644 assert.Equal(tokens[2], "?!!?")
645 assert.Equal(len(tokens), 3)
646
647 // testTokenizerIrishO
648 tokens = ttokenize(mat, w, "Peter O'Toole")
649 assert.Equal(tokens[0], "Peter")
650 assert.Equal(tokens[1], "O'Toole")
651 assert.Equal(len(tokens), 2)
652
653 // testTokenizerAbr
654 tokens = ttokenize(mat, w, "Früher bzw. später ...")
655 assert.Equal(tokens[0], "Früher")
656 assert.Equal(tokens[1], "bzw.")
657 assert.Equal(tokens[2], "später")
658 assert.Equal(tokens[3], "...")
659 assert.Equal(len(tokens), 4)
660
661 // testTokenizerUppercaseRule
662 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
663 assert.Equal(tokens[0], "Es")
664 assert.Equal(tokens[1], "war")
665 assert.Equal(tokens[2], "spät")
666 assert.Equal(tokens[3], ".")
667 assert.Equal(tokens[4], "Morgen")
668 assert.Equal(tokens[5], "ist")
669 assert.Equal(tokens[6], "es")
670 assert.Equal(tokens[7], "früh")
671 assert.Equal(tokens[8], ".")
672 assert.Equal(len(tokens), 9)
673 // Ignored in KorAP-Tokenizer
674
675 // testTokenizerOrd
676 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
677 assert.Equal(tokens[0], "Sie")
678 assert.Equal(tokens[1], "erreichte")
679 assert.Equal(tokens[2], "den")
680 assert.Equal(tokens[3], "1.")
681 assert.Equal(tokens[4], "Platz")
682 assert.Equal(tokens[5], "!")
683 assert.Equal(len(tokens), 6)
684
685 // testNoZipOuputArchive
686 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
687 assert.Equal(tokens[0], "Archive")
688 assert.Equal(tokens[1], ":")
689 assert.Equal(tokens[2], "Ich")
690 assert.Equal(tokens[3], "bin")
691 assert.Equal(tokens[4], "kein")
692 assert.Equal(tokens[5], "zip")
693 assert.Equal(6, len(tokens))
694
695 // testTokenizerStrasse
696 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
697 assert.Equal(tokens[4], "Weststr.")
698 assert.Equal(8, len(tokens))
699
700 // germanTokenizerKnowsGermanOmissionWords
701 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
702 assert.Equal("D'dorf", tokens[0])
703 assert.Equal("Ku'damm", tokens[1])
704 assert.Equal("Lu'hafen", tokens[2])
705 assert.Equal("M'gladbach", tokens[3])
706 assert.Equal("W'schaft", tokens[4])
707 assert.Equal(5, len(tokens))
708
709 // germanTokenizerDoesNOTSeparateGermanContractions
710 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
711 assert.Equal("mach's", tokens[0])
712 assert.Equal("macht's", tokens[1])
713 assert.Equal("was'n", tokens[2])
714 assert.Equal("ist's", tokens[3])
715 assert.Equal("haste", tokens[4])
716 assert.Equal("willste", tokens[5])
717 assert.Equal("kannste", tokens[6])
718 assert.Equal("biste", tokens[7])
719 assert.Equal("kriegste", tokens[8])
720 assert.Equal(9, len(tokens))
721
Akron78dba062021-10-28 19:30:46 +0200722 tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
723 assert.Equal("Es", tokens[0])
724 assert.Equal("ist", tokens[1])
725 assert.Equal("gleich", tokens[2])
726 assert.Equal("2:30", tokens[3])
727 assert.Equal("Uhr", tokens[4])
728 assert.Equal(".", tokens[5])
729 assert.Equal(6, len(tokens))
730
Akron17984c82021-10-30 11:44:37 +0200731 tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
732 assert.Equal("Sie", tokens[0])
733 assert.Equal("schwamm", tokens[1])
734 assert.Equal("die", tokens[2])
735 assert.Equal("Strecke", tokens[3])
736 assert.Equal("in", tokens[4])
737 assert.Equal("00:00:57,34", tokens[5])
738 assert.Equal("00:57,341", tokens[6])
739 assert.Equal("0:57", tokens[7])
740 assert.Equal("Stunden", tokens[8])
741 assert.Equal(".", tokens[9])
742 assert.Equal(10, len(tokens))
743
Akronf1106ec2021-11-05 13:04:44 +0100744 // waste example
745 tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
746 assert.Equal(tokens[0], "Am")
747 assert.Equal(tokens[1], "24.1.1806")
748 assert.Equal(tokens[2], "feierte")
749 assert.Equal(tokens[3], "E.")
750 assert.Equal(tokens[4], "T.")
751 assert.Equal(tokens[5], "A.")
752 assert.Equal(tokens[6], "Hoffmann")
753 assert.Equal(tokens[7], "seinen")
754 assert.Equal(tokens[8], "30.")
755 assert.Equal(tokens[9], "Geburtstag")
756 assert.Equal(tokens[10], ".")
757 assert.Equal(11, len(tokens))
758
Akron9135b202021-11-06 13:16:07 +0100759 // IPtest
760 tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.")
761 assert.Equal(tokens[0], "Meine")
762 assert.Equal(tokens[1], "IP")
763 assert.Equal(tokens[2], "ist")
764 assert.Equal(tokens[3], "192.178.168.55")
765 assert.Equal(tokens[4], ".")
766 assert.Equal(5, len(tokens))
767
Akron6742b962021-11-09 01:17:20 +0100768 // XML entities
769 tokens = ttokenize(mat, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
770 assert.Equal(tokens[0], "Das")
771 assert.Equal(tokens[1], "ist")
772 assert.Equal(tokens[2], "&nbsp;")
773 assert.Equal(tokens[3], "1:30")
774 assert.Equal(tokens[4], "Stunden")
775 assert.Equal(tokens[5], "&")
776 assert.Equal(tokens[6], "20")
777 assert.Equal(tokens[7], "Minuten")
778 assert.Equal(tokens[8], "zu")
779 assert.Equal(tokens[9], "spät")
780 assert.Equal(tokens[10], "&GT;")
781 assert.Equal(tokens[11], ".")
782 assert.Equal(12, len(tokens))
783
Akron936c0f52021-12-07 11:30:53 +0100784 // Plusampersand compounds (1)
Akrone62e8eb2021-12-03 11:59:53 +0100785 tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
786 assert.Equal(tokens[0], "Die")
787 assert.Equal(tokens[1], "2G+-Regel")
788 assert.Equal(tokens[2], "soll")
789 assert.Equal(tokens[3], "weitere")
790 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
791 assert.Equal(tokens[5], "reduzieren")
792 assert.Equal(tokens[6], ".")
793 assert.Equal(7, len(tokens))
794
Akron936c0f52021-12-07 11:30:53 +0100795 // Plusampersand compounds (2)
796 tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.")
797 assert.Equal(tokens[0], "Der")
798 assert.Equal(tokens[1], "Neu-C++-Programmierer")
799 assert.Equal(tokens[2], ".")
800 assert.Equal(3, len(tokens))
801
Akron54ed7e72022-01-04 12:05:00 +0100802 // z.B.
803 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
804 assert.Equal(tokens[0], "Dies")
805 assert.Equal(tokens[1], "sind")
806 assert.Equal(tokens[2], "z.")
807 assert.Equal(tokens[3], "B.")
808 assert.Equal(tokens[4], "zwei")
809 assert.Equal(tokens[5], "Wörter")
810 assert.Equal(tokens[6], "-")
811 assert.Equal(tokens[7], "z.")
812 assert.Equal(tokens[8], "B.")
813 assert.Equal(tokens[9], "auch")
814 assert.Equal(tokens[10], ".")
815 assert.Equal(11, len(tokens))
816
Akron9a594712022-01-14 11:12:21 +0100817 // z.B.
818 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
819 assert.Equal(tokens[0], "Dies")
820 assert.Equal(tokens[1], "sind")
821 assert.Equal(tokens[2], "z.")
822 assert.Equal(tokens[3], "B.")
823 assert.Equal(tokens[4], "zwei")
824 assert.Equal(tokens[5], "Wörter")
825 assert.Equal(tokens[6], "-")
826 assert.Equal(tokens[7], "z.")
827 assert.Equal(tokens[8], "B.")
828 assert.Equal(tokens[9], "auch")
829 assert.Equal(tokens[10], ".")
830 assert.Equal(11, len(tokens))
831
832 // Single quote handling
833 tokens = ttokenize(mat, w, "Es heißt 'Leitungssportteams' und nicht anders.")
834 assert.Equal(tokens[0], "Es")
835 assert.Equal(tokens[1], "heißt")
836 assert.Equal(tokens[2], "'")
837 assert.Equal(tokens[3], "Leitungssportteams")
838 assert.Equal(tokens[4], "'")
839 assert.Equal(tokens[5], "und")
840 assert.Equal(tokens[6], "nicht")
841 assert.Equal(tokens[7], "anders")
842 assert.Equal(tokens[8], ".")
843 assert.Equal(9, len(tokens))
844
Akronb02ad072022-01-19 12:41:44 +0100845 // Apostrophe handling
846 tokens = ttokenize(mat, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
847 assert.Equal(tokens[0], "Das")
848 assert.Equal(tokens[1], "ist")
849 assert.Equal(tokens[2], "Nils’")
850 assert.Equal(tokens[3], "Einkaufskorb")
851 assert.Equal(tokens[4], "bei")
852 assert.Equal(tokens[5], "McDonald's")
853 assert.Equal(tokens[6], ".")
854 assert.Equal(7, len(tokens))
855
Akron28031b72021-10-02 13:07:25 +0200856 /*
857 @Test
858 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
859 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
860 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
861 assert.Equal("'ve", tokens[1]);
862 assert.Equal("'ll", tokens[3]);
863 assert.Equal("'d", tokens[5]);
864 assert.Equal("'m", tokens[7]);
865 assert.Equal("'re", tokens[9]);
866 assert.Equal("'s", tokens[11]);
867 assert.Equal("is", tokens[12]);
868 assert.Equal("n't", tokens[13]);
869 assert.Equal(14, len(tokens));
870 }
871
872 @Test
873 public void frenchTokenizerKnowsFrenchAbbreviations () {
874 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
875 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
876 assert.Equal("Approx.", tokens[0]);
877 assert.Equal("juill.", tokens[2]);
878 assert.Equal("prof.", tokens[5]);
879 assert.Equal("exerc.", tokens[15]);
880 assert.Equal("no.", tokens[16]);
881 assert.Equal("pp.", tokens[21]);
882 }
883
884 @Test
885 public void frenchTokenizerKnowsFrenchContractions () {
886 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
887 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
888 assert.Equal("J'", tokens[0]);
889 assert.Equal("j'", tokens[2]);
890 assert.Equal("qu'", tokens[4]);
891 assert.Equal("d'", tokens[6]);
892 assert.Equal("jusqu'", tokens[8]);
893 assert.Equal("Aujourd'hui", tokens[10]);
894 assert.Equal("D'", tokens[11]); // ’
895 assert.Equal("Quelqu'un", tokens[13]); // ’
896 assert.Equal("Presqu'île", tokens[14]); // ’
897 }
898
899 @Test
900 public void frenchTokenizerKnowsFrenchClitics () {
901 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
902 tokens = tokenize(dat, w, "suis-je sont-elles ")
903 assert.Equal("suis", tokens[0]);
904 assert.Equal("-je", tokens[1]);
905 assert.Equal("sont", tokens[2]);
906 assert.Equal("-elles", tokens[3]);
907 }
908
909 @Test
910 public void testEnglishTokenizerScienceAbbreviations () {
911 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
912 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
913 assert.Equal("Approx.", tokens[0]);
914 assert.Equal("in", tokens[1]);
915 assert.Equal("Sept.", tokens[2]);
916 assert.Equal("1954", tokens[3]);
917 assert.Equal(",", tokens[4]);
918 assert.Equal("Assoc.", tokens[5]);
919 assert.Equal("Prof.", tokens[6]);
920 assert.Equal("Dr.", tokens[7]);
921 assert.Equal("R.", tokens[8]);
922 assert.Equal("J.", tokens[9]);
923 assert.Equal("Ewing", tokens[10]);
924 assert.Equal("reviewed", tokens[11]);
925 assert.Equal("articles", tokens[12]);
926 assert.Equal("on", tokens[13]);
927 assert.Equal("Enzymol.", tokens[14]);
928 assert.Equal("Bacteriol.", tokens[15]);
929 assert.Equal("effects", tokens[16]);
930 assert.Equal("later", tokens[17]);
931 assert.Equal("published", tokens[18]);
932 assert.Equal("in", tokens[19]);
933 assert.Equal("Nutr.", tokens[20]);
934 assert.Equal("Rheumatol.", tokens[21]);
935 assert.Equal("No.", tokens[22]);
936 assert.Equal("12", tokens[23]);
937 assert.Equal("and", tokens[24]);
938 assert.Equal("Nº.", tokens[25]);
939 assert.Equal("13.", tokens[26]);
940 assert.Equal(",", tokens[27]);
941 assert.Equal("pp.", tokens[28]);
942 assert.Equal("17-18", tokens[29]);
943 assert.Equal(".", tokens[30]);
944 }
945
946 @Test
947 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
948 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
949 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
950 assert.Equal("I.", tokens[1]);
951 assert.Equal("I", tokens[8]);
952 assert.Equal(".", tokens[9]);
953 assert.Equal("I", tokens[12]);
954 assert.Equal(".", tokens[13]);
955 }
956
957 @Test
958 public void testZipOuputArchive () {
959
960 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
961 System.setOut(new PrintStream(clearOut));
962 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
963 assert.Equal(0, len(tokens));
964 }
965 */
966 /*
967
968 @Test
969 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
970 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
971 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
972 .printOffsets(true)
973 .build();
974 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
975 assert.Equal("Text1", tokens[0].getType());
976 assert.Equal(len(tokens), 9 );
977 }
978 */
979}
980
Akronc9c0eae2021-10-22 19:49:43 +0200981func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200982 assert := assert.New(t)
983
Akron9fb63af2021-10-28 01:15:53 +0200984 if mat == nil {
985 mat = LoadMatrixFile("testdata/tokenizer.matok")
986 }
Akron28031b72021-10-02 13:07:25 +0200987
Akron28031b72021-10-02 13:07:25 +0200988 assert.NotNil(mat)
989
990 b := make([]byte, 0, 2048)
991 w := bytes.NewBuffer(b)
992 var tokens []string
993
994 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
995 assert.Equal("Das", tokens[0])
996 assert.Equal("<b>", tokens[1])
997 assert.Equal("beste", tokens[2])
998 assert.Equal("</b>", tokens[3])
999 assert.Equal("Fußballspiel", tokens[4])
1000 assert.Equal(5, len(tokens))
1001
1002 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
1003 assert.Equal("Das", tokens[0])
1004 assert.Equal("<b class=\"c\">", tokens[1])
1005 assert.Equal("beste", tokens[2])
1006 assert.Equal("</b>", tokens[3])
1007 assert.Equal("Fußballspiel", tokens[4])
1008 assert.Equal(5, len(tokens))
1009
1010 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
1011 assert.Equal("der", tokens[0])
1012 assert.Equal("<x y=\"alte \">", tokens[1])
1013 assert.Equal("<x x>", tokens[2])
1014 assert.Equal("alte", tokens[3])
1015 assert.Equal("</x>", tokens[4])
1016 assert.Equal("etc.", tokens[5])
1017 assert.Equal("et", tokens[6])
1018 assert.Equal(".", tokens[7])
1019 assert.Equal("Mann", tokens[8])
1020 assert.Equal(".", tokens[9])
1021 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001022
1023 tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.")
1024 assert.Equal("das", tokens[0])
1025 assert.Equal("<br class=\"br\" />", tokens[1])
1026 assert.Equal("ging", tokens[2])
1027 assert.Equal(".", tokens[3])
1028 assert.Equal(4, len(tokens))
Akron28031b72021-10-02 13:07:25 +02001029}
1030
Akronabcb6a52021-10-09 15:52:08 +02001031func TestMatokDatokEquivalence(t *testing.T) {
1032 assert := assert.New(t)
1033
Akron9fb63af2021-10-28 01:15:53 +02001034 if mat == nil {
1035 mat = LoadMatrixFile("testdata/tokenizer.matok")
1036 }
Akronabcb6a52021-10-09 15:52:08 +02001037 dat := LoadDatokFile("testdata/tokenizer.datok")
1038
1039 r := strings.NewReader(s)
1040
1041 tb := make([]byte, 0, 2048)
1042 w := bytes.NewBuffer(tb)
1043
1044 // Transduce with double array representation
1045 dat.Transduce(r, w)
1046
1047 datStr := w.String()
1048
1049 r.Reset(s)
1050 w.Reset()
1051
1052 // Transduce with matrix representation
1053 mat.Transduce(r, w)
1054
1055 matStr := w.String()
1056
1057 assert.Equal(datStr, matStr)
1058}
1059
Akronc9c0eae2021-10-22 19:49:43 +02001060func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001061 assert := assert.New(t)
1062
Akron9fb63af2021-10-28 01:15:53 +02001063 if mat == nil {
1064 mat = LoadMatrixFile("testdata/tokenizer.matok")
1065 }
Akrone396a932021-10-19 01:06:13 +02001066
1067 assert.NotNil(mat)
1068
1069 b := make([]byte, 0, 2048)
1070 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001071
1072 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001073
1074 matStr := w.String()
1075
1076 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1077}
1078
Akronc9c0eae2021-10-22 19:49:43 +02001079func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001080 assert := assert.New(t)
1081
Akron9fb63af2021-10-28 01:15:53 +02001082 if mat == nil {
1083 mat = LoadMatrixFile("testdata/tokenizer.matok")
1084 }
Akrona854faa2021-10-22 19:31:08 +02001085
1086 assert.NotNil(mat)
1087
1088 b := make([]byte, 0, 2048)
1089 w := bytes.NewBuffer(b)
1090
1091 assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
1092 matStr := w.String()
1093 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001094}
Akrona854faa2021-10-22 19:31:08 +02001095
Akron22c565a2021-11-28 17:31:36 +01001096func TestMatrixFullTokenizerLongText(t *testing.T) {
1097 assert := assert.New(t)
1098
1099 if mat == nil {
1100 mat = LoadMatrixFile("testdata/tokenizer.matok")
1101 }
1102
1103 assert.NotNil(mat)
1104
1105 b := make([]byte, 0, 2048)
1106 w := bytes.NewBuffer(b)
1107
1108 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1109
1110Copyright laws are changing all over the world. Be sure to check the
1111copyright laws for your country before downloading or redistributing
1112this or any other Project Gutenberg eBook.
1113
1114This header should be the first thing seen when viewing this Project
1115Gutenberg file. Please do not remove it. Do not change or edit the
1116header without written permission.
1117
1118Please read the "legal small print," and other information about the
1119eBook and Project Gutenberg at the bottom of this file. Included is
1120important information about your specific rights and restrictions in
1121how the file may be used. You can also find out about how to make a
1122donation to Project Gutenberg, and how to get involved.
1123
1124
1125**Welcome To The World of Free Plain Vanilla Electronic Texts**
1126
1127**eBooks Readable By Both Humans and By Computers, Since 1971**
1128
1129*****These eBooks Were Prepared By Thousands of Volunteers!*****
1130
1131
1132Title: Effi Briest
1133
1134Author: Theodor Fontane
1135
1136Release Date: March, 2004 [EBook #5323]
1137`
1138
1139 assert.True(mat.Transduce(strings.NewReader(text), w))
1140
1141 assert.True(strings.Contains(w.String(), "Release"))
1142}
1143
Akronf6bdfdb2021-10-23 15:56:53 +02001144func TestMatrixTrimming(t *testing.T) {
1145 assert := assert.New(t)
1146
Akron9fb63af2021-10-28 01:15:53 +02001147 if mat == nil {
1148 mat = LoadMatrixFile("testdata/tokenizer.matok")
1149 }
Akronf6bdfdb2021-10-23 15:56:53 +02001150
1151 assert.NotNil(mat)
1152
1153 b := make([]byte, 0, 2048)
1154 w := bytes.NewBuffer(b)
1155
1156 assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
1157 matStr := w.String()
1158 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001159}
1160
Akronc9c0eae2021-10-22 19:49:43 +02001161func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001162 bu := make([]byte, 0, 2048)
1163 w := bytes.NewBuffer(bu)
1164
Akron28031b72021-10-02 13:07:25 +02001165 r := strings.NewReader(s)
1166
Akron094a4e82021-10-02 18:37:00 +02001167 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001168
1169 b.ResetTimer()
1170
1171 for i := 0; i < b.N; i++ {
1172 w.Reset()
1173 r.Reset(s)
1174 ok := mat.Transduce(r, w)
1175 if !ok {
1176 fmt.Println("Fail!")
1177 fmt.Println(w.String())
1178 os.Exit(1)
1179 }
1180 }
1181}