blob: cc45b8f378bf8f97fd8f22b94d41944f254f3b44 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron1c34ce62021-09-23 23:27:39 +020023func TestFullTokenizerMatrix(t *testing.T) {
24 assert := assert.New(t)
25 foma := LoadFomaFile("testdata/simpletok.fst")
26 assert.NotNil(foma)
27
28 mat := foma.ToMatrix()
29
30 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
31 b := make([]byte, 0, 2048)
32 w := bytes.NewBuffer(b)
33 var tokens []string
34 mat.Transduce(r, w)
35 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020036 assert.Equal(len(tokens), 10)
Akron1c34ce62021-09-23 23:27:39 +020037 assert.Equal("wald", tokens[0])
38 assert.Equal("gehen", tokens[1])
39 assert.Equal("Da", tokens[2])
40 assert.Equal("kann", tokens[3])
41 assert.Equal("man", tokens[4])
42 assert.Equal("was", tokens[5])
43 assert.Equal("\"erleben\"", tokens[6])
44 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020045
46 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
47 w.Reset()
48 mat.Transduce(r, w)
49 tokens = strings.Split(w.String(), "\n")
50 assert.Equal("In", tokens[0])
51 assert.Equal("den", tokens[1])
52 assert.Equal("Wald", tokens[2])
53 assert.Equal("gehen", tokens[3])
54 assert.Equal("?", tokens[4])
55 assert.Equal("--", tokens[5])
56
57 r = strings.NewReader(" g? -- D")
58 w.Reset()
59 mat.Transduce(r, w)
60 tokens = strings.Split(w.String(), "\n")
61 assert.Equal("g", tokens[0])
62 assert.Equal("?", tokens[1])
63 assert.Equal("--", tokens[2])
64 assert.Equal("D", tokens[3])
65 assert.Equal("", tokens[4])
66 assert.Equal("", tokens[5])
67 assert.Equal(6, len(tokens))
68}
69
Akron16c312e2021-09-26 13:11:12 +020070func TestReadWriteMatrixTokenizer(t *testing.T) {
71 assert := assert.New(t)
72 foma := LoadFomaFile("testdata/simpletok.fst")
73 assert.NotNil(foma)
74
75 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020076 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020077
78 assert.True(tmatch(mat, "bau"))
79 assert.True(tmatch(mat, "bad"))
80 assert.True(tmatch(mat, "wald gehen"))
81 b := make([]byte, 0, 1024)
82 buf := bytes.NewBuffer(b)
83 n, err := mat.WriteTo(buf)
84 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020085 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020086 mat2 := ParseMatrix(buf)
87 assert.NotNil(mat2)
88 assert.Equal(mat.sigma, mat2.sigma)
89 assert.Equal(mat.epsilon, mat2.epsilon)
90 assert.Equal(mat.unknown, mat2.unknown)
91 assert.Equal(mat.identity, mat2.identity)
92 assert.Equal(mat.stateCount, mat2.stateCount)
93 assert.Equal(len(mat.array), len(mat2.array))
94 assert.Equal(mat.array, mat2.array)
95 assert.True(tmatch(mat2, "bau"))
96 assert.True(tmatch(mat2, "bad"))
97 assert.True(tmatch(mat2, "wald gehen"))
98}
99
Akrone396a932021-10-19 01:06:13 +0200100func TestMatrixIgnorableMCS(t *testing.T) {
101 assert := assert.New(t)
102
103 // This test relies on final states. That's why it is
104 // not working correctly anymore.
105
106 // File has MCS in sigma but not in net
107 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
108 assert.NotNil(tok)
109 mat := tok.ToMatrix()
110 assert.NotNil(mat)
111
112 b := make([]byte, 0, 2048)
113 w := bytes.NewBuffer(b)
114 var tokens []string
115
116 // Is only unambigous when transducing strictly greedy!
117 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
118 tokens = strings.Split(w.String(), "\n")
119 assert.Equal("a\nb\n<ab>a\n\n", w.String())
120 assert.Equal("a", tokens[0])
121 assert.Equal("b", tokens[1])
122 assert.Equal("<ab>a", tokens[2])
123 assert.Equal(5, len(tokens))
124}
125
Akron28031b72021-10-02 13:07:25 +0200126func TestReadWriteMatrixFullTokenizer(t *testing.T) {
127 assert := assert.New(t)
128 foma := LoadFomaFile("testdata/tokenizer.fst")
129 assert.NotNil(foma)
130
131 mat := foma.ToMatrix()
132 assert.NotNil(foma)
133
134 tb := make([]byte, 0, 2048)
135 w := bytes.NewBuffer(tb)
136
137 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
138 assert.Equal("der\nalte\nbaum\n\n", w.String())
139
140 b := make([]byte, 0, 1024)
141 buf := bytes.NewBuffer(b)
142 _, err := mat.WriteTo(buf)
143 assert.Nil(err)
144 w.Reset()
145 // assert.Equal(int64(248), n)
146
147 mat2 := ParseMatrix(buf)
148 assert.NotNil(mat2)
149 assert.Equal(mat.sigma, mat2.sigma)
150 assert.Equal(mat.epsilon, mat2.epsilon)
151 assert.Equal(mat.unknown, mat2.unknown)
152 assert.Equal(mat.identity, mat2.identity)
153 assert.Equal(mat.stateCount, mat2.stateCount)
154 assert.Equal(len(mat.array), len(mat2.array))
155 // assert.Equal(mat.array, mat2.array)
156
157 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
158 assert.Equal("der\nalte\nbaum\n\n", w.String())
159}
160
161func TestFullTokenizerMatrixTransduce(t *testing.T) {
162 assert := assert.New(t)
163
Akron094a4e82021-10-02 18:37:00 +0200164 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200165
166 assert.NotNil(mat)
167
168 b := make([]byte, 0, 2048)
169 w := bytes.NewBuffer(b)
170 var tokens []string
171
172 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
173
174 tokens = strings.Split(w.String(), "\n")
175 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
176 assert.Equal("tra", tokens[0])
177 assert.Equal(".", tokens[1])
178 assert.Equal("", tokens[2])
179 assert.Equal("u", tokens[3])
180 assert.Equal("Du", tokens[4])
181 assert.Equal("?", tokens[5])
182 assert.Equal("", tokens[6])
183 assert.Equal("", tokens[7])
184 assert.Equal(8, len(tokens))
185
186 w.Reset()
187 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
188 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
189}
190
Akron5c82a922021-09-24 19:11:29 +0200191func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
192 assert := assert.New(t)
Akron094a4e82021-10-02 18:37:00 +0200193 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron5c82a922021-09-24 19:11:29 +0200194
195 b := make([]byte, 0, 2048)
196 w := bytes.NewBuffer(b)
197 var sentences []string
198
199 // testSentSplitterSimple
200 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
201 sentences = strings.Split(w.String(), "\n\n")
202
203 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
204 assert.Equal("Der\nalte\nMann\n.", sentences[0])
205 assert.Equal("", sentences[1])
206 assert.Equal(len(sentences), 2)
207
208 w.Reset()
209 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
210 sentences = strings.Split(w.String(), "\n\n")
211 assert.Equal(len(sentences), 2)
212 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
213 assert.Equal("", sentences[1])
214
Akron28031b72021-10-02 13:07:25 +0200215 w.Reset()
216 assert.True(mat.Transduce(strings.NewReader(""), w))
217 sentences = strings.Split(w.String(), "\n\n")
218 assert.Equal(len(sentences), 1)
219 assert.Equal("\n", sentences[0])
Akron5c82a922021-09-24 19:11:29 +0200220
Akron28031b72021-10-02 13:07:25 +0200221 w.Reset()
222 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
223 sentences = strings.Split(w.String(), "\n\n")
224 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200225
Akron28031b72021-10-02 13:07:25 +0200226 w.Reset()
227 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
228 sentences = strings.Split(w.String(), "\n\n")
229 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200230
Akron28031b72021-10-02 13:07:25 +0200231 w.Reset()
232 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
233 sentences = strings.Split(w.String(), "\n\n")
234 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
235 assert.Equal("", sentences[1])
236 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200237
Akron28031b72021-10-02 13:07:25 +0200238 w.Reset()
239 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
240 sentences = strings.Split(w.String(), "\n\n")
241 assert.Equal("", sentences[1])
242 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200243
Akron28031b72021-10-02 13:07:25 +0200244 w.Reset()
245 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
246 sentences = strings.Split(w.String(), "\n\n")
247 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200248
Akron28031b72021-10-02 13:07:25 +0200249 w.Reset()
250 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
251 sentences = strings.Split(w.String(), "\n\n")
252 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200253
Akron28031b72021-10-02 13:07:25 +0200254 w.Reset()
255 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
256 sentences = strings.Split(w.String(), "\n\n")
257 assert.Equal(len(sentences), 2)
258 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
259 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200260
Akron28031b72021-10-02 13:07:25 +0200261 w.Reset()
262 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
263 sentences = strings.Split(w.String(), "\n\n")
264 assert.Equal(len(sentences), 3)
265 assert.Equal("Ausschalten\n!!!", sentences[0])
266 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
267 assert.Equal("", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200268
Akron28031b72021-10-02 13:07:25 +0200269 w.Reset()
270 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
271 sentences = strings.Split(w.String(), "\n\n")
272 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200273 /*
274 Test:
275 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
276 */
Akron1c34ce62021-09-23 23:27:39 +0200277}
Akron28031b72021-10-02 13:07:25 +0200278
279func TestFullTokenizerMatrixTokenSplitter(t *testing.T) {
280 assert := assert.New(t)
281
Akron094a4e82021-10-02 18:37:00 +0200282 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200283
284 b := make([]byte, 0, 2048)
285 w := bytes.NewBuffer(b)
286 var tokens []string
287
288 // testTokenizerSimple
289 tokens = ttokenize(mat, w, "Der alte Mann")
290 assert.Equal(tokens[0], "Der")
291 assert.Equal(tokens[1], "alte")
292 assert.Equal(tokens[2], "Mann")
293 assert.Equal(len(tokens), 3)
294
295 tokens = ttokenize(mat, w, "Der alte Mann.")
296 assert.Equal(tokens[0], "Der")
297 assert.Equal(tokens[1], "alte")
298 assert.Equal(tokens[2], "Mann")
299 assert.Equal(tokens[3], ".")
300 assert.Equal(len(tokens), 4)
301
302 // testTokenizerAbbr
303 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
304 assert.Equal(tokens[0], "Der")
305 assert.Equal(tokens[1], "Vorsitzende")
306 assert.Equal(tokens[2], "der")
307 assert.Equal(tokens[3], "F.D.P.")
308 assert.Equal(tokens[4], "hat")
309 assert.Equal(tokens[5], "gewählt")
310 assert.Equal(len(tokens), 6)
311 // Ignored in KorAP-Tokenizer
312
313 // testTokenizerHost1
314 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
315 assert.Equal(tokens[0], "Gefunden")
316 assert.Equal(tokens[1], "auf")
317 assert.Equal(tokens[2], "wikipedia.org")
318 assert.Equal(len(tokens), 3)
319
320 // testTokenizerWwwHost
321 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
322 assert.Equal("Gefunden", tokens[0])
323 assert.Equal("auf", tokens[1])
324 assert.Equal("www.wikipedia.org", tokens[2])
325 assert.Equal(3, len(tokens))
326
327 // testTokenizerWwwUrl
328 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
329 assert.Equal("www.info.biz/info", tokens[3])
330
331 // testTokenizerFtpHost
332 /*
333 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
334 assert.Equal("Kann", tokens[0])
335 assert.Equal("von", tokens[1])
336 assert.Equal("ftp.download.org", tokens[2])
337 assert.Equal(5, len(tokens))
338 // Ignored in KorAP-Tokenizer
339 */
340
341 // testTokenizerDash
342 tokens = ttokenize(mat, w, "Das war -- spitze")
343 assert.Equal(tokens[0], "Das")
344 assert.Equal(tokens[1], "war")
345 assert.Equal(tokens[2], "--")
346 assert.Equal(tokens[3], "spitze")
347 assert.Equal(len(tokens), 4)
348
349 // testTokenizerEmail1
350 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
351 assert.Equal(tokens[0], "Ich")
352 assert.Equal(tokens[1], "bin")
353 assert.Equal(tokens[2], "unter")
354 assert.Equal(tokens[3], "korap@ids-mannheim.de")
355 assert.Equal(tokens[4], "erreichbar")
356 assert.Equal(tokens[5], ".")
357 assert.Equal(len(tokens), 6)
358
359 // testTokenizerEmail2
360 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
361 assert.Equal(tokens[0], "Oder")
362 assert.Equal(tokens[1], "unter")
363 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
364 assert.Equal(tokens[3], ".")
365 assert.Equal(len(tokens), 4)
366
367 // testTokenizerEmail3
368 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
369 assert.Equal(tokens[0], "Oder")
370 assert.Equal(tokens[1], "unter")
371 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
372 assert.Equal(tokens[3], ".")
373 assert.Equal(len(tokens), 4)
374 // Ignored in KorAP-Tokenizer
375
376 // testTokenizerDoNotAcceptQuotedEmailNames
377 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
378 assert.Equal("\"", tokens[0])
379 assert.Equal("John", tokens[1])
380 assert.Equal("Doe", tokens[2])
381 assert.Equal("\"", tokens[3])
382 assert.Equal("@xx", tokens[4])
383 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
384 assert.Equal("com", tokens[6])
385 assert.Equal(7, len(tokens))
386
387 // testTokenizerTwitter
388 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
389 assert.Equal(tokens[0], "Folgt")
390 assert.Equal(tokens[1], "@korap")
391 assert.Equal(tokens[2], "und")
392 assert.Equal(tokens[3], "#korap")
393 assert.Equal(len(tokens), 4)
394
395 // testTokenizerWeb1
396 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
397 assert.Equal(tokens[0], "Unsere")
398 assert.Equal(tokens[1], "Website")
399 assert.Equal(tokens[2], "ist")
400 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
401 assert.Equal(len(tokens), 4)
402
403 // testTokenizerWeb2
404 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
405 assert.Equal(tokens[0], "Wir")
406 assert.Equal(tokens[1], "sind")
407 assert.Equal(tokens[2], "auch")
408 assert.Equal(tokens[3], "im")
409 assert.Equal(tokens[4], "Internet")
410 assert.Equal(tokens[5], "(")
411 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
412 assert.Equal(tokens[7], ")")
413 assert.Equal(len(tokens), 8)
414 // Ignored in KorAP-Tokenizer
415
416 // testTokenizerWeb3
417 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
418 assert.Equal(tokens[0], "Die")
419 assert.Equal(tokens[1], "Adresse")
420 assert.Equal(tokens[2], "ist")
421 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
422 assert.Equal(tokens[4], ".")
423 assert.Equal(len(tokens), 5)
424 // Ignored in KorAP-Tokenizer
425
426 // testTokenizerServer
427 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
428 assert.Equal(tokens[0], "Unser")
429 assert.Equal(tokens[1], "Server")
430 assert.Equal(tokens[2], "ist")
431 assert.Equal(tokens[3], "10.0.10.51")
432 assert.Equal(tokens[4], ".")
433 assert.Equal(len(tokens), 5)
434
435 // testTokenizerNum
436 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
437 assert.Equal(tokens[0], "Zu")
438 assert.Equal(tokens[1], "50,4%")
439 assert.Equal(tokens[2], "ist")
440 assert.Equal(tokens[3], "es")
441 assert.Equal(tokens[4], "sicher")
442 assert.Equal(len(tokens), 5)
443 // Differs from KorAP-Tokenizer
444
445 // testTokenizerDate
446 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
447 assert.Equal(tokens[0], "Der")
448 assert.Equal(tokens[1], "Termin")
449 assert.Equal(tokens[2], "ist")
450 assert.Equal(tokens[3], "am")
451 assert.Equal(tokens[4], "5.9.2018")
452 assert.Equal(len(tokens), 5)
453
454 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
455 assert.Equal(tokens[0], "Der")
456 assert.Equal(tokens[1], "Termin")
457 assert.Equal(tokens[2], "ist")
458 assert.Equal(tokens[3], "am")
459 assert.Equal(tokens[4], "5/9/2018")
460 assert.Equal(len(tokens), 5)
461
462 // testTokenizerDateRange
463 /*
464 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
465 assert.Equal(tokens[0], "Der")
466 assert.Equal(tokens[1], "Termin")
467 assert.Equal(tokens[2], "war")
468 assert.Equal(tokens[3], "vom")
469 assert.Equal(tokens[4], "4.")
470 assert.Equal(tokens[5], "-")
471 assert.Equal(tokens[6], "5.9.2018")
472 assert.Equal(len(tokens), 7)
473 // Ignored in KorAP-Tokenizer
474 */
475
476 // testTokenizerEmoji1
477 tokens = ttokenize(mat, w, "Das ist toll! ;)")
478 assert.Equal(tokens[0], "Das")
479 assert.Equal(tokens[1], "ist")
480 assert.Equal(tokens[2], "toll")
481 assert.Equal(tokens[3], "!")
482 assert.Equal(tokens[4], ";)")
483 assert.Equal(len(tokens), 5)
484
485 // testTokenizerRef1
486 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
487 assert.Equal(tokens[0], "Kupietz")
488 assert.Equal(tokens[1], "und")
489 assert.Equal(tokens[2], "Schmidt")
490 assert.Equal(tokens[3], "(2018)")
491 assert.Equal(tokens[4], ":")
492 assert.Equal(tokens[5], "Korpuslinguistik")
493 assert.Equal(len(tokens), 6)
494 // Differs from KorAP-Tokenizer!
495
496 // testTokenizerRef2 () {
497 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
498 assert.Equal(tokens[0], "Kupietz")
499 assert.Equal(tokens[1], "und")
500 assert.Equal(tokens[2], "Schmidt")
501 assert.Equal(tokens[3], "[2018]")
502 assert.Equal(tokens[4], ":")
503 assert.Equal(tokens[5], "Korpuslinguistik")
504 assert.Equal(len(tokens), 6)
505 // Differs from KorAP-Tokenizer!
506
507 // testTokenizerOmission1 () {
508 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
509 assert.Equal(tokens[0], "Er")
510 assert.Equal(tokens[1], "ist")
511 assert.Equal(tokens[2], "ein")
512 assert.Equal(tokens[3], "A****loch")
513 assert.Equal(tokens[4], "!")
514 assert.Equal(len(tokens), 5)
515
516 // testTokenizerOmission2
517 tokens = ttokenize(mat, w, "F*ck!")
518 assert.Equal(tokens[0], "F*ck")
519 assert.Equal(tokens[1], "!")
520 assert.Equal(len(tokens), 2)
521
522 // testTokenizerOmission3 () {
523 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
524 assert.Equal(tokens[0], "Dieses")
525 assert.Equal(tokens[1], "verf*****")
526 assert.Equal(tokens[2], "Kleid")
527 assert.Equal(tokens[3], "!")
528 assert.Equal(len(tokens), 4)
529
530 // Probably interpreted as HOST
531 // testTokenizerFileExtension1
532 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
533 assert.Equal(tokens[0], "Ich")
534 assert.Equal(tokens[1], "habe")
535 assert.Equal(tokens[2], "die")
536 assert.Equal(tokens[3], "readme.txt")
537 assert.Equal(tokens[4], "heruntergeladen")
538 assert.Equal(len(tokens), 5)
539
540 // Probably interpreted as HOST
541 // testTokenizerFileExtension2
542 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
543 assert.Equal(tokens[0], "Nimm")
544 assert.Equal(tokens[1], "die")
545 assert.Equal(tokens[2], "README.TXT")
546 assert.Equal(tokens[3], "!")
547 assert.Equal(len(tokens), 4)
548
549 // Probably interpreted as HOST
550 // testTokenizerFileExtension3
551 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
552 assert.Equal(tokens[0], "Zeig")
553 assert.Equal(tokens[1], "mir")
554 assert.Equal(tokens[2], "profile.jpeg")
555 assert.Equal(len(tokens), 3)
556
557 // testTokenizerFile1
558
559 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
560 assert.Equal(tokens[0], "Zeig")
561 assert.Equal(tokens[1], "mir")
562 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
563 assert.Equal(len(tokens), 3)
564
565 // testTokenizerFile2
566 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
567 assert.Equal(tokens[0], "Gehe")
568 assert.Equal(tokens[1], "zu")
569 assert.Equal(tokens[2], "/Dokumente/profile.docx")
570 assert.Equal(len(tokens), 3)
571
572 // testTokenizerFile3
573 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
574 assert.Equal(tokens[0], "Zeig")
575 assert.Equal(tokens[1], "mir")
576 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
577 assert.Equal(len(tokens), 3)
578 // Ignored in KorAP-Tokenizer
579
580 // testTokenizerPunct
581 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
582 assert.Equal(tokens[0], "Er")
583 assert.Equal(tokens[1], "sagte")
584 assert.Equal(tokens[2], ":")
585 assert.Equal(tokens[3], "\"")
586 assert.Equal(tokens[4], "Es")
587 assert.Equal(tokens[5], "geht")
588 assert.Equal(tokens[6], "mir")
589 assert.Equal(tokens[7], "gut")
590 assert.Equal(tokens[8], "!")
591 assert.Equal(tokens[9], "\"")
592 assert.Equal(tokens[10], ",")
593 assert.Equal(tokens[11], "daraufhin")
594 assert.Equal(tokens[12], "ging")
595 assert.Equal(tokens[13], "er")
596 assert.Equal(tokens[14], ".")
597 assert.Equal(len(tokens), 15)
598
599 // testTokenizerPlusAmpersand
600 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
601 assert.Equal(tokens[0], "&quot;")
602 assert.Equal(tokens[1], "Das")
603 assert.Equal(tokens[2], "ist")
604 assert.Equal(tokens[3], "von")
605 assert.Equal(tokens[4], "C&A")
606 assert.Equal(tokens[5], "!")
607 assert.Equal(tokens[6], "&quot;")
608 assert.Equal(len(tokens), 7)
609
610 // testTokenizerLongEnd
611 tokens = ttokenize(mat, w, "Siehst Du?!!?")
612 assert.Equal(tokens[0], "Siehst")
613 assert.Equal(tokens[1], "Du")
614 assert.Equal(tokens[2], "?!!?")
615 assert.Equal(len(tokens), 3)
616
617 // testTokenizerIrishO
618 tokens = ttokenize(mat, w, "Peter O'Toole")
619 assert.Equal(tokens[0], "Peter")
620 assert.Equal(tokens[1], "O'Toole")
621 assert.Equal(len(tokens), 2)
622
623 // testTokenizerAbr
624 tokens = ttokenize(mat, w, "Früher bzw. später ...")
625 assert.Equal(tokens[0], "Früher")
626 assert.Equal(tokens[1], "bzw.")
627 assert.Equal(tokens[2], "später")
628 assert.Equal(tokens[3], "...")
629 assert.Equal(len(tokens), 4)
630
631 // testTokenizerUppercaseRule
632 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
633 assert.Equal(tokens[0], "Es")
634 assert.Equal(tokens[1], "war")
635 assert.Equal(tokens[2], "spät")
636 assert.Equal(tokens[3], ".")
637 assert.Equal(tokens[4], "Morgen")
638 assert.Equal(tokens[5], "ist")
639 assert.Equal(tokens[6], "es")
640 assert.Equal(tokens[7], "früh")
641 assert.Equal(tokens[8], ".")
642 assert.Equal(len(tokens), 9)
643 // Ignored in KorAP-Tokenizer
644
645 // testTokenizerOrd
646 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
647 assert.Equal(tokens[0], "Sie")
648 assert.Equal(tokens[1], "erreichte")
649 assert.Equal(tokens[2], "den")
650 assert.Equal(tokens[3], "1.")
651 assert.Equal(tokens[4], "Platz")
652 assert.Equal(tokens[5], "!")
653 assert.Equal(len(tokens), 6)
654
655 // testNoZipOuputArchive
656 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
657 assert.Equal(tokens[0], "Archive")
658 assert.Equal(tokens[1], ":")
659 assert.Equal(tokens[2], "Ich")
660 assert.Equal(tokens[3], "bin")
661 assert.Equal(tokens[4], "kein")
662 assert.Equal(tokens[5], "zip")
663 assert.Equal(6, len(tokens))
664
665 // testTokenizerStrasse
666 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
667 assert.Equal(tokens[4], "Weststr.")
668 assert.Equal(8, len(tokens))
669
670 // germanTokenizerKnowsGermanOmissionWords
671 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
672 assert.Equal("D'dorf", tokens[0])
673 assert.Equal("Ku'damm", tokens[1])
674 assert.Equal("Lu'hafen", tokens[2])
675 assert.Equal("M'gladbach", tokens[3])
676 assert.Equal("W'schaft", tokens[4])
677 assert.Equal(5, len(tokens))
678
679 // germanTokenizerDoesNOTSeparateGermanContractions
680 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
681 assert.Equal("mach's", tokens[0])
682 assert.Equal("macht's", tokens[1])
683 assert.Equal("was'n", tokens[2])
684 assert.Equal("ist's", tokens[3])
685 assert.Equal("haste", tokens[4])
686 assert.Equal("willste", tokens[5])
687 assert.Equal("kannste", tokens[6])
688 assert.Equal("biste", tokens[7])
689 assert.Equal("kriegste", tokens[8])
690 assert.Equal(9, len(tokens))
691
692 /*
693 @Test
694 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
695 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
696 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
697 assert.Equal("'ve", tokens[1]);
698 assert.Equal("'ll", tokens[3]);
699 assert.Equal("'d", tokens[5]);
700 assert.Equal("'m", tokens[7]);
701 assert.Equal("'re", tokens[9]);
702 assert.Equal("'s", tokens[11]);
703 assert.Equal("is", tokens[12]);
704 assert.Equal("n't", tokens[13]);
705 assert.Equal(14, len(tokens));
706 }
707
708 @Test
709 public void frenchTokenizerKnowsFrenchAbbreviations () {
710 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
711 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
712 assert.Equal("Approx.", tokens[0]);
713 assert.Equal("juill.", tokens[2]);
714 assert.Equal("prof.", tokens[5]);
715 assert.Equal("exerc.", tokens[15]);
716 assert.Equal("no.", tokens[16]);
717 assert.Equal("pp.", tokens[21]);
718 }
719
720 @Test
721 public void frenchTokenizerKnowsFrenchContractions () {
722 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
723 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
724 assert.Equal("J'", tokens[0]);
725 assert.Equal("j'", tokens[2]);
726 assert.Equal("qu'", tokens[4]);
727 assert.Equal("d'", tokens[6]);
728 assert.Equal("jusqu'", tokens[8]);
729 assert.Equal("Aujourd'hui", tokens[10]);
730 assert.Equal("D'", tokens[11]); // ’
731 assert.Equal("Quelqu'un", tokens[13]); // ’
732 assert.Equal("Presqu'île", tokens[14]); // ’
733 }
734
735 @Test
736 public void frenchTokenizerKnowsFrenchClitics () {
737 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
738 tokens = tokenize(dat, w, "suis-je sont-elles ")
739 assert.Equal("suis", tokens[0]);
740 assert.Equal("-je", tokens[1]);
741 assert.Equal("sont", tokens[2]);
742 assert.Equal("-elles", tokens[3]);
743 }
744
745 @Test
746 public void testEnglishTokenizerScienceAbbreviations () {
747 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
748 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
749 assert.Equal("Approx.", tokens[0]);
750 assert.Equal("in", tokens[1]);
751 assert.Equal("Sept.", tokens[2]);
752 assert.Equal("1954", tokens[3]);
753 assert.Equal(",", tokens[4]);
754 assert.Equal("Assoc.", tokens[5]);
755 assert.Equal("Prof.", tokens[6]);
756 assert.Equal("Dr.", tokens[7]);
757 assert.Equal("R.", tokens[8]);
758 assert.Equal("J.", tokens[9]);
759 assert.Equal("Ewing", tokens[10]);
760 assert.Equal("reviewed", tokens[11]);
761 assert.Equal("articles", tokens[12]);
762 assert.Equal("on", tokens[13]);
763 assert.Equal("Enzymol.", tokens[14]);
764 assert.Equal("Bacteriol.", tokens[15]);
765 assert.Equal("effects", tokens[16]);
766 assert.Equal("later", tokens[17]);
767 assert.Equal("published", tokens[18]);
768 assert.Equal("in", tokens[19]);
769 assert.Equal("Nutr.", tokens[20]);
770 assert.Equal("Rheumatol.", tokens[21]);
771 assert.Equal("No.", tokens[22]);
772 assert.Equal("12", tokens[23]);
773 assert.Equal("and", tokens[24]);
774 assert.Equal("Nº.", tokens[25]);
775 assert.Equal("13.", tokens[26]);
776 assert.Equal(",", tokens[27]);
777 assert.Equal("pp.", tokens[28]);
778 assert.Equal("17-18", tokens[29]);
779 assert.Equal(".", tokens[30]);
780 }
781
782 @Test
783 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
784 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
785 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
786 assert.Equal("I.", tokens[1]);
787 assert.Equal("I", tokens[8]);
788 assert.Equal(".", tokens[9]);
789 assert.Equal("I", tokens[12]);
790 assert.Equal(".", tokens[13]);
791 }
792
793 @Test
794 public void testZipOuputArchive () {
795
796 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
797 System.setOut(new PrintStream(clearOut));
798 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
799 assert.Equal(0, len(tokens));
800 }
801 */
802 /*
803
804 @Test
805 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
806 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
807 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
808 .printOffsets(true)
809 .build();
810 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
811 assert.Equal("Text1", tokens[0].getType());
812 assert.Equal(len(tokens), 9 );
813 }
814 */
815}
816
817func TestFullTokenizerMatrixXML(t *testing.T) {
818 assert := assert.New(t)
819
Akron094a4e82021-10-02 18:37:00 +0200820 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200821
Akron28031b72021-10-02 13:07:25 +0200822 assert.NotNil(mat)
823
824 b := make([]byte, 0, 2048)
825 w := bytes.NewBuffer(b)
826 var tokens []string
827
828 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
829 assert.Equal("Das", tokens[0])
830 assert.Equal("<b>", tokens[1])
831 assert.Equal("beste", tokens[2])
832 assert.Equal("</b>", tokens[3])
833 assert.Equal("Fußballspiel", tokens[4])
834 assert.Equal(5, len(tokens))
835
836 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
837 assert.Equal("Das", tokens[0])
838 assert.Equal("<b class=\"c\">", tokens[1])
839 assert.Equal("beste", tokens[2])
840 assert.Equal("</b>", tokens[3])
841 assert.Equal("Fußballspiel", tokens[4])
842 assert.Equal(5, len(tokens))
843
844 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
845 assert.Equal("der", tokens[0])
846 assert.Equal("<x y=\"alte \">", tokens[1])
847 assert.Equal("<x x>", tokens[2])
848 assert.Equal("alte", tokens[3])
849 assert.Equal("</x>", tokens[4])
850 assert.Equal("etc.", tokens[5])
851 assert.Equal("et", tokens[6])
852 assert.Equal(".", tokens[7])
853 assert.Equal("Mann", tokens[8])
854 assert.Equal(".", tokens[9])
855 assert.Equal(10, len(tokens))
856}
857
Akronabcb6a52021-10-09 15:52:08 +0200858func TestMatokDatokEquivalence(t *testing.T) {
859 assert := assert.New(t)
860
861 mat := LoadMatrixFile("testdata/tokenizer.matok")
862 dat := LoadDatokFile("testdata/tokenizer.datok")
863
864 r := strings.NewReader(s)
865
866 tb := make([]byte, 0, 2048)
867 w := bytes.NewBuffer(tb)
868
869 // Transduce with double array representation
870 dat.Transduce(r, w)
871
872 datStr := w.String()
873
874 r.Reset(s)
875 w.Reset()
876
877 // Transduce with matrix representation
878 mat.Transduce(r, w)
879
880 matStr := w.String()
881
882 assert.Equal(datStr, matStr)
883}
884
Akrone396a932021-10-19 01:06:13 +0200885func TestFullTokenizerMatrixCallbackTransduce(t *testing.T) {
886 assert := assert.New(t)
887
888 mat := LoadMatrixFile("testdata/tokenizer.matok")
889
890 assert.NotNil(mat)
891
892 b := make([]byte, 0, 2048)
893 w := bytes.NewBuffer(b)
894 // var tokens []string
895
896 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
897}
898
Akron28031b72021-10-02 13:07:25 +0200899func BenchmarkTransduceMatrix(b *testing.B) {
900 bu := make([]byte, 0, 2048)
901 w := bytes.NewBuffer(bu)
902
Akron28031b72021-10-02 13:07:25 +0200903 r := strings.NewReader(s)
904
Akron094a4e82021-10-02 18:37:00 +0200905 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200906
907 b.ResetTimer()
908
909 for i := 0; i < b.N; i++ {
910 w.Reset()
911 r.Reset(s)
912 ok := mat.Transduce(r, w)
913 if !ok {
914 fmt.Println("Fail!")
915 fmt.Println(w.String())
916 os.Exit(1)
917 }
918 }
919}