blob: 71a4fb57355be3e3d3f4c7138db192d788ae2fac [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
13func TestFullTokenizerMatrix(t *testing.T) {
14 assert := assert.New(t)
15 foma := LoadFomaFile("testdata/simpletok.fst")
16 assert.NotNil(foma)
17
18 mat := foma.ToMatrix()
19
20 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
21 b := make([]byte, 0, 2048)
22 w := bytes.NewBuffer(b)
23 var tokens []string
24 mat.Transduce(r, w)
25 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020026 assert.Equal(len(tokens), 10)
Akron1c34ce62021-09-23 23:27:39 +020027 assert.Equal("wald", tokens[0])
28 assert.Equal("gehen", tokens[1])
29 assert.Equal("Da", tokens[2])
30 assert.Equal("kann", tokens[3])
31 assert.Equal("man", tokens[4])
32 assert.Equal("was", tokens[5])
33 assert.Equal("\"erleben\"", tokens[6])
34 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020035
36 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
37 w.Reset()
38 mat.Transduce(r, w)
39 tokens = strings.Split(w.String(), "\n")
40 assert.Equal("In", tokens[0])
41 assert.Equal("den", tokens[1])
42 assert.Equal("Wald", tokens[2])
43 assert.Equal("gehen", tokens[3])
44 assert.Equal("?", tokens[4])
45 assert.Equal("--", tokens[5])
46
47 r = strings.NewReader(" g? -- D")
48 w.Reset()
49 mat.Transduce(r, w)
50 tokens = strings.Split(w.String(), "\n")
51 assert.Equal("g", tokens[0])
52 assert.Equal("?", tokens[1])
53 assert.Equal("--", tokens[2])
54 assert.Equal("D", tokens[3])
55 assert.Equal("", tokens[4])
56 assert.Equal("", tokens[5])
57 assert.Equal(6, len(tokens))
58}
59
Akron16c312e2021-09-26 13:11:12 +020060func TestReadWriteMatrixTokenizer(t *testing.T) {
61 assert := assert.New(t)
62 foma := LoadFomaFile("testdata/simpletok.fst")
63 assert.NotNil(foma)
64
65 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020066 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020067
68 assert.True(tmatch(mat, "bau"))
69 assert.True(tmatch(mat, "bad"))
70 assert.True(tmatch(mat, "wald gehen"))
71 b := make([]byte, 0, 1024)
72 buf := bytes.NewBuffer(b)
73 n, err := mat.WriteTo(buf)
74 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020075 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020076 mat2 := ParseMatrix(buf)
77 assert.NotNil(mat2)
78 assert.Equal(mat.sigma, mat2.sigma)
79 assert.Equal(mat.epsilon, mat2.epsilon)
80 assert.Equal(mat.unknown, mat2.unknown)
81 assert.Equal(mat.identity, mat2.identity)
82 assert.Equal(mat.stateCount, mat2.stateCount)
83 assert.Equal(len(mat.array), len(mat2.array))
84 assert.Equal(mat.array, mat2.array)
85 assert.True(tmatch(mat2, "bau"))
86 assert.True(tmatch(mat2, "bad"))
87 assert.True(tmatch(mat2, "wald gehen"))
88}
89
Akron28031b72021-10-02 13:07:25 +020090func TestReadWriteMatrixFullTokenizer(t *testing.T) {
91 assert := assert.New(t)
92 foma := LoadFomaFile("testdata/tokenizer.fst")
93 assert.NotNil(foma)
94
95 mat := foma.ToMatrix()
96 assert.NotNil(foma)
97
98 tb := make([]byte, 0, 2048)
99 w := bytes.NewBuffer(tb)
100
101 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
102 assert.Equal("der\nalte\nbaum\n\n", w.String())
103
104 b := make([]byte, 0, 1024)
105 buf := bytes.NewBuffer(b)
106 _, err := mat.WriteTo(buf)
107 assert.Nil(err)
108 w.Reset()
109 // assert.Equal(int64(248), n)
110
111 mat2 := ParseMatrix(buf)
112 assert.NotNil(mat2)
113 assert.Equal(mat.sigma, mat2.sigma)
114 assert.Equal(mat.epsilon, mat2.epsilon)
115 assert.Equal(mat.unknown, mat2.unknown)
116 assert.Equal(mat.identity, mat2.identity)
117 assert.Equal(mat.stateCount, mat2.stateCount)
118 assert.Equal(len(mat.array), len(mat2.array))
119 // assert.Equal(mat.array, mat2.array)
120
121 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
122 assert.Equal("der\nalte\nbaum\n\n", w.String())
123}
124
125func TestFullTokenizerMatrixTransduce(t *testing.T) {
126 assert := assert.New(t)
127
128 foma := LoadFomaFile("testdata/tokenizer.fst")
129 assert.NotNil(foma)
130
131 mat := foma.ToMatrix()
132
133 assert.NotNil(mat)
134
135 b := make([]byte, 0, 2048)
136 w := bytes.NewBuffer(b)
137 var tokens []string
138
139 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
140
141 tokens = strings.Split(w.String(), "\n")
142 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
143 assert.Equal("tra", tokens[0])
144 assert.Equal(".", tokens[1])
145 assert.Equal("", tokens[2])
146 assert.Equal("u", tokens[3])
147 assert.Equal("Du", tokens[4])
148 assert.Equal("?", tokens[5])
149 assert.Equal("", tokens[6])
150 assert.Equal("", tokens[7])
151 assert.Equal(8, len(tokens))
152
153 w.Reset()
154 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
155 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
156}
157
Akron5c82a922021-09-24 19:11:29 +0200158func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
159 assert := assert.New(t)
160 foma := LoadFomaFile("testdata/tokenizer.fst")
161 assert.NotNil(foma)
162
163 mat := foma.ToMatrix()
164
165 b := make([]byte, 0, 2048)
166 w := bytes.NewBuffer(b)
167 var sentences []string
168
169 // testSentSplitterSimple
170 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
171 sentences = strings.Split(w.String(), "\n\n")
172
173 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
174 assert.Equal("Der\nalte\nMann\n.", sentences[0])
175 assert.Equal("", sentences[1])
176 assert.Equal(len(sentences), 2)
177
178 w.Reset()
179 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
180 sentences = strings.Split(w.String(), "\n\n")
181 assert.Equal(len(sentences), 2)
182 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
183 assert.Equal("", sentences[1])
184
Akron28031b72021-10-02 13:07:25 +0200185 w.Reset()
186 assert.True(mat.Transduce(strings.NewReader(""), w))
187 sentences = strings.Split(w.String(), "\n\n")
188 assert.Equal(len(sentences), 1)
189 assert.Equal("\n", sentences[0])
Akron5c82a922021-09-24 19:11:29 +0200190
Akron28031b72021-10-02 13:07:25 +0200191 w.Reset()
192 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
193 sentences = strings.Split(w.String(), "\n\n")
194 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200195
Akron28031b72021-10-02 13:07:25 +0200196 w.Reset()
197 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
198 sentences = strings.Split(w.String(), "\n\n")
199 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200200
Akron28031b72021-10-02 13:07:25 +0200201 w.Reset()
202 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
203 sentences = strings.Split(w.String(), "\n\n")
204 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
205 assert.Equal("", sentences[1])
206 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200207
Akron28031b72021-10-02 13:07:25 +0200208 w.Reset()
209 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
210 sentences = strings.Split(w.String(), "\n\n")
211 assert.Equal("", sentences[1])
212 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200213
Akron28031b72021-10-02 13:07:25 +0200214 w.Reset()
215 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
216 sentences = strings.Split(w.String(), "\n\n")
217 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200218
Akron28031b72021-10-02 13:07:25 +0200219 w.Reset()
220 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
221 sentences = strings.Split(w.String(), "\n\n")
222 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200223
Akron28031b72021-10-02 13:07:25 +0200224 w.Reset()
225 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
226 sentences = strings.Split(w.String(), "\n\n")
227 assert.Equal(len(sentences), 2)
228 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
229 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200230
Akron28031b72021-10-02 13:07:25 +0200231 w.Reset()
232 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
233 sentences = strings.Split(w.String(), "\n\n")
234 assert.Equal(len(sentences), 3)
235 assert.Equal("Ausschalten\n!!!", sentences[0])
236 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
237 assert.Equal("", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200238
Akron28031b72021-10-02 13:07:25 +0200239 w.Reset()
240 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
241 sentences = strings.Split(w.String(), "\n\n")
242 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200243 /*
244 Test:
245 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
246 */
Akron1c34ce62021-09-23 23:27:39 +0200247}
Akron28031b72021-10-02 13:07:25 +0200248
249func TestFullTokenizerMatrixTokenSplitter(t *testing.T) {
250 assert := assert.New(t)
251
252 foma := LoadFomaFile("testdata/tokenizer.fst")
253 assert.NotNil(foma)
254
255 mat := foma.ToMatrix()
256
257 b := make([]byte, 0, 2048)
258 w := bytes.NewBuffer(b)
259 var tokens []string
260
261 // testTokenizerSimple
262 tokens = ttokenize(mat, w, "Der alte Mann")
263 assert.Equal(tokens[0], "Der")
264 assert.Equal(tokens[1], "alte")
265 assert.Equal(tokens[2], "Mann")
266 assert.Equal(len(tokens), 3)
267
268 tokens = ttokenize(mat, w, "Der alte Mann.")
269 assert.Equal(tokens[0], "Der")
270 assert.Equal(tokens[1], "alte")
271 assert.Equal(tokens[2], "Mann")
272 assert.Equal(tokens[3], ".")
273 assert.Equal(len(tokens), 4)
274
275 // testTokenizerAbbr
276 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
277 assert.Equal(tokens[0], "Der")
278 assert.Equal(tokens[1], "Vorsitzende")
279 assert.Equal(tokens[2], "der")
280 assert.Equal(tokens[3], "F.D.P.")
281 assert.Equal(tokens[4], "hat")
282 assert.Equal(tokens[5], "gewählt")
283 assert.Equal(len(tokens), 6)
284 // Ignored in KorAP-Tokenizer
285
286 // testTokenizerHost1
287 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
288 assert.Equal(tokens[0], "Gefunden")
289 assert.Equal(tokens[1], "auf")
290 assert.Equal(tokens[2], "wikipedia.org")
291 assert.Equal(len(tokens), 3)
292
293 // testTokenizerWwwHost
294 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
295 assert.Equal("Gefunden", tokens[0])
296 assert.Equal("auf", tokens[1])
297 assert.Equal("www.wikipedia.org", tokens[2])
298 assert.Equal(3, len(tokens))
299
300 // testTokenizerWwwUrl
301 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
302 assert.Equal("www.info.biz/info", tokens[3])
303
304 // testTokenizerFtpHost
305 /*
306 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
307 assert.Equal("Kann", tokens[0])
308 assert.Equal("von", tokens[1])
309 assert.Equal("ftp.download.org", tokens[2])
310 assert.Equal(5, len(tokens))
311 // Ignored in KorAP-Tokenizer
312 */
313
314 // testTokenizerDash
315 tokens = ttokenize(mat, w, "Das war -- spitze")
316 assert.Equal(tokens[0], "Das")
317 assert.Equal(tokens[1], "war")
318 assert.Equal(tokens[2], "--")
319 assert.Equal(tokens[3], "spitze")
320 assert.Equal(len(tokens), 4)
321
322 // testTokenizerEmail1
323 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
324 assert.Equal(tokens[0], "Ich")
325 assert.Equal(tokens[1], "bin")
326 assert.Equal(tokens[2], "unter")
327 assert.Equal(tokens[3], "korap@ids-mannheim.de")
328 assert.Equal(tokens[4], "erreichbar")
329 assert.Equal(tokens[5], ".")
330 assert.Equal(len(tokens), 6)
331
332 // testTokenizerEmail2
333 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
334 assert.Equal(tokens[0], "Oder")
335 assert.Equal(tokens[1], "unter")
336 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
337 assert.Equal(tokens[3], ".")
338 assert.Equal(len(tokens), 4)
339
340 // testTokenizerEmail3
341 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
342 assert.Equal(tokens[0], "Oder")
343 assert.Equal(tokens[1], "unter")
344 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
345 assert.Equal(tokens[3], ".")
346 assert.Equal(len(tokens), 4)
347 // Ignored in KorAP-Tokenizer
348
349 // testTokenizerDoNotAcceptQuotedEmailNames
350 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
351 assert.Equal("\"", tokens[0])
352 assert.Equal("John", tokens[1])
353 assert.Equal("Doe", tokens[2])
354 assert.Equal("\"", tokens[3])
355 assert.Equal("@xx", tokens[4])
356 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
357 assert.Equal("com", tokens[6])
358 assert.Equal(7, len(tokens))
359
360 // testTokenizerTwitter
361 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
362 assert.Equal(tokens[0], "Folgt")
363 assert.Equal(tokens[1], "@korap")
364 assert.Equal(tokens[2], "und")
365 assert.Equal(tokens[3], "#korap")
366 assert.Equal(len(tokens), 4)
367
368 // testTokenizerWeb1
369 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
370 assert.Equal(tokens[0], "Unsere")
371 assert.Equal(tokens[1], "Website")
372 assert.Equal(tokens[2], "ist")
373 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
374 assert.Equal(len(tokens), 4)
375
376 // testTokenizerWeb2
377 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
378 assert.Equal(tokens[0], "Wir")
379 assert.Equal(tokens[1], "sind")
380 assert.Equal(tokens[2], "auch")
381 assert.Equal(tokens[3], "im")
382 assert.Equal(tokens[4], "Internet")
383 assert.Equal(tokens[5], "(")
384 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
385 assert.Equal(tokens[7], ")")
386 assert.Equal(len(tokens), 8)
387 // Ignored in KorAP-Tokenizer
388
389 // testTokenizerWeb3
390 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
391 assert.Equal(tokens[0], "Die")
392 assert.Equal(tokens[1], "Adresse")
393 assert.Equal(tokens[2], "ist")
394 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
395 assert.Equal(tokens[4], ".")
396 assert.Equal(len(tokens), 5)
397 // Ignored in KorAP-Tokenizer
398
399 // testTokenizerServer
400 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
401 assert.Equal(tokens[0], "Unser")
402 assert.Equal(tokens[1], "Server")
403 assert.Equal(tokens[2], "ist")
404 assert.Equal(tokens[3], "10.0.10.51")
405 assert.Equal(tokens[4], ".")
406 assert.Equal(len(tokens), 5)
407
408 // testTokenizerNum
409 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
410 assert.Equal(tokens[0], "Zu")
411 assert.Equal(tokens[1], "50,4%")
412 assert.Equal(tokens[2], "ist")
413 assert.Equal(tokens[3], "es")
414 assert.Equal(tokens[4], "sicher")
415 assert.Equal(len(tokens), 5)
416 // Differs from KorAP-Tokenizer
417
418 // testTokenizerDate
419 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
420 assert.Equal(tokens[0], "Der")
421 assert.Equal(tokens[1], "Termin")
422 assert.Equal(tokens[2], "ist")
423 assert.Equal(tokens[3], "am")
424 assert.Equal(tokens[4], "5.9.2018")
425 assert.Equal(len(tokens), 5)
426
427 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
428 assert.Equal(tokens[0], "Der")
429 assert.Equal(tokens[1], "Termin")
430 assert.Equal(tokens[2], "ist")
431 assert.Equal(tokens[3], "am")
432 assert.Equal(tokens[4], "5/9/2018")
433 assert.Equal(len(tokens), 5)
434
435 // testTokenizerDateRange
436 /*
437 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
438 assert.Equal(tokens[0], "Der")
439 assert.Equal(tokens[1], "Termin")
440 assert.Equal(tokens[2], "war")
441 assert.Equal(tokens[3], "vom")
442 assert.Equal(tokens[4], "4.")
443 assert.Equal(tokens[5], "-")
444 assert.Equal(tokens[6], "5.9.2018")
445 assert.Equal(len(tokens), 7)
446 // Ignored in KorAP-Tokenizer
447 */
448
449 // testTokenizerEmoji1
450 tokens = ttokenize(mat, w, "Das ist toll! ;)")
451 assert.Equal(tokens[0], "Das")
452 assert.Equal(tokens[1], "ist")
453 assert.Equal(tokens[2], "toll")
454 assert.Equal(tokens[3], "!")
455 assert.Equal(tokens[4], ";)")
456 assert.Equal(len(tokens), 5)
457
458 // testTokenizerRef1
459 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
460 assert.Equal(tokens[0], "Kupietz")
461 assert.Equal(tokens[1], "und")
462 assert.Equal(tokens[2], "Schmidt")
463 assert.Equal(tokens[3], "(2018)")
464 assert.Equal(tokens[4], ":")
465 assert.Equal(tokens[5], "Korpuslinguistik")
466 assert.Equal(len(tokens), 6)
467 // Differs from KorAP-Tokenizer!
468
469 // testTokenizerRef2 () {
470 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
471 assert.Equal(tokens[0], "Kupietz")
472 assert.Equal(tokens[1], "und")
473 assert.Equal(tokens[2], "Schmidt")
474 assert.Equal(tokens[3], "[2018]")
475 assert.Equal(tokens[4], ":")
476 assert.Equal(tokens[5], "Korpuslinguistik")
477 assert.Equal(len(tokens), 6)
478 // Differs from KorAP-Tokenizer!
479
480 // testTokenizerOmission1 () {
481 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
482 assert.Equal(tokens[0], "Er")
483 assert.Equal(tokens[1], "ist")
484 assert.Equal(tokens[2], "ein")
485 assert.Equal(tokens[3], "A****loch")
486 assert.Equal(tokens[4], "!")
487 assert.Equal(len(tokens), 5)
488
489 // testTokenizerOmission2
490 tokens = ttokenize(mat, w, "F*ck!")
491 assert.Equal(tokens[0], "F*ck")
492 assert.Equal(tokens[1], "!")
493 assert.Equal(len(tokens), 2)
494
495 // testTokenizerOmission3 () {
496 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
497 assert.Equal(tokens[0], "Dieses")
498 assert.Equal(tokens[1], "verf*****")
499 assert.Equal(tokens[2], "Kleid")
500 assert.Equal(tokens[3], "!")
501 assert.Equal(len(tokens), 4)
502
503 // Probably interpreted as HOST
504 // testTokenizerFileExtension1
505 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
506 assert.Equal(tokens[0], "Ich")
507 assert.Equal(tokens[1], "habe")
508 assert.Equal(tokens[2], "die")
509 assert.Equal(tokens[3], "readme.txt")
510 assert.Equal(tokens[4], "heruntergeladen")
511 assert.Equal(len(tokens), 5)
512
513 // Probably interpreted as HOST
514 // testTokenizerFileExtension2
515 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
516 assert.Equal(tokens[0], "Nimm")
517 assert.Equal(tokens[1], "die")
518 assert.Equal(tokens[2], "README.TXT")
519 assert.Equal(tokens[3], "!")
520 assert.Equal(len(tokens), 4)
521
522 // Probably interpreted as HOST
523 // testTokenizerFileExtension3
524 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
525 assert.Equal(tokens[0], "Zeig")
526 assert.Equal(tokens[1], "mir")
527 assert.Equal(tokens[2], "profile.jpeg")
528 assert.Equal(len(tokens), 3)
529
530 // testTokenizerFile1
531
532 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
533 assert.Equal(tokens[0], "Zeig")
534 assert.Equal(tokens[1], "mir")
535 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
536 assert.Equal(len(tokens), 3)
537
538 // testTokenizerFile2
539 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
540 assert.Equal(tokens[0], "Gehe")
541 assert.Equal(tokens[1], "zu")
542 assert.Equal(tokens[2], "/Dokumente/profile.docx")
543 assert.Equal(len(tokens), 3)
544
545 // testTokenizerFile3
546 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
547 assert.Equal(tokens[0], "Zeig")
548 assert.Equal(tokens[1], "mir")
549 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
550 assert.Equal(len(tokens), 3)
551 // Ignored in KorAP-Tokenizer
552
553 // testTokenizerPunct
554 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
555 assert.Equal(tokens[0], "Er")
556 assert.Equal(tokens[1], "sagte")
557 assert.Equal(tokens[2], ":")
558 assert.Equal(tokens[3], "\"")
559 assert.Equal(tokens[4], "Es")
560 assert.Equal(tokens[5], "geht")
561 assert.Equal(tokens[6], "mir")
562 assert.Equal(tokens[7], "gut")
563 assert.Equal(tokens[8], "!")
564 assert.Equal(tokens[9], "\"")
565 assert.Equal(tokens[10], ",")
566 assert.Equal(tokens[11], "daraufhin")
567 assert.Equal(tokens[12], "ging")
568 assert.Equal(tokens[13], "er")
569 assert.Equal(tokens[14], ".")
570 assert.Equal(len(tokens), 15)
571
572 // testTokenizerPlusAmpersand
573 tokens = ttokenize(mat, w, ""Das ist von C&A!"")
574 assert.Equal(tokens[0], """)
575 assert.Equal(tokens[1], "Das")
576 assert.Equal(tokens[2], "ist")
577 assert.Equal(tokens[3], "von")
578 assert.Equal(tokens[4], "C&A")
579 assert.Equal(tokens[5], "!")
580 assert.Equal(tokens[6], """)
581 assert.Equal(len(tokens), 7)
582
583 // testTokenizerLongEnd
584 tokens = ttokenize(mat, w, "Siehst Du?!!?")
585 assert.Equal(tokens[0], "Siehst")
586 assert.Equal(tokens[1], "Du")
587 assert.Equal(tokens[2], "?!!?")
588 assert.Equal(len(tokens), 3)
589
590 // testTokenizerIrishO
591 tokens = ttokenize(mat, w, "Peter O'Toole")
592 assert.Equal(tokens[0], "Peter")
593 assert.Equal(tokens[1], "O'Toole")
594 assert.Equal(len(tokens), 2)
595
596 // testTokenizerAbr
597 tokens = ttokenize(mat, w, "Früher bzw. später ...")
598 assert.Equal(tokens[0], "Früher")
599 assert.Equal(tokens[1], "bzw.")
600 assert.Equal(tokens[2], "später")
601 assert.Equal(tokens[3], "...")
602 assert.Equal(len(tokens), 4)
603
604 // testTokenizerUppercaseRule
605 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
606 assert.Equal(tokens[0], "Es")
607 assert.Equal(tokens[1], "war")
608 assert.Equal(tokens[2], "spät")
609 assert.Equal(tokens[3], ".")
610 assert.Equal(tokens[4], "Morgen")
611 assert.Equal(tokens[5], "ist")
612 assert.Equal(tokens[6], "es")
613 assert.Equal(tokens[7], "früh")
614 assert.Equal(tokens[8], ".")
615 assert.Equal(len(tokens), 9)
616 // Ignored in KorAP-Tokenizer
617
618 // testTokenizerOrd
619 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
620 assert.Equal(tokens[0], "Sie")
621 assert.Equal(tokens[1], "erreichte")
622 assert.Equal(tokens[2], "den")
623 assert.Equal(tokens[3], "1.")
624 assert.Equal(tokens[4], "Platz")
625 assert.Equal(tokens[5], "!")
626 assert.Equal(len(tokens), 6)
627
628 // testNoZipOuputArchive
629 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
630 assert.Equal(tokens[0], "Archive")
631 assert.Equal(tokens[1], ":")
632 assert.Equal(tokens[2], "Ich")
633 assert.Equal(tokens[3], "bin")
634 assert.Equal(tokens[4], "kein")
635 assert.Equal(tokens[5], "zip")
636 assert.Equal(6, len(tokens))
637
638 // testTokenizerStrasse
639 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
640 assert.Equal(tokens[4], "Weststr.")
641 assert.Equal(8, len(tokens))
642
643 // germanTokenizerKnowsGermanOmissionWords
644 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
645 assert.Equal("D'dorf", tokens[0])
646 assert.Equal("Ku'damm", tokens[1])
647 assert.Equal("Lu'hafen", tokens[2])
648 assert.Equal("M'gladbach", tokens[3])
649 assert.Equal("W'schaft", tokens[4])
650 assert.Equal(5, len(tokens))
651
652 // germanTokenizerDoesNOTSeparateGermanContractions
653 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
654 assert.Equal("mach's", tokens[0])
655 assert.Equal("macht's", tokens[1])
656 assert.Equal("was'n", tokens[2])
657 assert.Equal("ist's", tokens[3])
658 assert.Equal("haste", tokens[4])
659 assert.Equal("willste", tokens[5])
660 assert.Equal("kannste", tokens[6])
661 assert.Equal("biste", tokens[7])
662 assert.Equal("kriegste", tokens[8])
663 assert.Equal(9, len(tokens))
664
665 /*
666 @Test
667 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
668 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
669 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
670 assert.Equal("'ve", tokens[1]);
671 assert.Equal("'ll", tokens[3]);
672 assert.Equal("'d", tokens[5]);
673 assert.Equal("'m", tokens[7]);
674 assert.Equal("'re", tokens[9]);
675 assert.Equal("'s", tokens[11]);
676 assert.Equal("is", tokens[12]);
677 assert.Equal("n't", tokens[13]);
678 assert.Equal(14, len(tokens));
679 }
680
681 @Test
682 public void frenchTokenizerKnowsFrenchAbbreviations () {
683 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
684 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
685 assert.Equal("Approx.", tokens[0]);
686 assert.Equal("juill.", tokens[2]);
687 assert.Equal("prof.", tokens[5]);
688 assert.Equal("exerc.", tokens[15]);
689 assert.Equal("no.", tokens[16]);
690 assert.Equal("pp.", tokens[21]);
691 }
692
693 @Test
694 public void frenchTokenizerKnowsFrenchContractions () {
695 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
696 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
697 assert.Equal("J'", tokens[0]);
698 assert.Equal("j'", tokens[2]);
699 assert.Equal("qu'", tokens[4]);
700 assert.Equal("d'", tokens[6]);
701 assert.Equal("jusqu'", tokens[8]);
702 assert.Equal("Aujourd'hui", tokens[10]);
703 assert.Equal("D'", tokens[11]); // ’
704 assert.Equal("Quelqu'un", tokens[13]); // ’
705 assert.Equal("Presqu'île", tokens[14]); // ’
706 }
707
708 @Test
709 public void frenchTokenizerKnowsFrenchClitics () {
710 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
711 tokens = tokenize(dat, w, "suis-je sont-elles ")
712 assert.Equal("suis", tokens[0]);
713 assert.Equal("-je", tokens[1]);
714 assert.Equal("sont", tokens[2]);
715 assert.Equal("-elles", tokens[3]);
716 }
717
718 @Test
719 public void testEnglishTokenizerScienceAbbreviations () {
720 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
721 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
722 assert.Equal("Approx.", tokens[0]);
723 assert.Equal("in", tokens[1]);
724 assert.Equal("Sept.", tokens[2]);
725 assert.Equal("1954", tokens[3]);
726 assert.Equal(",", tokens[4]);
727 assert.Equal("Assoc.", tokens[5]);
728 assert.Equal("Prof.", tokens[6]);
729 assert.Equal("Dr.", tokens[7]);
730 assert.Equal("R.", tokens[8]);
731 assert.Equal("J.", tokens[9]);
732 assert.Equal("Ewing", tokens[10]);
733 assert.Equal("reviewed", tokens[11]);
734 assert.Equal("articles", tokens[12]);
735 assert.Equal("on", tokens[13]);
736 assert.Equal("Enzymol.", tokens[14]);
737 assert.Equal("Bacteriol.", tokens[15]);
738 assert.Equal("effects", tokens[16]);
739 assert.Equal("later", tokens[17]);
740 assert.Equal("published", tokens[18]);
741 assert.Equal("in", tokens[19]);
742 assert.Equal("Nutr.", tokens[20]);
743 assert.Equal("Rheumatol.", tokens[21]);
744 assert.Equal("No.", tokens[22]);
745 assert.Equal("12", tokens[23]);
746 assert.Equal("and", tokens[24]);
747 assert.Equal("Nº.", tokens[25]);
748 assert.Equal("13.", tokens[26]);
749 assert.Equal(",", tokens[27]);
750 assert.Equal("pp.", tokens[28]);
751 assert.Equal("17-18", tokens[29]);
752 assert.Equal(".", tokens[30]);
753 }
754
755 @Test
756 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
757 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
758 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
759 assert.Equal("I.", tokens[1]);
760 assert.Equal("I", tokens[8]);
761 assert.Equal(".", tokens[9]);
762 assert.Equal("I", tokens[12]);
763 assert.Equal(".", tokens[13]);
764 }
765
766 @Test
767 public void testZipOuputArchive () {
768
769 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
770 System.setOut(new PrintStream(clearOut));
771 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
772 assert.Equal(0, len(tokens));
773 }
774 */
775 /*
776
777 @Test
778 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
779 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
780 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
781 .printOffsets(true)
782 .build();
783 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
784 assert.Equal("Text1", tokens[0].getType());
785 assert.Equal(len(tokens), 9 );
786 }
787 */
788}
789
790func TestFullTokenizerMatrixXML(t *testing.T) {
791 assert := assert.New(t)
792
793 foma := LoadFomaFile("testdata/tokenizer.fst")
794 assert.NotNil(foma)
795
796 mat := foma.ToMatrix()
797 assert.NotNil(mat)
798
799 b := make([]byte, 0, 2048)
800 w := bytes.NewBuffer(b)
801 var tokens []string
802
803 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
804 assert.Equal("Das", tokens[0])
805 assert.Equal("<b>", tokens[1])
806 assert.Equal("beste", tokens[2])
807 assert.Equal("</b>", tokens[3])
808 assert.Equal("Fußballspiel", tokens[4])
809 assert.Equal(5, len(tokens))
810
811 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
812 assert.Equal("Das", tokens[0])
813 assert.Equal("<b class=\"c\">", tokens[1])
814 assert.Equal("beste", tokens[2])
815 assert.Equal("</b>", tokens[3])
816 assert.Equal("Fußballspiel", tokens[4])
817 assert.Equal(5, len(tokens))
818
819 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
820 assert.Equal("der", tokens[0])
821 assert.Equal("<x y=\"alte \">", tokens[1])
822 assert.Equal("<x x>", tokens[2])
823 assert.Equal("alte", tokens[3])
824 assert.Equal("</x>", tokens[4])
825 assert.Equal("etc.", tokens[5])
826 assert.Equal("et", tokens[6])
827 assert.Equal(".", tokens[7])
828 assert.Equal("Mann", tokens[8])
829 assert.Equal(".", tokens[9])
830 assert.Equal(10, len(tokens))
831}
832
833func BenchmarkTransduceMatrix(b *testing.B) {
834 bu := make([]byte, 0, 2048)
835 w := bytes.NewBuffer(bu)
836
837 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
838 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
839 Der Termin ist am 5.9.2018.
840 Ich habe die readme.txt heruntergeladen.
841 Ausschalten!!! Hast Du nicht gehört???
842 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
843 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
844 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
845 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
846 r := strings.NewReader(s)
847
848 foma := LoadFomaFile("testdata/tokenizer.fst")
849 mat := foma.ToMatrix()
850
851 b.ResetTimer()
852
853 for i := 0; i < b.N; i++ {
854 w.Reset()
855 r.Reset(s)
856 ok := mat.Transduce(r, w)
857 if !ok {
858 fmt.Println("Fail!")
859 fmt.Println(w.String())
860 os.Exit(1)
861 }
862 }
863}