blob: b40fb8401fab4c6b842455ed1917169625117098 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
13func TestFullTokenizerMatrix(t *testing.T) {
14 assert := assert.New(t)
15 foma := LoadFomaFile("testdata/simpletok.fst")
16 assert.NotNil(foma)
17
18 mat := foma.ToMatrix()
19
20 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
21 b := make([]byte, 0, 2048)
22 w := bytes.NewBuffer(b)
23 var tokens []string
24 mat.Transduce(r, w)
25 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020026 assert.Equal(len(tokens), 10)
Akron1c34ce62021-09-23 23:27:39 +020027 assert.Equal("wald", tokens[0])
28 assert.Equal("gehen", tokens[1])
29 assert.Equal("Da", tokens[2])
30 assert.Equal("kann", tokens[3])
31 assert.Equal("man", tokens[4])
32 assert.Equal("was", tokens[5])
33 assert.Equal("\"erleben\"", tokens[6])
34 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020035
36 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
37 w.Reset()
38 mat.Transduce(r, w)
39 tokens = strings.Split(w.String(), "\n")
40 assert.Equal("In", tokens[0])
41 assert.Equal("den", tokens[1])
42 assert.Equal("Wald", tokens[2])
43 assert.Equal("gehen", tokens[3])
44 assert.Equal("?", tokens[4])
45 assert.Equal("--", tokens[5])
46
47 r = strings.NewReader(" g? -- D")
48 w.Reset()
49 mat.Transduce(r, w)
50 tokens = strings.Split(w.String(), "\n")
51 assert.Equal("g", tokens[0])
52 assert.Equal("?", tokens[1])
53 assert.Equal("--", tokens[2])
54 assert.Equal("D", tokens[3])
55 assert.Equal("", tokens[4])
56 assert.Equal("", tokens[5])
57 assert.Equal(6, len(tokens))
58}
59
Akron16c312e2021-09-26 13:11:12 +020060func TestReadWriteMatrixTokenizer(t *testing.T) {
61 assert := assert.New(t)
62 foma := LoadFomaFile("testdata/simpletok.fst")
63 assert.NotNil(foma)
64
65 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020066 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020067
68 assert.True(tmatch(mat, "bau"))
69 assert.True(tmatch(mat, "bad"))
70 assert.True(tmatch(mat, "wald gehen"))
71 b := make([]byte, 0, 1024)
72 buf := bytes.NewBuffer(b)
73 n, err := mat.WriteTo(buf)
74 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020075 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020076 mat2 := ParseMatrix(buf)
77 assert.NotNil(mat2)
78 assert.Equal(mat.sigma, mat2.sigma)
79 assert.Equal(mat.epsilon, mat2.epsilon)
80 assert.Equal(mat.unknown, mat2.unknown)
81 assert.Equal(mat.identity, mat2.identity)
82 assert.Equal(mat.stateCount, mat2.stateCount)
83 assert.Equal(len(mat.array), len(mat2.array))
84 assert.Equal(mat.array, mat2.array)
85 assert.True(tmatch(mat2, "bau"))
86 assert.True(tmatch(mat2, "bad"))
87 assert.True(tmatch(mat2, "wald gehen"))
88}
89
Akron28031b72021-10-02 13:07:25 +020090func TestReadWriteMatrixFullTokenizer(t *testing.T) {
91 assert := assert.New(t)
92 foma := LoadFomaFile("testdata/tokenizer.fst")
93 assert.NotNil(foma)
94
95 mat := foma.ToMatrix()
96 assert.NotNil(foma)
97
98 tb := make([]byte, 0, 2048)
99 w := bytes.NewBuffer(tb)
100
101 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
102 assert.Equal("der\nalte\nbaum\n\n", w.String())
103
104 b := make([]byte, 0, 1024)
105 buf := bytes.NewBuffer(b)
106 _, err := mat.WriteTo(buf)
107 assert.Nil(err)
108 w.Reset()
109 // assert.Equal(int64(248), n)
110
111 mat2 := ParseMatrix(buf)
112 assert.NotNil(mat2)
113 assert.Equal(mat.sigma, mat2.sigma)
114 assert.Equal(mat.epsilon, mat2.epsilon)
115 assert.Equal(mat.unknown, mat2.unknown)
116 assert.Equal(mat.identity, mat2.identity)
117 assert.Equal(mat.stateCount, mat2.stateCount)
118 assert.Equal(len(mat.array), len(mat2.array))
119 // assert.Equal(mat.array, mat2.array)
120
121 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
122 assert.Equal("der\nalte\nbaum\n\n", w.String())
123}
124
125func TestFullTokenizerMatrixTransduce(t *testing.T) {
126 assert := assert.New(t)
127
Akron094a4e82021-10-02 18:37:00 +0200128 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200129
130 assert.NotNil(mat)
131
132 b := make([]byte, 0, 2048)
133 w := bytes.NewBuffer(b)
134 var tokens []string
135
136 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
137
138 tokens = strings.Split(w.String(), "\n")
139 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
140 assert.Equal("tra", tokens[0])
141 assert.Equal(".", tokens[1])
142 assert.Equal("", tokens[2])
143 assert.Equal("u", tokens[3])
144 assert.Equal("Du", tokens[4])
145 assert.Equal("?", tokens[5])
146 assert.Equal("", tokens[6])
147 assert.Equal("", tokens[7])
148 assert.Equal(8, len(tokens))
149
150 w.Reset()
151 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
152 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
153}
154
Akron5c82a922021-09-24 19:11:29 +0200155func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
156 assert := assert.New(t)
Akron094a4e82021-10-02 18:37:00 +0200157 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron5c82a922021-09-24 19:11:29 +0200158
159 b := make([]byte, 0, 2048)
160 w := bytes.NewBuffer(b)
161 var sentences []string
162
163 // testSentSplitterSimple
164 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
165 sentences = strings.Split(w.String(), "\n\n")
166
167 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
168 assert.Equal("Der\nalte\nMann\n.", sentences[0])
169 assert.Equal("", sentences[1])
170 assert.Equal(len(sentences), 2)
171
172 w.Reset()
173 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
174 sentences = strings.Split(w.String(), "\n\n")
175 assert.Equal(len(sentences), 2)
176 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
177 assert.Equal("", sentences[1])
178
Akron28031b72021-10-02 13:07:25 +0200179 w.Reset()
180 assert.True(mat.Transduce(strings.NewReader(""), w))
181 sentences = strings.Split(w.String(), "\n\n")
182 assert.Equal(len(sentences), 1)
183 assert.Equal("\n", sentences[0])
Akron5c82a922021-09-24 19:11:29 +0200184
Akron28031b72021-10-02 13:07:25 +0200185 w.Reset()
186 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
187 sentences = strings.Split(w.String(), "\n\n")
188 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200189
Akron28031b72021-10-02 13:07:25 +0200190 w.Reset()
191 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
192 sentences = strings.Split(w.String(), "\n\n")
193 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200194
Akron28031b72021-10-02 13:07:25 +0200195 w.Reset()
196 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
197 sentences = strings.Split(w.String(), "\n\n")
198 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
199 assert.Equal("", sentences[1])
200 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200201
Akron28031b72021-10-02 13:07:25 +0200202 w.Reset()
203 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
204 sentences = strings.Split(w.String(), "\n\n")
205 assert.Equal("", sentences[1])
206 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200207
Akron28031b72021-10-02 13:07:25 +0200208 w.Reset()
209 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
210 sentences = strings.Split(w.String(), "\n\n")
211 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200212
Akron28031b72021-10-02 13:07:25 +0200213 w.Reset()
214 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
215 sentences = strings.Split(w.String(), "\n\n")
216 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200217
Akron28031b72021-10-02 13:07:25 +0200218 w.Reset()
219 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
220 sentences = strings.Split(w.String(), "\n\n")
221 assert.Equal(len(sentences), 2)
222 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
223 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200224
Akron28031b72021-10-02 13:07:25 +0200225 w.Reset()
226 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
227 sentences = strings.Split(w.String(), "\n\n")
228 assert.Equal(len(sentences), 3)
229 assert.Equal("Ausschalten\n!!!", sentences[0])
230 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
231 assert.Equal("", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200232
Akron28031b72021-10-02 13:07:25 +0200233 w.Reset()
234 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
235 sentences = strings.Split(w.String(), "\n\n")
236 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200237 /*
238 Test:
239 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
240 */
Akron1c34ce62021-09-23 23:27:39 +0200241}
Akron28031b72021-10-02 13:07:25 +0200242
243func TestFullTokenizerMatrixTokenSplitter(t *testing.T) {
244 assert := assert.New(t)
245
Akron094a4e82021-10-02 18:37:00 +0200246 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200247
248 b := make([]byte, 0, 2048)
249 w := bytes.NewBuffer(b)
250 var tokens []string
251
252 // testTokenizerSimple
253 tokens = ttokenize(mat, w, "Der alte Mann")
254 assert.Equal(tokens[0], "Der")
255 assert.Equal(tokens[1], "alte")
256 assert.Equal(tokens[2], "Mann")
257 assert.Equal(len(tokens), 3)
258
259 tokens = ttokenize(mat, w, "Der alte Mann.")
260 assert.Equal(tokens[0], "Der")
261 assert.Equal(tokens[1], "alte")
262 assert.Equal(tokens[2], "Mann")
263 assert.Equal(tokens[3], ".")
264 assert.Equal(len(tokens), 4)
265
266 // testTokenizerAbbr
267 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
268 assert.Equal(tokens[0], "Der")
269 assert.Equal(tokens[1], "Vorsitzende")
270 assert.Equal(tokens[2], "der")
271 assert.Equal(tokens[3], "F.D.P.")
272 assert.Equal(tokens[4], "hat")
273 assert.Equal(tokens[5], "gewählt")
274 assert.Equal(len(tokens), 6)
275 // Ignored in KorAP-Tokenizer
276
277 // testTokenizerHost1
278 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
279 assert.Equal(tokens[0], "Gefunden")
280 assert.Equal(tokens[1], "auf")
281 assert.Equal(tokens[2], "wikipedia.org")
282 assert.Equal(len(tokens), 3)
283
284 // testTokenizerWwwHost
285 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
286 assert.Equal("Gefunden", tokens[0])
287 assert.Equal("auf", tokens[1])
288 assert.Equal("www.wikipedia.org", tokens[2])
289 assert.Equal(3, len(tokens))
290
291 // testTokenizerWwwUrl
292 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
293 assert.Equal("www.info.biz/info", tokens[3])
294
295 // testTokenizerFtpHost
296 /*
297 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
298 assert.Equal("Kann", tokens[0])
299 assert.Equal("von", tokens[1])
300 assert.Equal("ftp.download.org", tokens[2])
301 assert.Equal(5, len(tokens))
302 // Ignored in KorAP-Tokenizer
303 */
304
305 // testTokenizerDash
306 tokens = ttokenize(mat, w, "Das war -- spitze")
307 assert.Equal(tokens[0], "Das")
308 assert.Equal(tokens[1], "war")
309 assert.Equal(tokens[2], "--")
310 assert.Equal(tokens[3], "spitze")
311 assert.Equal(len(tokens), 4)
312
313 // testTokenizerEmail1
314 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
315 assert.Equal(tokens[0], "Ich")
316 assert.Equal(tokens[1], "bin")
317 assert.Equal(tokens[2], "unter")
318 assert.Equal(tokens[3], "korap@ids-mannheim.de")
319 assert.Equal(tokens[4], "erreichbar")
320 assert.Equal(tokens[5], ".")
321 assert.Equal(len(tokens), 6)
322
323 // testTokenizerEmail2
324 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
325 assert.Equal(tokens[0], "Oder")
326 assert.Equal(tokens[1], "unter")
327 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
328 assert.Equal(tokens[3], ".")
329 assert.Equal(len(tokens), 4)
330
331 // testTokenizerEmail3
332 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
333 assert.Equal(tokens[0], "Oder")
334 assert.Equal(tokens[1], "unter")
335 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
336 assert.Equal(tokens[3], ".")
337 assert.Equal(len(tokens), 4)
338 // Ignored in KorAP-Tokenizer
339
340 // testTokenizerDoNotAcceptQuotedEmailNames
341 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
342 assert.Equal("\"", tokens[0])
343 assert.Equal("John", tokens[1])
344 assert.Equal("Doe", tokens[2])
345 assert.Equal("\"", tokens[3])
346 assert.Equal("@xx", tokens[4])
347 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
348 assert.Equal("com", tokens[6])
349 assert.Equal(7, len(tokens))
350
351 // testTokenizerTwitter
352 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
353 assert.Equal(tokens[0], "Folgt")
354 assert.Equal(tokens[1], "@korap")
355 assert.Equal(tokens[2], "und")
356 assert.Equal(tokens[3], "#korap")
357 assert.Equal(len(tokens), 4)
358
359 // testTokenizerWeb1
360 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
361 assert.Equal(tokens[0], "Unsere")
362 assert.Equal(tokens[1], "Website")
363 assert.Equal(tokens[2], "ist")
364 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
365 assert.Equal(len(tokens), 4)
366
367 // testTokenizerWeb2
368 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
369 assert.Equal(tokens[0], "Wir")
370 assert.Equal(tokens[1], "sind")
371 assert.Equal(tokens[2], "auch")
372 assert.Equal(tokens[3], "im")
373 assert.Equal(tokens[4], "Internet")
374 assert.Equal(tokens[5], "(")
375 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
376 assert.Equal(tokens[7], ")")
377 assert.Equal(len(tokens), 8)
378 // Ignored in KorAP-Tokenizer
379
380 // testTokenizerWeb3
381 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
382 assert.Equal(tokens[0], "Die")
383 assert.Equal(tokens[1], "Adresse")
384 assert.Equal(tokens[2], "ist")
385 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
386 assert.Equal(tokens[4], ".")
387 assert.Equal(len(tokens), 5)
388 // Ignored in KorAP-Tokenizer
389
390 // testTokenizerServer
391 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
392 assert.Equal(tokens[0], "Unser")
393 assert.Equal(tokens[1], "Server")
394 assert.Equal(tokens[2], "ist")
395 assert.Equal(tokens[3], "10.0.10.51")
396 assert.Equal(tokens[4], ".")
397 assert.Equal(len(tokens), 5)
398
399 // testTokenizerNum
400 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
401 assert.Equal(tokens[0], "Zu")
402 assert.Equal(tokens[1], "50,4%")
403 assert.Equal(tokens[2], "ist")
404 assert.Equal(tokens[3], "es")
405 assert.Equal(tokens[4], "sicher")
406 assert.Equal(len(tokens), 5)
407 // Differs from KorAP-Tokenizer
408
409 // testTokenizerDate
410 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
411 assert.Equal(tokens[0], "Der")
412 assert.Equal(tokens[1], "Termin")
413 assert.Equal(tokens[2], "ist")
414 assert.Equal(tokens[3], "am")
415 assert.Equal(tokens[4], "5.9.2018")
416 assert.Equal(len(tokens), 5)
417
418 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
419 assert.Equal(tokens[0], "Der")
420 assert.Equal(tokens[1], "Termin")
421 assert.Equal(tokens[2], "ist")
422 assert.Equal(tokens[3], "am")
423 assert.Equal(tokens[4], "5/9/2018")
424 assert.Equal(len(tokens), 5)
425
426 // testTokenizerDateRange
427 /*
428 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
429 assert.Equal(tokens[0], "Der")
430 assert.Equal(tokens[1], "Termin")
431 assert.Equal(tokens[2], "war")
432 assert.Equal(tokens[3], "vom")
433 assert.Equal(tokens[4], "4.")
434 assert.Equal(tokens[5], "-")
435 assert.Equal(tokens[6], "5.9.2018")
436 assert.Equal(len(tokens), 7)
437 // Ignored in KorAP-Tokenizer
438 */
439
440 // testTokenizerEmoji1
441 tokens = ttokenize(mat, w, "Das ist toll! ;)")
442 assert.Equal(tokens[0], "Das")
443 assert.Equal(tokens[1], "ist")
444 assert.Equal(tokens[2], "toll")
445 assert.Equal(tokens[3], "!")
446 assert.Equal(tokens[4], ";)")
447 assert.Equal(len(tokens), 5)
448
449 // testTokenizerRef1
450 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
451 assert.Equal(tokens[0], "Kupietz")
452 assert.Equal(tokens[1], "und")
453 assert.Equal(tokens[2], "Schmidt")
454 assert.Equal(tokens[3], "(2018)")
455 assert.Equal(tokens[4], ":")
456 assert.Equal(tokens[5], "Korpuslinguistik")
457 assert.Equal(len(tokens), 6)
458 // Differs from KorAP-Tokenizer!
459
460 // testTokenizerRef2 () {
461 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
462 assert.Equal(tokens[0], "Kupietz")
463 assert.Equal(tokens[1], "und")
464 assert.Equal(tokens[2], "Schmidt")
465 assert.Equal(tokens[3], "[2018]")
466 assert.Equal(tokens[4], ":")
467 assert.Equal(tokens[5], "Korpuslinguistik")
468 assert.Equal(len(tokens), 6)
469 // Differs from KorAP-Tokenizer!
470
471 // testTokenizerOmission1 () {
472 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
473 assert.Equal(tokens[0], "Er")
474 assert.Equal(tokens[1], "ist")
475 assert.Equal(tokens[2], "ein")
476 assert.Equal(tokens[3], "A****loch")
477 assert.Equal(tokens[4], "!")
478 assert.Equal(len(tokens), 5)
479
480 // testTokenizerOmission2
481 tokens = ttokenize(mat, w, "F*ck!")
482 assert.Equal(tokens[0], "F*ck")
483 assert.Equal(tokens[1], "!")
484 assert.Equal(len(tokens), 2)
485
486 // testTokenizerOmission3 () {
487 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
488 assert.Equal(tokens[0], "Dieses")
489 assert.Equal(tokens[1], "verf*****")
490 assert.Equal(tokens[2], "Kleid")
491 assert.Equal(tokens[3], "!")
492 assert.Equal(len(tokens), 4)
493
494 // Probably interpreted as HOST
495 // testTokenizerFileExtension1
496 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
497 assert.Equal(tokens[0], "Ich")
498 assert.Equal(tokens[1], "habe")
499 assert.Equal(tokens[2], "die")
500 assert.Equal(tokens[3], "readme.txt")
501 assert.Equal(tokens[4], "heruntergeladen")
502 assert.Equal(len(tokens), 5)
503
504 // Probably interpreted as HOST
505 // testTokenizerFileExtension2
506 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
507 assert.Equal(tokens[0], "Nimm")
508 assert.Equal(tokens[1], "die")
509 assert.Equal(tokens[2], "README.TXT")
510 assert.Equal(tokens[3], "!")
511 assert.Equal(len(tokens), 4)
512
513 // Probably interpreted as HOST
514 // testTokenizerFileExtension3
515 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
516 assert.Equal(tokens[0], "Zeig")
517 assert.Equal(tokens[1], "mir")
518 assert.Equal(tokens[2], "profile.jpeg")
519 assert.Equal(len(tokens), 3)
520
521 // testTokenizerFile1
522
523 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
524 assert.Equal(tokens[0], "Zeig")
525 assert.Equal(tokens[1], "mir")
526 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
527 assert.Equal(len(tokens), 3)
528
529 // testTokenizerFile2
530 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
531 assert.Equal(tokens[0], "Gehe")
532 assert.Equal(tokens[1], "zu")
533 assert.Equal(tokens[2], "/Dokumente/profile.docx")
534 assert.Equal(len(tokens), 3)
535
536 // testTokenizerFile3
537 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
538 assert.Equal(tokens[0], "Zeig")
539 assert.Equal(tokens[1], "mir")
540 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
541 assert.Equal(len(tokens), 3)
542 // Ignored in KorAP-Tokenizer
543
544 // testTokenizerPunct
545 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
546 assert.Equal(tokens[0], "Er")
547 assert.Equal(tokens[1], "sagte")
548 assert.Equal(tokens[2], ":")
549 assert.Equal(tokens[3], "\"")
550 assert.Equal(tokens[4], "Es")
551 assert.Equal(tokens[5], "geht")
552 assert.Equal(tokens[6], "mir")
553 assert.Equal(tokens[7], "gut")
554 assert.Equal(tokens[8], "!")
555 assert.Equal(tokens[9], "\"")
556 assert.Equal(tokens[10], ",")
557 assert.Equal(tokens[11], "daraufhin")
558 assert.Equal(tokens[12], "ging")
559 assert.Equal(tokens[13], "er")
560 assert.Equal(tokens[14], ".")
561 assert.Equal(len(tokens), 15)
562
563 // testTokenizerPlusAmpersand
564 tokens = ttokenize(mat, w, ""Das ist von C&A!"")
565 assert.Equal(tokens[0], """)
566 assert.Equal(tokens[1], "Das")
567 assert.Equal(tokens[2], "ist")
568 assert.Equal(tokens[3], "von")
569 assert.Equal(tokens[4], "C&A")
570 assert.Equal(tokens[5], "!")
571 assert.Equal(tokens[6], """)
572 assert.Equal(len(tokens), 7)
573
574 // testTokenizerLongEnd
575 tokens = ttokenize(mat, w, "Siehst Du?!!?")
576 assert.Equal(tokens[0], "Siehst")
577 assert.Equal(tokens[1], "Du")
578 assert.Equal(tokens[2], "?!!?")
579 assert.Equal(len(tokens), 3)
580
581 // testTokenizerIrishO
582 tokens = ttokenize(mat, w, "Peter O'Toole")
583 assert.Equal(tokens[0], "Peter")
584 assert.Equal(tokens[1], "O'Toole")
585 assert.Equal(len(tokens), 2)
586
587 // testTokenizerAbr
588 tokens = ttokenize(mat, w, "Früher bzw. später ...")
589 assert.Equal(tokens[0], "Früher")
590 assert.Equal(tokens[1], "bzw.")
591 assert.Equal(tokens[2], "später")
592 assert.Equal(tokens[3], "...")
593 assert.Equal(len(tokens), 4)
594
595 // testTokenizerUppercaseRule
596 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
597 assert.Equal(tokens[0], "Es")
598 assert.Equal(tokens[1], "war")
599 assert.Equal(tokens[2], "spät")
600 assert.Equal(tokens[3], ".")
601 assert.Equal(tokens[4], "Morgen")
602 assert.Equal(tokens[5], "ist")
603 assert.Equal(tokens[6], "es")
604 assert.Equal(tokens[7], "früh")
605 assert.Equal(tokens[8], ".")
606 assert.Equal(len(tokens), 9)
607 // Ignored in KorAP-Tokenizer
608
609 // testTokenizerOrd
610 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
611 assert.Equal(tokens[0], "Sie")
612 assert.Equal(tokens[1], "erreichte")
613 assert.Equal(tokens[2], "den")
614 assert.Equal(tokens[3], "1.")
615 assert.Equal(tokens[4], "Platz")
616 assert.Equal(tokens[5], "!")
617 assert.Equal(len(tokens), 6)
618
619 // testNoZipOuputArchive
620 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
621 assert.Equal(tokens[0], "Archive")
622 assert.Equal(tokens[1], ":")
623 assert.Equal(tokens[2], "Ich")
624 assert.Equal(tokens[3], "bin")
625 assert.Equal(tokens[4], "kein")
626 assert.Equal(tokens[5], "zip")
627 assert.Equal(6, len(tokens))
628
629 // testTokenizerStrasse
630 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
631 assert.Equal(tokens[4], "Weststr.")
632 assert.Equal(8, len(tokens))
633
634 // germanTokenizerKnowsGermanOmissionWords
635 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
636 assert.Equal("D'dorf", tokens[0])
637 assert.Equal("Ku'damm", tokens[1])
638 assert.Equal("Lu'hafen", tokens[2])
639 assert.Equal("M'gladbach", tokens[3])
640 assert.Equal("W'schaft", tokens[4])
641 assert.Equal(5, len(tokens))
642
643 // germanTokenizerDoesNOTSeparateGermanContractions
644 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
645 assert.Equal("mach's", tokens[0])
646 assert.Equal("macht's", tokens[1])
647 assert.Equal("was'n", tokens[2])
648 assert.Equal("ist's", tokens[3])
649 assert.Equal("haste", tokens[4])
650 assert.Equal("willste", tokens[5])
651 assert.Equal("kannste", tokens[6])
652 assert.Equal("biste", tokens[7])
653 assert.Equal("kriegste", tokens[8])
654 assert.Equal(9, len(tokens))
655
656 /*
657 @Test
658 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
659 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
660 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
661 assert.Equal("'ve", tokens[1]);
662 assert.Equal("'ll", tokens[3]);
663 assert.Equal("'d", tokens[5]);
664 assert.Equal("'m", tokens[7]);
665 assert.Equal("'re", tokens[9]);
666 assert.Equal("'s", tokens[11]);
667 assert.Equal("is", tokens[12]);
668 assert.Equal("n't", tokens[13]);
669 assert.Equal(14, len(tokens));
670 }
671
672 @Test
673 public void frenchTokenizerKnowsFrenchAbbreviations () {
674 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
675 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
676 assert.Equal("Approx.", tokens[0]);
677 assert.Equal("juill.", tokens[2]);
678 assert.Equal("prof.", tokens[5]);
679 assert.Equal("exerc.", tokens[15]);
680 assert.Equal("no.", tokens[16]);
681 assert.Equal("pp.", tokens[21]);
682 }
683
684 @Test
685 public void frenchTokenizerKnowsFrenchContractions () {
686 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
687 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
688 assert.Equal("J'", tokens[0]);
689 assert.Equal("j'", tokens[2]);
690 assert.Equal("qu'", tokens[4]);
691 assert.Equal("d'", tokens[6]);
692 assert.Equal("jusqu'", tokens[8]);
693 assert.Equal("Aujourd'hui", tokens[10]);
694 assert.Equal("D'", tokens[11]); // ’
695 assert.Equal("Quelqu'un", tokens[13]); // ’
696 assert.Equal("Presqu'île", tokens[14]); // ’
697 }
698
699 @Test
700 public void frenchTokenizerKnowsFrenchClitics () {
701 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
702 tokens = tokenize(dat, w, "suis-je sont-elles ")
703 assert.Equal("suis", tokens[0]);
704 assert.Equal("-je", tokens[1]);
705 assert.Equal("sont", tokens[2]);
706 assert.Equal("-elles", tokens[3]);
707 }
708
709 @Test
710 public void testEnglishTokenizerScienceAbbreviations () {
711 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
712 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
713 assert.Equal("Approx.", tokens[0]);
714 assert.Equal("in", tokens[1]);
715 assert.Equal("Sept.", tokens[2]);
716 assert.Equal("1954", tokens[3]);
717 assert.Equal(",", tokens[4]);
718 assert.Equal("Assoc.", tokens[5]);
719 assert.Equal("Prof.", tokens[6]);
720 assert.Equal("Dr.", tokens[7]);
721 assert.Equal("R.", tokens[8]);
722 assert.Equal("J.", tokens[9]);
723 assert.Equal("Ewing", tokens[10]);
724 assert.Equal("reviewed", tokens[11]);
725 assert.Equal("articles", tokens[12]);
726 assert.Equal("on", tokens[13]);
727 assert.Equal("Enzymol.", tokens[14]);
728 assert.Equal("Bacteriol.", tokens[15]);
729 assert.Equal("effects", tokens[16]);
730 assert.Equal("later", tokens[17]);
731 assert.Equal("published", tokens[18]);
732 assert.Equal("in", tokens[19]);
733 assert.Equal("Nutr.", tokens[20]);
734 assert.Equal("Rheumatol.", tokens[21]);
735 assert.Equal("No.", tokens[22]);
736 assert.Equal("12", tokens[23]);
737 assert.Equal("and", tokens[24]);
738 assert.Equal("Nº.", tokens[25]);
739 assert.Equal("13.", tokens[26]);
740 assert.Equal(",", tokens[27]);
741 assert.Equal("pp.", tokens[28]);
742 assert.Equal("17-18", tokens[29]);
743 assert.Equal(".", tokens[30]);
744 }
745
746 @Test
747 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
748 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
749 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
750 assert.Equal("I.", tokens[1]);
751 assert.Equal("I", tokens[8]);
752 assert.Equal(".", tokens[9]);
753 assert.Equal("I", tokens[12]);
754 assert.Equal(".", tokens[13]);
755 }
756
757 @Test
758 public void testZipOuputArchive () {
759
760 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
761 System.setOut(new PrintStream(clearOut));
762 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
763 assert.Equal(0, len(tokens));
764 }
765 */
766 /*
767
768 @Test
769 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
770 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
771 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
772 .printOffsets(true)
773 .build();
774 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
775 assert.Equal("Text1", tokens[0].getType());
776 assert.Equal(len(tokens), 9 );
777 }
778 */
779}
780
781func TestFullTokenizerMatrixXML(t *testing.T) {
782 assert := assert.New(t)
783
Akron094a4e82021-10-02 18:37:00 +0200784 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200785
Akron28031b72021-10-02 13:07:25 +0200786 assert.NotNil(mat)
787
788 b := make([]byte, 0, 2048)
789 w := bytes.NewBuffer(b)
790 var tokens []string
791
792 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
793 assert.Equal("Das", tokens[0])
794 assert.Equal("<b>", tokens[1])
795 assert.Equal("beste", tokens[2])
796 assert.Equal("</b>", tokens[3])
797 assert.Equal("Fußballspiel", tokens[4])
798 assert.Equal(5, len(tokens))
799
800 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
801 assert.Equal("Das", tokens[0])
802 assert.Equal("<b class=\"c\">", tokens[1])
803 assert.Equal("beste", tokens[2])
804 assert.Equal("</b>", tokens[3])
805 assert.Equal("Fußballspiel", tokens[4])
806 assert.Equal(5, len(tokens))
807
808 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
809 assert.Equal("der", tokens[0])
810 assert.Equal("<x y=\"alte \">", tokens[1])
811 assert.Equal("<x x>", tokens[2])
812 assert.Equal("alte", tokens[3])
813 assert.Equal("</x>", tokens[4])
814 assert.Equal("etc.", tokens[5])
815 assert.Equal("et", tokens[6])
816 assert.Equal(".", tokens[7])
817 assert.Equal("Mann", tokens[8])
818 assert.Equal(".", tokens[9])
819 assert.Equal(10, len(tokens))
820}
821
822func BenchmarkTransduceMatrix(b *testing.B) {
823 bu := make([]byte, 0, 2048)
824 w := bytes.NewBuffer(bu)
825
826 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
827 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
828 Der Termin ist am 5.9.2018.
829 Ich habe die readme.txt heruntergeladen.
830 Ausschalten!!! Hast Du nicht gehört???
831 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
832 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
833 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
834 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
835 r := strings.NewReader(s)
836
Akron094a4e82021-10-02 18:37:00 +0200837 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200838
839 b.ResetTimer()
840
841 for i := 0; i < b.N; i++ {
842 w.Reset()
843 r.Reset(s)
844 ok := mat.Transduce(r, w)
845 if !ok {
846 fmt.Println("Fail!")
847 fmt.Println(w.String())
848 os.Exit(1)
849 }
850 }
851}