blob: b3af1a7b5922b0208b25a972a222b2e04f7e7639 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron1c34ce62021-09-23 23:27:39 +020023func TestFullTokenizerMatrix(t *testing.T) {
24 assert := assert.New(t)
25 foma := LoadFomaFile("testdata/simpletok.fst")
26 assert.NotNil(foma)
27
28 mat := foma.ToMatrix()
29
30 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
31 b := make([]byte, 0, 2048)
32 w := bytes.NewBuffer(b)
33 var tokens []string
34 mat.Transduce(r, w)
35 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020036 assert.Equal(len(tokens), 10)
Akron1c34ce62021-09-23 23:27:39 +020037 assert.Equal("wald", tokens[0])
38 assert.Equal("gehen", tokens[1])
39 assert.Equal("Da", tokens[2])
40 assert.Equal("kann", tokens[3])
41 assert.Equal("man", tokens[4])
42 assert.Equal("was", tokens[5])
43 assert.Equal("\"erleben\"", tokens[6])
44 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020045
46 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
47 w.Reset()
48 mat.Transduce(r, w)
49 tokens = strings.Split(w.String(), "\n")
50 assert.Equal("In", tokens[0])
51 assert.Equal("den", tokens[1])
52 assert.Equal("Wald", tokens[2])
53 assert.Equal("gehen", tokens[3])
54 assert.Equal("?", tokens[4])
55 assert.Equal("--", tokens[5])
56
57 r = strings.NewReader(" g? -- D")
58 w.Reset()
59 mat.Transduce(r, w)
60 tokens = strings.Split(w.String(), "\n")
61 assert.Equal("g", tokens[0])
62 assert.Equal("?", tokens[1])
63 assert.Equal("--", tokens[2])
64 assert.Equal("D", tokens[3])
65 assert.Equal("", tokens[4])
66 assert.Equal("", tokens[5])
67 assert.Equal(6, len(tokens))
68}
69
Akron16c312e2021-09-26 13:11:12 +020070func TestReadWriteMatrixTokenizer(t *testing.T) {
71 assert := assert.New(t)
72 foma := LoadFomaFile("testdata/simpletok.fst")
73 assert.NotNil(foma)
74
75 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020076 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020077
78 assert.True(tmatch(mat, "bau"))
79 assert.True(tmatch(mat, "bad"))
80 assert.True(tmatch(mat, "wald gehen"))
81 b := make([]byte, 0, 1024)
82 buf := bytes.NewBuffer(b)
83 n, err := mat.WriteTo(buf)
84 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020085 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020086 mat2 := ParseMatrix(buf)
87 assert.NotNil(mat2)
88 assert.Equal(mat.sigma, mat2.sigma)
89 assert.Equal(mat.epsilon, mat2.epsilon)
90 assert.Equal(mat.unknown, mat2.unknown)
91 assert.Equal(mat.identity, mat2.identity)
92 assert.Equal(mat.stateCount, mat2.stateCount)
93 assert.Equal(len(mat.array), len(mat2.array))
94 assert.Equal(mat.array, mat2.array)
95 assert.True(tmatch(mat2, "bau"))
96 assert.True(tmatch(mat2, "bad"))
97 assert.True(tmatch(mat2, "wald gehen"))
98}
99
Akron28031b72021-10-02 13:07:25 +0200100func TestReadWriteMatrixFullTokenizer(t *testing.T) {
101 assert := assert.New(t)
102 foma := LoadFomaFile("testdata/tokenizer.fst")
103 assert.NotNil(foma)
104
105 mat := foma.ToMatrix()
106 assert.NotNil(foma)
107
108 tb := make([]byte, 0, 2048)
109 w := bytes.NewBuffer(tb)
110
111 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
112 assert.Equal("der\nalte\nbaum\n\n", w.String())
113
114 b := make([]byte, 0, 1024)
115 buf := bytes.NewBuffer(b)
116 _, err := mat.WriteTo(buf)
117 assert.Nil(err)
118 w.Reset()
119 // assert.Equal(int64(248), n)
120
121 mat2 := ParseMatrix(buf)
122 assert.NotNil(mat2)
123 assert.Equal(mat.sigma, mat2.sigma)
124 assert.Equal(mat.epsilon, mat2.epsilon)
125 assert.Equal(mat.unknown, mat2.unknown)
126 assert.Equal(mat.identity, mat2.identity)
127 assert.Equal(mat.stateCount, mat2.stateCount)
128 assert.Equal(len(mat.array), len(mat2.array))
129 // assert.Equal(mat.array, mat2.array)
130
131 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
132 assert.Equal("der\nalte\nbaum\n\n", w.String())
133}
134
135func TestFullTokenizerMatrixTransduce(t *testing.T) {
136 assert := assert.New(t)
137
Akron094a4e82021-10-02 18:37:00 +0200138 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200139
140 assert.NotNil(mat)
141
142 b := make([]byte, 0, 2048)
143 w := bytes.NewBuffer(b)
144 var tokens []string
145
146 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
147
148 tokens = strings.Split(w.String(), "\n")
149 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
150 assert.Equal("tra", tokens[0])
151 assert.Equal(".", tokens[1])
152 assert.Equal("", tokens[2])
153 assert.Equal("u", tokens[3])
154 assert.Equal("Du", tokens[4])
155 assert.Equal("?", tokens[5])
156 assert.Equal("", tokens[6])
157 assert.Equal("", tokens[7])
158 assert.Equal(8, len(tokens))
159
160 w.Reset()
161 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
162 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
163}
164
Akron5c82a922021-09-24 19:11:29 +0200165func TestFullTokenizerMatrixSentenceSplitter(t *testing.T) {
166 assert := assert.New(t)
Akron094a4e82021-10-02 18:37:00 +0200167 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron5c82a922021-09-24 19:11:29 +0200168
169 b := make([]byte, 0, 2048)
170 w := bytes.NewBuffer(b)
171 var sentences []string
172
173 // testSentSplitterSimple
174 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
175 sentences = strings.Split(w.String(), "\n\n")
176
177 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
178 assert.Equal("Der\nalte\nMann\n.", sentences[0])
179 assert.Equal("", sentences[1])
180 assert.Equal(len(sentences), 2)
181
182 w.Reset()
183 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
184 sentences = strings.Split(w.String(), "\n\n")
185 assert.Equal(len(sentences), 2)
186 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
187 assert.Equal("", sentences[1])
188
Akron28031b72021-10-02 13:07:25 +0200189 w.Reset()
190 assert.True(mat.Transduce(strings.NewReader(""), w))
191 sentences = strings.Split(w.String(), "\n\n")
192 assert.Equal(len(sentences), 1)
193 assert.Equal("\n", sentences[0])
Akron5c82a922021-09-24 19:11:29 +0200194
Akron28031b72021-10-02 13:07:25 +0200195 w.Reset()
196 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
197 sentences = strings.Split(w.String(), "\n\n")
198 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200199
Akron28031b72021-10-02 13:07:25 +0200200 w.Reset()
201 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
202 sentences = strings.Split(w.String(), "\n\n")
203 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200204
Akron28031b72021-10-02 13:07:25 +0200205 w.Reset()
206 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
207 sentences = strings.Split(w.String(), "\n\n")
208 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
209 assert.Equal("", sentences[1])
210 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200211
Akron28031b72021-10-02 13:07:25 +0200212 w.Reset()
213 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
214 sentences = strings.Split(w.String(), "\n\n")
215 assert.Equal("", sentences[1])
216 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200217
Akron28031b72021-10-02 13:07:25 +0200218 w.Reset()
219 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
220 sentences = strings.Split(w.String(), "\n\n")
221 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200222
Akron28031b72021-10-02 13:07:25 +0200223 w.Reset()
224 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200227
Akron28031b72021-10-02 13:07:25 +0200228 w.Reset()
229 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
230 sentences = strings.Split(w.String(), "\n\n")
231 assert.Equal(len(sentences), 2)
232 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
233 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200234
Akron28031b72021-10-02 13:07:25 +0200235 w.Reset()
236 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
237 sentences = strings.Split(w.String(), "\n\n")
238 assert.Equal(len(sentences), 3)
239 assert.Equal("Ausschalten\n!!!", sentences[0])
240 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
241 assert.Equal("", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200242
Akron28031b72021-10-02 13:07:25 +0200243 w.Reset()
244 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
245 sentences = strings.Split(w.String(), "\n\n")
246 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200247 /*
248 Test:
249 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
250 */
Akron1c34ce62021-09-23 23:27:39 +0200251}
Akron28031b72021-10-02 13:07:25 +0200252
253func TestFullTokenizerMatrixTokenSplitter(t *testing.T) {
254 assert := assert.New(t)
255
Akron094a4e82021-10-02 18:37:00 +0200256 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200257
258 b := make([]byte, 0, 2048)
259 w := bytes.NewBuffer(b)
260 var tokens []string
261
262 // testTokenizerSimple
263 tokens = ttokenize(mat, w, "Der alte Mann")
264 assert.Equal(tokens[0], "Der")
265 assert.Equal(tokens[1], "alte")
266 assert.Equal(tokens[2], "Mann")
267 assert.Equal(len(tokens), 3)
268
269 tokens = ttokenize(mat, w, "Der alte Mann.")
270 assert.Equal(tokens[0], "Der")
271 assert.Equal(tokens[1], "alte")
272 assert.Equal(tokens[2], "Mann")
273 assert.Equal(tokens[3], ".")
274 assert.Equal(len(tokens), 4)
275
276 // testTokenizerAbbr
277 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
278 assert.Equal(tokens[0], "Der")
279 assert.Equal(tokens[1], "Vorsitzende")
280 assert.Equal(tokens[2], "der")
281 assert.Equal(tokens[3], "F.D.P.")
282 assert.Equal(tokens[4], "hat")
283 assert.Equal(tokens[5], "gewählt")
284 assert.Equal(len(tokens), 6)
285 // Ignored in KorAP-Tokenizer
286
287 // testTokenizerHost1
288 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
289 assert.Equal(tokens[0], "Gefunden")
290 assert.Equal(tokens[1], "auf")
291 assert.Equal(tokens[2], "wikipedia.org")
292 assert.Equal(len(tokens), 3)
293
294 // testTokenizerWwwHost
295 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
296 assert.Equal("Gefunden", tokens[0])
297 assert.Equal("auf", tokens[1])
298 assert.Equal("www.wikipedia.org", tokens[2])
299 assert.Equal(3, len(tokens))
300
301 // testTokenizerWwwUrl
302 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
303 assert.Equal("www.info.biz/info", tokens[3])
304
305 // testTokenizerFtpHost
306 /*
307 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
308 assert.Equal("Kann", tokens[0])
309 assert.Equal("von", tokens[1])
310 assert.Equal("ftp.download.org", tokens[2])
311 assert.Equal(5, len(tokens))
312 // Ignored in KorAP-Tokenizer
313 */
314
315 // testTokenizerDash
316 tokens = ttokenize(mat, w, "Das war -- spitze")
317 assert.Equal(tokens[0], "Das")
318 assert.Equal(tokens[1], "war")
319 assert.Equal(tokens[2], "--")
320 assert.Equal(tokens[3], "spitze")
321 assert.Equal(len(tokens), 4)
322
323 // testTokenizerEmail1
324 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
325 assert.Equal(tokens[0], "Ich")
326 assert.Equal(tokens[1], "bin")
327 assert.Equal(tokens[2], "unter")
328 assert.Equal(tokens[3], "korap@ids-mannheim.de")
329 assert.Equal(tokens[4], "erreichbar")
330 assert.Equal(tokens[5], ".")
331 assert.Equal(len(tokens), 6)
332
333 // testTokenizerEmail2
334 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
335 assert.Equal(tokens[0], "Oder")
336 assert.Equal(tokens[1], "unter")
337 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
338 assert.Equal(tokens[3], ".")
339 assert.Equal(len(tokens), 4)
340
341 // testTokenizerEmail3
342 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
343 assert.Equal(tokens[0], "Oder")
344 assert.Equal(tokens[1], "unter")
345 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
346 assert.Equal(tokens[3], ".")
347 assert.Equal(len(tokens), 4)
348 // Ignored in KorAP-Tokenizer
349
350 // testTokenizerDoNotAcceptQuotedEmailNames
351 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
352 assert.Equal("\"", tokens[0])
353 assert.Equal("John", tokens[1])
354 assert.Equal("Doe", tokens[2])
355 assert.Equal("\"", tokens[3])
356 assert.Equal("@xx", tokens[4])
357 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
358 assert.Equal("com", tokens[6])
359 assert.Equal(7, len(tokens))
360
361 // testTokenizerTwitter
362 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
363 assert.Equal(tokens[0], "Folgt")
364 assert.Equal(tokens[1], "@korap")
365 assert.Equal(tokens[2], "und")
366 assert.Equal(tokens[3], "#korap")
367 assert.Equal(len(tokens), 4)
368
369 // testTokenizerWeb1
370 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
371 assert.Equal(tokens[0], "Unsere")
372 assert.Equal(tokens[1], "Website")
373 assert.Equal(tokens[2], "ist")
374 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
375 assert.Equal(len(tokens), 4)
376
377 // testTokenizerWeb2
378 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
379 assert.Equal(tokens[0], "Wir")
380 assert.Equal(tokens[1], "sind")
381 assert.Equal(tokens[2], "auch")
382 assert.Equal(tokens[3], "im")
383 assert.Equal(tokens[4], "Internet")
384 assert.Equal(tokens[5], "(")
385 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
386 assert.Equal(tokens[7], ")")
387 assert.Equal(len(tokens), 8)
388 // Ignored in KorAP-Tokenizer
389
390 // testTokenizerWeb3
391 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
392 assert.Equal(tokens[0], "Die")
393 assert.Equal(tokens[1], "Adresse")
394 assert.Equal(tokens[2], "ist")
395 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
396 assert.Equal(tokens[4], ".")
397 assert.Equal(len(tokens), 5)
398 // Ignored in KorAP-Tokenizer
399
400 // testTokenizerServer
401 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
402 assert.Equal(tokens[0], "Unser")
403 assert.Equal(tokens[1], "Server")
404 assert.Equal(tokens[2], "ist")
405 assert.Equal(tokens[3], "10.0.10.51")
406 assert.Equal(tokens[4], ".")
407 assert.Equal(len(tokens), 5)
408
409 // testTokenizerNum
410 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
411 assert.Equal(tokens[0], "Zu")
412 assert.Equal(tokens[1], "50,4%")
413 assert.Equal(tokens[2], "ist")
414 assert.Equal(tokens[3], "es")
415 assert.Equal(tokens[4], "sicher")
416 assert.Equal(len(tokens), 5)
417 // Differs from KorAP-Tokenizer
418
419 // testTokenizerDate
420 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
421 assert.Equal(tokens[0], "Der")
422 assert.Equal(tokens[1], "Termin")
423 assert.Equal(tokens[2], "ist")
424 assert.Equal(tokens[3], "am")
425 assert.Equal(tokens[4], "5.9.2018")
426 assert.Equal(len(tokens), 5)
427
428 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
429 assert.Equal(tokens[0], "Der")
430 assert.Equal(tokens[1], "Termin")
431 assert.Equal(tokens[2], "ist")
432 assert.Equal(tokens[3], "am")
433 assert.Equal(tokens[4], "5/9/2018")
434 assert.Equal(len(tokens), 5)
435
436 // testTokenizerDateRange
437 /*
438 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
439 assert.Equal(tokens[0], "Der")
440 assert.Equal(tokens[1], "Termin")
441 assert.Equal(tokens[2], "war")
442 assert.Equal(tokens[3], "vom")
443 assert.Equal(tokens[4], "4.")
444 assert.Equal(tokens[5], "-")
445 assert.Equal(tokens[6], "5.9.2018")
446 assert.Equal(len(tokens), 7)
447 // Ignored in KorAP-Tokenizer
448 */
449
450 // testTokenizerEmoji1
451 tokens = ttokenize(mat, w, "Das ist toll! ;)")
452 assert.Equal(tokens[0], "Das")
453 assert.Equal(tokens[1], "ist")
454 assert.Equal(tokens[2], "toll")
455 assert.Equal(tokens[3], "!")
456 assert.Equal(tokens[4], ";)")
457 assert.Equal(len(tokens), 5)
458
459 // testTokenizerRef1
460 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
461 assert.Equal(tokens[0], "Kupietz")
462 assert.Equal(tokens[1], "und")
463 assert.Equal(tokens[2], "Schmidt")
464 assert.Equal(tokens[3], "(2018)")
465 assert.Equal(tokens[4], ":")
466 assert.Equal(tokens[5], "Korpuslinguistik")
467 assert.Equal(len(tokens), 6)
468 // Differs from KorAP-Tokenizer!
469
470 // testTokenizerRef2 () {
471 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
472 assert.Equal(tokens[0], "Kupietz")
473 assert.Equal(tokens[1], "und")
474 assert.Equal(tokens[2], "Schmidt")
475 assert.Equal(tokens[3], "[2018]")
476 assert.Equal(tokens[4], ":")
477 assert.Equal(tokens[5], "Korpuslinguistik")
478 assert.Equal(len(tokens), 6)
479 // Differs from KorAP-Tokenizer!
480
481 // testTokenizerOmission1 () {
482 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
483 assert.Equal(tokens[0], "Er")
484 assert.Equal(tokens[1], "ist")
485 assert.Equal(tokens[2], "ein")
486 assert.Equal(tokens[3], "A****loch")
487 assert.Equal(tokens[4], "!")
488 assert.Equal(len(tokens), 5)
489
490 // testTokenizerOmission2
491 tokens = ttokenize(mat, w, "F*ck!")
492 assert.Equal(tokens[0], "F*ck")
493 assert.Equal(tokens[1], "!")
494 assert.Equal(len(tokens), 2)
495
496 // testTokenizerOmission3 () {
497 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
498 assert.Equal(tokens[0], "Dieses")
499 assert.Equal(tokens[1], "verf*****")
500 assert.Equal(tokens[2], "Kleid")
501 assert.Equal(tokens[3], "!")
502 assert.Equal(len(tokens), 4)
503
504 // Probably interpreted as HOST
505 // testTokenizerFileExtension1
506 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
507 assert.Equal(tokens[0], "Ich")
508 assert.Equal(tokens[1], "habe")
509 assert.Equal(tokens[2], "die")
510 assert.Equal(tokens[3], "readme.txt")
511 assert.Equal(tokens[4], "heruntergeladen")
512 assert.Equal(len(tokens), 5)
513
514 // Probably interpreted as HOST
515 // testTokenizerFileExtension2
516 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
517 assert.Equal(tokens[0], "Nimm")
518 assert.Equal(tokens[1], "die")
519 assert.Equal(tokens[2], "README.TXT")
520 assert.Equal(tokens[3], "!")
521 assert.Equal(len(tokens), 4)
522
523 // Probably interpreted as HOST
524 // testTokenizerFileExtension3
525 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
526 assert.Equal(tokens[0], "Zeig")
527 assert.Equal(tokens[1], "mir")
528 assert.Equal(tokens[2], "profile.jpeg")
529 assert.Equal(len(tokens), 3)
530
531 // testTokenizerFile1
532
533 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
534 assert.Equal(tokens[0], "Zeig")
535 assert.Equal(tokens[1], "mir")
536 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
537 assert.Equal(len(tokens), 3)
538
539 // testTokenizerFile2
540 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
541 assert.Equal(tokens[0], "Gehe")
542 assert.Equal(tokens[1], "zu")
543 assert.Equal(tokens[2], "/Dokumente/profile.docx")
544 assert.Equal(len(tokens), 3)
545
546 // testTokenizerFile3
547 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
548 assert.Equal(tokens[0], "Zeig")
549 assert.Equal(tokens[1], "mir")
550 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
551 assert.Equal(len(tokens), 3)
552 // Ignored in KorAP-Tokenizer
553
554 // testTokenizerPunct
555 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
556 assert.Equal(tokens[0], "Er")
557 assert.Equal(tokens[1], "sagte")
558 assert.Equal(tokens[2], ":")
559 assert.Equal(tokens[3], "\"")
560 assert.Equal(tokens[4], "Es")
561 assert.Equal(tokens[5], "geht")
562 assert.Equal(tokens[6], "mir")
563 assert.Equal(tokens[7], "gut")
564 assert.Equal(tokens[8], "!")
565 assert.Equal(tokens[9], "\"")
566 assert.Equal(tokens[10], ",")
567 assert.Equal(tokens[11], "daraufhin")
568 assert.Equal(tokens[12], "ging")
569 assert.Equal(tokens[13], "er")
570 assert.Equal(tokens[14], ".")
571 assert.Equal(len(tokens), 15)
572
573 // testTokenizerPlusAmpersand
574 tokens = ttokenize(mat, w, ""Das ist von C&A!"")
575 assert.Equal(tokens[0], """)
576 assert.Equal(tokens[1], "Das")
577 assert.Equal(tokens[2], "ist")
578 assert.Equal(tokens[3], "von")
579 assert.Equal(tokens[4], "C&A")
580 assert.Equal(tokens[5], "!")
581 assert.Equal(tokens[6], """)
582 assert.Equal(len(tokens), 7)
583
584 // testTokenizerLongEnd
585 tokens = ttokenize(mat, w, "Siehst Du?!!?")
586 assert.Equal(tokens[0], "Siehst")
587 assert.Equal(tokens[1], "Du")
588 assert.Equal(tokens[2], "?!!?")
589 assert.Equal(len(tokens), 3)
590
591 // testTokenizerIrishO
592 tokens = ttokenize(mat, w, "Peter O'Toole")
593 assert.Equal(tokens[0], "Peter")
594 assert.Equal(tokens[1], "O'Toole")
595 assert.Equal(len(tokens), 2)
596
597 // testTokenizerAbr
598 tokens = ttokenize(mat, w, "Früher bzw. später ...")
599 assert.Equal(tokens[0], "Früher")
600 assert.Equal(tokens[1], "bzw.")
601 assert.Equal(tokens[2], "später")
602 assert.Equal(tokens[3], "...")
603 assert.Equal(len(tokens), 4)
604
605 // testTokenizerUppercaseRule
606 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
607 assert.Equal(tokens[0], "Es")
608 assert.Equal(tokens[1], "war")
609 assert.Equal(tokens[2], "spät")
610 assert.Equal(tokens[3], ".")
611 assert.Equal(tokens[4], "Morgen")
612 assert.Equal(tokens[5], "ist")
613 assert.Equal(tokens[6], "es")
614 assert.Equal(tokens[7], "früh")
615 assert.Equal(tokens[8], ".")
616 assert.Equal(len(tokens), 9)
617 // Ignored in KorAP-Tokenizer
618
619 // testTokenizerOrd
620 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
621 assert.Equal(tokens[0], "Sie")
622 assert.Equal(tokens[1], "erreichte")
623 assert.Equal(tokens[2], "den")
624 assert.Equal(tokens[3], "1.")
625 assert.Equal(tokens[4], "Platz")
626 assert.Equal(tokens[5], "!")
627 assert.Equal(len(tokens), 6)
628
629 // testNoZipOuputArchive
630 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
631 assert.Equal(tokens[0], "Archive")
632 assert.Equal(tokens[1], ":")
633 assert.Equal(tokens[2], "Ich")
634 assert.Equal(tokens[3], "bin")
635 assert.Equal(tokens[4], "kein")
636 assert.Equal(tokens[5], "zip")
637 assert.Equal(6, len(tokens))
638
639 // testTokenizerStrasse
640 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
641 assert.Equal(tokens[4], "Weststr.")
642 assert.Equal(8, len(tokens))
643
644 // germanTokenizerKnowsGermanOmissionWords
645 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
646 assert.Equal("D'dorf", tokens[0])
647 assert.Equal("Ku'damm", tokens[1])
648 assert.Equal("Lu'hafen", tokens[2])
649 assert.Equal("M'gladbach", tokens[3])
650 assert.Equal("W'schaft", tokens[4])
651 assert.Equal(5, len(tokens))
652
653 // germanTokenizerDoesNOTSeparateGermanContractions
654 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
655 assert.Equal("mach's", tokens[0])
656 assert.Equal("macht's", tokens[1])
657 assert.Equal("was'n", tokens[2])
658 assert.Equal("ist's", tokens[3])
659 assert.Equal("haste", tokens[4])
660 assert.Equal("willste", tokens[5])
661 assert.Equal("kannste", tokens[6])
662 assert.Equal("biste", tokens[7])
663 assert.Equal("kriegste", tokens[8])
664 assert.Equal(9, len(tokens))
665
666 /*
667 @Test
668 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
669 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
670 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
671 assert.Equal("'ve", tokens[1]);
672 assert.Equal("'ll", tokens[3]);
673 assert.Equal("'d", tokens[5]);
674 assert.Equal("'m", tokens[7]);
675 assert.Equal("'re", tokens[9]);
676 assert.Equal("'s", tokens[11]);
677 assert.Equal("is", tokens[12]);
678 assert.Equal("n't", tokens[13]);
679 assert.Equal(14, len(tokens));
680 }
681
682 @Test
683 public void frenchTokenizerKnowsFrenchAbbreviations () {
684 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
685 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
686 assert.Equal("Approx.", tokens[0]);
687 assert.Equal("juill.", tokens[2]);
688 assert.Equal("prof.", tokens[5]);
689 assert.Equal("exerc.", tokens[15]);
690 assert.Equal("no.", tokens[16]);
691 assert.Equal("pp.", tokens[21]);
692 }
693
694 @Test
695 public void frenchTokenizerKnowsFrenchContractions () {
696 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
697 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
698 assert.Equal("J'", tokens[0]);
699 assert.Equal("j'", tokens[2]);
700 assert.Equal("qu'", tokens[4]);
701 assert.Equal("d'", tokens[6]);
702 assert.Equal("jusqu'", tokens[8]);
703 assert.Equal("Aujourd'hui", tokens[10]);
704 assert.Equal("D'", tokens[11]); // ’
705 assert.Equal("Quelqu'un", tokens[13]); // ’
706 assert.Equal("Presqu'île", tokens[14]); // ’
707 }
708
709 @Test
710 public void frenchTokenizerKnowsFrenchClitics () {
711 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
712 tokens = tokenize(dat, w, "suis-je sont-elles ")
713 assert.Equal("suis", tokens[0]);
714 assert.Equal("-je", tokens[1]);
715 assert.Equal("sont", tokens[2]);
716 assert.Equal("-elles", tokens[3]);
717 }
718
719 @Test
720 public void testEnglishTokenizerScienceAbbreviations () {
721 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
722 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
723 assert.Equal("Approx.", tokens[0]);
724 assert.Equal("in", tokens[1]);
725 assert.Equal("Sept.", tokens[2]);
726 assert.Equal("1954", tokens[3]);
727 assert.Equal(",", tokens[4]);
728 assert.Equal("Assoc.", tokens[5]);
729 assert.Equal("Prof.", tokens[6]);
730 assert.Equal("Dr.", tokens[7]);
731 assert.Equal("R.", tokens[8]);
732 assert.Equal("J.", tokens[9]);
733 assert.Equal("Ewing", tokens[10]);
734 assert.Equal("reviewed", tokens[11]);
735 assert.Equal("articles", tokens[12]);
736 assert.Equal("on", tokens[13]);
737 assert.Equal("Enzymol.", tokens[14]);
738 assert.Equal("Bacteriol.", tokens[15]);
739 assert.Equal("effects", tokens[16]);
740 assert.Equal("later", tokens[17]);
741 assert.Equal("published", tokens[18]);
742 assert.Equal("in", tokens[19]);
743 assert.Equal("Nutr.", tokens[20]);
744 assert.Equal("Rheumatol.", tokens[21]);
745 assert.Equal("No.", tokens[22]);
746 assert.Equal("12", tokens[23]);
747 assert.Equal("and", tokens[24]);
748 assert.Equal("Nº.", tokens[25]);
749 assert.Equal("13.", tokens[26]);
750 assert.Equal(",", tokens[27]);
751 assert.Equal("pp.", tokens[28]);
752 assert.Equal("17-18", tokens[29]);
753 assert.Equal(".", tokens[30]);
754 }
755
756 @Test
757 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
758 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
759 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
760 assert.Equal("I.", tokens[1]);
761 assert.Equal("I", tokens[8]);
762 assert.Equal(".", tokens[9]);
763 assert.Equal("I", tokens[12]);
764 assert.Equal(".", tokens[13]);
765 }
766
767 @Test
768 public void testZipOuputArchive () {
769
770 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
771 System.setOut(new PrintStream(clearOut));
772 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
773 assert.Equal(0, len(tokens));
774 }
775 */
776 /*
777
778 @Test
779 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
780 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
781 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
782 .printOffsets(true)
783 .build();
784 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
785 assert.Equal("Text1", tokens[0].getType());
786 assert.Equal(len(tokens), 9 );
787 }
788 */
789}
790
791func TestFullTokenizerMatrixXML(t *testing.T) {
792 assert := assert.New(t)
793
Akron094a4e82021-10-02 18:37:00 +0200794 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200795
Akron28031b72021-10-02 13:07:25 +0200796 assert.NotNil(mat)
797
798 b := make([]byte, 0, 2048)
799 w := bytes.NewBuffer(b)
800 var tokens []string
801
802 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
803 assert.Equal("Das", tokens[0])
804 assert.Equal("<b>", tokens[1])
805 assert.Equal("beste", tokens[2])
806 assert.Equal("</b>", tokens[3])
807 assert.Equal("Fußballspiel", tokens[4])
808 assert.Equal(5, len(tokens))
809
810 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
811 assert.Equal("Das", tokens[0])
812 assert.Equal("<b class=\"c\">", tokens[1])
813 assert.Equal("beste", tokens[2])
814 assert.Equal("</b>", tokens[3])
815 assert.Equal("Fußballspiel", tokens[4])
816 assert.Equal(5, len(tokens))
817
818 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
819 assert.Equal("der", tokens[0])
820 assert.Equal("<x y=\"alte \">", tokens[1])
821 assert.Equal("<x x>", tokens[2])
822 assert.Equal("alte", tokens[3])
823 assert.Equal("</x>", tokens[4])
824 assert.Equal("etc.", tokens[5])
825 assert.Equal("et", tokens[6])
826 assert.Equal(".", tokens[7])
827 assert.Equal("Mann", tokens[8])
828 assert.Equal(".", tokens[9])
829 assert.Equal(10, len(tokens))
830}
831
Akronabcb6a52021-10-09 15:52:08 +0200832func TestMatokDatokEquivalence(t *testing.T) {
833 assert := assert.New(t)
834
835 mat := LoadMatrixFile("testdata/tokenizer.matok")
836 dat := LoadDatokFile("testdata/tokenizer.datok")
837
838 r := strings.NewReader(s)
839
840 tb := make([]byte, 0, 2048)
841 w := bytes.NewBuffer(tb)
842
843 // Transduce with double array representation
844 dat.Transduce(r, w)
845
846 datStr := w.String()
847
848 r.Reset(s)
849 w.Reset()
850
851 // Transduce with matrix representation
852 mat.Transduce(r, w)
853
854 matStr := w.String()
855
856 assert.Equal(datStr, matStr)
857}
858
Akron28031b72021-10-02 13:07:25 +0200859func BenchmarkTransduceMatrix(b *testing.B) {
860 bu := make([]byte, 0, 2048)
861 w := bytes.NewBuffer(bu)
862
Akron28031b72021-10-02 13:07:25 +0200863 r := strings.NewReader(s)
864
Akron094a4e82021-10-02 18:37:00 +0200865 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +0200866
867 b.ResetTimer()
868
869 for i := 0; i < b.N; i++ {
870 w.Reset()
871 r.Reset(s)
872 ok := mat.Transduce(r, w)
873 if !ok {
874 fmt.Println("Fail!")
875 fmt.Println(w.String())
876 os.Exit(1)
877 }
878 }
879}