blob: c017af57ec7c0a5ffc7e5da096e2064f046d62ab [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron9fb63af2021-10-28 01:15:53 +020023var mat *MatrixTokenizer
24
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akronc9c0eae2021-10-22 19:49:43 +020073func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +020074 assert := assert.New(t)
75 foma := LoadFomaFile("testdata/simpletok.fst")
76 assert.NotNil(foma)
77
78 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020079 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020080
81 assert.True(tmatch(mat, "bau"))
82 assert.True(tmatch(mat, "bad"))
83 assert.True(tmatch(mat, "wald gehen"))
84 b := make([]byte, 0, 1024)
85 buf := bytes.NewBuffer(b)
86 n, err := mat.WriteTo(buf)
87 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020088 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020089 mat2 := ParseMatrix(buf)
90 assert.NotNil(mat2)
91 assert.Equal(mat.sigma, mat2.sigma)
92 assert.Equal(mat.epsilon, mat2.epsilon)
93 assert.Equal(mat.unknown, mat2.unknown)
94 assert.Equal(mat.identity, mat2.identity)
95 assert.Equal(mat.stateCount, mat2.stateCount)
96 assert.Equal(len(mat.array), len(mat2.array))
97 assert.Equal(mat.array, mat2.array)
98 assert.True(tmatch(mat2, "bau"))
99 assert.True(tmatch(mat2, "bad"))
100 assert.True(tmatch(mat2, "wald gehen"))
101}
102
Akrone396a932021-10-19 01:06:13 +0200103func TestMatrixIgnorableMCS(t *testing.T) {
104 assert := assert.New(t)
105
106 // This test relies on final states. That's why it is
107 // not working correctly anymore.
108
109 // File has MCS in sigma but not in net
110 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
111 assert.NotNil(tok)
112 mat := tok.ToMatrix()
113 assert.NotNil(mat)
114
115 b := make([]byte, 0, 2048)
116 w := bytes.NewBuffer(b)
117 var tokens []string
118
119 // Is only unambigous when transducing strictly greedy!
120 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
121 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200122 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200123 assert.Equal("a", tokens[0])
124 assert.Equal("b", tokens[1])
125 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200126 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200127}
128
Akronc9c0eae2021-10-22 19:49:43 +0200129func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200130 assert := assert.New(t)
131 foma := LoadFomaFile("testdata/tokenizer.fst")
132 assert.NotNil(foma)
133
134 mat := foma.ToMatrix()
135 assert.NotNil(foma)
136
137 tb := make([]byte, 0, 2048)
138 w := bytes.NewBuffer(tb)
139
140 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200141 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200142
143 b := make([]byte, 0, 1024)
144 buf := bytes.NewBuffer(b)
145 _, err := mat.WriteTo(buf)
146 assert.Nil(err)
147 w.Reset()
148 // assert.Equal(int64(248), n)
149
150 mat2 := ParseMatrix(buf)
151 assert.NotNil(mat2)
152 assert.Equal(mat.sigma, mat2.sigma)
153 assert.Equal(mat.epsilon, mat2.epsilon)
154 assert.Equal(mat.unknown, mat2.unknown)
155 assert.Equal(mat.identity, mat2.identity)
156 assert.Equal(mat.stateCount, mat2.stateCount)
157 assert.Equal(len(mat.array), len(mat2.array))
158 // assert.Equal(mat.array, mat2.array)
159
160 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200161 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200162}
163
Akronc9c0eae2021-10-22 19:49:43 +0200164func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200165 assert := assert.New(t)
166
Akron9fb63af2021-10-28 01:15:53 +0200167 if mat == nil {
168 mat = LoadMatrixFile("testdata/tokenizer.matok")
169 }
Akron28031b72021-10-02 13:07:25 +0200170
171 assert.NotNil(mat)
172
173 b := make([]byte, 0, 2048)
174 w := bytes.NewBuffer(b)
175 var tokens []string
176
177 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
178
179 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200180 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200181 assert.Equal("tra", tokens[0])
182 assert.Equal(".", tokens[1])
183 assert.Equal("", tokens[2])
184 assert.Equal("u", tokens[3])
185 assert.Equal("Du", tokens[4])
186 assert.Equal("?", tokens[5])
187 assert.Equal("", tokens[6])
188 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200189 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200190
191 w.Reset()
192 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200193 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200194}
195
Akronc9c0eae2021-10-22 19:49:43 +0200196func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200197 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200198
199 if mat == nil {
200 mat = LoadMatrixFile("testdata/tokenizer.matok")
201 }
Akron5c82a922021-09-24 19:11:29 +0200202
203 b := make([]byte, 0, 2048)
204 w := bytes.NewBuffer(b)
205 var sentences []string
206
207 // testSentSplitterSimple
208 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
209 sentences = strings.Split(w.String(), "\n\n")
210
Akrona854faa2021-10-22 19:31:08 +0200211 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200212 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200213 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200214 assert.Equal(len(sentences), 2)
215
216 w.Reset()
Akrona854faa2021-10-22 19:31:08 +0200217 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
218 sentences = strings.Split(w.String(), "\n\n")
219 assert.Equal(len(sentences), 2)
220 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
221 assert.Equal("\n", sentences[1])
222
223 w.Reset()
Akron5c82a922021-09-24 19:11:29 +0200224 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal(len(sentences), 2)
227 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200228 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200229
Akron28031b72021-10-02 13:07:25 +0200230 w.Reset()
231 assert.True(mat.Transduce(strings.NewReader(""), w))
232 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200233 assert.Equal(len(sentences), 2)
234 assert.Equal("", sentences[0])
235 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200236
Akron28031b72021-10-02 13:07:25 +0200237 w.Reset()
238 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
239 sentences = strings.Split(w.String(), "\n\n")
240 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200241
Akron28031b72021-10-02 13:07:25 +0200242 w.Reset()
243 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
244 sentences = strings.Split(w.String(), "\n\n")
245 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200246
Akron28031b72021-10-02 13:07:25 +0200247 w.Reset()
248 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200251 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200252 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200253
Akron28031b72021-10-02 13:07:25 +0200254 w.Reset()
255 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
256 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200257 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200258 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200259
Akron28031b72021-10-02 13:07:25 +0200260 w.Reset()
261 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
262 sentences = strings.Split(w.String(), "\n\n")
263 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200264
Akron28031b72021-10-02 13:07:25 +0200265 w.Reset()
266 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
267 sentences = strings.Split(w.String(), "\n\n")
268 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200269
Akron28031b72021-10-02 13:07:25 +0200270 w.Reset()
271 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
272 sentences = strings.Split(w.String(), "\n\n")
273 assert.Equal(len(sentences), 2)
274 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200275 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200276
Akron28031b72021-10-02 13:07:25 +0200277 w.Reset()
278 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
279 sentences = strings.Split(w.String(), "\n\n")
280 assert.Equal(len(sentences), 3)
281 assert.Equal("Ausschalten\n!!!", sentences[0])
282 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200283 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200284
Akron28031b72021-10-02 13:07:25 +0200285 w.Reset()
286 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
287 sentences = strings.Split(w.String(), "\n\n")
288 assert.Equal(len(sentences), 2)
Akrone96895f2022-03-08 19:58:37 +0100289
290 w.Reset()
291 assert.True(mat.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w))
292 sentences = strings.Split(w.String(), "\n\n")
293 assert.Equal(len(sentences), 5)
294 assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0])
295 assert.Equal("Geh\n!!!", sentences[1])
296 assert.Equal("\"\nLass\n!\n\"", sentences[2])
297 assert.Equal("Dann\nging\ner\n.", sentences[3])
298
299 w.Reset()
300 assert.True(mat.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
301 sentences = strings.Split(w.String(), "\n\n")
302 assert.Equal(len(sentences), 3)
303 assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0])
304 assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1])
Akronece3f012022-03-09 19:12:15 +0100305
306 w.Reset()
307 assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
308 sentences = strings.Split(w.String(), "\n\n")
309 assert.Equal(len(sentences), 3)
310 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
311 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
312
313 w.Reset()
314 assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
315 sentences = strings.Split(w.String(), "\n\n")
316 assert.Equal(len(sentences), 3)
317 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
318 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
319
320 text := `»Meinetwegen. Denkst du, daß ich darauf warte? Das fehlte noch.
321Übrigens, ich kriege schon einen und vielleicht bald. Da ist mir nicht
322bange. Neulich erst hat mir der kleine Ventivegni von drüben gesagt:
323'Fräulein Effi, was gilt die Wette, wir sind hier noch in diesem Jahre
324zu Polterabend und Hochzeit.'«
325
326»Und was sagtest du da?«`
327
328 w.Reset()
329 assert.True(mat.Transduce(strings.NewReader(text), w))
330 sentences = strings.Split(w.String(), "\n\n")
331 assert.Equal(len(sentences), 8)
332 assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5])
333 assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6])
Akron1c34ce62021-09-23 23:27:39 +0200334}
Akron28031b72021-10-02 13:07:25 +0200335
Akronc9c0eae2021-10-22 19:49:43 +0200336func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200337 assert := assert.New(t)
338
Akron9fb63af2021-10-28 01:15:53 +0200339 if mat == nil {
340 mat = LoadMatrixFile("testdata/tokenizer.matok")
341 }
Akron28031b72021-10-02 13:07:25 +0200342
343 b := make([]byte, 0, 2048)
344 w := bytes.NewBuffer(b)
345 var tokens []string
346
347 // testTokenizerSimple
348 tokens = ttokenize(mat, w, "Der alte Mann")
349 assert.Equal(tokens[0], "Der")
350 assert.Equal(tokens[1], "alte")
351 assert.Equal(tokens[2], "Mann")
352 assert.Equal(len(tokens), 3)
353
354 tokens = ttokenize(mat, w, "Der alte Mann.")
355 assert.Equal(tokens[0], "Der")
356 assert.Equal(tokens[1], "alte")
357 assert.Equal(tokens[2], "Mann")
358 assert.Equal(tokens[3], ".")
359 assert.Equal(len(tokens), 4)
360
361 // testTokenizerAbbr
362 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
363 assert.Equal(tokens[0], "Der")
364 assert.Equal(tokens[1], "Vorsitzende")
365 assert.Equal(tokens[2], "der")
366 assert.Equal(tokens[3], "F.D.P.")
367 assert.Equal(tokens[4], "hat")
368 assert.Equal(tokens[5], "gewählt")
369 assert.Equal(len(tokens), 6)
370 // Ignored in KorAP-Tokenizer
371
372 // testTokenizerHost1
373 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
374 assert.Equal(tokens[0], "Gefunden")
375 assert.Equal(tokens[1], "auf")
376 assert.Equal(tokens[2], "wikipedia.org")
377 assert.Equal(len(tokens), 3)
378
379 // testTokenizerWwwHost
380 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
381 assert.Equal("Gefunden", tokens[0])
382 assert.Equal("auf", tokens[1])
383 assert.Equal("www.wikipedia.org", tokens[2])
384 assert.Equal(3, len(tokens))
385
386 // testTokenizerWwwUrl
387 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
388 assert.Equal("www.info.biz/info", tokens[3])
389
390 // testTokenizerFtpHost
391 /*
392 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
393 assert.Equal("Kann", tokens[0])
394 assert.Equal("von", tokens[1])
395 assert.Equal("ftp.download.org", tokens[2])
396 assert.Equal(5, len(tokens))
397 // Ignored in KorAP-Tokenizer
398 */
399
400 // testTokenizerDash
401 tokens = ttokenize(mat, w, "Das war -- spitze")
402 assert.Equal(tokens[0], "Das")
403 assert.Equal(tokens[1], "war")
404 assert.Equal(tokens[2], "--")
405 assert.Equal(tokens[3], "spitze")
406 assert.Equal(len(tokens), 4)
407
408 // testTokenizerEmail1
409 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
410 assert.Equal(tokens[0], "Ich")
411 assert.Equal(tokens[1], "bin")
412 assert.Equal(tokens[2], "unter")
413 assert.Equal(tokens[3], "korap@ids-mannheim.de")
414 assert.Equal(tokens[4], "erreichbar")
415 assert.Equal(tokens[5], ".")
416 assert.Equal(len(tokens), 6)
417
418 // testTokenizerEmail2
419 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
420 assert.Equal(tokens[0], "Oder")
421 assert.Equal(tokens[1], "unter")
422 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
423 assert.Equal(tokens[3], ".")
424 assert.Equal(len(tokens), 4)
425
426 // testTokenizerEmail3
427 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
428 assert.Equal(tokens[0], "Oder")
429 assert.Equal(tokens[1], "unter")
430 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
431 assert.Equal(tokens[3], ".")
432 assert.Equal(len(tokens), 4)
433 // Ignored in KorAP-Tokenizer
434
435 // testTokenizerDoNotAcceptQuotedEmailNames
436 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
437 assert.Equal("\"", tokens[0])
438 assert.Equal("John", tokens[1])
439 assert.Equal("Doe", tokens[2])
440 assert.Equal("\"", tokens[3])
441 assert.Equal("@xx", tokens[4])
442 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
443 assert.Equal("com", tokens[6])
444 assert.Equal(7, len(tokens))
445
446 // testTokenizerTwitter
447 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
448 assert.Equal(tokens[0], "Folgt")
449 assert.Equal(tokens[1], "@korap")
450 assert.Equal(tokens[2], "und")
451 assert.Equal(tokens[3], "#korap")
452 assert.Equal(len(tokens), 4)
453
454 // testTokenizerWeb1
455 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
456 assert.Equal(tokens[0], "Unsere")
457 assert.Equal(tokens[1], "Website")
458 assert.Equal(tokens[2], "ist")
459 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
460 assert.Equal(len(tokens), 4)
461
462 // testTokenizerWeb2
463 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
464 assert.Equal(tokens[0], "Wir")
465 assert.Equal(tokens[1], "sind")
466 assert.Equal(tokens[2], "auch")
467 assert.Equal(tokens[3], "im")
468 assert.Equal(tokens[4], "Internet")
469 assert.Equal(tokens[5], "(")
470 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
471 assert.Equal(tokens[7], ")")
472 assert.Equal(len(tokens), 8)
473 // Ignored in KorAP-Tokenizer
474
475 // testTokenizerWeb3
476 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
477 assert.Equal(tokens[0], "Die")
478 assert.Equal(tokens[1], "Adresse")
479 assert.Equal(tokens[2], "ist")
480 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
481 assert.Equal(tokens[4], ".")
482 assert.Equal(len(tokens), 5)
483 // Ignored in KorAP-Tokenizer
484
485 // testTokenizerServer
486 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
487 assert.Equal(tokens[0], "Unser")
488 assert.Equal(tokens[1], "Server")
489 assert.Equal(tokens[2], "ist")
490 assert.Equal(tokens[3], "10.0.10.51")
491 assert.Equal(tokens[4], ".")
492 assert.Equal(len(tokens), 5)
493
494 // testTokenizerNum
495 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
496 assert.Equal(tokens[0], "Zu")
497 assert.Equal(tokens[1], "50,4%")
498 assert.Equal(tokens[2], "ist")
499 assert.Equal(tokens[3], "es")
500 assert.Equal(tokens[4], "sicher")
501 assert.Equal(len(tokens), 5)
502 // Differs from KorAP-Tokenizer
503
504 // testTokenizerDate
505 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
506 assert.Equal(tokens[0], "Der")
507 assert.Equal(tokens[1], "Termin")
508 assert.Equal(tokens[2], "ist")
509 assert.Equal(tokens[3], "am")
510 assert.Equal(tokens[4], "5.9.2018")
511 assert.Equal(len(tokens), 5)
512
513 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
514 assert.Equal(tokens[0], "Der")
515 assert.Equal(tokens[1], "Termin")
516 assert.Equal(tokens[2], "ist")
517 assert.Equal(tokens[3], "am")
518 assert.Equal(tokens[4], "5/9/2018")
519 assert.Equal(len(tokens), 5)
520
521 // testTokenizerDateRange
522 /*
523 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
524 assert.Equal(tokens[0], "Der")
525 assert.Equal(tokens[1], "Termin")
526 assert.Equal(tokens[2], "war")
527 assert.Equal(tokens[3], "vom")
528 assert.Equal(tokens[4], "4.")
529 assert.Equal(tokens[5], "-")
530 assert.Equal(tokens[6], "5.9.2018")
531 assert.Equal(len(tokens), 7)
532 // Ignored in KorAP-Tokenizer
533 */
534
535 // testTokenizerEmoji1
536 tokens = ttokenize(mat, w, "Das ist toll! ;)")
537 assert.Equal(tokens[0], "Das")
538 assert.Equal(tokens[1], "ist")
539 assert.Equal(tokens[2], "toll")
540 assert.Equal(tokens[3], "!")
541 assert.Equal(tokens[4], ";)")
542 assert.Equal(len(tokens), 5)
543
544 // testTokenizerRef1
545 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
546 assert.Equal(tokens[0], "Kupietz")
547 assert.Equal(tokens[1], "und")
548 assert.Equal(tokens[2], "Schmidt")
549 assert.Equal(tokens[3], "(2018)")
550 assert.Equal(tokens[4], ":")
551 assert.Equal(tokens[5], "Korpuslinguistik")
552 assert.Equal(len(tokens), 6)
553 // Differs from KorAP-Tokenizer!
554
555 // testTokenizerRef2 () {
556 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
557 assert.Equal(tokens[0], "Kupietz")
558 assert.Equal(tokens[1], "und")
559 assert.Equal(tokens[2], "Schmidt")
560 assert.Equal(tokens[3], "[2018]")
561 assert.Equal(tokens[4], ":")
562 assert.Equal(tokens[5], "Korpuslinguistik")
563 assert.Equal(len(tokens), 6)
564 // Differs from KorAP-Tokenizer!
565
566 // testTokenizerOmission1 () {
567 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
568 assert.Equal(tokens[0], "Er")
569 assert.Equal(tokens[1], "ist")
570 assert.Equal(tokens[2], "ein")
571 assert.Equal(tokens[3], "A****loch")
572 assert.Equal(tokens[4], "!")
573 assert.Equal(len(tokens), 5)
574
575 // testTokenizerOmission2
576 tokens = ttokenize(mat, w, "F*ck!")
577 assert.Equal(tokens[0], "F*ck")
578 assert.Equal(tokens[1], "!")
579 assert.Equal(len(tokens), 2)
580
581 // testTokenizerOmission3 () {
582 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
583 assert.Equal(tokens[0], "Dieses")
584 assert.Equal(tokens[1], "verf*****")
585 assert.Equal(tokens[2], "Kleid")
586 assert.Equal(tokens[3], "!")
587 assert.Equal(len(tokens), 4)
588
589 // Probably interpreted as HOST
590 // testTokenizerFileExtension1
591 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
592 assert.Equal(tokens[0], "Ich")
593 assert.Equal(tokens[1], "habe")
594 assert.Equal(tokens[2], "die")
595 assert.Equal(tokens[3], "readme.txt")
596 assert.Equal(tokens[4], "heruntergeladen")
597 assert.Equal(len(tokens), 5)
598
599 // Probably interpreted as HOST
600 // testTokenizerFileExtension2
601 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
602 assert.Equal(tokens[0], "Nimm")
603 assert.Equal(tokens[1], "die")
604 assert.Equal(tokens[2], "README.TXT")
605 assert.Equal(tokens[3], "!")
606 assert.Equal(len(tokens), 4)
607
608 // Probably interpreted as HOST
609 // testTokenizerFileExtension3
610 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
611 assert.Equal(tokens[0], "Zeig")
612 assert.Equal(tokens[1], "mir")
613 assert.Equal(tokens[2], "profile.jpeg")
614 assert.Equal(len(tokens), 3)
615
616 // testTokenizerFile1
617
618 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
619 assert.Equal(tokens[0], "Zeig")
620 assert.Equal(tokens[1], "mir")
621 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
622 assert.Equal(len(tokens), 3)
623
624 // testTokenizerFile2
625 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
626 assert.Equal(tokens[0], "Gehe")
627 assert.Equal(tokens[1], "zu")
628 assert.Equal(tokens[2], "/Dokumente/profile.docx")
629 assert.Equal(len(tokens), 3)
630
631 // testTokenizerFile3
632 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
633 assert.Equal(tokens[0], "Zeig")
634 assert.Equal(tokens[1], "mir")
635 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
636 assert.Equal(len(tokens), 3)
637 // Ignored in KorAP-Tokenizer
638
639 // testTokenizerPunct
640 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
641 assert.Equal(tokens[0], "Er")
642 assert.Equal(tokens[1], "sagte")
643 assert.Equal(tokens[2], ":")
644 assert.Equal(tokens[3], "\"")
645 assert.Equal(tokens[4], "Es")
646 assert.Equal(tokens[5], "geht")
647 assert.Equal(tokens[6], "mir")
648 assert.Equal(tokens[7], "gut")
649 assert.Equal(tokens[8], "!")
650 assert.Equal(tokens[9], "\"")
651 assert.Equal(tokens[10], ",")
652 assert.Equal(tokens[11], "daraufhin")
653 assert.Equal(tokens[12], "ging")
654 assert.Equal(tokens[13], "er")
655 assert.Equal(tokens[14], ".")
656 assert.Equal(len(tokens), 15)
657
658 // testTokenizerPlusAmpersand
659 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
660 assert.Equal(tokens[0], "&quot;")
661 assert.Equal(tokens[1], "Das")
662 assert.Equal(tokens[2], "ist")
663 assert.Equal(tokens[3], "von")
664 assert.Equal(tokens[4], "C&A")
665 assert.Equal(tokens[5], "!")
666 assert.Equal(tokens[6], "&quot;")
667 assert.Equal(len(tokens), 7)
668
669 // testTokenizerLongEnd
670 tokens = ttokenize(mat, w, "Siehst Du?!!?")
671 assert.Equal(tokens[0], "Siehst")
672 assert.Equal(tokens[1], "Du")
673 assert.Equal(tokens[2], "?!!?")
674 assert.Equal(len(tokens), 3)
675
676 // testTokenizerIrishO
677 tokens = ttokenize(mat, w, "Peter O'Toole")
678 assert.Equal(tokens[0], "Peter")
679 assert.Equal(tokens[1], "O'Toole")
680 assert.Equal(len(tokens), 2)
681
682 // testTokenizerAbr
683 tokens = ttokenize(mat, w, "Früher bzw. später ...")
684 assert.Equal(tokens[0], "Früher")
685 assert.Equal(tokens[1], "bzw.")
686 assert.Equal(tokens[2], "später")
687 assert.Equal(tokens[3], "...")
688 assert.Equal(len(tokens), 4)
689
690 // testTokenizerUppercaseRule
691 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
692 assert.Equal(tokens[0], "Es")
693 assert.Equal(tokens[1], "war")
694 assert.Equal(tokens[2], "spät")
695 assert.Equal(tokens[3], ".")
696 assert.Equal(tokens[4], "Morgen")
697 assert.Equal(tokens[5], "ist")
698 assert.Equal(tokens[6], "es")
699 assert.Equal(tokens[7], "früh")
700 assert.Equal(tokens[8], ".")
701 assert.Equal(len(tokens), 9)
702 // Ignored in KorAP-Tokenizer
703
704 // testTokenizerOrd
705 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
706 assert.Equal(tokens[0], "Sie")
707 assert.Equal(tokens[1], "erreichte")
708 assert.Equal(tokens[2], "den")
709 assert.Equal(tokens[3], "1.")
710 assert.Equal(tokens[4], "Platz")
711 assert.Equal(tokens[5], "!")
712 assert.Equal(len(tokens), 6)
713
714 // testNoZipOuputArchive
715 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
716 assert.Equal(tokens[0], "Archive")
717 assert.Equal(tokens[1], ":")
718 assert.Equal(tokens[2], "Ich")
719 assert.Equal(tokens[3], "bin")
720 assert.Equal(tokens[4], "kein")
721 assert.Equal(tokens[5], "zip")
722 assert.Equal(6, len(tokens))
723
724 // testTokenizerStrasse
725 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
726 assert.Equal(tokens[4], "Weststr.")
727 assert.Equal(8, len(tokens))
728
729 // germanTokenizerKnowsGermanOmissionWords
730 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
731 assert.Equal("D'dorf", tokens[0])
732 assert.Equal("Ku'damm", tokens[1])
733 assert.Equal("Lu'hafen", tokens[2])
734 assert.Equal("M'gladbach", tokens[3])
735 assert.Equal("W'schaft", tokens[4])
736 assert.Equal(5, len(tokens))
737
738 // germanTokenizerDoesNOTSeparateGermanContractions
739 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
740 assert.Equal("mach's", tokens[0])
741 assert.Equal("macht's", tokens[1])
742 assert.Equal("was'n", tokens[2])
743 assert.Equal("ist's", tokens[3])
744 assert.Equal("haste", tokens[4])
745 assert.Equal("willste", tokens[5])
746 assert.Equal("kannste", tokens[6])
747 assert.Equal("biste", tokens[7])
748 assert.Equal("kriegste", tokens[8])
749 assert.Equal(9, len(tokens))
750
Akron78dba062021-10-28 19:30:46 +0200751 tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
752 assert.Equal("Es", tokens[0])
753 assert.Equal("ist", tokens[1])
754 assert.Equal("gleich", tokens[2])
755 assert.Equal("2:30", tokens[3])
756 assert.Equal("Uhr", tokens[4])
757 assert.Equal(".", tokens[5])
758 assert.Equal(6, len(tokens))
759
Akron17984c82021-10-30 11:44:37 +0200760 tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
761 assert.Equal("Sie", tokens[0])
762 assert.Equal("schwamm", tokens[1])
763 assert.Equal("die", tokens[2])
764 assert.Equal("Strecke", tokens[3])
765 assert.Equal("in", tokens[4])
766 assert.Equal("00:00:57,34", tokens[5])
767 assert.Equal("00:57,341", tokens[6])
768 assert.Equal("0:57", tokens[7])
769 assert.Equal("Stunden", tokens[8])
770 assert.Equal(".", tokens[9])
771 assert.Equal(10, len(tokens))
772
Akronf1106ec2021-11-05 13:04:44 +0100773 // waste example
774 tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
775 assert.Equal(tokens[0], "Am")
776 assert.Equal(tokens[1], "24.1.1806")
777 assert.Equal(tokens[2], "feierte")
778 assert.Equal(tokens[3], "E.")
779 assert.Equal(tokens[4], "T.")
780 assert.Equal(tokens[5], "A.")
781 assert.Equal(tokens[6], "Hoffmann")
782 assert.Equal(tokens[7], "seinen")
783 assert.Equal(tokens[8], "30.")
784 assert.Equal(tokens[9], "Geburtstag")
785 assert.Equal(tokens[10], ".")
786 assert.Equal(11, len(tokens))
787
Akron9135b202021-11-06 13:16:07 +0100788 // IPtest
789 tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.")
790 assert.Equal(tokens[0], "Meine")
791 assert.Equal(tokens[1], "IP")
792 assert.Equal(tokens[2], "ist")
793 assert.Equal(tokens[3], "192.178.168.55")
794 assert.Equal(tokens[4], ".")
795 assert.Equal(5, len(tokens))
796
Akron6742b962021-11-09 01:17:20 +0100797 // XML entities
798 tokens = ttokenize(mat, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
799 assert.Equal(tokens[0], "Das")
800 assert.Equal(tokens[1], "ist")
801 assert.Equal(tokens[2], "&nbsp;")
802 assert.Equal(tokens[3], "1:30")
803 assert.Equal(tokens[4], "Stunden")
804 assert.Equal(tokens[5], "&")
805 assert.Equal(tokens[6], "20")
806 assert.Equal(tokens[7], "Minuten")
807 assert.Equal(tokens[8], "zu")
808 assert.Equal(tokens[9], "spät")
809 assert.Equal(tokens[10], "&GT;")
810 assert.Equal(tokens[11], ".")
811 assert.Equal(12, len(tokens))
812
Akron936c0f52021-12-07 11:30:53 +0100813 // Plusampersand compounds (1)
Akrone62e8eb2021-12-03 11:59:53 +0100814 tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
815 assert.Equal(tokens[0], "Die")
816 assert.Equal(tokens[1], "2G+-Regel")
817 assert.Equal(tokens[2], "soll")
818 assert.Equal(tokens[3], "weitere")
819 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
820 assert.Equal(tokens[5], "reduzieren")
821 assert.Equal(tokens[6], ".")
822 assert.Equal(7, len(tokens))
823
Akron936c0f52021-12-07 11:30:53 +0100824 // Plusampersand compounds (2)
825 tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.")
826 assert.Equal(tokens[0], "Der")
827 assert.Equal(tokens[1], "Neu-C++-Programmierer")
828 assert.Equal(tokens[2], ".")
829 assert.Equal(3, len(tokens))
830
Akron54ed7e72022-01-04 12:05:00 +0100831 // z.B.
832 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
833 assert.Equal(tokens[0], "Dies")
834 assert.Equal(tokens[1], "sind")
835 assert.Equal(tokens[2], "z.")
836 assert.Equal(tokens[3], "B.")
837 assert.Equal(tokens[4], "zwei")
838 assert.Equal(tokens[5], "Wörter")
839 assert.Equal(tokens[6], "-")
840 assert.Equal(tokens[7], "z.")
841 assert.Equal(tokens[8], "B.")
842 assert.Equal(tokens[9], "auch")
843 assert.Equal(tokens[10], ".")
844 assert.Equal(11, len(tokens))
845
Akron9a594712022-01-14 11:12:21 +0100846 // z.B.
847 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
848 assert.Equal(tokens[0], "Dies")
849 assert.Equal(tokens[1], "sind")
850 assert.Equal(tokens[2], "z.")
851 assert.Equal(tokens[3], "B.")
852 assert.Equal(tokens[4], "zwei")
853 assert.Equal(tokens[5], "Wörter")
854 assert.Equal(tokens[6], "-")
855 assert.Equal(tokens[7], "z.")
856 assert.Equal(tokens[8], "B.")
857 assert.Equal(tokens[9], "auch")
858 assert.Equal(tokens[10], ".")
859 assert.Equal(11, len(tokens))
860
861 // Single quote handling
862 tokens = ttokenize(mat, w, "Es heißt 'Leitungssportteams' und nicht anders.")
863 assert.Equal(tokens[0], "Es")
864 assert.Equal(tokens[1], "heißt")
865 assert.Equal(tokens[2], "'")
866 assert.Equal(tokens[3], "Leitungssportteams")
867 assert.Equal(tokens[4], "'")
868 assert.Equal(tokens[5], "und")
869 assert.Equal(tokens[6], "nicht")
870 assert.Equal(tokens[7], "anders")
871 assert.Equal(tokens[8], ".")
872 assert.Equal(9, len(tokens))
873
Akronb02ad072022-01-19 12:41:44 +0100874 // Apostrophe handling
875 tokens = ttokenize(mat, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
876 assert.Equal(tokens[0], "Das")
877 assert.Equal(tokens[1], "ist")
878 assert.Equal(tokens[2], "Nils’")
879 assert.Equal(tokens[3], "Einkaufskorb")
880 assert.Equal(tokens[4], "bei")
881 assert.Equal(tokens[5], "McDonald's")
882 assert.Equal(tokens[6], ".")
883 assert.Equal(7, len(tokens))
884
Akron28031b72021-10-02 13:07:25 +0200885 /*
886 @Test
887 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
888 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
889 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
890 assert.Equal("'ve", tokens[1]);
891 assert.Equal("'ll", tokens[3]);
892 assert.Equal("'d", tokens[5]);
893 assert.Equal("'m", tokens[7]);
894 assert.Equal("'re", tokens[9]);
895 assert.Equal("'s", tokens[11]);
896 assert.Equal("is", tokens[12]);
897 assert.Equal("n't", tokens[13]);
898 assert.Equal(14, len(tokens));
899 }
900
901 @Test
902 public void frenchTokenizerKnowsFrenchAbbreviations () {
903 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
904 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
905 assert.Equal("Approx.", tokens[0]);
906 assert.Equal("juill.", tokens[2]);
907 assert.Equal("prof.", tokens[5]);
908 assert.Equal("exerc.", tokens[15]);
909 assert.Equal("no.", tokens[16]);
910 assert.Equal("pp.", tokens[21]);
911 }
912
913 @Test
914 public void frenchTokenizerKnowsFrenchContractions () {
915 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
916 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
917 assert.Equal("J'", tokens[0]);
918 assert.Equal("j'", tokens[2]);
919 assert.Equal("qu'", tokens[4]);
920 assert.Equal("d'", tokens[6]);
921 assert.Equal("jusqu'", tokens[8]);
922 assert.Equal("Aujourd'hui", tokens[10]);
923 assert.Equal("D'", tokens[11]); // ’
924 assert.Equal("Quelqu'un", tokens[13]); // ’
925 assert.Equal("Presqu'île", tokens[14]); // ’
926 }
927
928 @Test
929 public void frenchTokenizerKnowsFrenchClitics () {
930 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
931 tokens = tokenize(dat, w, "suis-je sont-elles ")
932 assert.Equal("suis", tokens[0]);
933 assert.Equal("-je", tokens[1]);
934 assert.Equal("sont", tokens[2]);
935 assert.Equal("-elles", tokens[3]);
936 }
937
938 @Test
939 public void testEnglishTokenizerScienceAbbreviations () {
940 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
941 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
942 assert.Equal("Approx.", tokens[0]);
943 assert.Equal("in", tokens[1]);
944 assert.Equal("Sept.", tokens[2]);
945 assert.Equal("1954", tokens[3]);
946 assert.Equal(",", tokens[4]);
947 assert.Equal("Assoc.", tokens[5]);
948 assert.Equal("Prof.", tokens[6]);
949 assert.Equal("Dr.", tokens[7]);
950 assert.Equal("R.", tokens[8]);
951 assert.Equal("J.", tokens[9]);
952 assert.Equal("Ewing", tokens[10]);
953 assert.Equal("reviewed", tokens[11]);
954 assert.Equal("articles", tokens[12]);
955 assert.Equal("on", tokens[13]);
956 assert.Equal("Enzymol.", tokens[14]);
957 assert.Equal("Bacteriol.", tokens[15]);
958 assert.Equal("effects", tokens[16]);
959 assert.Equal("later", tokens[17]);
960 assert.Equal("published", tokens[18]);
961 assert.Equal("in", tokens[19]);
962 assert.Equal("Nutr.", tokens[20]);
963 assert.Equal("Rheumatol.", tokens[21]);
964 assert.Equal("No.", tokens[22]);
965 assert.Equal("12", tokens[23]);
966 assert.Equal("and", tokens[24]);
967 assert.Equal("Nº.", tokens[25]);
968 assert.Equal("13.", tokens[26]);
969 assert.Equal(",", tokens[27]);
970 assert.Equal("pp.", tokens[28]);
971 assert.Equal("17-18", tokens[29]);
972 assert.Equal(".", tokens[30]);
973 }
974
975 @Test
976 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
977 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
978 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
979 assert.Equal("I.", tokens[1]);
980 assert.Equal("I", tokens[8]);
981 assert.Equal(".", tokens[9]);
982 assert.Equal("I", tokens[12]);
983 assert.Equal(".", tokens[13]);
984 }
985
986 @Test
987 public void testZipOuputArchive () {
988
989 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
990 System.setOut(new PrintStream(clearOut));
991 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
992 assert.Equal(0, len(tokens));
993 }
994 */
995 /*
996
997 @Test
998 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
999 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
1000 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
1001 .printOffsets(true)
1002 .build();
1003 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
1004 assert.Equal("Text1", tokens[0].getType());
1005 assert.Equal(len(tokens), 9 );
1006 }
1007 */
1008}
1009
Akronc9c0eae2021-10-22 19:49:43 +02001010func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +02001011 assert := assert.New(t)
1012
Akron9fb63af2021-10-28 01:15:53 +02001013 if mat == nil {
1014 mat = LoadMatrixFile("testdata/tokenizer.matok")
1015 }
Akron28031b72021-10-02 13:07:25 +02001016
Akron28031b72021-10-02 13:07:25 +02001017 assert.NotNil(mat)
1018
1019 b := make([]byte, 0, 2048)
1020 w := bytes.NewBuffer(b)
1021 var tokens []string
1022
1023 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
1024 assert.Equal("Das", tokens[0])
1025 assert.Equal("<b>", tokens[1])
1026 assert.Equal("beste", tokens[2])
1027 assert.Equal("</b>", tokens[3])
1028 assert.Equal("Fußballspiel", tokens[4])
1029 assert.Equal(5, len(tokens))
1030
1031 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
1032 assert.Equal("Das", tokens[0])
1033 assert.Equal("<b class=\"c\">", tokens[1])
1034 assert.Equal("beste", tokens[2])
1035 assert.Equal("</b>", tokens[3])
1036 assert.Equal("Fußballspiel", tokens[4])
1037 assert.Equal(5, len(tokens))
1038
1039 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
1040 assert.Equal("der", tokens[0])
1041 assert.Equal("<x y=\"alte \">", tokens[1])
1042 assert.Equal("<x x>", tokens[2])
1043 assert.Equal("alte", tokens[3])
1044 assert.Equal("</x>", tokens[4])
1045 assert.Equal("etc.", tokens[5])
1046 assert.Equal("et", tokens[6])
1047 assert.Equal(".", tokens[7])
1048 assert.Equal("Mann", tokens[8])
1049 assert.Equal(".", tokens[9])
1050 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001051
1052 tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.")
1053 assert.Equal("das", tokens[0])
1054 assert.Equal("<br class=\"br\" />", tokens[1])
1055 assert.Equal("ging", tokens[2])
1056 assert.Equal(".", tokens[3])
1057 assert.Equal(4, len(tokens))
Akron28031b72021-10-02 13:07:25 +02001058}
1059
Akronabcb6a52021-10-09 15:52:08 +02001060func TestMatokDatokEquivalence(t *testing.T) {
1061 assert := assert.New(t)
1062
Akron9fb63af2021-10-28 01:15:53 +02001063 if mat == nil {
1064 mat = LoadMatrixFile("testdata/tokenizer.matok")
1065 }
Akronabcb6a52021-10-09 15:52:08 +02001066 dat := LoadDatokFile("testdata/tokenizer.datok")
1067
1068 r := strings.NewReader(s)
1069
1070 tb := make([]byte, 0, 2048)
1071 w := bytes.NewBuffer(tb)
1072
1073 // Transduce with double array representation
1074 dat.Transduce(r, w)
1075
1076 datStr := w.String()
1077
1078 r.Reset(s)
1079 w.Reset()
1080
1081 // Transduce with matrix representation
1082 mat.Transduce(r, w)
1083
1084 matStr := w.String()
1085
1086 assert.Equal(datStr, matStr)
1087}
1088
Akronc9c0eae2021-10-22 19:49:43 +02001089func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001090 assert := assert.New(t)
1091
Akron9fb63af2021-10-28 01:15:53 +02001092 if mat == nil {
1093 mat = LoadMatrixFile("testdata/tokenizer.matok")
1094 }
Akrone396a932021-10-19 01:06:13 +02001095
1096 assert.NotNil(mat)
1097
1098 b := make([]byte, 0, 2048)
1099 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001100
1101 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001102
1103 matStr := w.String()
1104
1105 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1106}
1107
Akronc9c0eae2021-10-22 19:49:43 +02001108func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001109 assert := assert.New(t)
1110
Akron9fb63af2021-10-28 01:15:53 +02001111 if mat == nil {
1112 mat = LoadMatrixFile("testdata/tokenizer.matok")
1113 }
Akrona854faa2021-10-22 19:31:08 +02001114
1115 assert.NotNil(mat)
1116
1117 b := make([]byte, 0, 2048)
1118 w := bytes.NewBuffer(b)
1119
1120 assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
1121 matStr := w.String()
1122 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001123}
Akrona854faa2021-10-22 19:31:08 +02001124
Akron22c565a2021-11-28 17:31:36 +01001125func TestMatrixFullTokenizerLongText(t *testing.T) {
1126 assert := assert.New(t)
1127
1128 if mat == nil {
1129 mat = LoadMatrixFile("testdata/tokenizer.matok")
1130 }
1131
1132 assert.NotNil(mat)
1133
1134 b := make([]byte, 0, 2048)
1135 w := bytes.NewBuffer(b)
1136
1137 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1138
1139Copyright laws are changing all over the world. Be sure to check the
1140copyright laws for your country before downloading or redistributing
1141this or any other Project Gutenberg eBook.
1142
1143This header should be the first thing seen when viewing this Project
1144Gutenberg file. Please do not remove it. Do not change or edit the
1145header without written permission.
1146
1147Please read the "legal small print," and other information about the
1148eBook and Project Gutenberg at the bottom of this file. Included is
1149important information about your specific rights and restrictions in
1150how the file may be used. You can also find out about how to make a
1151donation to Project Gutenberg, and how to get involved.
1152
1153
1154**Welcome To The World of Free Plain Vanilla Electronic Texts**
1155
1156**eBooks Readable By Both Humans and By Computers, Since 1971**
1157
1158*****These eBooks Were Prepared By Thousands of Volunteers!*****
1159
1160
1161Title: Effi Briest
1162
1163Author: Theodor Fontane
1164
1165Release Date: March, 2004 [EBook #5323]
1166`
1167
1168 assert.True(mat.Transduce(strings.NewReader(text), w))
1169
1170 assert.True(strings.Contains(w.String(), "Release"))
1171}
1172
Akronf6bdfdb2021-10-23 15:56:53 +02001173func TestMatrixTrimming(t *testing.T) {
1174 assert := assert.New(t)
1175
Akron9fb63af2021-10-28 01:15:53 +02001176 if mat == nil {
1177 mat = LoadMatrixFile("testdata/tokenizer.matok")
1178 }
Akronf6bdfdb2021-10-23 15:56:53 +02001179
1180 assert.NotNil(mat)
1181
1182 b := make([]byte, 0, 2048)
1183 w := bytes.NewBuffer(b)
1184
1185 assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
1186 matStr := w.String()
1187 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001188}
1189
Akronc9c0eae2021-10-22 19:49:43 +02001190func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001191 bu := make([]byte, 0, 2048)
1192 w := bytes.NewBuffer(bu)
1193
Akron28031b72021-10-02 13:07:25 +02001194 r := strings.NewReader(s)
1195
Akron094a4e82021-10-02 18:37:00 +02001196 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001197
1198 b.ResetTimer()
1199
1200 for i := 0; i < b.N; i++ {
1201 w.Reset()
1202 r.Reset(s)
1203 ok := mat.Transduce(r, w)
1204 if !ok {
1205 fmt.Println("Fail!")
1206 fmt.Println(w.String())
1207 os.Exit(1)
1208 }
1209 }
1210}