blob: 3b64d5cabdccc2bd3b3080f5bb62546fc2a4614b [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron9fb63af2021-10-28 01:15:53 +020023var mat *MatrixTokenizer
24
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akronc9c0eae2021-10-22 19:49:43 +020073func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +020074 assert := assert.New(t)
75 foma := LoadFomaFile("testdata/simpletok.fst")
76 assert.NotNil(foma)
77
78 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020079 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020080
81 assert.True(tmatch(mat, "bau"))
82 assert.True(tmatch(mat, "bad"))
83 assert.True(tmatch(mat, "wald gehen"))
84 b := make([]byte, 0, 1024)
85 buf := bytes.NewBuffer(b)
86 n, err := mat.WriteTo(buf)
87 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020088 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020089 mat2 := ParseMatrix(buf)
90 assert.NotNil(mat2)
91 assert.Equal(mat.sigma, mat2.sigma)
92 assert.Equal(mat.epsilon, mat2.epsilon)
93 assert.Equal(mat.unknown, mat2.unknown)
94 assert.Equal(mat.identity, mat2.identity)
95 assert.Equal(mat.stateCount, mat2.stateCount)
96 assert.Equal(len(mat.array), len(mat2.array))
97 assert.Equal(mat.array, mat2.array)
98 assert.True(tmatch(mat2, "bau"))
99 assert.True(tmatch(mat2, "bad"))
100 assert.True(tmatch(mat2, "wald gehen"))
101}
102
Akrone396a932021-10-19 01:06:13 +0200103func TestMatrixIgnorableMCS(t *testing.T) {
104 assert := assert.New(t)
105
106 // This test relies on final states. That's why it is
107 // not working correctly anymore.
108
109 // File has MCS in sigma but not in net
110 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
111 assert.NotNil(tok)
112 mat := tok.ToMatrix()
113 assert.NotNil(mat)
114
115 b := make([]byte, 0, 2048)
116 w := bytes.NewBuffer(b)
117 var tokens []string
118
119 // Is only unambigous when transducing strictly greedy!
120 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
121 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200122 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200123 assert.Equal("a", tokens[0])
124 assert.Equal("b", tokens[1])
125 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200126 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200127}
128
Akronc9c0eae2021-10-22 19:49:43 +0200129func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200130 assert := assert.New(t)
131 foma := LoadFomaFile("testdata/tokenizer.fst")
132 assert.NotNil(foma)
133
134 mat := foma.ToMatrix()
135 assert.NotNil(foma)
136
137 tb := make([]byte, 0, 2048)
138 w := bytes.NewBuffer(tb)
139
140 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200141 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200142
143 b := make([]byte, 0, 1024)
144 buf := bytes.NewBuffer(b)
145 _, err := mat.WriteTo(buf)
146 assert.Nil(err)
147 w.Reset()
148 // assert.Equal(int64(248), n)
149
150 mat2 := ParseMatrix(buf)
151 assert.NotNil(mat2)
152 assert.Equal(mat.sigma, mat2.sigma)
153 assert.Equal(mat.epsilon, mat2.epsilon)
154 assert.Equal(mat.unknown, mat2.unknown)
155 assert.Equal(mat.identity, mat2.identity)
156 assert.Equal(mat.stateCount, mat2.stateCount)
157 assert.Equal(len(mat.array), len(mat2.array))
158 // assert.Equal(mat.array, mat2.array)
159
160 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200161 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200162}
163
Akronc9c0eae2021-10-22 19:49:43 +0200164func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200165 assert := assert.New(t)
166
Akron9fb63af2021-10-28 01:15:53 +0200167 if mat == nil {
168 mat = LoadMatrixFile("testdata/tokenizer.matok")
169 }
Akron28031b72021-10-02 13:07:25 +0200170
171 assert.NotNil(mat)
172
173 b := make([]byte, 0, 2048)
174 w := bytes.NewBuffer(b)
175 var tokens []string
176
177 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
178
179 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200180 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200181 assert.Equal("tra", tokens[0])
182 assert.Equal(".", tokens[1])
183 assert.Equal("", tokens[2])
184 assert.Equal("u", tokens[3])
185 assert.Equal("Du", tokens[4])
186 assert.Equal("?", tokens[5])
187 assert.Equal("", tokens[6])
188 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200189 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200190
191 w.Reset()
192 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200193 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200194}
195
Akronc9c0eae2021-10-22 19:49:43 +0200196func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200197 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200198
199 if mat == nil {
200 mat = LoadMatrixFile("testdata/tokenizer.matok")
201 }
Akron5c82a922021-09-24 19:11:29 +0200202
203 b := make([]byte, 0, 2048)
204 w := bytes.NewBuffer(b)
205 var sentences []string
206
207 // testSentSplitterSimple
208 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
209 sentences = strings.Split(w.String(), "\n\n")
210
Akrona854faa2021-10-22 19:31:08 +0200211 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200212 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200213 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200214 assert.Equal(len(sentences), 2)
215
216 w.Reset()
Akrona854faa2021-10-22 19:31:08 +0200217 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
218 sentences = strings.Split(w.String(), "\n\n")
219 assert.Equal(len(sentences), 2)
220 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
221 assert.Equal("\n", sentences[1])
222
223 w.Reset()
Akron5c82a922021-09-24 19:11:29 +0200224 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal(len(sentences), 2)
227 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200228 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200229
Akron28031b72021-10-02 13:07:25 +0200230 w.Reset()
231 assert.True(mat.Transduce(strings.NewReader(""), w))
232 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200233 assert.Equal(len(sentences), 2)
234 assert.Equal("", sentences[0])
235 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200236
Akron28031b72021-10-02 13:07:25 +0200237 w.Reset()
238 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
239 sentences = strings.Split(w.String(), "\n\n")
240 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200241
Akron28031b72021-10-02 13:07:25 +0200242 w.Reset()
243 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
244 sentences = strings.Split(w.String(), "\n\n")
245 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200246
Akron28031b72021-10-02 13:07:25 +0200247 w.Reset()
248 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200251 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200252 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200253
Akron28031b72021-10-02 13:07:25 +0200254 w.Reset()
255 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
256 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200257 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200258 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200259
Akron28031b72021-10-02 13:07:25 +0200260 w.Reset()
261 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
262 sentences = strings.Split(w.String(), "\n\n")
263 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200264
Akron28031b72021-10-02 13:07:25 +0200265 w.Reset()
266 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
267 sentences = strings.Split(w.String(), "\n\n")
268 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200269
Akron28031b72021-10-02 13:07:25 +0200270 w.Reset()
271 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
272 sentences = strings.Split(w.String(), "\n\n")
273 assert.Equal(len(sentences), 2)
274 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200275 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200276
Akron28031b72021-10-02 13:07:25 +0200277 w.Reset()
278 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
279 sentences = strings.Split(w.String(), "\n\n")
280 assert.Equal(len(sentences), 3)
281 assert.Equal("Ausschalten\n!!!", sentences[0])
282 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200283 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200284
Akron28031b72021-10-02 13:07:25 +0200285 w.Reset()
286 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
287 sentences = strings.Split(w.String(), "\n\n")
288 assert.Equal(len(sentences), 2)
Akrone96895f2022-03-08 19:58:37 +0100289
290 w.Reset()
291 assert.True(mat.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w))
292 sentences = strings.Split(w.String(), "\n\n")
293 assert.Equal(len(sentences), 5)
294 assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0])
295 assert.Equal("Geh\n!!!", sentences[1])
296 assert.Equal("\"\nLass\n!\n\"", sentences[2])
297 assert.Equal("Dann\nging\ner\n.", sentences[3])
298
299 w.Reset()
300 assert.True(mat.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
301 sentences = strings.Split(w.String(), "\n\n")
302 assert.Equal(len(sentences), 3)
303 assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0])
304 assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1])
Akronece3f012022-03-09 19:12:15 +0100305
306 w.Reset()
307 assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
308 sentences = strings.Split(w.String(), "\n\n")
309 assert.Equal(len(sentences), 3)
310 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
311 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
312
313 w.Reset()
314 assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
315 sentences = strings.Split(w.String(), "\n\n")
316 assert.Equal(len(sentences), 3)
317 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
318 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
319
320 text := `»Meinetwegen. Denkst du, daß ich darauf warte? Das fehlte noch.
321Übrigens, ich kriege schon einen und vielleicht bald. Da ist mir nicht
322bange. Neulich erst hat mir der kleine Ventivegni von drüben gesagt:
323'Fräulein Effi, was gilt die Wette, wir sind hier noch in diesem Jahre
324zu Polterabend und Hochzeit.'«
325
326»Und was sagtest du da?«`
327
328 w.Reset()
329 assert.True(mat.Transduce(strings.NewReader(text), w))
330 sentences = strings.Split(w.String(), "\n\n")
331 assert.Equal(len(sentences), 8)
332 assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5])
333 assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6])
Akron4222ac82022-03-11 01:06:21 +0100334
335 text = `»Nun, gib dich zufrieden, ich fange schon an ... Also Baron
336Innstetten!`
337
338 w.Reset()
339 assert.True(mat.Transduce(strings.NewReader(text), w))
340 sentences = strings.Split(w.String(), "\n\n")
341 assert.Equal(len(sentences), 3)
342 assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
343 assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
Akron1c34ce62021-09-23 23:27:39 +0200344}
Akron28031b72021-10-02 13:07:25 +0200345
Akronc9c0eae2021-10-22 19:49:43 +0200346func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200347 assert := assert.New(t)
348
Akron9fb63af2021-10-28 01:15:53 +0200349 if mat == nil {
350 mat = LoadMatrixFile("testdata/tokenizer.matok")
351 }
Akron28031b72021-10-02 13:07:25 +0200352
353 b := make([]byte, 0, 2048)
354 w := bytes.NewBuffer(b)
355 var tokens []string
356
357 // testTokenizerSimple
358 tokens = ttokenize(mat, w, "Der alte Mann")
359 assert.Equal(tokens[0], "Der")
360 assert.Equal(tokens[1], "alte")
361 assert.Equal(tokens[2], "Mann")
362 assert.Equal(len(tokens), 3)
363
364 tokens = ttokenize(mat, w, "Der alte Mann.")
365 assert.Equal(tokens[0], "Der")
366 assert.Equal(tokens[1], "alte")
367 assert.Equal(tokens[2], "Mann")
368 assert.Equal(tokens[3], ".")
369 assert.Equal(len(tokens), 4)
370
371 // testTokenizerAbbr
372 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
373 assert.Equal(tokens[0], "Der")
374 assert.Equal(tokens[1], "Vorsitzende")
375 assert.Equal(tokens[2], "der")
376 assert.Equal(tokens[3], "F.D.P.")
377 assert.Equal(tokens[4], "hat")
378 assert.Equal(tokens[5], "gewählt")
379 assert.Equal(len(tokens), 6)
380 // Ignored in KorAP-Tokenizer
381
382 // testTokenizerHost1
383 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
384 assert.Equal(tokens[0], "Gefunden")
385 assert.Equal(tokens[1], "auf")
386 assert.Equal(tokens[2], "wikipedia.org")
387 assert.Equal(len(tokens), 3)
388
389 // testTokenizerWwwHost
390 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
391 assert.Equal("Gefunden", tokens[0])
392 assert.Equal("auf", tokens[1])
393 assert.Equal("www.wikipedia.org", tokens[2])
394 assert.Equal(3, len(tokens))
395
396 // testTokenizerWwwUrl
397 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
398 assert.Equal("www.info.biz/info", tokens[3])
399
400 // testTokenizerFtpHost
401 /*
402 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
403 assert.Equal("Kann", tokens[0])
404 assert.Equal("von", tokens[1])
405 assert.Equal("ftp.download.org", tokens[2])
406 assert.Equal(5, len(tokens))
407 // Ignored in KorAP-Tokenizer
408 */
409
410 // testTokenizerDash
411 tokens = ttokenize(mat, w, "Das war -- spitze")
412 assert.Equal(tokens[0], "Das")
413 assert.Equal(tokens[1], "war")
414 assert.Equal(tokens[2], "--")
415 assert.Equal(tokens[3], "spitze")
416 assert.Equal(len(tokens), 4)
417
418 // testTokenizerEmail1
419 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
420 assert.Equal(tokens[0], "Ich")
421 assert.Equal(tokens[1], "bin")
422 assert.Equal(tokens[2], "unter")
423 assert.Equal(tokens[3], "korap@ids-mannheim.de")
424 assert.Equal(tokens[4], "erreichbar")
425 assert.Equal(tokens[5], ".")
426 assert.Equal(len(tokens), 6)
427
428 // testTokenizerEmail2
429 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
430 assert.Equal(tokens[0], "Oder")
431 assert.Equal(tokens[1], "unter")
432 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
433 assert.Equal(tokens[3], ".")
434 assert.Equal(len(tokens), 4)
435
436 // testTokenizerEmail3
437 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
438 assert.Equal(tokens[0], "Oder")
439 assert.Equal(tokens[1], "unter")
440 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
441 assert.Equal(tokens[3], ".")
442 assert.Equal(len(tokens), 4)
443 // Ignored in KorAP-Tokenizer
444
445 // testTokenizerDoNotAcceptQuotedEmailNames
446 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
447 assert.Equal("\"", tokens[0])
448 assert.Equal("John", tokens[1])
449 assert.Equal("Doe", tokens[2])
450 assert.Equal("\"", tokens[3])
451 assert.Equal("@xx", tokens[4])
452 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
453 assert.Equal("com", tokens[6])
454 assert.Equal(7, len(tokens))
455
456 // testTokenizerTwitter
457 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
458 assert.Equal(tokens[0], "Folgt")
459 assert.Equal(tokens[1], "@korap")
460 assert.Equal(tokens[2], "und")
461 assert.Equal(tokens[3], "#korap")
462 assert.Equal(len(tokens), 4)
463
464 // testTokenizerWeb1
465 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
466 assert.Equal(tokens[0], "Unsere")
467 assert.Equal(tokens[1], "Website")
468 assert.Equal(tokens[2], "ist")
469 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
470 assert.Equal(len(tokens), 4)
471
472 // testTokenizerWeb2
473 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
474 assert.Equal(tokens[0], "Wir")
475 assert.Equal(tokens[1], "sind")
476 assert.Equal(tokens[2], "auch")
477 assert.Equal(tokens[3], "im")
478 assert.Equal(tokens[4], "Internet")
479 assert.Equal(tokens[5], "(")
480 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
481 assert.Equal(tokens[7], ")")
482 assert.Equal(len(tokens), 8)
483 // Ignored in KorAP-Tokenizer
484
485 // testTokenizerWeb3
486 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
487 assert.Equal(tokens[0], "Die")
488 assert.Equal(tokens[1], "Adresse")
489 assert.Equal(tokens[2], "ist")
490 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
491 assert.Equal(tokens[4], ".")
492 assert.Equal(len(tokens), 5)
493 // Ignored in KorAP-Tokenizer
494
495 // testTokenizerServer
496 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
497 assert.Equal(tokens[0], "Unser")
498 assert.Equal(tokens[1], "Server")
499 assert.Equal(tokens[2], "ist")
500 assert.Equal(tokens[3], "10.0.10.51")
501 assert.Equal(tokens[4], ".")
502 assert.Equal(len(tokens), 5)
503
504 // testTokenizerNum
505 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
506 assert.Equal(tokens[0], "Zu")
507 assert.Equal(tokens[1], "50,4%")
508 assert.Equal(tokens[2], "ist")
509 assert.Equal(tokens[3], "es")
510 assert.Equal(tokens[4], "sicher")
511 assert.Equal(len(tokens), 5)
512 // Differs from KorAP-Tokenizer
513
514 // testTokenizerDate
515 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
516 assert.Equal(tokens[0], "Der")
517 assert.Equal(tokens[1], "Termin")
518 assert.Equal(tokens[2], "ist")
519 assert.Equal(tokens[3], "am")
520 assert.Equal(tokens[4], "5.9.2018")
521 assert.Equal(len(tokens), 5)
522
523 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
524 assert.Equal(tokens[0], "Der")
525 assert.Equal(tokens[1], "Termin")
526 assert.Equal(tokens[2], "ist")
527 assert.Equal(tokens[3], "am")
528 assert.Equal(tokens[4], "5/9/2018")
529 assert.Equal(len(tokens), 5)
530
531 // testTokenizerDateRange
532 /*
533 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
534 assert.Equal(tokens[0], "Der")
535 assert.Equal(tokens[1], "Termin")
536 assert.Equal(tokens[2], "war")
537 assert.Equal(tokens[3], "vom")
538 assert.Equal(tokens[4], "4.")
539 assert.Equal(tokens[5], "-")
540 assert.Equal(tokens[6], "5.9.2018")
541 assert.Equal(len(tokens), 7)
542 // Ignored in KorAP-Tokenizer
543 */
544
545 // testTokenizerEmoji1
546 tokens = ttokenize(mat, w, "Das ist toll! ;)")
547 assert.Equal(tokens[0], "Das")
548 assert.Equal(tokens[1], "ist")
549 assert.Equal(tokens[2], "toll")
550 assert.Equal(tokens[3], "!")
551 assert.Equal(tokens[4], ";)")
552 assert.Equal(len(tokens), 5)
553
554 // testTokenizerRef1
555 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
556 assert.Equal(tokens[0], "Kupietz")
557 assert.Equal(tokens[1], "und")
558 assert.Equal(tokens[2], "Schmidt")
559 assert.Equal(tokens[3], "(2018)")
560 assert.Equal(tokens[4], ":")
561 assert.Equal(tokens[5], "Korpuslinguistik")
562 assert.Equal(len(tokens), 6)
563 // Differs from KorAP-Tokenizer!
564
565 // testTokenizerRef2 () {
566 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
567 assert.Equal(tokens[0], "Kupietz")
568 assert.Equal(tokens[1], "und")
569 assert.Equal(tokens[2], "Schmidt")
570 assert.Equal(tokens[3], "[2018]")
571 assert.Equal(tokens[4], ":")
572 assert.Equal(tokens[5], "Korpuslinguistik")
573 assert.Equal(len(tokens), 6)
574 // Differs from KorAP-Tokenizer!
575
576 // testTokenizerOmission1 () {
577 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
578 assert.Equal(tokens[0], "Er")
579 assert.Equal(tokens[1], "ist")
580 assert.Equal(tokens[2], "ein")
581 assert.Equal(tokens[3], "A****loch")
582 assert.Equal(tokens[4], "!")
583 assert.Equal(len(tokens), 5)
584
585 // testTokenizerOmission2
586 tokens = ttokenize(mat, w, "F*ck!")
587 assert.Equal(tokens[0], "F*ck")
588 assert.Equal(tokens[1], "!")
589 assert.Equal(len(tokens), 2)
590
591 // testTokenizerOmission3 () {
592 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
593 assert.Equal(tokens[0], "Dieses")
594 assert.Equal(tokens[1], "verf*****")
595 assert.Equal(tokens[2], "Kleid")
596 assert.Equal(tokens[3], "!")
597 assert.Equal(len(tokens), 4)
598
599 // Probably interpreted as HOST
600 // testTokenizerFileExtension1
601 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
602 assert.Equal(tokens[0], "Ich")
603 assert.Equal(tokens[1], "habe")
604 assert.Equal(tokens[2], "die")
605 assert.Equal(tokens[3], "readme.txt")
606 assert.Equal(tokens[4], "heruntergeladen")
607 assert.Equal(len(tokens), 5)
608
609 // Probably interpreted as HOST
610 // testTokenizerFileExtension2
611 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
612 assert.Equal(tokens[0], "Nimm")
613 assert.Equal(tokens[1], "die")
614 assert.Equal(tokens[2], "README.TXT")
615 assert.Equal(tokens[3], "!")
616 assert.Equal(len(tokens), 4)
617
618 // Probably interpreted as HOST
619 // testTokenizerFileExtension3
620 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
621 assert.Equal(tokens[0], "Zeig")
622 assert.Equal(tokens[1], "mir")
623 assert.Equal(tokens[2], "profile.jpeg")
624 assert.Equal(len(tokens), 3)
625
626 // testTokenizerFile1
627
628 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
629 assert.Equal(tokens[0], "Zeig")
630 assert.Equal(tokens[1], "mir")
631 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
632 assert.Equal(len(tokens), 3)
633
634 // testTokenizerFile2
635 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
636 assert.Equal(tokens[0], "Gehe")
637 assert.Equal(tokens[1], "zu")
638 assert.Equal(tokens[2], "/Dokumente/profile.docx")
639 assert.Equal(len(tokens), 3)
640
641 // testTokenizerFile3
642 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
643 assert.Equal(tokens[0], "Zeig")
644 assert.Equal(tokens[1], "mir")
645 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
646 assert.Equal(len(tokens), 3)
647 // Ignored in KorAP-Tokenizer
648
649 // testTokenizerPunct
650 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
651 assert.Equal(tokens[0], "Er")
652 assert.Equal(tokens[1], "sagte")
653 assert.Equal(tokens[2], ":")
654 assert.Equal(tokens[3], "\"")
655 assert.Equal(tokens[4], "Es")
656 assert.Equal(tokens[5], "geht")
657 assert.Equal(tokens[6], "mir")
658 assert.Equal(tokens[7], "gut")
659 assert.Equal(tokens[8], "!")
660 assert.Equal(tokens[9], "\"")
661 assert.Equal(tokens[10], ",")
662 assert.Equal(tokens[11], "daraufhin")
663 assert.Equal(tokens[12], "ging")
664 assert.Equal(tokens[13], "er")
665 assert.Equal(tokens[14], ".")
666 assert.Equal(len(tokens), 15)
667
668 // testTokenizerPlusAmpersand
669 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
670 assert.Equal(tokens[0], "&quot;")
671 assert.Equal(tokens[1], "Das")
672 assert.Equal(tokens[2], "ist")
673 assert.Equal(tokens[3], "von")
674 assert.Equal(tokens[4], "C&A")
675 assert.Equal(tokens[5], "!")
676 assert.Equal(tokens[6], "&quot;")
677 assert.Equal(len(tokens), 7)
678
679 // testTokenizerLongEnd
680 tokens = ttokenize(mat, w, "Siehst Du?!!?")
681 assert.Equal(tokens[0], "Siehst")
682 assert.Equal(tokens[1], "Du")
683 assert.Equal(tokens[2], "?!!?")
684 assert.Equal(len(tokens), 3)
685
686 // testTokenizerIrishO
687 tokens = ttokenize(mat, w, "Peter O'Toole")
688 assert.Equal(tokens[0], "Peter")
689 assert.Equal(tokens[1], "O'Toole")
690 assert.Equal(len(tokens), 2)
691
692 // testTokenizerAbr
693 tokens = ttokenize(mat, w, "Früher bzw. später ...")
694 assert.Equal(tokens[0], "Früher")
695 assert.Equal(tokens[1], "bzw.")
696 assert.Equal(tokens[2], "später")
697 assert.Equal(tokens[3], "...")
698 assert.Equal(len(tokens), 4)
699
700 // testTokenizerUppercaseRule
701 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
702 assert.Equal(tokens[0], "Es")
703 assert.Equal(tokens[1], "war")
704 assert.Equal(tokens[2], "spät")
705 assert.Equal(tokens[3], ".")
706 assert.Equal(tokens[4], "Morgen")
707 assert.Equal(tokens[5], "ist")
708 assert.Equal(tokens[6], "es")
709 assert.Equal(tokens[7], "früh")
710 assert.Equal(tokens[8], ".")
711 assert.Equal(len(tokens), 9)
712 // Ignored in KorAP-Tokenizer
713
714 // testTokenizerOrd
715 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
716 assert.Equal(tokens[0], "Sie")
717 assert.Equal(tokens[1], "erreichte")
718 assert.Equal(tokens[2], "den")
719 assert.Equal(tokens[3], "1.")
720 assert.Equal(tokens[4], "Platz")
721 assert.Equal(tokens[5], "!")
722 assert.Equal(len(tokens), 6)
723
724 // testNoZipOuputArchive
725 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
726 assert.Equal(tokens[0], "Archive")
727 assert.Equal(tokens[1], ":")
728 assert.Equal(tokens[2], "Ich")
729 assert.Equal(tokens[3], "bin")
730 assert.Equal(tokens[4], "kein")
731 assert.Equal(tokens[5], "zip")
732 assert.Equal(6, len(tokens))
733
734 // testTokenizerStrasse
735 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
736 assert.Equal(tokens[4], "Weststr.")
737 assert.Equal(8, len(tokens))
738
739 // germanTokenizerKnowsGermanOmissionWords
740 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
741 assert.Equal("D'dorf", tokens[0])
742 assert.Equal("Ku'damm", tokens[1])
743 assert.Equal("Lu'hafen", tokens[2])
744 assert.Equal("M'gladbach", tokens[3])
745 assert.Equal("W'schaft", tokens[4])
746 assert.Equal(5, len(tokens))
747
748 // germanTokenizerDoesNOTSeparateGermanContractions
749 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
750 assert.Equal("mach's", tokens[0])
751 assert.Equal("macht's", tokens[1])
752 assert.Equal("was'n", tokens[2])
753 assert.Equal("ist's", tokens[3])
754 assert.Equal("haste", tokens[4])
755 assert.Equal("willste", tokens[5])
756 assert.Equal("kannste", tokens[6])
757 assert.Equal("biste", tokens[7])
758 assert.Equal("kriegste", tokens[8])
759 assert.Equal(9, len(tokens))
760
Akron78dba062021-10-28 19:30:46 +0200761 tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
762 assert.Equal("Es", tokens[0])
763 assert.Equal("ist", tokens[1])
764 assert.Equal("gleich", tokens[2])
765 assert.Equal("2:30", tokens[3])
766 assert.Equal("Uhr", tokens[4])
767 assert.Equal(".", tokens[5])
768 assert.Equal(6, len(tokens))
769
Akron17984c82021-10-30 11:44:37 +0200770 tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
771 assert.Equal("Sie", tokens[0])
772 assert.Equal("schwamm", tokens[1])
773 assert.Equal("die", tokens[2])
774 assert.Equal("Strecke", tokens[3])
775 assert.Equal("in", tokens[4])
776 assert.Equal("00:00:57,34", tokens[5])
777 assert.Equal("00:57,341", tokens[6])
778 assert.Equal("0:57", tokens[7])
779 assert.Equal("Stunden", tokens[8])
780 assert.Equal(".", tokens[9])
781 assert.Equal(10, len(tokens))
782
Akronf1106ec2021-11-05 13:04:44 +0100783 // waste example
784 tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
785 assert.Equal(tokens[0], "Am")
786 assert.Equal(tokens[1], "24.1.1806")
787 assert.Equal(tokens[2], "feierte")
788 assert.Equal(tokens[3], "E.")
789 assert.Equal(tokens[4], "T.")
790 assert.Equal(tokens[5], "A.")
791 assert.Equal(tokens[6], "Hoffmann")
792 assert.Equal(tokens[7], "seinen")
793 assert.Equal(tokens[8], "30.")
794 assert.Equal(tokens[9], "Geburtstag")
795 assert.Equal(tokens[10], ".")
796 assert.Equal(11, len(tokens))
797
Akron9135b202021-11-06 13:16:07 +0100798 // IPtest
799 tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.")
800 assert.Equal(tokens[0], "Meine")
801 assert.Equal(tokens[1], "IP")
802 assert.Equal(tokens[2], "ist")
803 assert.Equal(tokens[3], "192.178.168.55")
804 assert.Equal(tokens[4], ".")
805 assert.Equal(5, len(tokens))
806
Akron6742b962021-11-09 01:17:20 +0100807 // XML entities
808 tokens = ttokenize(mat, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
809 assert.Equal(tokens[0], "Das")
810 assert.Equal(tokens[1], "ist")
811 assert.Equal(tokens[2], "&nbsp;")
812 assert.Equal(tokens[3], "1:30")
813 assert.Equal(tokens[4], "Stunden")
814 assert.Equal(tokens[5], "&")
815 assert.Equal(tokens[6], "20")
816 assert.Equal(tokens[7], "Minuten")
817 assert.Equal(tokens[8], "zu")
818 assert.Equal(tokens[9], "spät")
819 assert.Equal(tokens[10], "&GT;")
820 assert.Equal(tokens[11], ".")
821 assert.Equal(12, len(tokens))
822
Akron936c0f52021-12-07 11:30:53 +0100823 // Plusampersand compounds (1)
Akrone62e8eb2021-12-03 11:59:53 +0100824 tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
825 assert.Equal(tokens[0], "Die")
826 assert.Equal(tokens[1], "2G+-Regel")
827 assert.Equal(tokens[2], "soll")
828 assert.Equal(tokens[3], "weitere")
829 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
830 assert.Equal(tokens[5], "reduzieren")
831 assert.Equal(tokens[6], ".")
832 assert.Equal(7, len(tokens))
833
Akron936c0f52021-12-07 11:30:53 +0100834 // Plusampersand compounds (2)
835 tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.")
836 assert.Equal(tokens[0], "Der")
837 assert.Equal(tokens[1], "Neu-C++-Programmierer")
838 assert.Equal(tokens[2], ".")
839 assert.Equal(3, len(tokens))
840
Akron54ed7e72022-01-04 12:05:00 +0100841 // z.B.
842 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
843 assert.Equal(tokens[0], "Dies")
844 assert.Equal(tokens[1], "sind")
845 assert.Equal(tokens[2], "z.")
846 assert.Equal(tokens[3], "B.")
847 assert.Equal(tokens[4], "zwei")
848 assert.Equal(tokens[5], "Wörter")
849 assert.Equal(tokens[6], "-")
850 assert.Equal(tokens[7], "z.")
851 assert.Equal(tokens[8], "B.")
852 assert.Equal(tokens[9], "auch")
853 assert.Equal(tokens[10], ".")
854 assert.Equal(11, len(tokens))
855
Akron9a594712022-01-14 11:12:21 +0100856 // z.B.
857 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
858 assert.Equal(tokens[0], "Dies")
859 assert.Equal(tokens[1], "sind")
860 assert.Equal(tokens[2], "z.")
861 assert.Equal(tokens[3], "B.")
862 assert.Equal(tokens[4], "zwei")
863 assert.Equal(tokens[5], "Wörter")
864 assert.Equal(tokens[6], "-")
865 assert.Equal(tokens[7], "z.")
866 assert.Equal(tokens[8], "B.")
867 assert.Equal(tokens[9], "auch")
868 assert.Equal(tokens[10], ".")
869 assert.Equal(11, len(tokens))
870
871 // Single quote handling
872 tokens = ttokenize(mat, w, "Es heißt 'Leitungssportteams' und nicht anders.")
873 assert.Equal(tokens[0], "Es")
874 assert.Equal(tokens[1], "heißt")
875 assert.Equal(tokens[2], "'")
876 assert.Equal(tokens[3], "Leitungssportteams")
877 assert.Equal(tokens[4], "'")
878 assert.Equal(tokens[5], "und")
879 assert.Equal(tokens[6], "nicht")
880 assert.Equal(tokens[7], "anders")
881 assert.Equal(tokens[8], ".")
882 assert.Equal(9, len(tokens))
883
Akronb02ad072022-01-19 12:41:44 +0100884 // Apostrophe handling
885 tokens = ttokenize(mat, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
886 assert.Equal(tokens[0], "Das")
887 assert.Equal(tokens[1], "ist")
888 assert.Equal(tokens[2], "Nils’")
889 assert.Equal(tokens[3], "Einkaufskorb")
890 assert.Equal(tokens[4], "bei")
891 assert.Equal(tokens[5], "McDonald's")
892 assert.Equal(tokens[6], ".")
893 assert.Equal(7, len(tokens))
894
Akron28031b72021-10-02 13:07:25 +0200895 /*
896 @Test
897 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
898 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
899 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
900 assert.Equal("'ve", tokens[1]);
901 assert.Equal("'ll", tokens[3]);
902 assert.Equal("'d", tokens[5]);
903 assert.Equal("'m", tokens[7]);
904 assert.Equal("'re", tokens[9]);
905 assert.Equal("'s", tokens[11]);
906 assert.Equal("is", tokens[12]);
907 assert.Equal("n't", tokens[13]);
908 assert.Equal(14, len(tokens));
909 }
910
911 @Test
912 public void frenchTokenizerKnowsFrenchAbbreviations () {
913 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
914 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
915 assert.Equal("Approx.", tokens[0]);
916 assert.Equal("juill.", tokens[2]);
917 assert.Equal("prof.", tokens[5]);
918 assert.Equal("exerc.", tokens[15]);
919 assert.Equal("no.", tokens[16]);
920 assert.Equal("pp.", tokens[21]);
921 }
922
923 @Test
924 public void frenchTokenizerKnowsFrenchContractions () {
925 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
926 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
927 assert.Equal("J'", tokens[0]);
928 assert.Equal("j'", tokens[2]);
929 assert.Equal("qu'", tokens[4]);
930 assert.Equal("d'", tokens[6]);
931 assert.Equal("jusqu'", tokens[8]);
932 assert.Equal("Aujourd'hui", tokens[10]);
933 assert.Equal("D'", tokens[11]); // ’
934 assert.Equal("Quelqu'un", tokens[13]); // ’
935 assert.Equal("Presqu'île", tokens[14]); // ’
936 }
937
938 @Test
939 public void frenchTokenizerKnowsFrenchClitics () {
940 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
941 tokens = tokenize(dat, w, "suis-je sont-elles ")
942 assert.Equal("suis", tokens[0]);
943 assert.Equal("-je", tokens[1]);
944 assert.Equal("sont", tokens[2]);
945 assert.Equal("-elles", tokens[3]);
946 }
947
948 @Test
949 public void testEnglishTokenizerScienceAbbreviations () {
950 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
951 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
952 assert.Equal("Approx.", tokens[0]);
953 assert.Equal("in", tokens[1]);
954 assert.Equal("Sept.", tokens[2]);
955 assert.Equal("1954", tokens[3]);
956 assert.Equal(",", tokens[4]);
957 assert.Equal("Assoc.", tokens[5]);
958 assert.Equal("Prof.", tokens[6]);
959 assert.Equal("Dr.", tokens[7]);
960 assert.Equal("R.", tokens[8]);
961 assert.Equal("J.", tokens[9]);
962 assert.Equal("Ewing", tokens[10]);
963 assert.Equal("reviewed", tokens[11]);
964 assert.Equal("articles", tokens[12]);
965 assert.Equal("on", tokens[13]);
966 assert.Equal("Enzymol.", tokens[14]);
967 assert.Equal("Bacteriol.", tokens[15]);
968 assert.Equal("effects", tokens[16]);
969 assert.Equal("later", tokens[17]);
970 assert.Equal("published", tokens[18]);
971 assert.Equal("in", tokens[19]);
972 assert.Equal("Nutr.", tokens[20]);
973 assert.Equal("Rheumatol.", tokens[21]);
974 assert.Equal("No.", tokens[22]);
975 assert.Equal("12", tokens[23]);
976 assert.Equal("and", tokens[24]);
977 assert.Equal("Nº.", tokens[25]);
978 assert.Equal("13.", tokens[26]);
979 assert.Equal(",", tokens[27]);
980 assert.Equal("pp.", tokens[28]);
981 assert.Equal("17-18", tokens[29]);
982 assert.Equal(".", tokens[30]);
983 }
984
985 @Test
986 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
987 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
988 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
989 assert.Equal("I.", tokens[1]);
990 assert.Equal("I", tokens[8]);
991 assert.Equal(".", tokens[9]);
992 assert.Equal("I", tokens[12]);
993 assert.Equal(".", tokens[13]);
994 }
995
996 @Test
997 public void testZipOuputArchive () {
998
999 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
1000 System.setOut(new PrintStream(clearOut));
1001 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
1002 assert.Equal(0, len(tokens));
1003 }
1004 */
1005 /*
1006
1007 @Test
1008 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
1009 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
1010 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
1011 .printOffsets(true)
1012 .build();
1013 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
1014 assert.Equal("Text1", tokens[0].getType());
1015 assert.Equal(len(tokens), 9 );
1016 }
1017 */
1018}
1019
Akronc9c0eae2021-10-22 19:49:43 +02001020func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +02001021 assert := assert.New(t)
1022
Akron9fb63af2021-10-28 01:15:53 +02001023 if mat == nil {
1024 mat = LoadMatrixFile("testdata/tokenizer.matok")
1025 }
Akron28031b72021-10-02 13:07:25 +02001026
Akron28031b72021-10-02 13:07:25 +02001027 assert.NotNil(mat)
1028
1029 b := make([]byte, 0, 2048)
1030 w := bytes.NewBuffer(b)
1031 var tokens []string
1032
1033 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
1034 assert.Equal("Das", tokens[0])
1035 assert.Equal("<b>", tokens[1])
1036 assert.Equal("beste", tokens[2])
1037 assert.Equal("</b>", tokens[3])
1038 assert.Equal("Fußballspiel", tokens[4])
1039 assert.Equal(5, len(tokens))
1040
1041 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
1042 assert.Equal("Das", tokens[0])
1043 assert.Equal("<b class=\"c\">", tokens[1])
1044 assert.Equal("beste", tokens[2])
1045 assert.Equal("</b>", tokens[3])
1046 assert.Equal("Fußballspiel", tokens[4])
1047 assert.Equal(5, len(tokens))
1048
1049 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
1050 assert.Equal("der", tokens[0])
1051 assert.Equal("<x y=\"alte \">", tokens[1])
1052 assert.Equal("<x x>", tokens[2])
1053 assert.Equal("alte", tokens[3])
1054 assert.Equal("</x>", tokens[4])
1055 assert.Equal("etc.", tokens[5])
1056 assert.Equal("et", tokens[6])
1057 assert.Equal(".", tokens[7])
1058 assert.Equal("Mann", tokens[8])
1059 assert.Equal(".", tokens[9])
1060 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001061
1062 tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.")
1063 assert.Equal("das", tokens[0])
1064 assert.Equal("<br class=\"br\" />", tokens[1])
1065 assert.Equal("ging", tokens[2])
1066 assert.Equal(".", tokens[3])
1067 assert.Equal(4, len(tokens))
Akron28031b72021-10-02 13:07:25 +02001068}
1069
Akronabcb6a52021-10-09 15:52:08 +02001070func TestMatokDatokEquivalence(t *testing.T) {
1071 assert := assert.New(t)
1072
Akron9fb63af2021-10-28 01:15:53 +02001073 if mat == nil {
1074 mat = LoadMatrixFile("testdata/tokenizer.matok")
1075 }
Akronabcb6a52021-10-09 15:52:08 +02001076 dat := LoadDatokFile("testdata/tokenizer.datok")
1077
1078 r := strings.NewReader(s)
1079
1080 tb := make([]byte, 0, 2048)
1081 w := bytes.NewBuffer(tb)
1082
1083 // Transduce with double array representation
1084 dat.Transduce(r, w)
1085
1086 datStr := w.String()
1087
1088 r.Reset(s)
1089 w.Reset()
1090
1091 // Transduce with matrix representation
1092 mat.Transduce(r, w)
1093
1094 matStr := w.String()
1095
1096 assert.Equal(datStr, matStr)
1097}
1098
Akronc9c0eae2021-10-22 19:49:43 +02001099func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001100 assert := assert.New(t)
1101
Akron9fb63af2021-10-28 01:15:53 +02001102 if mat == nil {
1103 mat = LoadMatrixFile("testdata/tokenizer.matok")
1104 }
Akrone396a932021-10-19 01:06:13 +02001105
1106 assert.NotNil(mat)
1107
1108 b := make([]byte, 0, 2048)
1109 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001110
1111 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001112
1113 matStr := w.String()
1114
1115 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1116}
1117
Akronc9c0eae2021-10-22 19:49:43 +02001118func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001119 assert := assert.New(t)
1120
Akron9fb63af2021-10-28 01:15:53 +02001121 if mat == nil {
1122 mat = LoadMatrixFile("testdata/tokenizer.matok")
1123 }
Akrona854faa2021-10-22 19:31:08 +02001124
1125 assert.NotNil(mat)
1126
1127 b := make([]byte, 0, 2048)
1128 w := bytes.NewBuffer(b)
1129
1130 assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
1131 matStr := w.String()
1132 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001133}
Akrona854faa2021-10-22 19:31:08 +02001134
Akron22c565a2021-11-28 17:31:36 +01001135func TestMatrixFullTokenizerLongText(t *testing.T) {
1136 assert := assert.New(t)
1137
1138 if mat == nil {
1139 mat = LoadMatrixFile("testdata/tokenizer.matok")
1140 }
1141
1142 assert.NotNil(mat)
1143
1144 b := make([]byte, 0, 2048)
1145 w := bytes.NewBuffer(b)
1146
1147 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1148
1149Copyright laws are changing all over the world. Be sure to check the
1150copyright laws for your country before downloading or redistributing
1151this or any other Project Gutenberg eBook.
1152
1153This header should be the first thing seen when viewing this Project
1154Gutenberg file. Please do not remove it. Do not change or edit the
1155header without written permission.
1156
1157Please read the "legal small print," and other information about the
1158eBook and Project Gutenberg at the bottom of this file. Included is
1159important information about your specific rights and restrictions in
1160how the file may be used. You can also find out about how to make a
1161donation to Project Gutenberg, and how to get involved.
1162
1163
1164**Welcome To The World of Free Plain Vanilla Electronic Texts**
1165
1166**eBooks Readable By Both Humans and By Computers, Since 1971**
1167
1168*****These eBooks Were Prepared By Thousands of Volunteers!*****
1169
1170
1171Title: Effi Briest
1172
1173Author: Theodor Fontane
1174
1175Release Date: March, 2004 [EBook #5323]
1176`
1177
1178 assert.True(mat.Transduce(strings.NewReader(text), w))
1179
1180 assert.True(strings.Contains(w.String(), "Release"))
1181}
1182
Akronf6bdfdb2021-10-23 15:56:53 +02001183func TestMatrixTrimming(t *testing.T) {
1184 assert := assert.New(t)
1185
Akron9fb63af2021-10-28 01:15:53 +02001186 if mat == nil {
1187 mat = LoadMatrixFile("testdata/tokenizer.matok")
1188 }
Akronf6bdfdb2021-10-23 15:56:53 +02001189
1190 assert.NotNil(mat)
1191
1192 b := make([]byte, 0, 2048)
1193 w := bytes.NewBuffer(b)
1194
1195 assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
1196 matStr := w.String()
1197 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001198}
1199
Akronc9c0eae2021-10-22 19:49:43 +02001200func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001201 bu := make([]byte, 0, 2048)
1202 w := bytes.NewBuffer(bu)
1203
Akron28031b72021-10-02 13:07:25 +02001204 r := strings.NewReader(s)
1205
Akron094a4e82021-10-02 18:37:00 +02001206 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001207
1208 b.ResetTimer()
1209
1210 for i := 0; i < b.N; i++ {
1211 w.Reset()
1212 r.Reset(s)
1213 ok := mat.Transduce(r, w)
1214 if !ok {
1215 fmt.Println("Fail!")
1216 fmt.Println(w.String())
1217 os.Exit(1)
1218 }
1219 }
1220}