blob: ac9b05490f3cf4633eb0efbf16452f569b58eaa4 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron9fb63af2021-10-28 01:15:53 +020023var mat *MatrixTokenizer
24
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akrondf275812022-03-27 12:54:46 +020073func TestMatrixSimpleString(t *testing.T) {
74 assert := assert.New(t)
75 // bau | bauamt
76 tok := LoadFomaFile("testdata/bauamt.fst")
77 mat := tok.ToMatrix()
78
79 b := make([]byte, 0, 2048)
80 w := bytes.NewBuffer(b)
81 var tokens []string
82
83 tokens = ttokenize(mat, w, "ibauamt")
84 assert.Equal("i", tokens[0])
85 assert.Equal("bauamt", tokens[1])
86
87 tokens = ttokenize(mat, w, "ibbauamt")
88 assert.Equal("i", tokens[0])
89
90 assert.Equal("b", tokens[1])
91 assert.Equal("bauamt", tokens[2])
92
93 tokens = ttokenize(mat, w, "bau")
94 assert.Equal("bau", tokens[0])
95
96 tokens = ttokenize(mat, w, "baum")
97 assert.Equal("bau", tokens[0])
98 assert.Equal("m", tokens[1])
99
100 tokens = ttokenize(mat, w, "baudibauamt")
101 assert.Equal("bau", tokens[0])
102 assert.Equal("d", tokens[1])
103 assert.Equal("i", tokens[2])
104 assert.Equal("bauamt", tokens[3])
105}
106
Akronc9c0eae2021-10-22 19:49:43 +0200107func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +0200108 assert := assert.New(t)
109 foma := LoadFomaFile("testdata/simpletok.fst")
110 assert.NotNil(foma)
111
112 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +0200113 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +0200114
Akrondf275812022-03-27 12:54:46 +0200115 assert.Equal(ttokenizeStr(mat, "bau"), "bau")
116 assert.Equal(ttokenizeStr(mat, "bad"), "bad")
117 assert.Equal(ttokenizeStr(mat, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200118 b := make([]byte, 0, 1024)
119 buf := bytes.NewBuffer(b)
120 n, err := mat.WriteTo(buf)
121 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +0200122 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +0200123 mat2 := ParseMatrix(buf)
124 assert.NotNil(mat2)
125 assert.Equal(mat.sigma, mat2.sigma)
126 assert.Equal(mat.epsilon, mat2.epsilon)
127 assert.Equal(mat.unknown, mat2.unknown)
128 assert.Equal(mat.identity, mat2.identity)
129 assert.Equal(mat.stateCount, mat2.stateCount)
130 assert.Equal(len(mat.array), len(mat2.array))
131 assert.Equal(mat.array, mat2.array)
Akrondf275812022-03-27 12:54:46 +0200132 assert.Equal(ttokenizeStr(mat2, "bau"), "bau")
133 assert.Equal(ttokenizeStr(mat2, "bad"), "bad")
134 assert.Equal(ttokenizeStr(mat2, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200135}
136
Akrone396a932021-10-19 01:06:13 +0200137func TestMatrixIgnorableMCS(t *testing.T) {
138 assert := assert.New(t)
139
140 // This test relies on final states. That's why it is
141 // not working correctly anymore.
142
143 // File has MCS in sigma but not in net
144 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
145 assert.NotNil(tok)
146 mat := tok.ToMatrix()
147 assert.NotNil(mat)
148
149 b := make([]byte, 0, 2048)
150 w := bytes.NewBuffer(b)
151 var tokens []string
152
153 // Is only unambigous when transducing strictly greedy!
154 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
155 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200156 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200157 assert.Equal("a", tokens[0])
158 assert.Equal("b", tokens[1])
159 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200160 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200161}
162
Akronc9c0eae2021-10-22 19:49:43 +0200163func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200164 assert := assert.New(t)
165 foma := LoadFomaFile("testdata/tokenizer.fst")
166 assert.NotNil(foma)
167
168 mat := foma.ToMatrix()
169 assert.NotNil(foma)
170
171 tb := make([]byte, 0, 2048)
172 w := bytes.NewBuffer(tb)
173
174 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200175 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200176
177 b := make([]byte, 0, 1024)
178 buf := bytes.NewBuffer(b)
179 _, err := mat.WriteTo(buf)
180 assert.Nil(err)
181 w.Reset()
182 // assert.Equal(int64(248), n)
183
184 mat2 := ParseMatrix(buf)
185 assert.NotNil(mat2)
186 assert.Equal(mat.sigma, mat2.sigma)
187 assert.Equal(mat.epsilon, mat2.epsilon)
188 assert.Equal(mat.unknown, mat2.unknown)
189 assert.Equal(mat.identity, mat2.identity)
190 assert.Equal(mat.stateCount, mat2.stateCount)
191 assert.Equal(len(mat.array), len(mat2.array))
192 // assert.Equal(mat.array, mat2.array)
193
194 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200195 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200196}
197
Akronc9c0eae2021-10-22 19:49:43 +0200198func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200199 assert := assert.New(t)
200
Akron9fb63af2021-10-28 01:15:53 +0200201 if mat == nil {
202 mat = LoadMatrixFile("testdata/tokenizer.matok")
203 }
Akron28031b72021-10-02 13:07:25 +0200204
205 assert.NotNil(mat)
206
207 b := make([]byte, 0, 2048)
208 w := bytes.NewBuffer(b)
209 var tokens []string
210
211 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
212
213 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200214 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200215 assert.Equal("tra", tokens[0])
216 assert.Equal(".", tokens[1])
217 assert.Equal("", tokens[2])
218 assert.Equal("u", tokens[3])
219 assert.Equal("Du", tokens[4])
220 assert.Equal("?", tokens[5])
221 assert.Equal("", tokens[6])
222 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200223 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200224
225 w.Reset()
226 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200227 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200228}
229
Akronc9c0eae2021-10-22 19:49:43 +0200230func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200231 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200232
233 if mat == nil {
234 mat = LoadMatrixFile("testdata/tokenizer.matok")
235 }
Akron5c82a922021-09-24 19:11:29 +0200236
237 b := make([]byte, 0, 2048)
238 w := bytes.NewBuffer(b)
239 var sentences []string
240
241 // testSentSplitterSimple
242 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
243 sentences = strings.Split(w.String(), "\n\n")
244
Akrona854faa2021-10-22 19:31:08 +0200245 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200246 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200247 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200248 assert.Equal(len(sentences), 2)
249
250 w.Reset()
Akrona854faa2021-10-22 19:31:08 +0200251 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
252 sentences = strings.Split(w.String(), "\n\n")
253 assert.Equal(len(sentences), 2)
254 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
255 assert.Equal("\n", sentences[1])
256
257 w.Reset()
Akron5c82a922021-09-24 19:11:29 +0200258 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
259 sentences = strings.Split(w.String(), "\n\n")
260 assert.Equal(len(sentences), 2)
261 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200262 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200263
Akron28031b72021-10-02 13:07:25 +0200264 w.Reset()
265 assert.True(mat.Transduce(strings.NewReader(""), w))
266 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200267 assert.Equal(len(sentences), 2)
268 assert.Equal("", sentences[0])
269 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200270
Akron28031b72021-10-02 13:07:25 +0200271 w.Reset()
272 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
273 sentences = strings.Split(w.String(), "\n\n")
274 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200275
Akron28031b72021-10-02 13:07:25 +0200276 w.Reset()
277 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
278 sentences = strings.Split(w.String(), "\n\n")
279 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200280
Akron28031b72021-10-02 13:07:25 +0200281 w.Reset()
282 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
283 sentences = strings.Split(w.String(), "\n\n")
284 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200285 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200286 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200287
Akron28031b72021-10-02 13:07:25 +0200288 w.Reset()
289 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
290 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200291 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200292 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200293
Akron28031b72021-10-02 13:07:25 +0200294 w.Reset()
295 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
296 sentences = strings.Split(w.String(), "\n\n")
297 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200298
Akron28031b72021-10-02 13:07:25 +0200299 w.Reset()
300 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
301 sentences = strings.Split(w.String(), "\n\n")
302 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200303
Akron28031b72021-10-02 13:07:25 +0200304 w.Reset()
305 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
306 sentences = strings.Split(w.String(), "\n\n")
307 assert.Equal(len(sentences), 2)
308 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200309 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200310
Akron28031b72021-10-02 13:07:25 +0200311 w.Reset()
312 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
313 sentences = strings.Split(w.String(), "\n\n")
314 assert.Equal(len(sentences), 3)
315 assert.Equal("Ausschalten\n!!!", sentences[0])
316 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200317 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200318
Akron28031b72021-10-02 13:07:25 +0200319 w.Reset()
320 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
321 sentences = strings.Split(w.String(), "\n\n")
322 assert.Equal(len(sentences), 2)
Akrone96895f2022-03-08 19:58:37 +0100323
324 w.Reset()
325 assert.True(mat.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w))
326 sentences = strings.Split(w.String(), "\n\n")
327 assert.Equal(len(sentences), 5)
328 assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0])
329 assert.Equal("Geh\n!!!", sentences[1])
330 assert.Equal("\"\nLass\n!\n\"", sentences[2])
331 assert.Equal("Dann\nging\ner\n.", sentences[3])
332
333 w.Reset()
334 assert.True(mat.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
335 sentences = strings.Split(w.String(), "\n\n")
336 assert.Equal(len(sentences), 3)
337 assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0])
338 assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1])
Akronece3f012022-03-09 19:12:15 +0100339
340 w.Reset()
341 assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
342 sentences = strings.Split(w.String(), "\n\n")
343 assert.Equal(len(sentences), 3)
344 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
345 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
346
347 w.Reset()
348 assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
349 sentences = strings.Split(w.String(), "\n\n")
350 assert.Equal(len(sentences), 3)
351 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
352 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
353
354 text := `»Meinetwegen. Denkst du, daß ich darauf warte? Das fehlte noch.
355Übrigens, ich kriege schon einen und vielleicht bald. Da ist mir nicht
356bange. Neulich erst hat mir der kleine Ventivegni von drüben gesagt:
357'Fräulein Effi, was gilt die Wette, wir sind hier noch in diesem Jahre
358zu Polterabend und Hochzeit.'«
359
360»Und was sagtest du da?«`
361
362 w.Reset()
363 assert.True(mat.Transduce(strings.NewReader(text), w))
364 sentences = strings.Split(w.String(), "\n\n")
365 assert.Equal(len(sentences), 8)
366 assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5])
367 assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6])
Akron4222ac82022-03-11 01:06:21 +0100368
369 text = `»Nun, gib dich zufrieden, ich fange schon an ... Also Baron
370Innstetten!`
371
372 w.Reset()
373 assert.True(mat.Transduce(strings.NewReader(text), w))
374 sentences = strings.Split(w.String(), "\n\n")
375 assert.Equal(len(sentences), 3)
376 assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
377 assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
Akrondf275812022-03-27 12:54:46 +0200378
Akronf94b9ce2022-03-27 18:18:09 +0200379 // Check paranthesis at the end of sentences.
380 w.Reset()
381 assert.True(mat.Transduce(strings.NewReader("(Er ging.) Und kam (später)."), w))
382 sentences = strings.Split(w.String(), "\n\n")
383 assert.Equal(len(sentences), 3)
384 assert.Equal("(\nEr\nging\n.\n)", sentences[0])
385 assert.Equal("Und\nkam\n(\nspäter\n)\n.", sentences[1])
Akrondf275812022-03-27 12:54:46 +0200386}
387
388func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
389 assert := assert.New(t)
390
391 if mat == nil {
392 mat = LoadMatrixFile("testdata/tokenizer.matok")
393 }
394
395 b := make([]byte, 0, 2048)
396 w := bytes.NewBuffer(b)
397 var sentences []string
398
399 text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
400
401 w.Reset()
402 assert.True(mat.Transduce(strings.NewReader(text), w))
403 sentences = strings.Split(w.String(), "\n\n")
Akronb4287552022-03-27 14:11:24 +0200404 assert.Equal(len(sentences), 6)
Akrondf275812022-03-27 12:54:46 +0200405 assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
406 assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
Akronb4287552022-03-27 14:11:24 +0200407 assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
408 assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
409 assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
Akron1c34ce62021-09-23 23:27:39 +0200410}
Akron28031b72021-10-02 13:07:25 +0200411
Akronc9c0eae2021-10-22 19:49:43 +0200412func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200413 assert := assert.New(t)
414
Akron9fb63af2021-10-28 01:15:53 +0200415 if mat == nil {
416 mat = LoadMatrixFile("testdata/tokenizer.matok")
417 }
Akron28031b72021-10-02 13:07:25 +0200418
419 b := make([]byte, 0, 2048)
420 w := bytes.NewBuffer(b)
421 var tokens []string
422
423 // testTokenizerSimple
424 tokens = ttokenize(mat, w, "Der alte Mann")
425 assert.Equal(tokens[0], "Der")
426 assert.Equal(tokens[1], "alte")
427 assert.Equal(tokens[2], "Mann")
428 assert.Equal(len(tokens), 3)
429
430 tokens = ttokenize(mat, w, "Der alte Mann.")
431 assert.Equal(tokens[0], "Der")
432 assert.Equal(tokens[1], "alte")
433 assert.Equal(tokens[2], "Mann")
434 assert.Equal(tokens[3], ".")
435 assert.Equal(len(tokens), 4)
436
437 // testTokenizerAbbr
438 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
439 assert.Equal(tokens[0], "Der")
440 assert.Equal(tokens[1], "Vorsitzende")
441 assert.Equal(tokens[2], "der")
442 assert.Equal(tokens[3], "F.D.P.")
443 assert.Equal(tokens[4], "hat")
444 assert.Equal(tokens[5], "gewählt")
445 assert.Equal(len(tokens), 6)
446 // Ignored in KorAP-Tokenizer
447
448 // testTokenizerHost1
449 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
450 assert.Equal(tokens[0], "Gefunden")
451 assert.Equal(tokens[1], "auf")
452 assert.Equal(tokens[2], "wikipedia.org")
453 assert.Equal(len(tokens), 3)
454
455 // testTokenizerWwwHost
456 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
457 assert.Equal("Gefunden", tokens[0])
458 assert.Equal("auf", tokens[1])
459 assert.Equal("www.wikipedia.org", tokens[2])
460 assert.Equal(3, len(tokens))
461
462 // testTokenizerWwwUrl
463 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
464 assert.Equal("www.info.biz/info", tokens[3])
465
466 // testTokenizerFtpHost
467 /*
468 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
469 assert.Equal("Kann", tokens[0])
470 assert.Equal("von", tokens[1])
471 assert.Equal("ftp.download.org", tokens[2])
472 assert.Equal(5, len(tokens))
473 // Ignored in KorAP-Tokenizer
474 */
475
476 // testTokenizerDash
477 tokens = ttokenize(mat, w, "Das war -- spitze")
478 assert.Equal(tokens[0], "Das")
479 assert.Equal(tokens[1], "war")
480 assert.Equal(tokens[2], "--")
481 assert.Equal(tokens[3], "spitze")
482 assert.Equal(len(tokens), 4)
483
484 // testTokenizerEmail1
485 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
486 assert.Equal(tokens[0], "Ich")
487 assert.Equal(tokens[1], "bin")
488 assert.Equal(tokens[2], "unter")
489 assert.Equal(tokens[3], "korap@ids-mannheim.de")
490 assert.Equal(tokens[4], "erreichbar")
491 assert.Equal(tokens[5], ".")
492 assert.Equal(len(tokens), 6)
493
494 // testTokenizerEmail2
495 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
496 assert.Equal(tokens[0], "Oder")
497 assert.Equal(tokens[1], "unter")
498 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
499 assert.Equal(tokens[3], ".")
500 assert.Equal(len(tokens), 4)
501
502 // testTokenizerEmail3
503 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
504 assert.Equal(tokens[0], "Oder")
505 assert.Equal(tokens[1], "unter")
506 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
507 assert.Equal(tokens[3], ".")
508 assert.Equal(len(tokens), 4)
509 // Ignored in KorAP-Tokenizer
510
511 // testTokenizerDoNotAcceptQuotedEmailNames
512 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
513 assert.Equal("\"", tokens[0])
514 assert.Equal("John", tokens[1])
515 assert.Equal("Doe", tokens[2])
516 assert.Equal("\"", tokens[3])
517 assert.Equal("@xx", tokens[4])
518 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
519 assert.Equal("com", tokens[6])
520 assert.Equal(7, len(tokens))
521
522 // testTokenizerTwitter
523 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
524 assert.Equal(tokens[0], "Folgt")
525 assert.Equal(tokens[1], "@korap")
526 assert.Equal(tokens[2], "und")
527 assert.Equal(tokens[3], "#korap")
528 assert.Equal(len(tokens), 4)
529
530 // testTokenizerWeb1
531 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
532 assert.Equal(tokens[0], "Unsere")
533 assert.Equal(tokens[1], "Website")
534 assert.Equal(tokens[2], "ist")
535 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
536 assert.Equal(len(tokens), 4)
537
538 // testTokenizerWeb2
539 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
540 assert.Equal(tokens[0], "Wir")
541 assert.Equal(tokens[1], "sind")
542 assert.Equal(tokens[2], "auch")
543 assert.Equal(tokens[3], "im")
544 assert.Equal(tokens[4], "Internet")
545 assert.Equal(tokens[5], "(")
546 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
547 assert.Equal(tokens[7], ")")
548 assert.Equal(len(tokens), 8)
549 // Ignored in KorAP-Tokenizer
550
551 // testTokenizerWeb3
552 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
553 assert.Equal(tokens[0], "Die")
554 assert.Equal(tokens[1], "Adresse")
555 assert.Equal(tokens[2], "ist")
556 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
557 assert.Equal(tokens[4], ".")
558 assert.Equal(len(tokens), 5)
559 // Ignored in KorAP-Tokenizer
560
561 // testTokenizerServer
562 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
563 assert.Equal(tokens[0], "Unser")
564 assert.Equal(tokens[1], "Server")
565 assert.Equal(tokens[2], "ist")
566 assert.Equal(tokens[3], "10.0.10.51")
567 assert.Equal(tokens[4], ".")
568 assert.Equal(len(tokens), 5)
569
570 // testTokenizerNum
571 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
572 assert.Equal(tokens[0], "Zu")
573 assert.Equal(tokens[1], "50,4%")
574 assert.Equal(tokens[2], "ist")
575 assert.Equal(tokens[3], "es")
576 assert.Equal(tokens[4], "sicher")
577 assert.Equal(len(tokens), 5)
578 // Differs from KorAP-Tokenizer
579
580 // testTokenizerDate
581 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
582 assert.Equal(tokens[0], "Der")
583 assert.Equal(tokens[1], "Termin")
584 assert.Equal(tokens[2], "ist")
585 assert.Equal(tokens[3], "am")
586 assert.Equal(tokens[4], "5.9.2018")
587 assert.Equal(len(tokens), 5)
588
589 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
590 assert.Equal(tokens[0], "Der")
591 assert.Equal(tokens[1], "Termin")
592 assert.Equal(tokens[2], "ist")
593 assert.Equal(tokens[3], "am")
594 assert.Equal(tokens[4], "5/9/2018")
595 assert.Equal(len(tokens), 5)
596
597 // testTokenizerDateRange
598 /*
599 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
600 assert.Equal(tokens[0], "Der")
601 assert.Equal(tokens[1], "Termin")
602 assert.Equal(tokens[2], "war")
603 assert.Equal(tokens[3], "vom")
604 assert.Equal(tokens[4], "4.")
605 assert.Equal(tokens[5], "-")
606 assert.Equal(tokens[6], "5.9.2018")
607 assert.Equal(len(tokens), 7)
608 // Ignored in KorAP-Tokenizer
609 */
610
611 // testTokenizerEmoji1
612 tokens = ttokenize(mat, w, "Das ist toll! ;)")
613 assert.Equal(tokens[0], "Das")
614 assert.Equal(tokens[1], "ist")
615 assert.Equal(tokens[2], "toll")
616 assert.Equal(tokens[3], "!")
617 assert.Equal(tokens[4], ";)")
618 assert.Equal(len(tokens), 5)
619
620 // testTokenizerRef1
621 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
622 assert.Equal(tokens[0], "Kupietz")
623 assert.Equal(tokens[1], "und")
624 assert.Equal(tokens[2], "Schmidt")
625 assert.Equal(tokens[3], "(2018)")
626 assert.Equal(tokens[4], ":")
627 assert.Equal(tokens[5], "Korpuslinguistik")
628 assert.Equal(len(tokens), 6)
629 // Differs from KorAP-Tokenizer!
630
631 // testTokenizerRef2 () {
632 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
633 assert.Equal(tokens[0], "Kupietz")
634 assert.Equal(tokens[1], "und")
635 assert.Equal(tokens[2], "Schmidt")
636 assert.Equal(tokens[3], "[2018]")
637 assert.Equal(tokens[4], ":")
638 assert.Equal(tokens[5], "Korpuslinguistik")
639 assert.Equal(len(tokens), 6)
640 // Differs from KorAP-Tokenizer!
641
642 // testTokenizerOmission1 () {
643 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
644 assert.Equal(tokens[0], "Er")
645 assert.Equal(tokens[1], "ist")
646 assert.Equal(tokens[2], "ein")
647 assert.Equal(tokens[3], "A****loch")
648 assert.Equal(tokens[4], "!")
649 assert.Equal(len(tokens), 5)
650
651 // testTokenizerOmission2
652 tokens = ttokenize(mat, w, "F*ck!")
653 assert.Equal(tokens[0], "F*ck")
654 assert.Equal(tokens[1], "!")
655 assert.Equal(len(tokens), 2)
656
657 // testTokenizerOmission3 () {
658 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
659 assert.Equal(tokens[0], "Dieses")
660 assert.Equal(tokens[1], "verf*****")
661 assert.Equal(tokens[2], "Kleid")
662 assert.Equal(tokens[3], "!")
663 assert.Equal(len(tokens), 4)
664
665 // Probably interpreted as HOST
666 // testTokenizerFileExtension1
667 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
668 assert.Equal(tokens[0], "Ich")
669 assert.Equal(tokens[1], "habe")
670 assert.Equal(tokens[2], "die")
671 assert.Equal(tokens[3], "readme.txt")
672 assert.Equal(tokens[4], "heruntergeladen")
673 assert.Equal(len(tokens), 5)
674
675 // Probably interpreted as HOST
676 // testTokenizerFileExtension2
677 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
678 assert.Equal(tokens[0], "Nimm")
679 assert.Equal(tokens[1], "die")
680 assert.Equal(tokens[2], "README.TXT")
681 assert.Equal(tokens[3], "!")
682 assert.Equal(len(tokens), 4)
683
684 // Probably interpreted as HOST
685 // testTokenizerFileExtension3
686 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
687 assert.Equal(tokens[0], "Zeig")
688 assert.Equal(tokens[1], "mir")
689 assert.Equal(tokens[2], "profile.jpeg")
690 assert.Equal(len(tokens), 3)
691
692 // testTokenizerFile1
693
694 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
695 assert.Equal(tokens[0], "Zeig")
696 assert.Equal(tokens[1], "mir")
697 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
698 assert.Equal(len(tokens), 3)
699
700 // testTokenizerFile2
701 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
702 assert.Equal(tokens[0], "Gehe")
703 assert.Equal(tokens[1], "zu")
704 assert.Equal(tokens[2], "/Dokumente/profile.docx")
705 assert.Equal(len(tokens), 3)
706
707 // testTokenizerFile3
708 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
709 assert.Equal(tokens[0], "Zeig")
710 assert.Equal(tokens[1], "mir")
711 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
712 assert.Equal(len(tokens), 3)
713 // Ignored in KorAP-Tokenizer
714
715 // testTokenizerPunct
716 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
717 assert.Equal(tokens[0], "Er")
718 assert.Equal(tokens[1], "sagte")
719 assert.Equal(tokens[2], ":")
720 assert.Equal(tokens[3], "\"")
721 assert.Equal(tokens[4], "Es")
722 assert.Equal(tokens[5], "geht")
723 assert.Equal(tokens[6], "mir")
724 assert.Equal(tokens[7], "gut")
725 assert.Equal(tokens[8], "!")
726 assert.Equal(tokens[9], "\"")
727 assert.Equal(tokens[10], ",")
728 assert.Equal(tokens[11], "daraufhin")
729 assert.Equal(tokens[12], "ging")
730 assert.Equal(tokens[13], "er")
731 assert.Equal(tokens[14], ".")
732 assert.Equal(len(tokens), 15)
733
734 // testTokenizerPlusAmpersand
735 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
736 assert.Equal(tokens[0], "&quot;")
737 assert.Equal(tokens[1], "Das")
738 assert.Equal(tokens[2], "ist")
739 assert.Equal(tokens[3], "von")
740 assert.Equal(tokens[4], "C&A")
741 assert.Equal(tokens[5], "!")
742 assert.Equal(tokens[6], "&quot;")
743 assert.Equal(len(tokens), 7)
744
745 // testTokenizerLongEnd
746 tokens = ttokenize(mat, w, "Siehst Du?!!?")
747 assert.Equal(tokens[0], "Siehst")
748 assert.Equal(tokens[1], "Du")
749 assert.Equal(tokens[2], "?!!?")
750 assert.Equal(len(tokens), 3)
751
752 // testTokenizerIrishO
753 tokens = ttokenize(mat, w, "Peter O'Toole")
754 assert.Equal(tokens[0], "Peter")
755 assert.Equal(tokens[1], "O'Toole")
756 assert.Equal(len(tokens), 2)
757
758 // testTokenizerAbr
759 tokens = ttokenize(mat, w, "Früher bzw. später ...")
760 assert.Equal(tokens[0], "Früher")
761 assert.Equal(tokens[1], "bzw.")
762 assert.Equal(tokens[2], "später")
763 assert.Equal(tokens[3], "...")
764 assert.Equal(len(tokens), 4)
765
766 // testTokenizerUppercaseRule
767 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
768 assert.Equal(tokens[0], "Es")
769 assert.Equal(tokens[1], "war")
770 assert.Equal(tokens[2], "spät")
771 assert.Equal(tokens[3], ".")
772 assert.Equal(tokens[4], "Morgen")
773 assert.Equal(tokens[5], "ist")
774 assert.Equal(tokens[6], "es")
775 assert.Equal(tokens[7], "früh")
776 assert.Equal(tokens[8], ".")
777 assert.Equal(len(tokens), 9)
778 // Ignored in KorAP-Tokenizer
779
780 // testTokenizerOrd
781 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
782 assert.Equal(tokens[0], "Sie")
783 assert.Equal(tokens[1], "erreichte")
784 assert.Equal(tokens[2], "den")
785 assert.Equal(tokens[3], "1.")
786 assert.Equal(tokens[4], "Platz")
787 assert.Equal(tokens[5], "!")
788 assert.Equal(len(tokens), 6)
789
790 // testNoZipOuputArchive
791 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
792 assert.Equal(tokens[0], "Archive")
793 assert.Equal(tokens[1], ":")
794 assert.Equal(tokens[2], "Ich")
795 assert.Equal(tokens[3], "bin")
796 assert.Equal(tokens[4], "kein")
797 assert.Equal(tokens[5], "zip")
798 assert.Equal(6, len(tokens))
799
800 // testTokenizerStrasse
801 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
802 assert.Equal(tokens[4], "Weststr.")
803 assert.Equal(8, len(tokens))
804
805 // germanTokenizerKnowsGermanOmissionWords
806 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
807 assert.Equal("D'dorf", tokens[0])
808 assert.Equal("Ku'damm", tokens[1])
809 assert.Equal("Lu'hafen", tokens[2])
810 assert.Equal("M'gladbach", tokens[3])
811 assert.Equal("W'schaft", tokens[4])
812 assert.Equal(5, len(tokens))
813
814 // germanTokenizerDoesNOTSeparateGermanContractions
815 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
816 assert.Equal("mach's", tokens[0])
817 assert.Equal("macht's", tokens[1])
818 assert.Equal("was'n", tokens[2])
819 assert.Equal("ist's", tokens[3])
820 assert.Equal("haste", tokens[4])
821 assert.Equal("willste", tokens[5])
822 assert.Equal("kannste", tokens[6])
823 assert.Equal("biste", tokens[7])
824 assert.Equal("kriegste", tokens[8])
825 assert.Equal(9, len(tokens))
826
Akron78dba062021-10-28 19:30:46 +0200827 tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
828 assert.Equal("Es", tokens[0])
829 assert.Equal("ist", tokens[1])
830 assert.Equal("gleich", tokens[2])
831 assert.Equal("2:30", tokens[3])
832 assert.Equal("Uhr", tokens[4])
833 assert.Equal(".", tokens[5])
834 assert.Equal(6, len(tokens))
835
Akron17984c82021-10-30 11:44:37 +0200836 tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
837 assert.Equal("Sie", tokens[0])
838 assert.Equal("schwamm", tokens[1])
839 assert.Equal("die", tokens[2])
840 assert.Equal("Strecke", tokens[3])
841 assert.Equal("in", tokens[4])
842 assert.Equal("00:00:57,34", tokens[5])
843 assert.Equal("00:57,341", tokens[6])
844 assert.Equal("0:57", tokens[7])
845 assert.Equal("Stunden", tokens[8])
846 assert.Equal(".", tokens[9])
847 assert.Equal(10, len(tokens))
848
Akronf1106ec2021-11-05 13:04:44 +0100849 // waste example
850 tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
851 assert.Equal(tokens[0], "Am")
852 assert.Equal(tokens[1], "24.1.1806")
853 assert.Equal(tokens[2], "feierte")
854 assert.Equal(tokens[3], "E.")
855 assert.Equal(tokens[4], "T.")
856 assert.Equal(tokens[5], "A.")
857 assert.Equal(tokens[6], "Hoffmann")
858 assert.Equal(tokens[7], "seinen")
859 assert.Equal(tokens[8], "30.")
860 assert.Equal(tokens[9], "Geburtstag")
861 assert.Equal(tokens[10], ".")
862 assert.Equal(11, len(tokens))
863
Akron9135b202021-11-06 13:16:07 +0100864 // IPtest
865 tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.")
866 assert.Equal(tokens[0], "Meine")
867 assert.Equal(tokens[1], "IP")
868 assert.Equal(tokens[2], "ist")
869 assert.Equal(tokens[3], "192.178.168.55")
870 assert.Equal(tokens[4], ".")
871 assert.Equal(5, len(tokens))
872
Akron6742b962021-11-09 01:17:20 +0100873 // XML entities
874 tokens = ttokenize(mat, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
875 assert.Equal(tokens[0], "Das")
876 assert.Equal(tokens[1], "ist")
877 assert.Equal(tokens[2], "&nbsp;")
878 assert.Equal(tokens[3], "1:30")
879 assert.Equal(tokens[4], "Stunden")
880 assert.Equal(tokens[5], "&")
881 assert.Equal(tokens[6], "20")
882 assert.Equal(tokens[7], "Minuten")
883 assert.Equal(tokens[8], "zu")
884 assert.Equal(tokens[9], "spät")
885 assert.Equal(tokens[10], "&GT;")
886 assert.Equal(tokens[11], ".")
887 assert.Equal(12, len(tokens))
888
Akron936c0f52021-12-07 11:30:53 +0100889 // Plusampersand compounds (1)
Akrone62e8eb2021-12-03 11:59:53 +0100890 tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
891 assert.Equal(tokens[0], "Die")
892 assert.Equal(tokens[1], "2G+-Regel")
893 assert.Equal(tokens[2], "soll")
894 assert.Equal(tokens[3], "weitere")
895 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
896 assert.Equal(tokens[5], "reduzieren")
897 assert.Equal(tokens[6], ".")
898 assert.Equal(7, len(tokens))
899
Akron936c0f52021-12-07 11:30:53 +0100900 // Plusampersand compounds (2)
901 tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.")
902 assert.Equal(tokens[0], "Der")
903 assert.Equal(tokens[1], "Neu-C++-Programmierer")
904 assert.Equal(tokens[2], ".")
905 assert.Equal(3, len(tokens))
906
Akron54ed7e72022-01-04 12:05:00 +0100907 // z.B.
908 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
909 assert.Equal(tokens[0], "Dies")
910 assert.Equal(tokens[1], "sind")
911 assert.Equal(tokens[2], "z.")
912 assert.Equal(tokens[3], "B.")
913 assert.Equal(tokens[4], "zwei")
914 assert.Equal(tokens[5], "Wörter")
915 assert.Equal(tokens[6], "-")
916 assert.Equal(tokens[7], "z.")
917 assert.Equal(tokens[8], "B.")
918 assert.Equal(tokens[9], "auch")
919 assert.Equal(tokens[10], ".")
920 assert.Equal(11, len(tokens))
921
Akron9a594712022-01-14 11:12:21 +0100922 // z.B.
923 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
924 assert.Equal(tokens[0], "Dies")
925 assert.Equal(tokens[1], "sind")
926 assert.Equal(tokens[2], "z.")
927 assert.Equal(tokens[3], "B.")
928 assert.Equal(tokens[4], "zwei")
929 assert.Equal(tokens[5], "Wörter")
930 assert.Equal(tokens[6], "-")
931 assert.Equal(tokens[7], "z.")
932 assert.Equal(tokens[8], "B.")
933 assert.Equal(tokens[9], "auch")
934 assert.Equal(tokens[10], ".")
935 assert.Equal(11, len(tokens))
936
937 // Single quote handling
938 tokens = ttokenize(mat, w, "Es heißt 'Leitungssportteams' und nicht anders.")
939 assert.Equal(tokens[0], "Es")
940 assert.Equal(tokens[1], "heißt")
941 assert.Equal(tokens[2], "'")
942 assert.Equal(tokens[3], "Leitungssportteams")
943 assert.Equal(tokens[4], "'")
944 assert.Equal(tokens[5], "und")
945 assert.Equal(tokens[6], "nicht")
946 assert.Equal(tokens[7], "anders")
947 assert.Equal(tokens[8], ".")
948 assert.Equal(9, len(tokens))
949
Akronb02ad072022-01-19 12:41:44 +0100950 // Apostrophe handling
951 tokens = ttokenize(mat, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
952 assert.Equal(tokens[0], "Das")
953 assert.Equal(tokens[1], "ist")
954 assert.Equal(tokens[2], "Nils’")
955 assert.Equal(tokens[3], "Einkaufskorb")
956 assert.Equal(tokens[4], "bei")
957 assert.Equal(tokens[5], "McDonald's")
958 assert.Equal(tokens[6], ".")
959 assert.Equal(7, len(tokens))
960
Akron28031b72021-10-02 13:07:25 +0200961 /*
962 @Test
963 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
964 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
965 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
966 assert.Equal("'ve", tokens[1]);
967 assert.Equal("'ll", tokens[3]);
968 assert.Equal("'d", tokens[5]);
969 assert.Equal("'m", tokens[7]);
970 assert.Equal("'re", tokens[9]);
971 assert.Equal("'s", tokens[11]);
972 assert.Equal("is", tokens[12]);
973 assert.Equal("n't", tokens[13]);
974 assert.Equal(14, len(tokens));
975 }
976
977 @Test
978 public void frenchTokenizerKnowsFrenchAbbreviations () {
979 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
980 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
981 assert.Equal("Approx.", tokens[0]);
982 assert.Equal("juill.", tokens[2]);
983 assert.Equal("prof.", tokens[5]);
984 assert.Equal("exerc.", tokens[15]);
985 assert.Equal("no.", tokens[16]);
986 assert.Equal("pp.", tokens[21]);
987 }
988
989 @Test
990 public void frenchTokenizerKnowsFrenchContractions () {
991 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
992 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
993 assert.Equal("J'", tokens[0]);
994 assert.Equal("j'", tokens[2]);
995 assert.Equal("qu'", tokens[4]);
996 assert.Equal("d'", tokens[6]);
997 assert.Equal("jusqu'", tokens[8]);
998 assert.Equal("Aujourd'hui", tokens[10]);
999 assert.Equal("D'", tokens[11]); // ’
1000 assert.Equal("Quelqu'un", tokens[13]); // ’
1001 assert.Equal("Presqu'île", tokens[14]); // ’
1002 }
1003
1004 @Test
1005 public void frenchTokenizerKnowsFrenchClitics () {
1006 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1007 tokens = tokenize(dat, w, "suis-je sont-elles ")
1008 assert.Equal("suis", tokens[0]);
1009 assert.Equal("-je", tokens[1]);
1010 assert.Equal("sont", tokens[2]);
1011 assert.Equal("-elles", tokens[3]);
1012 }
1013
1014 @Test
1015 public void testEnglishTokenizerScienceAbbreviations () {
1016 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1017 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
1018 assert.Equal("Approx.", tokens[0]);
1019 assert.Equal("in", tokens[1]);
1020 assert.Equal("Sept.", tokens[2]);
1021 assert.Equal("1954", tokens[3]);
1022 assert.Equal(",", tokens[4]);
1023 assert.Equal("Assoc.", tokens[5]);
1024 assert.Equal("Prof.", tokens[6]);
1025 assert.Equal("Dr.", tokens[7]);
1026 assert.Equal("R.", tokens[8]);
1027 assert.Equal("J.", tokens[9]);
1028 assert.Equal("Ewing", tokens[10]);
1029 assert.Equal("reviewed", tokens[11]);
1030 assert.Equal("articles", tokens[12]);
1031 assert.Equal("on", tokens[13]);
1032 assert.Equal("Enzymol.", tokens[14]);
1033 assert.Equal("Bacteriol.", tokens[15]);
1034 assert.Equal("effects", tokens[16]);
1035 assert.Equal("later", tokens[17]);
1036 assert.Equal("published", tokens[18]);
1037 assert.Equal("in", tokens[19]);
1038 assert.Equal("Nutr.", tokens[20]);
1039 assert.Equal("Rheumatol.", tokens[21]);
1040 assert.Equal("No.", tokens[22]);
1041 assert.Equal("12", tokens[23]);
1042 assert.Equal("and", tokens[24]);
1043 assert.Equal("Nº.", tokens[25]);
1044 assert.Equal("13.", tokens[26]);
1045 assert.Equal(",", tokens[27]);
1046 assert.Equal("pp.", tokens[28]);
1047 assert.Equal("17-18", tokens[29]);
1048 assert.Equal(".", tokens[30]);
1049 }
1050
1051 @Test
1052 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
1053 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1054 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
1055 assert.Equal("I.", tokens[1]);
1056 assert.Equal("I", tokens[8]);
1057 assert.Equal(".", tokens[9]);
1058 assert.Equal("I", tokens[12]);
1059 assert.Equal(".", tokens[13]);
1060 }
1061
1062 @Test
1063 public void testZipOuputArchive () {
1064
1065 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
1066 System.setOut(new PrintStream(clearOut));
1067 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
1068 assert.Equal(0, len(tokens));
1069 }
1070 */
1071 /*
1072
1073 @Test
1074 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
1075 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
1076 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
1077 .printOffsets(true)
1078 .build();
1079 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
1080 assert.Equal("Text1", tokens[0].getType());
1081 assert.Equal(len(tokens), 9 );
1082 }
1083 */
1084}
1085
Akronc9c0eae2021-10-22 19:49:43 +02001086func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +02001087 assert := assert.New(t)
1088
Akron9fb63af2021-10-28 01:15:53 +02001089 if mat == nil {
1090 mat = LoadMatrixFile("testdata/tokenizer.matok")
1091 }
Akron28031b72021-10-02 13:07:25 +02001092
Akron28031b72021-10-02 13:07:25 +02001093 assert.NotNil(mat)
1094
1095 b := make([]byte, 0, 2048)
1096 w := bytes.NewBuffer(b)
1097 var tokens []string
1098
1099 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
1100 assert.Equal("Das", tokens[0])
1101 assert.Equal("<b>", tokens[1])
1102 assert.Equal("beste", tokens[2])
1103 assert.Equal("</b>", tokens[3])
1104 assert.Equal("Fußballspiel", tokens[4])
1105 assert.Equal(5, len(tokens))
1106
1107 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
1108 assert.Equal("Das", tokens[0])
1109 assert.Equal("<b class=\"c\">", tokens[1])
1110 assert.Equal("beste", tokens[2])
1111 assert.Equal("</b>", tokens[3])
1112 assert.Equal("Fußballspiel", tokens[4])
1113 assert.Equal(5, len(tokens))
1114
1115 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
1116 assert.Equal("der", tokens[0])
1117 assert.Equal("<x y=\"alte \">", tokens[1])
1118 assert.Equal("<x x>", tokens[2])
1119 assert.Equal("alte", tokens[3])
1120 assert.Equal("</x>", tokens[4])
1121 assert.Equal("etc.", tokens[5])
1122 assert.Equal("et", tokens[6])
1123 assert.Equal(".", tokens[7])
1124 assert.Equal("Mann", tokens[8])
1125 assert.Equal(".", tokens[9])
1126 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001127
1128 tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.")
1129 assert.Equal("das", tokens[0])
1130 assert.Equal("<br class=\"br\" />", tokens[1])
1131 assert.Equal("ging", tokens[2])
1132 assert.Equal(".", tokens[3])
1133 assert.Equal(4, len(tokens))
Akron28031b72021-10-02 13:07:25 +02001134}
1135
Akronabcb6a52021-10-09 15:52:08 +02001136func TestMatokDatokEquivalence(t *testing.T) {
1137 assert := assert.New(t)
1138
Akron9fb63af2021-10-28 01:15:53 +02001139 if mat == nil {
1140 mat = LoadMatrixFile("testdata/tokenizer.matok")
1141 }
Akronabcb6a52021-10-09 15:52:08 +02001142 dat := LoadDatokFile("testdata/tokenizer.datok")
1143
1144 r := strings.NewReader(s)
1145
1146 tb := make([]byte, 0, 2048)
1147 w := bytes.NewBuffer(tb)
1148
1149 // Transduce with double array representation
1150 dat.Transduce(r, w)
1151
1152 datStr := w.String()
1153
1154 r.Reset(s)
1155 w.Reset()
1156
1157 // Transduce with matrix representation
1158 mat.Transduce(r, w)
1159
1160 matStr := w.String()
1161
1162 assert.Equal(datStr, matStr)
1163}
1164
Akronc9c0eae2021-10-22 19:49:43 +02001165func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001166 assert := assert.New(t)
1167
Akron9fb63af2021-10-28 01:15:53 +02001168 if mat == nil {
1169 mat = LoadMatrixFile("testdata/tokenizer.matok")
1170 }
Akrone396a932021-10-19 01:06:13 +02001171
1172 assert.NotNil(mat)
1173
1174 b := make([]byte, 0, 2048)
1175 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001176
1177 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001178
1179 matStr := w.String()
1180
1181 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1182}
1183
Akronc9c0eae2021-10-22 19:49:43 +02001184func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001185 assert := assert.New(t)
1186
Akron9fb63af2021-10-28 01:15:53 +02001187 if mat == nil {
1188 mat = LoadMatrixFile("testdata/tokenizer.matok")
1189 }
Akrona854faa2021-10-22 19:31:08 +02001190
1191 assert.NotNil(mat)
1192
1193 b := make([]byte, 0, 2048)
1194 w := bytes.NewBuffer(b)
1195
1196 assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
1197 matStr := w.String()
1198 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001199}
Akrona854faa2021-10-22 19:31:08 +02001200
Akron22c565a2021-11-28 17:31:36 +01001201func TestMatrixFullTokenizerLongText(t *testing.T) {
1202 assert := assert.New(t)
1203
1204 if mat == nil {
1205 mat = LoadMatrixFile("testdata/tokenizer.matok")
1206 }
1207
1208 assert.NotNil(mat)
1209
1210 b := make([]byte, 0, 2048)
1211 w := bytes.NewBuffer(b)
1212
1213 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1214
1215Copyright laws are changing all over the world. Be sure to check the
1216copyright laws for your country before downloading or redistributing
1217this or any other Project Gutenberg eBook.
1218
1219This header should be the first thing seen when viewing this Project
1220Gutenberg file. Please do not remove it. Do not change or edit the
1221header without written permission.
1222
1223Please read the "legal small print," and other information about the
1224eBook and Project Gutenberg at the bottom of this file. Included is
1225important information about your specific rights and restrictions in
1226how the file may be used. You can also find out about how to make a
1227donation to Project Gutenberg, and how to get involved.
1228
1229
1230**Welcome To The World of Free Plain Vanilla Electronic Texts**
1231
1232**eBooks Readable By Both Humans and By Computers, Since 1971**
1233
1234*****These eBooks Were Prepared By Thousands of Volunteers!*****
1235
1236
1237Title: Effi Briest
1238
1239Author: Theodor Fontane
1240
1241Release Date: March, 2004 [EBook #5323]
1242`
1243
1244 assert.True(mat.Transduce(strings.NewReader(text), w))
1245
1246 assert.True(strings.Contains(w.String(), "Release"))
1247}
1248
Akronf6bdfdb2021-10-23 15:56:53 +02001249func TestMatrixTrimming(t *testing.T) {
1250 assert := assert.New(t)
1251
Akron9fb63af2021-10-28 01:15:53 +02001252 if mat == nil {
1253 mat = LoadMatrixFile("testdata/tokenizer.matok")
1254 }
Akronf6bdfdb2021-10-23 15:56:53 +02001255
1256 assert.NotNil(mat)
1257
1258 b := make([]byte, 0, 2048)
1259 w := bytes.NewBuffer(b)
1260
1261 assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
1262 matStr := w.String()
1263 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001264}
1265
Akronc9c0eae2021-10-22 19:49:43 +02001266func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001267 bu := make([]byte, 0, 2048)
1268 w := bytes.NewBuffer(bu)
1269
Akron28031b72021-10-02 13:07:25 +02001270 r := strings.NewReader(s)
1271
Akron094a4e82021-10-02 18:37:00 +02001272 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001273
1274 b.ResetTimer()
1275
1276 for i := 0; i < b.N; i++ {
1277 w.Reset()
1278 r.Reset(s)
1279 ok := mat.Transduce(r, w)
1280 if !ok {
1281 fmt.Println("Fail!")
1282 fmt.Println(w.String())
1283 os.Exit(1)
1284 }
1285 }
1286}