blob: 25719783125a7806ef401777ab4dcb90a61dff43 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron9fb63af2021-10-28 01:15:53 +020023var mat *MatrixTokenizer
24
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akrondf275812022-03-27 12:54:46 +020073func TestMatrixSimpleString(t *testing.T) {
74 assert := assert.New(t)
75 // bau | bauamt
76 tok := LoadFomaFile("testdata/bauamt.fst")
77 mat := tok.ToMatrix()
78
79 b := make([]byte, 0, 2048)
80 w := bytes.NewBuffer(b)
81 var tokens []string
82
83 tokens = ttokenize(mat, w, "ibauamt")
84 assert.Equal("i", tokens[0])
85 assert.Equal("bauamt", tokens[1])
86
87 tokens = ttokenize(mat, w, "ibbauamt")
88 assert.Equal("i", tokens[0])
89
90 assert.Equal("b", tokens[1])
91 assert.Equal("bauamt", tokens[2])
92
93 tokens = ttokenize(mat, w, "bau")
94 assert.Equal("bau", tokens[0])
95
96 tokens = ttokenize(mat, w, "baum")
97 assert.Equal("bau", tokens[0])
98 assert.Equal("m", tokens[1])
99
100 tokens = ttokenize(mat, w, "baudibauamt")
101 assert.Equal("bau", tokens[0])
102 assert.Equal("d", tokens[1])
103 assert.Equal("i", tokens[2])
104 assert.Equal("bauamt", tokens[3])
105}
106
Akronc9c0eae2021-10-22 19:49:43 +0200107func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +0200108 assert := assert.New(t)
109 foma := LoadFomaFile("testdata/simpletok.fst")
110 assert.NotNil(foma)
111
112 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +0200113 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +0200114
Akrondf275812022-03-27 12:54:46 +0200115 assert.Equal(ttokenizeStr(mat, "bau"), "bau")
116 assert.Equal(ttokenizeStr(mat, "bad"), "bad")
117 assert.Equal(ttokenizeStr(mat, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200118 b := make([]byte, 0, 1024)
119 buf := bytes.NewBuffer(b)
120 n, err := mat.WriteTo(buf)
121 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +0200122 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +0200123 mat2 := ParseMatrix(buf)
124 assert.NotNil(mat2)
125 assert.Equal(mat.sigma, mat2.sigma)
126 assert.Equal(mat.epsilon, mat2.epsilon)
127 assert.Equal(mat.unknown, mat2.unknown)
128 assert.Equal(mat.identity, mat2.identity)
129 assert.Equal(mat.stateCount, mat2.stateCount)
130 assert.Equal(len(mat.array), len(mat2.array))
131 assert.Equal(mat.array, mat2.array)
Akrondf275812022-03-27 12:54:46 +0200132 assert.Equal(ttokenizeStr(mat2, "bau"), "bau")
133 assert.Equal(ttokenizeStr(mat2, "bad"), "bad")
134 assert.Equal(ttokenizeStr(mat2, "wald gehen"), "wald\ngehen")
Akron16c312e2021-09-26 13:11:12 +0200135}
136
Akrone396a932021-10-19 01:06:13 +0200137func TestMatrixIgnorableMCS(t *testing.T) {
138 assert := assert.New(t)
139
140 // This test relies on final states. That's why it is
141 // not working correctly anymore.
142
143 // File has MCS in sigma but not in net
144 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
145 assert.NotNil(tok)
146 mat := tok.ToMatrix()
147 assert.NotNil(mat)
148
149 b := make([]byte, 0, 2048)
150 w := bytes.NewBuffer(b)
151 var tokens []string
152
153 // Is only unambigous when transducing strictly greedy!
154 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
155 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200156 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200157 assert.Equal("a", tokens[0])
158 assert.Equal("b", tokens[1])
159 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200160 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200161}
162
Akronc9c0eae2021-10-22 19:49:43 +0200163func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200164 assert := assert.New(t)
165 foma := LoadFomaFile("testdata/tokenizer.fst")
166 assert.NotNil(foma)
167
168 mat := foma.ToMatrix()
169 assert.NotNil(foma)
170
171 tb := make([]byte, 0, 2048)
172 w := bytes.NewBuffer(tb)
173
174 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200175 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200176
177 b := make([]byte, 0, 1024)
178 buf := bytes.NewBuffer(b)
179 _, err := mat.WriteTo(buf)
180 assert.Nil(err)
181 w.Reset()
182 // assert.Equal(int64(248), n)
183
184 mat2 := ParseMatrix(buf)
185 assert.NotNil(mat2)
186 assert.Equal(mat.sigma, mat2.sigma)
187 assert.Equal(mat.epsilon, mat2.epsilon)
188 assert.Equal(mat.unknown, mat2.unknown)
189 assert.Equal(mat.identity, mat2.identity)
190 assert.Equal(mat.stateCount, mat2.stateCount)
191 assert.Equal(len(mat.array), len(mat2.array))
192 // assert.Equal(mat.array, mat2.array)
193
194 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200195 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200196}
197
Akronc9c0eae2021-10-22 19:49:43 +0200198func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200199 assert := assert.New(t)
200
Akron9fb63af2021-10-28 01:15:53 +0200201 if mat == nil {
202 mat = LoadMatrixFile("testdata/tokenizer.matok")
203 }
Akron28031b72021-10-02 13:07:25 +0200204
205 assert.NotNil(mat)
206
207 b := make([]byte, 0, 2048)
208 w := bytes.NewBuffer(b)
209 var tokens []string
210
211 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
212
213 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200214 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200215 assert.Equal("tra", tokens[0])
216 assert.Equal(".", tokens[1])
217 assert.Equal("", tokens[2])
218 assert.Equal("u", tokens[3])
219 assert.Equal("Du", tokens[4])
220 assert.Equal("?", tokens[5])
221 assert.Equal("", tokens[6])
222 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200223 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200224
225 w.Reset()
226 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200227 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200228}
229
Akronc9c0eae2021-10-22 19:49:43 +0200230func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200231 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200232
233 if mat == nil {
234 mat = LoadMatrixFile("testdata/tokenizer.matok")
235 }
Akron5c82a922021-09-24 19:11:29 +0200236
237 b := make([]byte, 0, 2048)
238 w := bytes.NewBuffer(b)
239 var sentences []string
240
241 // testSentSplitterSimple
242 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
243 sentences = strings.Split(w.String(), "\n\n")
244
Akrona854faa2021-10-22 19:31:08 +0200245 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200246 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200247 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200248 assert.Equal(len(sentences), 2)
249
250 w.Reset()
Akrona854faa2021-10-22 19:31:08 +0200251 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
252 sentences = strings.Split(w.String(), "\n\n")
253 assert.Equal(len(sentences), 2)
254 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
255 assert.Equal("\n", sentences[1])
256
257 w.Reset()
Akron5c82a922021-09-24 19:11:29 +0200258 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
259 sentences = strings.Split(w.String(), "\n\n")
260 assert.Equal(len(sentences), 2)
261 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200262 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200263
Akron28031b72021-10-02 13:07:25 +0200264 w.Reset()
265 assert.True(mat.Transduce(strings.NewReader(""), w))
266 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200267 assert.Equal(len(sentences), 2)
268 assert.Equal("", sentences[0])
269 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200270
Akron28031b72021-10-02 13:07:25 +0200271 w.Reset()
272 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
273 sentences = strings.Split(w.String(), "\n\n")
274 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200275
Akron28031b72021-10-02 13:07:25 +0200276 w.Reset()
277 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
278 sentences = strings.Split(w.String(), "\n\n")
279 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200280
Akron28031b72021-10-02 13:07:25 +0200281 w.Reset()
282 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
283 sentences = strings.Split(w.String(), "\n\n")
284 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200285 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200286 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200287
Akron28031b72021-10-02 13:07:25 +0200288 w.Reset()
289 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
290 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200291 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200292 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200293
Akron28031b72021-10-02 13:07:25 +0200294 w.Reset()
295 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
296 sentences = strings.Split(w.String(), "\n\n")
297 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200298
Akron28031b72021-10-02 13:07:25 +0200299 w.Reset()
300 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
301 sentences = strings.Split(w.String(), "\n\n")
302 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200303
Akron28031b72021-10-02 13:07:25 +0200304 w.Reset()
305 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
306 sentences = strings.Split(w.String(), "\n\n")
307 assert.Equal(len(sentences), 2)
308 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200309 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200310
Akron28031b72021-10-02 13:07:25 +0200311 w.Reset()
312 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
313 sentences = strings.Split(w.String(), "\n\n")
314 assert.Equal(len(sentences), 3)
315 assert.Equal("Ausschalten\n!!!", sentences[0])
316 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200317 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200318
Akron28031b72021-10-02 13:07:25 +0200319 w.Reset()
320 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
321 sentences = strings.Split(w.String(), "\n\n")
322 assert.Equal(len(sentences), 2)
Akrone96895f2022-03-08 19:58:37 +0100323
324 w.Reset()
325 assert.True(mat.Transduce(strings.NewReader("\"Alter!\", sagte er: \"Komm nicht wieder!\" Geh!!! \"Lass!\" Dann ging er."), w))
326 sentences = strings.Split(w.String(), "\n\n")
327 assert.Equal(len(sentences), 5)
328 assert.Equal("\"\nAlter\n!\n\"\n,\nsagte\ner\n:\n\"\nKomm\nnicht\nwieder\n!\n\"", sentences[0])
329 assert.Equal("Geh\n!!!", sentences[1])
330 assert.Equal("\"\nLass\n!\n\"", sentences[2])
331 assert.Equal("Dann\nging\ner\n.", sentences[3])
332
333 w.Reset()
334 assert.True(mat.Transduce(strings.NewReader("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
335 sentences = strings.Split(w.String(), "\n\n")
336 assert.Equal(len(sentences), 3)
337 assert.Equal("\"\nAusschalten\n!!!\n\"\n,\nsagte\ner\n.", sentences[0])
338 assert.Equal("\"\nHast\nDu\nnicht\ngehört\n???\n\"", sentences[1])
Akronece3f012022-03-09 19:12:15 +0100339
340 w.Reset()
341 assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
342 sentences = strings.Split(w.String(), "\n\n")
343 assert.Equal(len(sentences), 3)
344 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
345 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
346
347 w.Reset()
348 assert.True(mat.Transduce(strings.NewReader("“Ausschalten!!!”, sagte er. «Hast Du nicht gehört???»"), w))
349 sentences = strings.Split(w.String(), "\n\n")
350 assert.Equal(len(sentences), 3)
351 assert.Equal("“\nAusschalten\n!!!\n”\n,\nsagte\ner\n.", sentences[0])
352 assert.Equal("«\nHast\nDu\nnicht\ngehört\n???\n»", sentences[1])
353
354 text := `»Meinetwegen. Denkst du, daß ich darauf warte? Das fehlte noch.
355Übrigens, ich kriege schon einen und vielleicht bald. Da ist mir nicht
356bange. Neulich erst hat mir der kleine Ventivegni von drüben gesagt:
357'Fräulein Effi, was gilt die Wette, wir sind hier noch in diesem Jahre
358zu Polterabend und Hochzeit.'«
359
360»Und was sagtest du da?«`
361
362 w.Reset()
363 assert.True(mat.Transduce(strings.NewReader(text), w))
364 sentences = strings.Split(w.String(), "\n\n")
365 assert.Equal(len(sentences), 8)
366 assert.Equal("Neulich\nerst\nhat\nmir\nder\nkleine\nVentivegni\nvon\ndrüben\ngesagt\n:\n'\nFräulein\nEffi\n,\nwas\ngilt\ndie\nWette\n,\nwir\nsind\nhier\nnoch\nin\ndiesem\nJahre\nzu\nPolterabend\nund\nHochzeit\n.\n'\n«", sentences[5])
367 assert.Equal("»\nUnd\nwas\nsagtest\ndu\nda\n?\n«", sentences[6])
Akron4222ac82022-03-11 01:06:21 +0100368
369 text = `»Nun, gib dich zufrieden, ich fange schon an ... Also Baron
370Innstetten!`
371
372 w.Reset()
373 assert.True(mat.Transduce(strings.NewReader(text), w))
374 sentences = strings.Split(w.String(), "\n\n")
375 assert.Equal(len(sentences), 3)
376 assert.Equal("»\nNun\n,\ngib\ndich\nzufrieden\n,\nich\nfange\nschon\nan\n...", sentences[0])
377 assert.Equal("Also\nBaron\nInnstetten\n!", sentences[1])
Akrondf275812022-03-27 12:54:46 +0200378
379}
380
381func TestMatrixFullTokenizerMatrixSentenceSplitterBug1(t *testing.T) {
382 assert := assert.New(t)
383
384 if mat == nil {
385 mat = LoadMatrixFile("testdata/tokenizer.matok")
386 }
387
388 b := make([]byte, 0, 2048)
389 w := bytes.NewBuffer(b)
390 var sentences []string
391
392 text := `Wüllersdorf war aufgestanden. »Ich finde es furchtbar, daß Sie recht haben, aber Sie haben recht. Ich quäle Sie nicht länger mit meinem 'Muß es sein?'. Die Welt ist einmal, wie sie ist, und die Dinge verlaufen nicht, wie wir wollen, sondern wie die andern wollen. Das mit dem 'Gottesgericht', wie manche hochtrabend versichern, ist freilich ein Unsinn, nichts davon, umgekehrt, unser Ehrenkultus ist ein Götzendienst, aber wir müssen uns ihm unterwerfen, solange der Götze gilt.«`
393
394 w.Reset()
395 assert.True(mat.Transduce(strings.NewReader(text), w))
396 sentences = strings.Split(w.String(), "\n\n")
Akronb4287552022-03-27 14:11:24 +0200397 assert.Equal(len(sentences), 6)
Akrondf275812022-03-27 12:54:46 +0200398 assert.Equal("Wüllersdorf\nwar\naufgestanden\n.", sentences[0])
399 assert.Equal("»\nIch\nfinde\nes\nfurchtbar\n,\ndaß\nSie\nrecht\nhaben\n,\naber\nSie\nhaben\nrecht\n.", sentences[1])
Akronb4287552022-03-27 14:11:24 +0200400 assert.Equal("Ich\nquäle\nSie\nnicht\nlänger\nmit\nmeinem\n'\nMuß\nes\nsein\n?\n'\n.", sentences[2])
401 assert.Equal("Die\nWelt\nist\neinmal\n,\nwie\nsie\nist\n,\nund\ndie\nDinge\nverlaufen\nnicht\n,\nwie\nwir\nwollen\n,\nsondern\nwie\ndie\nandern\nwollen\n.", sentences[3])
402 assert.Equal("Das\nmit\ndem\n'\nGottesgericht\n'\n,\nwie\nmanche\nhochtrabend\nversichern\n,\nist\nfreilich\nein\nUnsinn\n,\nnichts\ndavon\n,\numgekehrt\n,\nunser\nEhrenkultus\nist\nein\nGötzendienst\n,\naber\nwir\nmüssen\nuns\nihm\nunterwerfen\n,\nsolange\nder\nGötze\ngilt\n.\n«", sentences[4])
Akron1c34ce62021-09-23 23:27:39 +0200403}
Akron28031b72021-10-02 13:07:25 +0200404
Akronc9c0eae2021-10-22 19:49:43 +0200405func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200406 assert := assert.New(t)
407
Akron9fb63af2021-10-28 01:15:53 +0200408 if mat == nil {
409 mat = LoadMatrixFile("testdata/tokenizer.matok")
410 }
Akron28031b72021-10-02 13:07:25 +0200411
412 b := make([]byte, 0, 2048)
413 w := bytes.NewBuffer(b)
414 var tokens []string
415
416 // testTokenizerSimple
417 tokens = ttokenize(mat, w, "Der alte Mann")
418 assert.Equal(tokens[0], "Der")
419 assert.Equal(tokens[1], "alte")
420 assert.Equal(tokens[2], "Mann")
421 assert.Equal(len(tokens), 3)
422
423 tokens = ttokenize(mat, w, "Der alte Mann.")
424 assert.Equal(tokens[0], "Der")
425 assert.Equal(tokens[1], "alte")
426 assert.Equal(tokens[2], "Mann")
427 assert.Equal(tokens[3], ".")
428 assert.Equal(len(tokens), 4)
429
430 // testTokenizerAbbr
431 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
432 assert.Equal(tokens[0], "Der")
433 assert.Equal(tokens[1], "Vorsitzende")
434 assert.Equal(tokens[2], "der")
435 assert.Equal(tokens[3], "F.D.P.")
436 assert.Equal(tokens[4], "hat")
437 assert.Equal(tokens[5], "gewählt")
438 assert.Equal(len(tokens), 6)
439 // Ignored in KorAP-Tokenizer
440
441 // testTokenizerHost1
442 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
443 assert.Equal(tokens[0], "Gefunden")
444 assert.Equal(tokens[1], "auf")
445 assert.Equal(tokens[2], "wikipedia.org")
446 assert.Equal(len(tokens), 3)
447
448 // testTokenizerWwwHost
449 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
450 assert.Equal("Gefunden", tokens[0])
451 assert.Equal("auf", tokens[1])
452 assert.Equal("www.wikipedia.org", tokens[2])
453 assert.Equal(3, len(tokens))
454
455 // testTokenizerWwwUrl
456 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
457 assert.Equal("www.info.biz/info", tokens[3])
458
459 // testTokenizerFtpHost
460 /*
461 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
462 assert.Equal("Kann", tokens[0])
463 assert.Equal("von", tokens[1])
464 assert.Equal("ftp.download.org", tokens[2])
465 assert.Equal(5, len(tokens))
466 // Ignored in KorAP-Tokenizer
467 */
468
469 // testTokenizerDash
470 tokens = ttokenize(mat, w, "Das war -- spitze")
471 assert.Equal(tokens[0], "Das")
472 assert.Equal(tokens[1], "war")
473 assert.Equal(tokens[2], "--")
474 assert.Equal(tokens[3], "spitze")
475 assert.Equal(len(tokens), 4)
476
477 // testTokenizerEmail1
478 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
479 assert.Equal(tokens[0], "Ich")
480 assert.Equal(tokens[1], "bin")
481 assert.Equal(tokens[2], "unter")
482 assert.Equal(tokens[3], "korap@ids-mannheim.de")
483 assert.Equal(tokens[4], "erreichbar")
484 assert.Equal(tokens[5], ".")
485 assert.Equal(len(tokens), 6)
486
487 // testTokenizerEmail2
488 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
489 assert.Equal(tokens[0], "Oder")
490 assert.Equal(tokens[1], "unter")
491 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
492 assert.Equal(tokens[3], ".")
493 assert.Equal(len(tokens), 4)
494
495 // testTokenizerEmail3
496 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
497 assert.Equal(tokens[0], "Oder")
498 assert.Equal(tokens[1], "unter")
499 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
500 assert.Equal(tokens[3], ".")
501 assert.Equal(len(tokens), 4)
502 // Ignored in KorAP-Tokenizer
503
504 // testTokenizerDoNotAcceptQuotedEmailNames
505 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
506 assert.Equal("\"", tokens[0])
507 assert.Equal("John", tokens[1])
508 assert.Equal("Doe", tokens[2])
509 assert.Equal("\"", tokens[3])
510 assert.Equal("@xx", tokens[4])
511 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
512 assert.Equal("com", tokens[6])
513 assert.Equal(7, len(tokens))
514
515 // testTokenizerTwitter
516 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
517 assert.Equal(tokens[0], "Folgt")
518 assert.Equal(tokens[1], "@korap")
519 assert.Equal(tokens[2], "und")
520 assert.Equal(tokens[3], "#korap")
521 assert.Equal(len(tokens), 4)
522
523 // testTokenizerWeb1
524 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
525 assert.Equal(tokens[0], "Unsere")
526 assert.Equal(tokens[1], "Website")
527 assert.Equal(tokens[2], "ist")
528 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
529 assert.Equal(len(tokens), 4)
530
531 // testTokenizerWeb2
532 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
533 assert.Equal(tokens[0], "Wir")
534 assert.Equal(tokens[1], "sind")
535 assert.Equal(tokens[2], "auch")
536 assert.Equal(tokens[3], "im")
537 assert.Equal(tokens[4], "Internet")
538 assert.Equal(tokens[5], "(")
539 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
540 assert.Equal(tokens[7], ")")
541 assert.Equal(len(tokens), 8)
542 // Ignored in KorAP-Tokenizer
543
544 // testTokenizerWeb3
545 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
546 assert.Equal(tokens[0], "Die")
547 assert.Equal(tokens[1], "Adresse")
548 assert.Equal(tokens[2], "ist")
549 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
550 assert.Equal(tokens[4], ".")
551 assert.Equal(len(tokens), 5)
552 // Ignored in KorAP-Tokenizer
553
554 // testTokenizerServer
555 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
556 assert.Equal(tokens[0], "Unser")
557 assert.Equal(tokens[1], "Server")
558 assert.Equal(tokens[2], "ist")
559 assert.Equal(tokens[3], "10.0.10.51")
560 assert.Equal(tokens[4], ".")
561 assert.Equal(len(tokens), 5)
562
563 // testTokenizerNum
564 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
565 assert.Equal(tokens[0], "Zu")
566 assert.Equal(tokens[1], "50,4%")
567 assert.Equal(tokens[2], "ist")
568 assert.Equal(tokens[3], "es")
569 assert.Equal(tokens[4], "sicher")
570 assert.Equal(len(tokens), 5)
571 // Differs from KorAP-Tokenizer
572
573 // testTokenizerDate
574 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
575 assert.Equal(tokens[0], "Der")
576 assert.Equal(tokens[1], "Termin")
577 assert.Equal(tokens[2], "ist")
578 assert.Equal(tokens[3], "am")
579 assert.Equal(tokens[4], "5.9.2018")
580 assert.Equal(len(tokens), 5)
581
582 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
583 assert.Equal(tokens[0], "Der")
584 assert.Equal(tokens[1], "Termin")
585 assert.Equal(tokens[2], "ist")
586 assert.Equal(tokens[3], "am")
587 assert.Equal(tokens[4], "5/9/2018")
588 assert.Equal(len(tokens), 5)
589
590 // testTokenizerDateRange
591 /*
592 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
593 assert.Equal(tokens[0], "Der")
594 assert.Equal(tokens[1], "Termin")
595 assert.Equal(tokens[2], "war")
596 assert.Equal(tokens[3], "vom")
597 assert.Equal(tokens[4], "4.")
598 assert.Equal(tokens[5], "-")
599 assert.Equal(tokens[6], "5.9.2018")
600 assert.Equal(len(tokens), 7)
601 // Ignored in KorAP-Tokenizer
602 */
603
604 // testTokenizerEmoji1
605 tokens = ttokenize(mat, w, "Das ist toll! ;)")
606 assert.Equal(tokens[0], "Das")
607 assert.Equal(tokens[1], "ist")
608 assert.Equal(tokens[2], "toll")
609 assert.Equal(tokens[3], "!")
610 assert.Equal(tokens[4], ";)")
611 assert.Equal(len(tokens), 5)
612
613 // testTokenizerRef1
614 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
615 assert.Equal(tokens[0], "Kupietz")
616 assert.Equal(tokens[1], "und")
617 assert.Equal(tokens[2], "Schmidt")
618 assert.Equal(tokens[3], "(2018)")
619 assert.Equal(tokens[4], ":")
620 assert.Equal(tokens[5], "Korpuslinguistik")
621 assert.Equal(len(tokens), 6)
622 // Differs from KorAP-Tokenizer!
623
624 // testTokenizerRef2 () {
625 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
626 assert.Equal(tokens[0], "Kupietz")
627 assert.Equal(tokens[1], "und")
628 assert.Equal(tokens[2], "Schmidt")
629 assert.Equal(tokens[3], "[2018]")
630 assert.Equal(tokens[4], ":")
631 assert.Equal(tokens[5], "Korpuslinguistik")
632 assert.Equal(len(tokens), 6)
633 // Differs from KorAP-Tokenizer!
634
635 // testTokenizerOmission1 () {
636 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
637 assert.Equal(tokens[0], "Er")
638 assert.Equal(tokens[1], "ist")
639 assert.Equal(tokens[2], "ein")
640 assert.Equal(tokens[3], "A****loch")
641 assert.Equal(tokens[4], "!")
642 assert.Equal(len(tokens), 5)
643
644 // testTokenizerOmission2
645 tokens = ttokenize(mat, w, "F*ck!")
646 assert.Equal(tokens[0], "F*ck")
647 assert.Equal(tokens[1], "!")
648 assert.Equal(len(tokens), 2)
649
650 // testTokenizerOmission3 () {
651 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
652 assert.Equal(tokens[0], "Dieses")
653 assert.Equal(tokens[1], "verf*****")
654 assert.Equal(tokens[2], "Kleid")
655 assert.Equal(tokens[3], "!")
656 assert.Equal(len(tokens), 4)
657
658 // Probably interpreted as HOST
659 // testTokenizerFileExtension1
660 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
661 assert.Equal(tokens[0], "Ich")
662 assert.Equal(tokens[1], "habe")
663 assert.Equal(tokens[2], "die")
664 assert.Equal(tokens[3], "readme.txt")
665 assert.Equal(tokens[4], "heruntergeladen")
666 assert.Equal(len(tokens), 5)
667
668 // Probably interpreted as HOST
669 // testTokenizerFileExtension2
670 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
671 assert.Equal(tokens[0], "Nimm")
672 assert.Equal(tokens[1], "die")
673 assert.Equal(tokens[2], "README.TXT")
674 assert.Equal(tokens[3], "!")
675 assert.Equal(len(tokens), 4)
676
677 // Probably interpreted as HOST
678 // testTokenizerFileExtension3
679 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
680 assert.Equal(tokens[0], "Zeig")
681 assert.Equal(tokens[1], "mir")
682 assert.Equal(tokens[2], "profile.jpeg")
683 assert.Equal(len(tokens), 3)
684
685 // testTokenizerFile1
686
687 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
688 assert.Equal(tokens[0], "Zeig")
689 assert.Equal(tokens[1], "mir")
690 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
691 assert.Equal(len(tokens), 3)
692
693 // testTokenizerFile2
694 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
695 assert.Equal(tokens[0], "Gehe")
696 assert.Equal(tokens[1], "zu")
697 assert.Equal(tokens[2], "/Dokumente/profile.docx")
698 assert.Equal(len(tokens), 3)
699
700 // testTokenizerFile3
701 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
702 assert.Equal(tokens[0], "Zeig")
703 assert.Equal(tokens[1], "mir")
704 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
705 assert.Equal(len(tokens), 3)
706 // Ignored in KorAP-Tokenizer
707
708 // testTokenizerPunct
709 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
710 assert.Equal(tokens[0], "Er")
711 assert.Equal(tokens[1], "sagte")
712 assert.Equal(tokens[2], ":")
713 assert.Equal(tokens[3], "\"")
714 assert.Equal(tokens[4], "Es")
715 assert.Equal(tokens[5], "geht")
716 assert.Equal(tokens[6], "mir")
717 assert.Equal(tokens[7], "gut")
718 assert.Equal(tokens[8], "!")
719 assert.Equal(tokens[9], "\"")
720 assert.Equal(tokens[10], ",")
721 assert.Equal(tokens[11], "daraufhin")
722 assert.Equal(tokens[12], "ging")
723 assert.Equal(tokens[13], "er")
724 assert.Equal(tokens[14], ".")
725 assert.Equal(len(tokens), 15)
726
727 // testTokenizerPlusAmpersand
728 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
729 assert.Equal(tokens[0], "&quot;")
730 assert.Equal(tokens[1], "Das")
731 assert.Equal(tokens[2], "ist")
732 assert.Equal(tokens[3], "von")
733 assert.Equal(tokens[4], "C&A")
734 assert.Equal(tokens[5], "!")
735 assert.Equal(tokens[6], "&quot;")
736 assert.Equal(len(tokens), 7)
737
738 // testTokenizerLongEnd
739 tokens = ttokenize(mat, w, "Siehst Du?!!?")
740 assert.Equal(tokens[0], "Siehst")
741 assert.Equal(tokens[1], "Du")
742 assert.Equal(tokens[2], "?!!?")
743 assert.Equal(len(tokens), 3)
744
745 // testTokenizerIrishO
746 tokens = ttokenize(mat, w, "Peter O'Toole")
747 assert.Equal(tokens[0], "Peter")
748 assert.Equal(tokens[1], "O'Toole")
749 assert.Equal(len(tokens), 2)
750
751 // testTokenizerAbr
752 tokens = ttokenize(mat, w, "Früher bzw. später ...")
753 assert.Equal(tokens[0], "Früher")
754 assert.Equal(tokens[1], "bzw.")
755 assert.Equal(tokens[2], "später")
756 assert.Equal(tokens[3], "...")
757 assert.Equal(len(tokens), 4)
758
759 // testTokenizerUppercaseRule
760 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
761 assert.Equal(tokens[0], "Es")
762 assert.Equal(tokens[1], "war")
763 assert.Equal(tokens[2], "spät")
764 assert.Equal(tokens[3], ".")
765 assert.Equal(tokens[4], "Morgen")
766 assert.Equal(tokens[5], "ist")
767 assert.Equal(tokens[6], "es")
768 assert.Equal(tokens[7], "früh")
769 assert.Equal(tokens[8], ".")
770 assert.Equal(len(tokens), 9)
771 // Ignored in KorAP-Tokenizer
772
773 // testTokenizerOrd
774 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
775 assert.Equal(tokens[0], "Sie")
776 assert.Equal(tokens[1], "erreichte")
777 assert.Equal(tokens[2], "den")
778 assert.Equal(tokens[3], "1.")
779 assert.Equal(tokens[4], "Platz")
780 assert.Equal(tokens[5], "!")
781 assert.Equal(len(tokens), 6)
782
783 // testNoZipOuputArchive
784 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
785 assert.Equal(tokens[0], "Archive")
786 assert.Equal(tokens[1], ":")
787 assert.Equal(tokens[2], "Ich")
788 assert.Equal(tokens[3], "bin")
789 assert.Equal(tokens[4], "kein")
790 assert.Equal(tokens[5], "zip")
791 assert.Equal(6, len(tokens))
792
793 // testTokenizerStrasse
794 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
795 assert.Equal(tokens[4], "Weststr.")
796 assert.Equal(8, len(tokens))
797
798 // germanTokenizerKnowsGermanOmissionWords
799 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
800 assert.Equal("D'dorf", tokens[0])
801 assert.Equal("Ku'damm", tokens[1])
802 assert.Equal("Lu'hafen", tokens[2])
803 assert.Equal("M'gladbach", tokens[3])
804 assert.Equal("W'schaft", tokens[4])
805 assert.Equal(5, len(tokens))
806
807 // germanTokenizerDoesNOTSeparateGermanContractions
808 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
809 assert.Equal("mach's", tokens[0])
810 assert.Equal("macht's", tokens[1])
811 assert.Equal("was'n", tokens[2])
812 assert.Equal("ist's", tokens[3])
813 assert.Equal("haste", tokens[4])
814 assert.Equal("willste", tokens[5])
815 assert.Equal("kannste", tokens[6])
816 assert.Equal("biste", tokens[7])
817 assert.Equal("kriegste", tokens[8])
818 assert.Equal(9, len(tokens))
819
Akron78dba062021-10-28 19:30:46 +0200820 tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
821 assert.Equal("Es", tokens[0])
822 assert.Equal("ist", tokens[1])
823 assert.Equal("gleich", tokens[2])
824 assert.Equal("2:30", tokens[3])
825 assert.Equal("Uhr", tokens[4])
826 assert.Equal(".", tokens[5])
827 assert.Equal(6, len(tokens))
828
Akron17984c82021-10-30 11:44:37 +0200829 tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
830 assert.Equal("Sie", tokens[0])
831 assert.Equal("schwamm", tokens[1])
832 assert.Equal("die", tokens[2])
833 assert.Equal("Strecke", tokens[3])
834 assert.Equal("in", tokens[4])
835 assert.Equal("00:00:57,34", tokens[5])
836 assert.Equal("00:57,341", tokens[6])
837 assert.Equal("0:57", tokens[7])
838 assert.Equal("Stunden", tokens[8])
839 assert.Equal(".", tokens[9])
840 assert.Equal(10, len(tokens))
841
Akronf1106ec2021-11-05 13:04:44 +0100842 // waste example
843 tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
844 assert.Equal(tokens[0], "Am")
845 assert.Equal(tokens[1], "24.1.1806")
846 assert.Equal(tokens[2], "feierte")
847 assert.Equal(tokens[3], "E.")
848 assert.Equal(tokens[4], "T.")
849 assert.Equal(tokens[5], "A.")
850 assert.Equal(tokens[6], "Hoffmann")
851 assert.Equal(tokens[7], "seinen")
852 assert.Equal(tokens[8], "30.")
853 assert.Equal(tokens[9], "Geburtstag")
854 assert.Equal(tokens[10], ".")
855 assert.Equal(11, len(tokens))
856
Akron9135b202021-11-06 13:16:07 +0100857 // IPtest
858 tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.")
859 assert.Equal(tokens[0], "Meine")
860 assert.Equal(tokens[1], "IP")
861 assert.Equal(tokens[2], "ist")
862 assert.Equal(tokens[3], "192.178.168.55")
863 assert.Equal(tokens[4], ".")
864 assert.Equal(5, len(tokens))
865
Akron6742b962021-11-09 01:17:20 +0100866 // XML entities
867 tokens = ttokenize(mat, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
868 assert.Equal(tokens[0], "Das")
869 assert.Equal(tokens[1], "ist")
870 assert.Equal(tokens[2], "&nbsp;")
871 assert.Equal(tokens[3], "1:30")
872 assert.Equal(tokens[4], "Stunden")
873 assert.Equal(tokens[5], "&")
874 assert.Equal(tokens[6], "20")
875 assert.Equal(tokens[7], "Minuten")
876 assert.Equal(tokens[8], "zu")
877 assert.Equal(tokens[9], "spät")
878 assert.Equal(tokens[10], "&GT;")
879 assert.Equal(tokens[11], ".")
880 assert.Equal(12, len(tokens))
881
Akron936c0f52021-12-07 11:30:53 +0100882 // Plusampersand compounds (1)
Akrone62e8eb2021-12-03 11:59:53 +0100883 tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
884 assert.Equal(tokens[0], "Die")
885 assert.Equal(tokens[1], "2G+-Regel")
886 assert.Equal(tokens[2], "soll")
887 assert.Equal(tokens[3], "weitere")
888 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
889 assert.Equal(tokens[5], "reduzieren")
890 assert.Equal(tokens[6], ".")
891 assert.Equal(7, len(tokens))
892
Akron936c0f52021-12-07 11:30:53 +0100893 // Plusampersand compounds (2)
894 tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.")
895 assert.Equal(tokens[0], "Der")
896 assert.Equal(tokens[1], "Neu-C++-Programmierer")
897 assert.Equal(tokens[2], ".")
898 assert.Equal(3, len(tokens))
899
Akron54ed7e72022-01-04 12:05:00 +0100900 // z.B.
901 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
902 assert.Equal(tokens[0], "Dies")
903 assert.Equal(tokens[1], "sind")
904 assert.Equal(tokens[2], "z.")
905 assert.Equal(tokens[3], "B.")
906 assert.Equal(tokens[4], "zwei")
907 assert.Equal(tokens[5], "Wörter")
908 assert.Equal(tokens[6], "-")
909 assert.Equal(tokens[7], "z.")
910 assert.Equal(tokens[8], "B.")
911 assert.Equal(tokens[9], "auch")
912 assert.Equal(tokens[10], ".")
913 assert.Equal(11, len(tokens))
914
Akron9a594712022-01-14 11:12:21 +0100915 // z.B.
916 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
917 assert.Equal(tokens[0], "Dies")
918 assert.Equal(tokens[1], "sind")
919 assert.Equal(tokens[2], "z.")
920 assert.Equal(tokens[3], "B.")
921 assert.Equal(tokens[4], "zwei")
922 assert.Equal(tokens[5], "Wörter")
923 assert.Equal(tokens[6], "-")
924 assert.Equal(tokens[7], "z.")
925 assert.Equal(tokens[8], "B.")
926 assert.Equal(tokens[9], "auch")
927 assert.Equal(tokens[10], ".")
928 assert.Equal(11, len(tokens))
929
930 // Single quote handling
931 tokens = ttokenize(mat, w, "Es heißt 'Leitungssportteams' und nicht anders.")
932 assert.Equal(tokens[0], "Es")
933 assert.Equal(tokens[1], "heißt")
934 assert.Equal(tokens[2], "'")
935 assert.Equal(tokens[3], "Leitungssportteams")
936 assert.Equal(tokens[4], "'")
937 assert.Equal(tokens[5], "und")
938 assert.Equal(tokens[6], "nicht")
939 assert.Equal(tokens[7], "anders")
940 assert.Equal(tokens[8], ".")
941 assert.Equal(9, len(tokens))
942
Akronb02ad072022-01-19 12:41:44 +0100943 // Apostrophe handling
944 tokens = ttokenize(mat, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
945 assert.Equal(tokens[0], "Das")
946 assert.Equal(tokens[1], "ist")
947 assert.Equal(tokens[2], "Nils’")
948 assert.Equal(tokens[3], "Einkaufskorb")
949 assert.Equal(tokens[4], "bei")
950 assert.Equal(tokens[5], "McDonald's")
951 assert.Equal(tokens[6], ".")
952 assert.Equal(7, len(tokens))
953
Akron28031b72021-10-02 13:07:25 +0200954 /*
955 @Test
956 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
957 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
958 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
959 assert.Equal("'ve", tokens[1]);
960 assert.Equal("'ll", tokens[3]);
961 assert.Equal("'d", tokens[5]);
962 assert.Equal("'m", tokens[7]);
963 assert.Equal("'re", tokens[9]);
964 assert.Equal("'s", tokens[11]);
965 assert.Equal("is", tokens[12]);
966 assert.Equal("n't", tokens[13]);
967 assert.Equal(14, len(tokens));
968 }
969
970 @Test
971 public void frenchTokenizerKnowsFrenchAbbreviations () {
972 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
973 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
974 assert.Equal("Approx.", tokens[0]);
975 assert.Equal("juill.", tokens[2]);
976 assert.Equal("prof.", tokens[5]);
977 assert.Equal("exerc.", tokens[15]);
978 assert.Equal("no.", tokens[16]);
979 assert.Equal("pp.", tokens[21]);
980 }
981
982 @Test
983 public void frenchTokenizerKnowsFrenchContractions () {
984 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
985 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
986 assert.Equal("J'", tokens[0]);
987 assert.Equal("j'", tokens[2]);
988 assert.Equal("qu'", tokens[4]);
989 assert.Equal("d'", tokens[6]);
990 assert.Equal("jusqu'", tokens[8]);
991 assert.Equal("Aujourd'hui", tokens[10]);
992 assert.Equal("D'", tokens[11]); // ’
993 assert.Equal("Quelqu'un", tokens[13]); // ’
994 assert.Equal("Presqu'île", tokens[14]); // ’
995 }
996
997 @Test
998 public void frenchTokenizerKnowsFrenchClitics () {
999 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
1000 tokens = tokenize(dat, w, "suis-je sont-elles ")
1001 assert.Equal("suis", tokens[0]);
1002 assert.Equal("-je", tokens[1]);
1003 assert.Equal("sont", tokens[2]);
1004 assert.Equal("-elles", tokens[3]);
1005 }
1006
1007 @Test
1008 public void testEnglishTokenizerScienceAbbreviations () {
1009 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1010 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
1011 assert.Equal("Approx.", tokens[0]);
1012 assert.Equal("in", tokens[1]);
1013 assert.Equal("Sept.", tokens[2]);
1014 assert.Equal("1954", tokens[3]);
1015 assert.Equal(",", tokens[4]);
1016 assert.Equal("Assoc.", tokens[5]);
1017 assert.Equal("Prof.", tokens[6]);
1018 assert.Equal("Dr.", tokens[7]);
1019 assert.Equal("R.", tokens[8]);
1020 assert.Equal("J.", tokens[9]);
1021 assert.Equal("Ewing", tokens[10]);
1022 assert.Equal("reviewed", tokens[11]);
1023 assert.Equal("articles", tokens[12]);
1024 assert.Equal("on", tokens[13]);
1025 assert.Equal("Enzymol.", tokens[14]);
1026 assert.Equal("Bacteriol.", tokens[15]);
1027 assert.Equal("effects", tokens[16]);
1028 assert.Equal("later", tokens[17]);
1029 assert.Equal("published", tokens[18]);
1030 assert.Equal("in", tokens[19]);
1031 assert.Equal("Nutr.", tokens[20]);
1032 assert.Equal("Rheumatol.", tokens[21]);
1033 assert.Equal("No.", tokens[22]);
1034 assert.Equal("12", tokens[23]);
1035 assert.Equal("and", tokens[24]);
1036 assert.Equal("Nº.", tokens[25]);
1037 assert.Equal("13.", tokens[26]);
1038 assert.Equal(",", tokens[27]);
1039 assert.Equal("pp.", tokens[28]);
1040 assert.Equal("17-18", tokens[29]);
1041 assert.Equal(".", tokens[30]);
1042 }
1043
1044 @Test
1045 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
1046 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
1047 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
1048 assert.Equal("I.", tokens[1]);
1049 assert.Equal("I", tokens[8]);
1050 assert.Equal(".", tokens[9]);
1051 assert.Equal("I", tokens[12]);
1052 assert.Equal(".", tokens[13]);
1053 }
1054
1055 @Test
1056 public void testZipOuputArchive () {
1057
1058 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
1059 System.setOut(new PrintStream(clearOut));
1060 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
1061 assert.Equal(0, len(tokens));
1062 }
1063 */
1064 /*
1065
1066 @Test
1067 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
1068 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
1069 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
1070 .printOffsets(true)
1071 .build();
1072 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
1073 assert.Equal("Text1", tokens[0].getType());
1074 assert.Equal(len(tokens), 9 );
1075 }
1076 */
1077}
1078
Akronc9c0eae2021-10-22 19:49:43 +02001079func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +02001080 assert := assert.New(t)
1081
Akron9fb63af2021-10-28 01:15:53 +02001082 if mat == nil {
1083 mat = LoadMatrixFile("testdata/tokenizer.matok")
1084 }
Akron28031b72021-10-02 13:07:25 +02001085
Akron28031b72021-10-02 13:07:25 +02001086 assert.NotNil(mat)
1087
1088 b := make([]byte, 0, 2048)
1089 w := bytes.NewBuffer(b)
1090 var tokens []string
1091
1092 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
1093 assert.Equal("Das", tokens[0])
1094 assert.Equal("<b>", tokens[1])
1095 assert.Equal("beste", tokens[2])
1096 assert.Equal("</b>", tokens[3])
1097 assert.Equal("Fußballspiel", tokens[4])
1098 assert.Equal(5, len(tokens))
1099
1100 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
1101 assert.Equal("Das", tokens[0])
1102 assert.Equal("<b class=\"c\">", tokens[1])
1103 assert.Equal("beste", tokens[2])
1104 assert.Equal("</b>", tokens[3])
1105 assert.Equal("Fußballspiel", tokens[4])
1106 assert.Equal(5, len(tokens))
1107
1108 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
1109 assert.Equal("der", tokens[0])
1110 assert.Equal("<x y=\"alte \">", tokens[1])
1111 assert.Equal("<x x>", tokens[2])
1112 assert.Equal("alte", tokens[3])
1113 assert.Equal("</x>", tokens[4])
1114 assert.Equal("etc.", tokens[5])
1115 assert.Equal("et", tokens[6])
1116 assert.Equal(".", tokens[7])
1117 assert.Equal("Mann", tokens[8])
1118 assert.Equal(".", tokens[9])
1119 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001120
1121 tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.")
1122 assert.Equal("das", tokens[0])
1123 assert.Equal("<br class=\"br\" />", tokens[1])
1124 assert.Equal("ging", tokens[2])
1125 assert.Equal(".", tokens[3])
1126 assert.Equal(4, len(tokens))
Akron28031b72021-10-02 13:07:25 +02001127}
1128
Akronabcb6a52021-10-09 15:52:08 +02001129func TestMatokDatokEquivalence(t *testing.T) {
1130 assert := assert.New(t)
1131
Akron9fb63af2021-10-28 01:15:53 +02001132 if mat == nil {
1133 mat = LoadMatrixFile("testdata/tokenizer.matok")
1134 }
Akronabcb6a52021-10-09 15:52:08 +02001135 dat := LoadDatokFile("testdata/tokenizer.datok")
1136
1137 r := strings.NewReader(s)
1138
1139 tb := make([]byte, 0, 2048)
1140 w := bytes.NewBuffer(tb)
1141
1142 // Transduce with double array representation
1143 dat.Transduce(r, w)
1144
1145 datStr := w.String()
1146
1147 r.Reset(s)
1148 w.Reset()
1149
1150 // Transduce with matrix representation
1151 mat.Transduce(r, w)
1152
1153 matStr := w.String()
1154
1155 assert.Equal(datStr, matStr)
1156}
1157
Akronc9c0eae2021-10-22 19:49:43 +02001158func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001159 assert := assert.New(t)
1160
Akron9fb63af2021-10-28 01:15:53 +02001161 if mat == nil {
1162 mat = LoadMatrixFile("testdata/tokenizer.matok")
1163 }
Akrone396a932021-10-19 01:06:13 +02001164
1165 assert.NotNil(mat)
1166
1167 b := make([]byte, 0, 2048)
1168 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001169
1170 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001171
1172 matStr := w.String()
1173
1174 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1175}
1176
Akronc9c0eae2021-10-22 19:49:43 +02001177func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001178 assert := assert.New(t)
1179
Akron9fb63af2021-10-28 01:15:53 +02001180 if mat == nil {
1181 mat = LoadMatrixFile("testdata/tokenizer.matok")
1182 }
Akrona854faa2021-10-22 19:31:08 +02001183
1184 assert.NotNil(mat)
1185
1186 b := make([]byte, 0, 2048)
1187 w := bytes.NewBuffer(b)
1188
1189 assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
1190 matStr := w.String()
1191 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001192}
Akrona854faa2021-10-22 19:31:08 +02001193
Akron22c565a2021-11-28 17:31:36 +01001194func TestMatrixFullTokenizerLongText(t *testing.T) {
1195 assert := assert.New(t)
1196
1197 if mat == nil {
1198 mat = LoadMatrixFile("testdata/tokenizer.matok")
1199 }
1200
1201 assert.NotNil(mat)
1202
1203 b := make([]byte, 0, 2048)
1204 w := bytes.NewBuffer(b)
1205
1206 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1207
1208Copyright laws are changing all over the world. Be sure to check the
1209copyright laws for your country before downloading or redistributing
1210this or any other Project Gutenberg eBook.
1211
1212This header should be the first thing seen when viewing this Project
1213Gutenberg file. Please do not remove it. Do not change or edit the
1214header without written permission.
1215
1216Please read the "legal small print," and other information about the
1217eBook and Project Gutenberg at the bottom of this file. Included is
1218important information about your specific rights and restrictions in
1219how the file may be used. You can also find out about how to make a
1220donation to Project Gutenberg, and how to get involved.
1221
1222
1223**Welcome To The World of Free Plain Vanilla Electronic Texts**
1224
1225**eBooks Readable By Both Humans and By Computers, Since 1971**
1226
1227*****These eBooks Were Prepared By Thousands of Volunteers!*****
1228
1229
1230Title: Effi Briest
1231
1232Author: Theodor Fontane
1233
1234Release Date: March, 2004 [EBook #5323]
1235`
1236
1237 assert.True(mat.Transduce(strings.NewReader(text), w))
1238
1239 assert.True(strings.Contains(w.String(), "Release"))
1240}
1241
Akronf6bdfdb2021-10-23 15:56:53 +02001242func TestMatrixTrimming(t *testing.T) {
1243 assert := assert.New(t)
1244
Akron9fb63af2021-10-28 01:15:53 +02001245 if mat == nil {
1246 mat = LoadMatrixFile("testdata/tokenizer.matok")
1247 }
Akronf6bdfdb2021-10-23 15:56:53 +02001248
1249 assert.NotNil(mat)
1250
1251 b := make([]byte, 0, 2048)
1252 w := bytes.NewBuffer(b)
1253
1254 assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
1255 matStr := w.String()
1256 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001257}
1258
Akronc9c0eae2021-10-22 19:49:43 +02001259func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001260 bu := make([]byte, 0, 2048)
1261 w := bytes.NewBuffer(bu)
1262
Akron28031b72021-10-02 13:07:25 +02001263 r := strings.NewReader(s)
1264
Akron094a4e82021-10-02 18:37:00 +02001265 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001266
1267 b.ResetTimer()
1268
1269 for i := 0; i < b.N; i++ {
1270 w.Reset()
1271 r.Reset(s)
1272 ok := mat.Transduce(r, w)
1273 if !ok {
1274 fmt.Println("Fail!")
1275 fmt.Println(w.String())
1276 os.Exit(1)
1277 }
1278 }
1279}