blob: 14b2fbcfa516222d98d0f7d1a62544f16f26a176 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron9fb63af2021-10-28 01:15:53 +020023var mat *MatrixTokenizer
24
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akronc9c0eae2021-10-22 19:49:43 +020073func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +020074 assert := assert.New(t)
75 foma := LoadFomaFile("testdata/simpletok.fst")
76 assert.NotNil(foma)
77
78 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020079 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020080
81 assert.True(tmatch(mat, "bau"))
82 assert.True(tmatch(mat, "bad"))
83 assert.True(tmatch(mat, "wald gehen"))
84 b := make([]byte, 0, 1024)
85 buf := bytes.NewBuffer(b)
86 n, err := mat.WriteTo(buf)
87 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020088 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020089 mat2 := ParseMatrix(buf)
90 assert.NotNil(mat2)
91 assert.Equal(mat.sigma, mat2.sigma)
92 assert.Equal(mat.epsilon, mat2.epsilon)
93 assert.Equal(mat.unknown, mat2.unknown)
94 assert.Equal(mat.identity, mat2.identity)
95 assert.Equal(mat.stateCount, mat2.stateCount)
96 assert.Equal(len(mat.array), len(mat2.array))
97 assert.Equal(mat.array, mat2.array)
98 assert.True(tmatch(mat2, "bau"))
99 assert.True(tmatch(mat2, "bad"))
100 assert.True(tmatch(mat2, "wald gehen"))
101}
102
Akrone396a932021-10-19 01:06:13 +0200103func TestMatrixIgnorableMCS(t *testing.T) {
104 assert := assert.New(t)
105
106 // This test relies on final states. That's why it is
107 // not working correctly anymore.
108
109 // File has MCS in sigma but not in net
110 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
111 assert.NotNil(tok)
112 mat := tok.ToMatrix()
113 assert.NotNil(mat)
114
115 b := make([]byte, 0, 2048)
116 w := bytes.NewBuffer(b)
117 var tokens []string
118
119 // Is only unambigous when transducing strictly greedy!
120 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
121 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200122 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200123 assert.Equal("a", tokens[0])
124 assert.Equal("b", tokens[1])
125 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200126 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200127}
128
Akronc9c0eae2021-10-22 19:49:43 +0200129func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200130 assert := assert.New(t)
131 foma := LoadFomaFile("testdata/tokenizer.fst")
132 assert.NotNil(foma)
133
134 mat := foma.ToMatrix()
135 assert.NotNil(foma)
136
137 tb := make([]byte, 0, 2048)
138 w := bytes.NewBuffer(tb)
139
140 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200141 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200142
143 b := make([]byte, 0, 1024)
144 buf := bytes.NewBuffer(b)
145 _, err := mat.WriteTo(buf)
146 assert.Nil(err)
147 w.Reset()
148 // assert.Equal(int64(248), n)
149
150 mat2 := ParseMatrix(buf)
151 assert.NotNil(mat2)
152 assert.Equal(mat.sigma, mat2.sigma)
153 assert.Equal(mat.epsilon, mat2.epsilon)
154 assert.Equal(mat.unknown, mat2.unknown)
155 assert.Equal(mat.identity, mat2.identity)
156 assert.Equal(mat.stateCount, mat2.stateCount)
157 assert.Equal(len(mat.array), len(mat2.array))
158 // assert.Equal(mat.array, mat2.array)
159
160 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200161 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200162}
163
Akronc9c0eae2021-10-22 19:49:43 +0200164func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200165 assert := assert.New(t)
166
Akron9fb63af2021-10-28 01:15:53 +0200167 if mat == nil {
168 mat = LoadMatrixFile("testdata/tokenizer.matok")
169 }
Akron28031b72021-10-02 13:07:25 +0200170
171 assert.NotNil(mat)
172
173 b := make([]byte, 0, 2048)
174 w := bytes.NewBuffer(b)
175 var tokens []string
176
177 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
178
179 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200180 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200181 assert.Equal("tra", tokens[0])
182 assert.Equal(".", tokens[1])
183 assert.Equal("", tokens[2])
184 assert.Equal("u", tokens[3])
185 assert.Equal("Du", tokens[4])
186 assert.Equal("?", tokens[5])
187 assert.Equal("", tokens[6])
188 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200189 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200190
191 w.Reset()
192 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200193 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200194}
195
Akronc9c0eae2021-10-22 19:49:43 +0200196func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200197 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200198
199 if mat == nil {
200 mat = LoadMatrixFile("testdata/tokenizer.matok")
201 }
Akron5c82a922021-09-24 19:11:29 +0200202
203 b := make([]byte, 0, 2048)
204 w := bytes.NewBuffer(b)
205 var sentences []string
206
207 // testSentSplitterSimple
208 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
209 sentences = strings.Split(w.String(), "\n\n")
210
Akrona854faa2021-10-22 19:31:08 +0200211 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200212 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200213 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200214 assert.Equal(len(sentences), 2)
215
216 w.Reset()
Akrona854faa2021-10-22 19:31:08 +0200217 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
218 sentences = strings.Split(w.String(), "\n\n")
219 assert.Equal(len(sentences), 2)
220 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
221 assert.Equal("\n", sentences[1])
222
223 w.Reset()
Akron5c82a922021-09-24 19:11:29 +0200224 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal(len(sentences), 2)
227 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200228 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200229
Akron28031b72021-10-02 13:07:25 +0200230 w.Reset()
231 assert.True(mat.Transduce(strings.NewReader(""), w))
232 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200233 assert.Equal(len(sentences), 2)
234 assert.Equal("", sentences[0])
235 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200236
Akron28031b72021-10-02 13:07:25 +0200237 w.Reset()
238 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
239 sentences = strings.Split(w.String(), "\n\n")
240 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200241
Akron28031b72021-10-02 13:07:25 +0200242 w.Reset()
243 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
244 sentences = strings.Split(w.String(), "\n\n")
245 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200246
Akron28031b72021-10-02 13:07:25 +0200247 w.Reset()
248 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200251 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200252 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200253
Akron28031b72021-10-02 13:07:25 +0200254 w.Reset()
255 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
256 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200257 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200258 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200259
Akron28031b72021-10-02 13:07:25 +0200260 w.Reset()
261 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
262 sentences = strings.Split(w.String(), "\n\n")
263 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200264
Akron28031b72021-10-02 13:07:25 +0200265 w.Reset()
266 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
267 sentences = strings.Split(w.String(), "\n\n")
268 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200269
Akron28031b72021-10-02 13:07:25 +0200270 w.Reset()
271 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
272 sentences = strings.Split(w.String(), "\n\n")
273 assert.Equal(len(sentences), 2)
274 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200275 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200276
Akron28031b72021-10-02 13:07:25 +0200277 w.Reset()
278 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
279 sentences = strings.Split(w.String(), "\n\n")
280 assert.Equal(len(sentences), 3)
281 assert.Equal("Ausschalten\n!!!", sentences[0])
282 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200283 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200284
Akron28031b72021-10-02 13:07:25 +0200285 w.Reset()
286 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
287 sentences = strings.Split(w.String(), "\n\n")
288 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200289 /*
290 Test:
291 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
292 */
Akron1c34ce62021-09-23 23:27:39 +0200293}
Akron28031b72021-10-02 13:07:25 +0200294
Akronc9c0eae2021-10-22 19:49:43 +0200295func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200296 assert := assert.New(t)
297
Akron9fb63af2021-10-28 01:15:53 +0200298 if mat == nil {
299 mat = LoadMatrixFile("testdata/tokenizer.matok")
300 }
Akron28031b72021-10-02 13:07:25 +0200301
302 b := make([]byte, 0, 2048)
303 w := bytes.NewBuffer(b)
304 var tokens []string
305
306 // testTokenizerSimple
307 tokens = ttokenize(mat, w, "Der alte Mann")
308 assert.Equal(tokens[0], "Der")
309 assert.Equal(tokens[1], "alte")
310 assert.Equal(tokens[2], "Mann")
311 assert.Equal(len(tokens), 3)
312
313 tokens = ttokenize(mat, w, "Der alte Mann.")
314 assert.Equal(tokens[0], "Der")
315 assert.Equal(tokens[1], "alte")
316 assert.Equal(tokens[2], "Mann")
317 assert.Equal(tokens[3], ".")
318 assert.Equal(len(tokens), 4)
319
320 // testTokenizerAbbr
321 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
322 assert.Equal(tokens[0], "Der")
323 assert.Equal(tokens[1], "Vorsitzende")
324 assert.Equal(tokens[2], "der")
325 assert.Equal(tokens[3], "F.D.P.")
326 assert.Equal(tokens[4], "hat")
327 assert.Equal(tokens[5], "gewählt")
328 assert.Equal(len(tokens), 6)
329 // Ignored in KorAP-Tokenizer
330
331 // testTokenizerHost1
332 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
333 assert.Equal(tokens[0], "Gefunden")
334 assert.Equal(tokens[1], "auf")
335 assert.Equal(tokens[2], "wikipedia.org")
336 assert.Equal(len(tokens), 3)
337
338 // testTokenizerWwwHost
339 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
340 assert.Equal("Gefunden", tokens[0])
341 assert.Equal("auf", tokens[1])
342 assert.Equal("www.wikipedia.org", tokens[2])
343 assert.Equal(3, len(tokens))
344
345 // testTokenizerWwwUrl
346 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
347 assert.Equal("www.info.biz/info", tokens[3])
348
349 // testTokenizerFtpHost
350 /*
351 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
352 assert.Equal("Kann", tokens[0])
353 assert.Equal("von", tokens[1])
354 assert.Equal("ftp.download.org", tokens[2])
355 assert.Equal(5, len(tokens))
356 // Ignored in KorAP-Tokenizer
357 */
358
359 // testTokenizerDash
360 tokens = ttokenize(mat, w, "Das war -- spitze")
361 assert.Equal(tokens[0], "Das")
362 assert.Equal(tokens[1], "war")
363 assert.Equal(tokens[2], "--")
364 assert.Equal(tokens[3], "spitze")
365 assert.Equal(len(tokens), 4)
366
367 // testTokenizerEmail1
368 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
369 assert.Equal(tokens[0], "Ich")
370 assert.Equal(tokens[1], "bin")
371 assert.Equal(tokens[2], "unter")
372 assert.Equal(tokens[3], "korap@ids-mannheim.de")
373 assert.Equal(tokens[4], "erreichbar")
374 assert.Equal(tokens[5], ".")
375 assert.Equal(len(tokens), 6)
376
377 // testTokenizerEmail2
378 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
379 assert.Equal(tokens[0], "Oder")
380 assert.Equal(tokens[1], "unter")
381 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
382 assert.Equal(tokens[3], ".")
383 assert.Equal(len(tokens), 4)
384
385 // testTokenizerEmail3
386 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
387 assert.Equal(tokens[0], "Oder")
388 assert.Equal(tokens[1], "unter")
389 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
390 assert.Equal(tokens[3], ".")
391 assert.Equal(len(tokens), 4)
392 // Ignored in KorAP-Tokenizer
393
394 // testTokenizerDoNotAcceptQuotedEmailNames
395 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
396 assert.Equal("\"", tokens[0])
397 assert.Equal("John", tokens[1])
398 assert.Equal("Doe", tokens[2])
399 assert.Equal("\"", tokens[3])
400 assert.Equal("@xx", tokens[4])
401 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
402 assert.Equal("com", tokens[6])
403 assert.Equal(7, len(tokens))
404
405 // testTokenizerTwitter
406 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
407 assert.Equal(tokens[0], "Folgt")
408 assert.Equal(tokens[1], "@korap")
409 assert.Equal(tokens[2], "und")
410 assert.Equal(tokens[3], "#korap")
411 assert.Equal(len(tokens), 4)
412
413 // testTokenizerWeb1
414 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
415 assert.Equal(tokens[0], "Unsere")
416 assert.Equal(tokens[1], "Website")
417 assert.Equal(tokens[2], "ist")
418 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
419 assert.Equal(len(tokens), 4)
420
421 // testTokenizerWeb2
422 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
423 assert.Equal(tokens[0], "Wir")
424 assert.Equal(tokens[1], "sind")
425 assert.Equal(tokens[2], "auch")
426 assert.Equal(tokens[3], "im")
427 assert.Equal(tokens[4], "Internet")
428 assert.Equal(tokens[5], "(")
429 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
430 assert.Equal(tokens[7], ")")
431 assert.Equal(len(tokens), 8)
432 // Ignored in KorAP-Tokenizer
433
434 // testTokenizerWeb3
435 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
436 assert.Equal(tokens[0], "Die")
437 assert.Equal(tokens[1], "Adresse")
438 assert.Equal(tokens[2], "ist")
439 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
440 assert.Equal(tokens[4], ".")
441 assert.Equal(len(tokens), 5)
442 // Ignored in KorAP-Tokenizer
443
444 // testTokenizerServer
445 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
446 assert.Equal(tokens[0], "Unser")
447 assert.Equal(tokens[1], "Server")
448 assert.Equal(tokens[2], "ist")
449 assert.Equal(tokens[3], "10.0.10.51")
450 assert.Equal(tokens[4], ".")
451 assert.Equal(len(tokens), 5)
452
453 // testTokenizerNum
454 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
455 assert.Equal(tokens[0], "Zu")
456 assert.Equal(tokens[1], "50,4%")
457 assert.Equal(tokens[2], "ist")
458 assert.Equal(tokens[3], "es")
459 assert.Equal(tokens[4], "sicher")
460 assert.Equal(len(tokens), 5)
461 // Differs from KorAP-Tokenizer
462
463 // testTokenizerDate
464 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
465 assert.Equal(tokens[0], "Der")
466 assert.Equal(tokens[1], "Termin")
467 assert.Equal(tokens[2], "ist")
468 assert.Equal(tokens[3], "am")
469 assert.Equal(tokens[4], "5.9.2018")
470 assert.Equal(len(tokens), 5)
471
472 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
473 assert.Equal(tokens[0], "Der")
474 assert.Equal(tokens[1], "Termin")
475 assert.Equal(tokens[2], "ist")
476 assert.Equal(tokens[3], "am")
477 assert.Equal(tokens[4], "5/9/2018")
478 assert.Equal(len(tokens), 5)
479
480 // testTokenizerDateRange
481 /*
482 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
483 assert.Equal(tokens[0], "Der")
484 assert.Equal(tokens[1], "Termin")
485 assert.Equal(tokens[2], "war")
486 assert.Equal(tokens[3], "vom")
487 assert.Equal(tokens[4], "4.")
488 assert.Equal(tokens[5], "-")
489 assert.Equal(tokens[6], "5.9.2018")
490 assert.Equal(len(tokens), 7)
491 // Ignored in KorAP-Tokenizer
492 */
493
494 // testTokenizerEmoji1
495 tokens = ttokenize(mat, w, "Das ist toll! ;)")
496 assert.Equal(tokens[0], "Das")
497 assert.Equal(tokens[1], "ist")
498 assert.Equal(tokens[2], "toll")
499 assert.Equal(tokens[3], "!")
500 assert.Equal(tokens[4], ";)")
501 assert.Equal(len(tokens), 5)
502
503 // testTokenizerRef1
504 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
505 assert.Equal(tokens[0], "Kupietz")
506 assert.Equal(tokens[1], "und")
507 assert.Equal(tokens[2], "Schmidt")
508 assert.Equal(tokens[3], "(2018)")
509 assert.Equal(tokens[4], ":")
510 assert.Equal(tokens[5], "Korpuslinguistik")
511 assert.Equal(len(tokens), 6)
512 // Differs from KorAP-Tokenizer!
513
514 // testTokenizerRef2 () {
515 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
516 assert.Equal(tokens[0], "Kupietz")
517 assert.Equal(tokens[1], "und")
518 assert.Equal(tokens[2], "Schmidt")
519 assert.Equal(tokens[3], "[2018]")
520 assert.Equal(tokens[4], ":")
521 assert.Equal(tokens[5], "Korpuslinguistik")
522 assert.Equal(len(tokens), 6)
523 // Differs from KorAP-Tokenizer!
524
525 // testTokenizerOmission1 () {
526 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
527 assert.Equal(tokens[0], "Er")
528 assert.Equal(tokens[1], "ist")
529 assert.Equal(tokens[2], "ein")
530 assert.Equal(tokens[3], "A****loch")
531 assert.Equal(tokens[4], "!")
532 assert.Equal(len(tokens), 5)
533
534 // testTokenizerOmission2
535 tokens = ttokenize(mat, w, "F*ck!")
536 assert.Equal(tokens[0], "F*ck")
537 assert.Equal(tokens[1], "!")
538 assert.Equal(len(tokens), 2)
539
540 // testTokenizerOmission3 () {
541 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
542 assert.Equal(tokens[0], "Dieses")
543 assert.Equal(tokens[1], "verf*****")
544 assert.Equal(tokens[2], "Kleid")
545 assert.Equal(tokens[3], "!")
546 assert.Equal(len(tokens), 4)
547
548 // Probably interpreted as HOST
549 // testTokenizerFileExtension1
550 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
551 assert.Equal(tokens[0], "Ich")
552 assert.Equal(tokens[1], "habe")
553 assert.Equal(tokens[2], "die")
554 assert.Equal(tokens[3], "readme.txt")
555 assert.Equal(tokens[4], "heruntergeladen")
556 assert.Equal(len(tokens), 5)
557
558 // Probably interpreted as HOST
559 // testTokenizerFileExtension2
560 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
561 assert.Equal(tokens[0], "Nimm")
562 assert.Equal(tokens[1], "die")
563 assert.Equal(tokens[2], "README.TXT")
564 assert.Equal(tokens[3], "!")
565 assert.Equal(len(tokens), 4)
566
567 // Probably interpreted as HOST
568 // testTokenizerFileExtension3
569 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
570 assert.Equal(tokens[0], "Zeig")
571 assert.Equal(tokens[1], "mir")
572 assert.Equal(tokens[2], "profile.jpeg")
573 assert.Equal(len(tokens), 3)
574
575 // testTokenizerFile1
576
577 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
578 assert.Equal(tokens[0], "Zeig")
579 assert.Equal(tokens[1], "mir")
580 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
581 assert.Equal(len(tokens), 3)
582
583 // testTokenizerFile2
584 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
585 assert.Equal(tokens[0], "Gehe")
586 assert.Equal(tokens[1], "zu")
587 assert.Equal(tokens[2], "/Dokumente/profile.docx")
588 assert.Equal(len(tokens), 3)
589
590 // testTokenizerFile3
591 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
592 assert.Equal(tokens[0], "Zeig")
593 assert.Equal(tokens[1], "mir")
594 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
595 assert.Equal(len(tokens), 3)
596 // Ignored in KorAP-Tokenizer
597
598 // testTokenizerPunct
599 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
600 assert.Equal(tokens[0], "Er")
601 assert.Equal(tokens[1], "sagte")
602 assert.Equal(tokens[2], ":")
603 assert.Equal(tokens[3], "\"")
604 assert.Equal(tokens[4], "Es")
605 assert.Equal(tokens[5], "geht")
606 assert.Equal(tokens[6], "mir")
607 assert.Equal(tokens[7], "gut")
608 assert.Equal(tokens[8], "!")
609 assert.Equal(tokens[9], "\"")
610 assert.Equal(tokens[10], ",")
611 assert.Equal(tokens[11], "daraufhin")
612 assert.Equal(tokens[12], "ging")
613 assert.Equal(tokens[13], "er")
614 assert.Equal(tokens[14], ".")
615 assert.Equal(len(tokens), 15)
616
617 // testTokenizerPlusAmpersand
618 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
619 assert.Equal(tokens[0], "&quot;")
620 assert.Equal(tokens[1], "Das")
621 assert.Equal(tokens[2], "ist")
622 assert.Equal(tokens[3], "von")
623 assert.Equal(tokens[4], "C&A")
624 assert.Equal(tokens[5], "!")
625 assert.Equal(tokens[6], "&quot;")
626 assert.Equal(len(tokens), 7)
627
628 // testTokenizerLongEnd
629 tokens = ttokenize(mat, w, "Siehst Du?!!?")
630 assert.Equal(tokens[0], "Siehst")
631 assert.Equal(tokens[1], "Du")
632 assert.Equal(tokens[2], "?!!?")
633 assert.Equal(len(tokens), 3)
634
635 // testTokenizerIrishO
636 tokens = ttokenize(mat, w, "Peter O'Toole")
637 assert.Equal(tokens[0], "Peter")
638 assert.Equal(tokens[1], "O'Toole")
639 assert.Equal(len(tokens), 2)
640
641 // testTokenizerAbr
642 tokens = ttokenize(mat, w, "Früher bzw. später ...")
643 assert.Equal(tokens[0], "Früher")
644 assert.Equal(tokens[1], "bzw.")
645 assert.Equal(tokens[2], "später")
646 assert.Equal(tokens[3], "...")
647 assert.Equal(len(tokens), 4)
648
649 // testTokenizerUppercaseRule
650 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
651 assert.Equal(tokens[0], "Es")
652 assert.Equal(tokens[1], "war")
653 assert.Equal(tokens[2], "spät")
654 assert.Equal(tokens[3], ".")
655 assert.Equal(tokens[4], "Morgen")
656 assert.Equal(tokens[5], "ist")
657 assert.Equal(tokens[6], "es")
658 assert.Equal(tokens[7], "früh")
659 assert.Equal(tokens[8], ".")
660 assert.Equal(len(tokens), 9)
661 // Ignored in KorAP-Tokenizer
662
663 // testTokenizerOrd
664 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
665 assert.Equal(tokens[0], "Sie")
666 assert.Equal(tokens[1], "erreichte")
667 assert.Equal(tokens[2], "den")
668 assert.Equal(tokens[3], "1.")
669 assert.Equal(tokens[4], "Platz")
670 assert.Equal(tokens[5], "!")
671 assert.Equal(len(tokens), 6)
672
673 // testNoZipOuputArchive
674 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
675 assert.Equal(tokens[0], "Archive")
676 assert.Equal(tokens[1], ":")
677 assert.Equal(tokens[2], "Ich")
678 assert.Equal(tokens[3], "bin")
679 assert.Equal(tokens[4], "kein")
680 assert.Equal(tokens[5], "zip")
681 assert.Equal(6, len(tokens))
682
683 // testTokenizerStrasse
684 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
685 assert.Equal(tokens[4], "Weststr.")
686 assert.Equal(8, len(tokens))
687
688 // germanTokenizerKnowsGermanOmissionWords
689 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
690 assert.Equal("D'dorf", tokens[0])
691 assert.Equal("Ku'damm", tokens[1])
692 assert.Equal("Lu'hafen", tokens[2])
693 assert.Equal("M'gladbach", tokens[3])
694 assert.Equal("W'schaft", tokens[4])
695 assert.Equal(5, len(tokens))
696
697 // germanTokenizerDoesNOTSeparateGermanContractions
698 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
699 assert.Equal("mach's", tokens[0])
700 assert.Equal("macht's", tokens[1])
701 assert.Equal("was'n", tokens[2])
702 assert.Equal("ist's", tokens[3])
703 assert.Equal("haste", tokens[4])
704 assert.Equal("willste", tokens[5])
705 assert.Equal("kannste", tokens[6])
706 assert.Equal("biste", tokens[7])
707 assert.Equal("kriegste", tokens[8])
708 assert.Equal(9, len(tokens))
709
Akron78dba062021-10-28 19:30:46 +0200710 tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
711 assert.Equal("Es", tokens[0])
712 assert.Equal("ist", tokens[1])
713 assert.Equal("gleich", tokens[2])
714 assert.Equal("2:30", tokens[3])
715 assert.Equal("Uhr", tokens[4])
716 assert.Equal(".", tokens[5])
717 assert.Equal(6, len(tokens))
718
Akron17984c82021-10-30 11:44:37 +0200719 tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
720 assert.Equal("Sie", tokens[0])
721 assert.Equal("schwamm", tokens[1])
722 assert.Equal("die", tokens[2])
723 assert.Equal("Strecke", tokens[3])
724 assert.Equal("in", tokens[4])
725 assert.Equal("00:00:57,34", tokens[5])
726 assert.Equal("00:57,341", tokens[6])
727 assert.Equal("0:57", tokens[7])
728 assert.Equal("Stunden", tokens[8])
729 assert.Equal(".", tokens[9])
730 assert.Equal(10, len(tokens))
731
Akronf1106ec2021-11-05 13:04:44 +0100732 // waste example
733 tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
734 assert.Equal(tokens[0], "Am")
735 assert.Equal(tokens[1], "24.1.1806")
736 assert.Equal(tokens[2], "feierte")
737 assert.Equal(tokens[3], "E.")
738 assert.Equal(tokens[4], "T.")
739 assert.Equal(tokens[5], "A.")
740 assert.Equal(tokens[6], "Hoffmann")
741 assert.Equal(tokens[7], "seinen")
742 assert.Equal(tokens[8], "30.")
743 assert.Equal(tokens[9], "Geburtstag")
744 assert.Equal(tokens[10], ".")
745 assert.Equal(11, len(tokens))
746
Akron9135b202021-11-06 13:16:07 +0100747 // IPtest
748 tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.")
749 assert.Equal(tokens[0], "Meine")
750 assert.Equal(tokens[1], "IP")
751 assert.Equal(tokens[2], "ist")
752 assert.Equal(tokens[3], "192.178.168.55")
753 assert.Equal(tokens[4], ".")
754 assert.Equal(5, len(tokens))
755
Akron6742b962021-11-09 01:17:20 +0100756 // XML entities
757 tokens = ttokenize(mat, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
758 assert.Equal(tokens[0], "Das")
759 assert.Equal(tokens[1], "ist")
760 assert.Equal(tokens[2], "&nbsp;")
761 assert.Equal(tokens[3], "1:30")
762 assert.Equal(tokens[4], "Stunden")
763 assert.Equal(tokens[5], "&")
764 assert.Equal(tokens[6], "20")
765 assert.Equal(tokens[7], "Minuten")
766 assert.Equal(tokens[8], "zu")
767 assert.Equal(tokens[9], "spät")
768 assert.Equal(tokens[10], "&GT;")
769 assert.Equal(tokens[11], ".")
770 assert.Equal(12, len(tokens))
771
Akron28031b72021-10-02 13:07:25 +0200772 /*
773 @Test
774 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
775 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
776 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
777 assert.Equal("'ve", tokens[1]);
778 assert.Equal("'ll", tokens[3]);
779 assert.Equal("'d", tokens[5]);
780 assert.Equal("'m", tokens[7]);
781 assert.Equal("'re", tokens[9]);
782 assert.Equal("'s", tokens[11]);
783 assert.Equal("is", tokens[12]);
784 assert.Equal("n't", tokens[13]);
785 assert.Equal(14, len(tokens));
786 }
787
788 @Test
789 public void frenchTokenizerKnowsFrenchAbbreviations () {
790 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
791 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
792 assert.Equal("Approx.", tokens[0]);
793 assert.Equal("juill.", tokens[2]);
794 assert.Equal("prof.", tokens[5]);
795 assert.Equal("exerc.", tokens[15]);
796 assert.Equal("no.", tokens[16]);
797 assert.Equal("pp.", tokens[21]);
798 }
799
800 @Test
801 public void frenchTokenizerKnowsFrenchContractions () {
802 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
803 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
804 assert.Equal("J'", tokens[0]);
805 assert.Equal("j'", tokens[2]);
806 assert.Equal("qu'", tokens[4]);
807 assert.Equal("d'", tokens[6]);
808 assert.Equal("jusqu'", tokens[8]);
809 assert.Equal("Aujourd'hui", tokens[10]);
810 assert.Equal("D'", tokens[11]); // ’
811 assert.Equal("Quelqu'un", tokens[13]); // ’
812 assert.Equal("Presqu'île", tokens[14]); // ’
813 }
814
815 @Test
816 public void frenchTokenizerKnowsFrenchClitics () {
817 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
818 tokens = tokenize(dat, w, "suis-je sont-elles ")
819 assert.Equal("suis", tokens[0]);
820 assert.Equal("-je", tokens[1]);
821 assert.Equal("sont", tokens[2]);
822 assert.Equal("-elles", tokens[3]);
823 }
824
825 @Test
826 public void testEnglishTokenizerScienceAbbreviations () {
827 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
828 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
829 assert.Equal("Approx.", tokens[0]);
830 assert.Equal("in", tokens[1]);
831 assert.Equal("Sept.", tokens[2]);
832 assert.Equal("1954", tokens[3]);
833 assert.Equal(",", tokens[4]);
834 assert.Equal("Assoc.", tokens[5]);
835 assert.Equal("Prof.", tokens[6]);
836 assert.Equal("Dr.", tokens[7]);
837 assert.Equal("R.", tokens[8]);
838 assert.Equal("J.", tokens[9]);
839 assert.Equal("Ewing", tokens[10]);
840 assert.Equal("reviewed", tokens[11]);
841 assert.Equal("articles", tokens[12]);
842 assert.Equal("on", tokens[13]);
843 assert.Equal("Enzymol.", tokens[14]);
844 assert.Equal("Bacteriol.", tokens[15]);
845 assert.Equal("effects", tokens[16]);
846 assert.Equal("later", tokens[17]);
847 assert.Equal("published", tokens[18]);
848 assert.Equal("in", tokens[19]);
849 assert.Equal("Nutr.", tokens[20]);
850 assert.Equal("Rheumatol.", tokens[21]);
851 assert.Equal("No.", tokens[22]);
852 assert.Equal("12", tokens[23]);
853 assert.Equal("and", tokens[24]);
854 assert.Equal("Nº.", tokens[25]);
855 assert.Equal("13.", tokens[26]);
856 assert.Equal(",", tokens[27]);
857 assert.Equal("pp.", tokens[28]);
858 assert.Equal("17-18", tokens[29]);
859 assert.Equal(".", tokens[30]);
860 }
861
862 @Test
863 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
864 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
865 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
866 assert.Equal("I.", tokens[1]);
867 assert.Equal("I", tokens[8]);
868 assert.Equal(".", tokens[9]);
869 assert.Equal("I", tokens[12]);
870 assert.Equal(".", tokens[13]);
871 }
872
873 @Test
874 public void testZipOuputArchive () {
875
876 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
877 System.setOut(new PrintStream(clearOut));
878 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
879 assert.Equal(0, len(tokens));
880 }
881 */
882 /*
883
884 @Test
885 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
886 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
887 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
888 .printOffsets(true)
889 .build();
890 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
891 assert.Equal("Text1", tokens[0].getType());
892 assert.Equal(len(tokens), 9 );
893 }
894 */
895}
896
Akronc9c0eae2021-10-22 19:49:43 +0200897func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200898 assert := assert.New(t)
899
Akron9fb63af2021-10-28 01:15:53 +0200900 if mat == nil {
901 mat = LoadMatrixFile("testdata/tokenizer.matok")
902 }
Akron28031b72021-10-02 13:07:25 +0200903
Akron28031b72021-10-02 13:07:25 +0200904 assert.NotNil(mat)
905
906 b := make([]byte, 0, 2048)
907 w := bytes.NewBuffer(b)
908 var tokens []string
909
910 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
911 assert.Equal("Das", tokens[0])
912 assert.Equal("<b>", tokens[1])
913 assert.Equal("beste", tokens[2])
914 assert.Equal("</b>", tokens[3])
915 assert.Equal("Fußballspiel", tokens[4])
916 assert.Equal(5, len(tokens))
917
918 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
919 assert.Equal("Das", tokens[0])
920 assert.Equal("<b class=\"c\">", tokens[1])
921 assert.Equal("beste", tokens[2])
922 assert.Equal("</b>", tokens[3])
923 assert.Equal("Fußballspiel", tokens[4])
924 assert.Equal(5, len(tokens))
925
926 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
927 assert.Equal("der", tokens[0])
928 assert.Equal("<x y=\"alte \">", tokens[1])
929 assert.Equal("<x x>", tokens[2])
930 assert.Equal("alte", tokens[3])
931 assert.Equal("</x>", tokens[4])
932 assert.Equal("etc.", tokens[5])
933 assert.Equal("et", tokens[6])
934 assert.Equal(".", tokens[7])
935 assert.Equal("Mann", tokens[8])
936 assert.Equal(".", tokens[9])
937 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +0200938
939 tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.")
940 assert.Equal("das", tokens[0])
941 assert.Equal("<br class=\"br\" />", tokens[1])
942 assert.Equal("ging", tokens[2])
943 assert.Equal(".", tokens[3])
944 assert.Equal(4, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200945}
946
Akronabcb6a52021-10-09 15:52:08 +0200947func TestMatokDatokEquivalence(t *testing.T) {
948 assert := assert.New(t)
949
Akron9fb63af2021-10-28 01:15:53 +0200950 if mat == nil {
951 mat = LoadMatrixFile("testdata/tokenizer.matok")
952 }
Akronabcb6a52021-10-09 15:52:08 +0200953 dat := LoadDatokFile("testdata/tokenizer.datok")
954
955 r := strings.NewReader(s)
956
957 tb := make([]byte, 0, 2048)
958 w := bytes.NewBuffer(tb)
959
960 // Transduce with double array representation
961 dat.Transduce(r, w)
962
963 datStr := w.String()
964
965 r.Reset(s)
966 w.Reset()
967
968 // Transduce with matrix representation
969 mat.Transduce(r, w)
970
971 matStr := w.String()
972
973 assert.Equal(datStr, matStr)
974}
975
Akronc9c0eae2021-10-22 19:49:43 +0200976func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +0200977 assert := assert.New(t)
978
Akron9fb63af2021-10-28 01:15:53 +0200979 if mat == nil {
980 mat = LoadMatrixFile("testdata/tokenizer.matok")
981 }
Akrone396a932021-10-19 01:06:13 +0200982
983 assert.NotNil(mat)
984
985 b := make([]byte, 0, 2048)
986 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +0200987
988 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +0200989
990 matStr := w.String()
991
992 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
993}
994
Akronc9c0eae2021-10-22 19:49:43 +0200995func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +0200996 assert := assert.New(t)
997
Akron9fb63af2021-10-28 01:15:53 +0200998 if mat == nil {
999 mat = LoadMatrixFile("testdata/tokenizer.matok")
1000 }
Akrona854faa2021-10-22 19:31:08 +02001001
1002 assert.NotNil(mat)
1003
1004 b := make([]byte, 0, 2048)
1005 w := bytes.NewBuffer(b)
1006
1007 assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
1008 matStr := w.String()
1009 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001010}
Akrona854faa2021-10-22 19:31:08 +02001011
Akron22c565a2021-11-28 17:31:36 +01001012func TestMatrixFullTokenizerLongText(t *testing.T) {
1013 assert := assert.New(t)
1014
1015 if mat == nil {
1016 mat = LoadMatrixFile("testdata/tokenizer.matok")
1017 }
1018
1019 assert.NotNil(mat)
1020
1021 b := make([]byte, 0, 2048)
1022 w := bytes.NewBuffer(b)
1023
1024 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1025
1026Copyright laws are changing all over the world. Be sure to check the
1027copyright laws for your country before downloading or redistributing
1028this or any other Project Gutenberg eBook.
1029
1030This header should be the first thing seen when viewing this Project
1031Gutenberg file. Please do not remove it. Do not change or edit the
1032header without written permission.
1033
1034Please read the "legal small print," and other information about the
1035eBook and Project Gutenberg at the bottom of this file. Included is
1036important information about your specific rights and restrictions in
1037how the file may be used. You can also find out about how to make a
1038donation to Project Gutenberg, and how to get involved.
1039
1040
1041**Welcome To The World of Free Plain Vanilla Electronic Texts**
1042
1043**eBooks Readable By Both Humans and By Computers, Since 1971**
1044
1045*****These eBooks Were Prepared By Thousands of Volunteers!*****
1046
1047
1048Title: Effi Briest
1049
1050Author: Theodor Fontane
1051
1052Release Date: March, 2004 [EBook #5323]
1053`
1054
1055 assert.True(mat.Transduce(strings.NewReader(text), w))
1056
1057 assert.True(strings.Contains(w.String(), "Release"))
1058}
1059
Akronf6bdfdb2021-10-23 15:56:53 +02001060func TestMatrixTrimming(t *testing.T) {
1061 assert := assert.New(t)
1062
Akron9fb63af2021-10-28 01:15:53 +02001063 if mat == nil {
1064 mat = LoadMatrixFile("testdata/tokenizer.matok")
1065 }
Akronf6bdfdb2021-10-23 15:56:53 +02001066
1067 assert.NotNil(mat)
1068
1069 b := make([]byte, 0, 2048)
1070 w := bytes.NewBuffer(b)
1071
1072 assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
1073 matStr := w.String()
1074 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001075}
1076
Akronc9c0eae2021-10-22 19:49:43 +02001077func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001078 bu := make([]byte, 0, 2048)
1079 w := bytes.NewBuffer(bu)
1080
Akron28031b72021-10-02 13:07:25 +02001081 r := strings.NewReader(s)
1082
Akron094a4e82021-10-02 18:37:00 +02001083 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001084
1085 b.ResetTimer()
1086
1087 for i := 0; i < b.N; i++ {
1088 w.Reset()
1089 r.Reset(s)
1090 ok := mat.Transduce(r, w)
1091 if !ok {
1092 fmt.Println("Fail!")
1093 fmt.Println(w.String())
1094 os.Exit(1)
1095 }
1096 }
1097}