blob: e7aa154f1b74ee0acdc36aaf651975dcdd3a3dea [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron9fb63af2021-10-28 01:15:53 +020023var mat *MatrixTokenizer
24
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akronc9c0eae2021-10-22 19:49:43 +020073func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +020074 assert := assert.New(t)
75 foma := LoadFomaFile("testdata/simpletok.fst")
76 assert.NotNil(foma)
77
78 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020079 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020080
81 assert.True(tmatch(mat, "bau"))
82 assert.True(tmatch(mat, "bad"))
83 assert.True(tmatch(mat, "wald gehen"))
84 b := make([]byte, 0, 1024)
85 buf := bytes.NewBuffer(b)
86 n, err := mat.WriteTo(buf)
87 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020088 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020089 mat2 := ParseMatrix(buf)
90 assert.NotNil(mat2)
91 assert.Equal(mat.sigma, mat2.sigma)
92 assert.Equal(mat.epsilon, mat2.epsilon)
93 assert.Equal(mat.unknown, mat2.unknown)
94 assert.Equal(mat.identity, mat2.identity)
95 assert.Equal(mat.stateCount, mat2.stateCount)
96 assert.Equal(len(mat.array), len(mat2.array))
97 assert.Equal(mat.array, mat2.array)
98 assert.True(tmatch(mat2, "bau"))
99 assert.True(tmatch(mat2, "bad"))
100 assert.True(tmatch(mat2, "wald gehen"))
101}
102
Akrone396a932021-10-19 01:06:13 +0200103func TestMatrixIgnorableMCS(t *testing.T) {
104 assert := assert.New(t)
105
106 // This test relies on final states. That's why it is
107 // not working correctly anymore.
108
109 // File has MCS in sigma but not in net
110 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
111 assert.NotNil(tok)
112 mat := tok.ToMatrix()
113 assert.NotNil(mat)
114
115 b := make([]byte, 0, 2048)
116 w := bytes.NewBuffer(b)
117 var tokens []string
118
119 // Is only unambigous when transducing strictly greedy!
120 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
121 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200122 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200123 assert.Equal("a", tokens[0])
124 assert.Equal("b", tokens[1])
125 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200126 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200127}
128
Akronc9c0eae2021-10-22 19:49:43 +0200129func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200130 assert := assert.New(t)
131 foma := LoadFomaFile("testdata/tokenizer.fst")
132 assert.NotNil(foma)
133
134 mat := foma.ToMatrix()
135 assert.NotNil(foma)
136
137 tb := make([]byte, 0, 2048)
138 w := bytes.NewBuffer(tb)
139
140 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200141 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200142
143 b := make([]byte, 0, 1024)
144 buf := bytes.NewBuffer(b)
145 _, err := mat.WriteTo(buf)
146 assert.Nil(err)
147 w.Reset()
148 // assert.Equal(int64(248), n)
149
150 mat2 := ParseMatrix(buf)
151 assert.NotNil(mat2)
152 assert.Equal(mat.sigma, mat2.sigma)
153 assert.Equal(mat.epsilon, mat2.epsilon)
154 assert.Equal(mat.unknown, mat2.unknown)
155 assert.Equal(mat.identity, mat2.identity)
156 assert.Equal(mat.stateCount, mat2.stateCount)
157 assert.Equal(len(mat.array), len(mat2.array))
158 // assert.Equal(mat.array, mat2.array)
159
160 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200161 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200162}
163
Akronc9c0eae2021-10-22 19:49:43 +0200164func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200165 assert := assert.New(t)
166
Akron9fb63af2021-10-28 01:15:53 +0200167 if mat == nil {
168 mat = LoadMatrixFile("testdata/tokenizer.matok")
169 }
Akron28031b72021-10-02 13:07:25 +0200170
171 assert.NotNil(mat)
172
173 b := make([]byte, 0, 2048)
174 w := bytes.NewBuffer(b)
175 var tokens []string
176
177 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
178
179 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200180 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200181 assert.Equal("tra", tokens[0])
182 assert.Equal(".", tokens[1])
183 assert.Equal("", tokens[2])
184 assert.Equal("u", tokens[3])
185 assert.Equal("Du", tokens[4])
186 assert.Equal("?", tokens[5])
187 assert.Equal("", tokens[6])
188 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200189 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200190
191 w.Reset()
192 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200193 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200194}
195
Akronc9c0eae2021-10-22 19:49:43 +0200196func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200197 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200198
199 if mat == nil {
200 mat = LoadMatrixFile("testdata/tokenizer.matok")
201 }
Akron5c82a922021-09-24 19:11:29 +0200202
203 b := make([]byte, 0, 2048)
204 w := bytes.NewBuffer(b)
205 var sentences []string
206
207 // testSentSplitterSimple
208 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
209 sentences = strings.Split(w.String(), "\n\n")
210
Akrona854faa2021-10-22 19:31:08 +0200211 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200212 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200213 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200214 assert.Equal(len(sentences), 2)
215
216 w.Reset()
Akrona854faa2021-10-22 19:31:08 +0200217 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
218 sentences = strings.Split(w.String(), "\n\n")
219 assert.Equal(len(sentences), 2)
220 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
221 assert.Equal("\n", sentences[1])
222
223 w.Reset()
Akron5c82a922021-09-24 19:11:29 +0200224 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal(len(sentences), 2)
227 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200228 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200229
Akron28031b72021-10-02 13:07:25 +0200230 w.Reset()
231 assert.True(mat.Transduce(strings.NewReader(""), w))
232 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200233 assert.Equal(len(sentences), 2)
234 assert.Equal("", sentences[0])
235 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200236
Akron28031b72021-10-02 13:07:25 +0200237 w.Reset()
238 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
239 sentences = strings.Split(w.String(), "\n\n")
240 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200241
Akron28031b72021-10-02 13:07:25 +0200242 w.Reset()
243 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
244 sentences = strings.Split(w.String(), "\n\n")
245 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200246
Akron28031b72021-10-02 13:07:25 +0200247 w.Reset()
248 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200251 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200252 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200253
Akron28031b72021-10-02 13:07:25 +0200254 w.Reset()
255 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
256 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200257 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200258 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200259
Akron28031b72021-10-02 13:07:25 +0200260 w.Reset()
261 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
262 sentences = strings.Split(w.String(), "\n\n")
263 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200264
Akron28031b72021-10-02 13:07:25 +0200265 w.Reset()
266 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
267 sentences = strings.Split(w.String(), "\n\n")
268 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200269
Akron28031b72021-10-02 13:07:25 +0200270 w.Reset()
271 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
272 sentences = strings.Split(w.String(), "\n\n")
273 assert.Equal(len(sentences), 2)
274 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200275 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200276
Akron28031b72021-10-02 13:07:25 +0200277 w.Reset()
278 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
279 sentences = strings.Split(w.String(), "\n\n")
280 assert.Equal(len(sentences), 3)
281 assert.Equal("Ausschalten\n!!!", sentences[0])
282 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200283 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200284
Akron28031b72021-10-02 13:07:25 +0200285 w.Reset()
286 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
287 sentences = strings.Split(w.String(), "\n\n")
288 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200289 /*
290 Test:
291 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
292 */
Akron1c34ce62021-09-23 23:27:39 +0200293}
Akron28031b72021-10-02 13:07:25 +0200294
Akronc9c0eae2021-10-22 19:49:43 +0200295func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200296 assert := assert.New(t)
297
Akron9fb63af2021-10-28 01:15:53 +0200298 if mat == nil {
299 mat = LoadMatrixFile("testdata/tokenizer.matok")
300 }
Akron28031b72021-10-02 13:07:25 +0200301
302 b := make([]byte, 0, 2048)
303 w := bytes.NewBuffer(b)
304 var tokens []string
305
306 // testTokenizerSimple
307 tokens = ttokenize(mat, w, "Der alte Mann")
308 assert.Equal(tokens[0], "Der")
309 assert.Equal(tokens[1], "alte")
310 assert.Equal(tokens[2], "Mann")
311 assert.Equal(len(tokens), 3)
312
313 tokens = ttokenize(mat, w, "Der alte Mann.")
314 assert.Equal(tokens[0], "Der")
315 assert.Equal(tokens[1], "alte")
316 assert.Equal(tokens[2], "Mann")
317 assert.Equal(tokens[3], ".")
318 assert.Equal(len(tokens), 4)
319
320 // testTokenizerAbbr
321 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
322 assert.Equal(tokens[0], "Der")
323 assert.Equal(tokens[1], "Vorsitzende")
324 assert.Equal(tokens[2], "der")
325 assert.Equal(tokens[3], "F.D.P.")
326 assert.Equal(tokens[4], "hat")
327 assert.Equal(tokens[5], "gewählt")
328 assert.Equal(len(tokens), 6)
329 // Ignored in KorAP-Tokenizer
330
331 // testTokenizerHost1
332 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
333 assert.Equal(tokens[0], "Gefunden")
334 assert.Equal(tokens[1], "auf")
335 assert.Equal(tokens[2], "wikipedia.org")
336 assert.Equal(len(tokens), 3)
337
338 // testTokenizerWwwHost
339 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
340 assert.Equal("Gefunden", tokens[0])
341 assert.Equal("auf", tokens[1])
342 assert.Equal("www.wikipedia.org", tokens[2])
343 assert.Equal(3, len(tokens))
344
345 // testTokenizerWwwUrl
346 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
347 assert.Equal("www.info.biz/info", tokens[3])
348
349 // testTokenizerFtpHost
350 /*
351 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
352 assert.Equal("Kann", tokens[0])
353 assert.Equal("von", tokens[1])
354 assert.Equal("ftp.download.org", tokens[2])
355 assert.Equal(5, len(tokens))
356 // Ignored in KorAP-Tokenizer
357 */
358
359 // testTokenizerDash
360 tokens = ttokenize(mat, w, "Das war -- spitze")
361 assert.Equal(tokens[0], "Das")
362 assert.Equal(tokens[1], "war")
363 assert.Equal(tokens[2], "--")
364 assert.Equal(tokens[3], "spitze")
365 assert.Equal(len(tokens), 4)
366
367 // testTokenizerEmail1
368 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
369 assert.Equal(tokens[0], "Ich")
370 assert.Equal(tokens[1], "bin")
371 assert.Equal(tokens[2], "unter")
372 assert.Equal(tokens[3], "korap@ids-mannheim.de")
373 assert.Equal(tokens[4], "erreichbar")
374 assert.Equal(tokens[5], ".")
375 assert.Equal(len(tokens), 6)
376
377 // testTokenizerEmail2
378 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
379 assert.Equal(tokens[0], "Oder")
380 assert.Equal(tokens[1], "unter")
381 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
382 assert.Equal(tokens[3], ".")
383 assert.Equal(len(tokens), 4)
384
385 // testTokenizerEmail3
386 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
387 assert.Equal(tokens[0], "Oder")
388 assert.Equal(tokens[1], "unter")
389 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
390 assert.Equal(tokens[3], ".")
391 assert.Equal(len(tokens), 4)
392 // Ignored in KorAP-Tokenizer
393
394 // testTokenizerDoNotAcceptQuotedEmailNames
395 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
396 assert.Equal("\"", tokens[0])
397 assert.Equal("John", tokens[1])
398 assert.Equal("Doe", tokens[2])
399 assert.Equal("\"", tokens[3])
400 assert.Equal("@xx", tokens[4])
401 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
402 assert.Equal("com", tokens[6])
403 assert.Equal(7, len(tokens))
404
405 // testTokenizerTwitter
406 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
407 assert.Equal(tokens[0], "Folgt")
408 assert.Equal(tokens[1], "@korap")
409 assert.Equal(tokens[2], "und")
410 assert.Equal(tokens[3], "#korap")
411 assert.Equal(len(tokens), 4)
412
413 // testTokenizerWeb1
414 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
415 assert.Equal(tokens[0], "Unsere")
416 assert.Equal(tokens[1], "Website")
417 assert.Equal(tokens[2], "ist")
418 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
419 assert.Equal(len(tokens), 4)
420
421 // testTokenizerWeb2
422 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
423 assert.Equal(tokens[0], "Wir")
424 assert.Equal(tokens[1], "sind")
425 assert.Equal(tokens[2], "auch")
426 assert.Equal(tokens[3], "im")
427 assert.Equal(tokens[4], "Internet")
428 assert.Equal(tokens[5], "(")
429 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
430 assert.Equal(tokens[7], ")")
431 assert.Equal(len(tokens), 8)
432 // Ignored in KorAP-Tokenizer
433
434 // testTokenizerWeb3
435 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
436 assert.Equal(tokens[0], "Die")
437 assert.Equal(tokens[1], "Adresse")
438 assert.Equal(tokens[2], "ist")
439 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
440 assert.Equal(tokens[4], ".")
441 assert.Equal(len(tokens), 5)
442 // Ignored in KorAP-Tokenizer
443
444 // testTokenizerServer
445 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
446 assert.Equal(tokens[0], "Unser")
447 assert.Equal(tokens[1], "Server")
448 assert.Equal(tokens[2], "ist")
449 assert.Equal(tokens[3], "10.0.10.51")
450 assert.Equal(tokens[4], ".")
451 assert.Equal(len(tokens), 5)
452
453 // testTokenizerNum
454 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
455 assert.Equal(tokens[0], "Zu")
456 assert.Equal(tokens[1], "50,4%")
457 assert.Equal(tokens[2], "ist")
458 assert.Equal(tokens[3], "es")
459 assert.Equal(tokens[4], "sicher")
460 assert.Equal(len(tokens), 5)
461 // Differs from KorAP-Tokenizer
462
463 // testTokenizerDate
464 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
465 assert.Equal(tokens[0], "Der")
466 assert.Equal(tokens[1], "Termin")
467 assert.Equal(tokens[2], "ist")
468 assert.Equal(tokens[3], "am")
469 assert.Equal(tokens[4], "5.9.2018")
470 assert.Equal(len(tokens), 5)
471
472 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
473 assert.Equal(tokens[0], "Der")
474 assert.Equal(tokens[1], "Termin")
475 assert.Equal(tokens[2], "ist")
476 assert.Equal(tokens[3], "am")
477 assert.Equal(tokens[4], "5/9/2018")
478 assert.Equal(len(tokens), 5)
479
480 // testTokenizerDateRange
481 /*
482 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
483 assert.Equal(tokens[0], "Der")
484 assert.Equal(tokens[1], "Termin")
485 assert.Equal(tokens[2], "war")
486 assert.Equal(tokens[3], "vom")
487 assert.Equal(tokens[4], "4.")
488 assert.Equal(tokens[5], "-")
489 assert.Equal(tokens[6], "5.9.2018")
490 assert.Equal(len(tokens), 7)
491 // Ignored in KorAP-Tokenizer
492 */
493
494 // testTokenizerEmoji1
495 tokens = ttokenize(mat, w, "Das ist toll! ;)")
496 assert.Equal(tokens[0], "Das")
497 assert.Equal(tokens[1], "ist")
498 assert.Equal(tokens[2], "toll")
499 assert.Equal(tokens[3], "!")
500 assert.Equal(tokens[4], ";)")
501 assert.Equal(len(tokens), 5)
502
503 // testTokenizerRef1
504 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
505 assert.Equal(tokens[0], "Kupietz")
506 assert.Equal(tokens[1], "und")
507 assert.Equal(tokens[2], "Schmidt")
508 assert.Equal(tokens[3], "(2018)")
509 assert.Equal(tokens[4], ":")
510 assert.Equal(tokens[5], "Korpuslinguistik")
511 assert.Equal(len(tokens), 6)
512 // Differs from KorAP-Tokenizer!
513
514 // testTokenizerRef2 () {
515 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
516 assert.Equal(tokens[0], "Kupietz")
517 assert.Equal(tokens[1], "und")
518 assert.Equal(tokens[2], "Schmidt")
519 assert.Equal(tokens[3], "[2018]")
520 assert.Equal(tokens[4], ":")
521 assert.Equal(tokens[5], "Korpuslinguistik")
522 assert.Equal(len(tokens), 6)
523 // Differs from KorAP-Tokenizer!
524
525 // testTokenizerOmission1 () {
526 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
527 assert.Equal(tokens[0], "Er")
528 assert.Equal(tokens[1], "ist")
529 assert.Equal(tokens[2], "ein")
530 assert.Equal(tokens[3], "A****loch")
531 assert.Equal(tokens[4], "!")
532 assert.Equal(len(tokens), 5)
533
534 // testTokenizerOmission2
535 tokens = ttokenize(mat, w, "F*ck!")
536 assert.Equal(tokens[0], "F*ck")
537 assert.Equal(tokens[1], "!")
538 assert.Equal(len(tokens), 2)
539
540 // testTokenizerOmission3 () {
541 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
542 assert.Equal(tokens[0], "Dieses")
543 assert.Equal(tokens[1], "verf*****")
544 assert.Equal(tokens[2], "Kleid")
545 assert.Equal(tokens[3], "!")
546 assert.Equal(len(tokens), 4)
547
548 // Probably interpreted as HOST
549 // testTokenizerFileExtension1
550 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
551 assert.Equal(tokens[0], "Ich")
552 assert.Equal(tokens[1], "habe")
553 assert.Equal(tokens[2], "die")
554 assert.Equal(tokens[3], "readme.txt")
555 assert.Equal(tokens[4], "heruntergeladen")
556 assert.Equal(len(tokens), 5)
557
558 // Probably interpreted as HOST
559 // testTokenizerFileExtension2
560 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
561 assert.Equal(tokens[0], "Nimm")
562 assert.Equal(tokens[1], "die")
563 assert.Equal(tokens[2], "README.TXT")
564 assert.Equal(tokens[3], "!")
565 assert.Equal(len(tokens), 4)
566
567 // Probably interpreted as HOST
568 // testTokenizerFileExtension3
569 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
570 assert.Equal(tokens[0], "Zeig")
571 assert.Equal(tokens[1], "mir")
572 assert.Equal(tokens[2], "profile.jpeg")
573 assert.Equal(len(tokens), 3)
574
575 // testTokenizerFile1
576
577 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
578 assert.Equal(tokens[0], "Zeig")
579 assert.Equal(tokens[1], "mir")
580 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
581 assert.Equal(len(tokens), 3)
582
583 // testTokenizerFile2
584 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
585 assert.Equal(tokens[0], "Gehe")
586 assert.Equal(tokens[1], "zu")
587 assert.Equal(tokens[2], "/Dokumente/profile.docx")
588 assert.Equal(len(tokens), 3)
589
590 // testTokenizerFile3
591 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
592 assert.Equal(tokens[0], "Zeig")
593 assert.Equal(tokens[1], "mir")
594 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
595 assert.Equal(len(tokens), 3)
596 // Ignored in KorAP-Tokenizer
597
598 // testTokenizerPunct
599 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
600 assert.Equal(tokens[0], "Er")
601 assert.Equal(tokens[1], "sagte")
602 assert.Equal(tokens[2], ":")
603 assert.Equal(tokens[3], "\"")
604 assert.Equal(tokens[4], "Es")
605 assert.Equal(tokens[5], "geht")
606 assert.Equal(tokens[6], "mir")
607 assert.Equal(tokens[7], "gut")
608 assert.Equal(tokens[8], "!")
609 assert.Equal(tokens[9], "\"")
610 assert.Equal(tokens[10], ",")
611 assert.Equal(tokens[11], "daraufhin")
612 assert.Equal(tokens[12], "ging")
613 assert.Equal(tokens[13], "er")
614 assert.Equal(tokens[14], ".")
615 assert.Equal(len(tokens), 15)
616
617 // testTokenizerPlusAmpersand
618 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
619 assert.Equal(tokens[0], "&quot;")
620 assert.Equal(tokens[1], "Das")
621 assert.Equal(tokens[2], "ist")
622 assert.Equal(tokens[3], "von")
623 assert.Equal(tokens[4], "C&A")
624 assert.Equal(tokens[5], "!")
625 assert.Equal(tokens[6], "&quot;")
626 assert.Equal(len(tokens), 7)
627
628 // testTokenizerLongEnd
629 tokens = ttokenize(mat, w, "Siehst Du?!!?")
630 assert.Equal(tokens[0], "Siehst")
631 assert.Equal(tokens[1], "Du")
632 assert.Equal(tokens[2], "?!!?")
633 assert.Equal(len(tokens), 3)
634
635 // testTokenizerIrishO
636 tokens = ttokenize(mat, w, "Peter O'Toole")
637 assert.Equal(tokens[0], "Peter")
638 assert.Equal(tokens[1], "O'Toole")
639 assert.Equal(len(tokens), 2)
640
641 // testTokenizerAbr
642 tokens = ttokenize(mat, w, "Früher bzw. später ...")
643 assert.Equal(tokens[0], "Früher")
644 assert.Equal(tokens[1], "bzw.")
645 assert.Equal(tokens[2], "später")
646 assert.Equal(tokens[3], "...")
647 assert.Equal(len(tokens), 4)
648
649 // testTokenizerUppercaseRule
650 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
651 assert.Equal(tokens[0], "Es")
652 assert.Equal(tokens[1], "war")
653 assert.Equal(tokens[2], "spät")
654 assert.Equal(tokens[3], ".")
655 assert.Equal(tokens[4], "Morgen")
656 assert.Equal(tokens[5], "ist")
657 assert.Equal(tokens[6], "es")
658 assert.Equal(tokens[7], "früh")
659 assert.Equal(tokens[8], ".")
660 assert.Equal(len(tokens), 9)
661 // Ignored in KorAP-Tokenizer
662
663 // testTokenizerOrd
664 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
665 assert.Equal(tokens[0], "Sie")
666 assert.Equal(tokens[1], "erreichte")
667 assert.Equal(tokens[2], "den")
668 assert.Equal(tokens[3], "1.")
669 assert.Equal(tokens[4], "Platz")
670 assert.Equal(tokens[5], "!")
671 assert.Equal(len(tokens), 6)
672
673 // testNoZipOuputArchive
674 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
675 assert.Equal(tokens[0], "Archive")
676 assert.Equal(tokens[1], ":")
677 assert.Equal(tokens[2], "Ich")
678 assert.Equal(tokens[3], "bin")
679 assert.Equal(tokens[4], "kein")
680 assert.Equal(tokens[5], "zip")
681 assert.Equal(6, len(tokens))
682
683 // testTokenizerStrasse
684 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
685 assert.Equal(tokens[4], "Weststr.")
686 assert.Equal(8, len(tokens))
687
688 // germanTokenizerKnowsGermanOmissionWords
689 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
690 assert.Equal("D'dorf", tokens[0])
691 assert.Equal("Ku'damm", tokens[1])
692 assert.Equal("Lu'hafen", tokens[2])
693 assert.Equal("M'gladbach", tokens[3])
694 assert.Equal("W'schaft", tokens[4])
695 assert.Equal(5, len(tokens))
696
697 // germanTokenizerDoesNOTSeparateGermanContractions
698 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
699 assert.Equal("mach's", tokens[0])
700 assert.Equal("macht's", tokens[1])
701 assert.Equal("was'n", tokens[2])
702 assert.Equal("ist's", tokens[3])
703 assert.Equal("haste", tokens[4])
704 assert.Equal("willste", tokens[5])
705 assert.Equal("kannste", tokens[6])
706 assert.Equal("biste", tokens[7])
707 assert.Equal("kriegste", tokens[8])
708 assert.Equal(9, len(tokens))
709
Akron78dba062021-10-28 19:30:46 +0200710 tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
711 assert.Equal("Es", tokens[0])
712 assert.Equal("ist", tokens[1])
713 assert.Equal("gleich", tokens[2])
714 assert.Equal("2:30", tokens[3])
715 assert.Equal("Uhr", tokens[4])
716 assert.Equal(".", tokens[5])
717 assert.Equal(6, len(tokens))
718
Akron17984c82021-10-30 11:44:37 +0200719 tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
720 assert.Equal("Sie", tokens[0])
721 assert.Equal("schwamm", tokens[1])
722 assert.Equal("die", tokens[2])
723 assert.Equal("Strecke", tokens[3])
724 assert.Equal("in", tokens[4])
725 assert.Equal("00:00:57,34", tokens[5])
726 assert.Equal("00:57,341", tokens[6])
727 assert.Equal("0:57", tokens[7])
728 assert.Equal("Stunden", tokens[8])
729 assert.Equal(".", tokens[9])
730 assert.Equal(10, len(tokens))
731
Akronf1106ec2021-11-05 13:04:44 +0100732 // waste example
733 tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
734 assert.Equal(tokens[0], "Am")
735 assert.Equal(tokens[1], "24.1.1806")
736 assert.Equal(tokens[2], "feierte")
737 assert.Equal(tokens[3], "E.")
738 assert.Equal(tokens[4], "T.")
739 assert.Equal(tokens[5], "A.")
740 assert.Equal(tokens[6], "Hoffmann")
741 assert.Equal(tokens[7], "seinen")
742 assert.Equal(tokens[8], "30.")
743 assert.Equal(tokens[9], "Geburtstag")
744 assert.Equal(tokens[10], ".")
745 assert.Equal(11, len(tokens))
746
Akron9135b202021-11-06 13:16:07 +0100747 // IPtest
748 tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.")
749 assert.Equal(tokens[0], "Meine")
750 assert.Equal(tokens[1], "IP")
751 assert.Equal(tokens[2], "ist")
752 assert.Equal(tokens[3], "192.178.168.55")
753 assert.Equal(tokens[4], ".")
754 assert.Equal(5, len(tokens))
755
Akron6742b962021-11-09 01:17:20 +0100756 // XML entities
757 tokens = ttokenize(mat, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
758 assert.Equal(tokens[0], "Das")
759 assert.Equal(tokens[1], "ist")
760 assert.Equal(tokens[2], "&nbsp;")
761 assert.Equal(tokens[3], "1:30")
762 assert.Equal(tokens[4], "Stunden")
763 assert.Equal(tokens[5], "&")
764 assert.Equal(tokens[6], "20")
765 assert.Equal(tokens[7], "Minuten")
766 assert.Equal(tokens[8], "zu")
767 assert.Equal(tokens[9], "spät")
768 assert.Equal(tokens[10], "&GT;")
769 assert.Equal(tokens[11], ".")
770 assert.Equal(12, len(tokens))
771
Akron936c0f52021-12-07 11:30:53 +0100772 // Plusampersand compounds (1)
Akrone62e8eb2021-12-03 11:59:53 +0100773 tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
774 assert.Equal(tokens[0], "Die")
775 assert.Equal(tokens[1], "2G+-Regel")
776 assert.Equal(tokens[2], "soll")
777 assert.Equal(tokens[3], "weitere")
778 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
779 assert.Equal(tokens[5], "reduzieren")
780 assert.Equal(tokens[6], ".")
781 assert.Equal(7, len(tokens))
782
Akron936c0f52021-12-07 11:30:53 +0100783 // Plusampersand compounds (2)
784 tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.")
785 assert.Equal(tokens[0], "Der")
786 assert.Equal(tokens[1], "Neu-C++-Programmierer")
787 assert.Equal(tokens[2], ".")
788 assert.Equal(3, len(tokens))
789
Akron54ed7e72022-01-04 12:05:00 +0100790 // z.B.
791 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
792 assert.Equal(tokens[0], "Dies")
793 assert.Equal(tokens[1], "sind")
794 assert.Equal(tokens[2], "z.")
795 assert.Equal(tokens[3], "B.")
796 assert.Equal(tokens[4], "zwei")
797 assert.Equal(tokens[5], "Wörter")
798 assert.Equal(tokens[6], "-")
799 assert.Equal(tokens[7], "z.")
800 assert.Equal(tokens[8], "B.")
801 assert.Equal(tokens[9], "auch")
802 assert.Equal(tokens[10], ".")
803 assert.Equal(11, len(tokens))
804
Akron28031b72021-10-02 13:07:25 +0200805 /*
806 @Test
807 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
808 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
809 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
810 assert.Equal("'ve", tokens[1]);
811 assert.Equal("'ll", tokens[3]);
812 assert.Equal("'d", tokens[5]);
813 assert.Equal("'m", tokens[7]);
814 assert.Equal("'re", tokens[9]);
815 assert.Equal("'s", tokens[11]);
816 assert.Equal("is", tokens[12]);
817 assert.Equal("n't", tokens[13]);
818 assert.Equal(14, len(tokens));
819 }
820
821 @Test
822 public void frenchTokenizerKnowsFrenchAbbreviations () {
823 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
824 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
825 assert.Equal("Approx.", tokens[0]);
826 assert.Equal("juill.", tokens[2]);
827 assert.Equal("prof.", tokens[5]);
828 assert.Equal("exerc.", tokens[15]);
829 assert.Equal("no.", tokens[16]);
830 assert.Equal("pp.", tokens[21]);
831 }
832
833 @Test
834 public void frenchTokenizerKnowsFrenchContractions () {
835 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
836 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
837 assert.Equal("J'", tokens[0]);
838 assert.Equal("j'", tokens[2]);
839 assert.Equal("qu'", tokens[4]);
840 assert.Equal("d'", tokens[6]);
841 assert.Equal("jusqu'", tokens[8]);
842 assert.Equal("Aujourd'hui", tokens[10]);
843 assert.Equal("D'", tokens[11]); // ’
844 assert.Equal("Quelqu'un", tokens[13]); // ’
845 assert.Equal("Presqu'île", tokens[14]); // ’
846 }
847
848 @Test
849 public void frenchTokenizerKnowsFrenchClitics () {
850 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
851 tokens = tokenize(dat, w, "suis-je sont-elles ")
852 assert.Equal("suis", tokens[0]);
853 assert.Equal("-je", tokens[1]);
854 assert.Equal("sont", tokens[2]);
855 assert.Equal("-elles", tokens[3]);
856 }
857
858 @Test
859 public void testEnglishTokenizerScienceAbbreviations () {
860 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
861 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
862 assert.Equal("Approx.", tokens[0]);
863 assert.Equal("in", tokens[1]);
864 assert.Equal("Sept.", tokens[2]);
865 assert.Equal("1954", tokens[3]);
866 assert.Equal(",", tokens[4]);
867 assert.Equal("Assoc.", tokens[5]);
868 assert.Equal("Prof.", tokens[6]);
869 assert.Equal("Dr.", tokens[7]);
870 assert.Equal("R.", tokens[8]);
871 assert.Equal("J.", tokens[9]);
872 assert.Equal("Ewing", tokens[10]);
873 assert.Equal("reviewed", tokens[11]);
874 assert.Equal("articles", tokens[12]);
875 assert.Equal("on", tokens[13]);
876 assert.Equal("Enzymol.", tokens[14]);
877 assert.Equal("Bacteriol.", tokens[15]);
878 assert.Equal("effects", tokens[16]);
879 assert.Equal("later", tokens[17]);
880 assert.Equal("published", tokens[18]);
881 assert.Equal("in", tokens[19]);
882 assert.Equal("Nutr.", tokens[20]);
883 assert.Equal("Rheumatol.", tokens[21]);
884 assert.Equal("No.", tokens[22]);
885 assert.Equal("12", tokens[23]);
886 assert.Equal("and", tokens[24]);
887 assert.Equal("Nº.", tokens[25]);
888 assert.Equal("13.", tokens[26]);
889 assert.Equal(",", tokens[27]);
890 assert.Equal("pp.", tokens[28]);
891 assert.Equal("17-18", tokens[29]);
892 assert.Equal(".", tokens[30]);
893 }
894
895 @Test
896 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
897 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
898 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
899 assert.Equal("I.", tokens[1]);
900 assert.Equal("I", tokens[8]);
901 assert.Equal(".", tokens[9]);
902 assert.Equal("I", tokens[12]);
903 assert.Equal(".", tokens[13]);
904 }
905
906 @Test
907 public void testZipOuputArchive () {
908
909 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
910 System.setOut(new PrintStream(clearOut));
911 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
912 assert.Equal(0, len(tokens));
913 }
914 */
915 /*
916
917 @Test
918 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
919 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
920 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
921 .printOffsets(true)
922 .build();
923 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
924 assert.Equal("Text1", tokens[0].getType());
925 assert.Equal(len(tokens), 9 );
926 }
927 */
928}
929
Akronc9c0eae2021-10-22 19:49:43 +0200930func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200931 assert := assert.New(t)
932
Akron9fb63af2021-10-28 01:15:53 +0200933 if mat == nil {
934 mat = LoadMatrixFile("testdata/tokenizer.matok")
935 }
Akron28031b72021-10-02 13:07:25 +0200936
Akron28031b72021-10-02 13:07:25 +0200937 assert.NotNil(mat)
938
939 b := make([]byte, 0, 2048)
940 w := bytes.NewBuffer(b)
941 var tokens []string
942
943 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
944 assert.Equal("Das", tokens[0])
945 assert.Equal("<b>", tokens[1])
946 assert.Equal("beste", tokens[2])
947 assert.Equal("</b>", tokens[3])
948 assert.Equal("Fußballspiel", tokens[4])
949 assert.Equal(5, len(tokens))
950
951 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
952 assert.Equal("Das", tokens[0])
953 assert.Equal("<b class=\"c\">", tokens[1])
954 assert.Equal("beste", tokens[2])
955 assert.Equal("</b>", tokens[3])
956 assert.Equal("Fußballspiel", tokens[4])
957 assert.Equal(5, len(tokens))
958
959 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
960 assert.Equal("der", tokens[0])
961 assert.Equal("<x y=\"alte \">", tokens[1])
962 assert.Equal("<x x>", tokens[2])
963 assert.Equal("alte", tokens[3])
964 assert.Equal("</x>", tokens[4])
965 assert.Equal("etc.", tokens[5])
966 assert.Equal("et", tokens[6])
967 assert.Equal(".", tokens[7])
968 assert.Equal("Mann", tokens[8])
969 assert.Equal(".", tokens[9])
970 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +0200971
972 tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.")
973 assert.Equal("das", tokens[0])
974 assert.Equal("<br class=\"br\" />", tokens[1])
975 assert.Equal("ging", tokens[2])
976 assert.Equal(".", tokens[3])
977 assert.Equal(4, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200978}
979
Akronabcb6a52021-10-09 15:52:08 +0200980func TestMatokDatokEquivalence(t *testing.T) {
981 assert := assert.New(t)
982
Akron9fb63af2021-10-28 01:15:53 +0200983 if mat == nil {
984 mat = LoadMatrixFile("testdata/tokenizer.matok")
985 }
Akronabcb6a52021-10-09 15:52:08 +0200986 dat := LoadDatokFile("testdata/tokenizer.datok")
987
988 r := strings.NewReader(s)
989
990 tb := make([]byte, 0, 2048)
991 w := bytes.NewBuffer(tb)
992
993 // Transduce with double array representation
994 dat.Transduce(r, w)
995
996 datStr := w.String()
997
998 r.Reset(s)
999 w.Reset()
1000
1001 // Transduce with matrix representation
1002 mat.Transduce(r, w)
1003
1004 matStr := w.String()
1005
1006 assert.Equal(datStr, matStr)
1007}
1008
Akronc9c0eae2021-10-22 19:49:43 +02001009func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001010 assert := assert.New(t)
1011
Akron9fb63af2021-10-28 01:15:53 +02001012 if mat == nil {
1013 mat = LoadMatrixFile("testdata/tokenizer.matok")
1014 }
Akrone396a932021-10-19 01:06:13 +02001015
1016 assert.NotNil(mat)
1017
1018 b := make([]byte, 0, 2048)
1019 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001020
1021 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001022
1023 matStr := w.String()
1024
1025 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1026}
1027
Akronc9c0eae2021-10-22 19:49:43 +02001028func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001029 assert := assert.New(t)
1030
Akron9fb63af2021-10-28 01:15:53 +02001031 if mat == nil {
1032 mat = LoadMatrixFile("testdata/tokenizer.matok")
1033 }
Akrona854faa2021-10-22 19:31:08 +02001034
1035 assert.NotNil(mat)
1036
1037 b := make([]byte, 0, 2048)
1038 w := bytes.NewBuffer(b)
1039
1040 assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
1041 matStr := w.String()
1042 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001043}
Akrona854faa2021-10-22 19:31:08 +02001044
Akron22c565a2021-11-28 17:31:36 +01001045func TestMatrixFullTokenizerLongText(t *testing.T) {
1046 assert := assert.New(t)
1047
1048 if mat == nil {
1049 mat = LoadMatrixFile("testdata/tokenizer.matok")
1050 }
1051
1052 assert.NotNil(mat)
1053
1054 b := make([]byte, 0, 2048)
1055 w := bytes.NewBuffer(b)
1056
1057 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1058
1059Copyright laws are changing all over the world. Be sure to check the
1060copyright laws for your country before downloading or redistributing
1061this or any other Project Gutenberg eBook.
1062
1063This header should be the first thing seen when viewing this Project
1064Gutenberg file. Please do not remove it. Do not change or edit the
1065header without written permission.
1066
1067Please read the "legal small print," and other information about the
1068eBook and Project Gutenberg at the bottom of this file. Included is
1069important information about your specific rights and restrictions in
1070how the file may be used. You can also find out about how to make a
1071donation to Project Gutenberg, and how to get involved.
1072
1073
1074**Welcome To The World of Free Plain Vanilla Electronic Texts**
1075
1076**eBooks Readable By Both Humans and By Computers, Since 1971**
1077
1078*****These eBooks Were Prepared By Thousands of Volunteers!*****
1079
1080
1081Title: Effi Briest
1082
1083Author: Theodor Fontane
1084
1085Release Date: March, 2004 [EBook #5323]
1086`
1087
1088 assert.True(mat.Transduce(strings.NewReader(text), w))
1089
1090 assert.True(strings.Contains(w.String(), "Release"))
1091}
1092
Akronf6bdfdb2021-10-23 15:56:53 +02001093func TestMatrixTrimming(t *testing.T) {
1094 assert := assert.New(t)
1095
Akron9fb63af2021-10-28 01:15:53 +02001096 if mat == nil {
1097 mat = LoadMatrixFile("testdata/tokenizer.matok")
1098 }
Akronf6bdfdb2021-10-23 15:56:53 +02001099
1100 assert.NotNil(mat)
1101
1102 b := make([]byte, 0, 2048)
1103 w := bytes.NewBuffer(b)
1104
1105 assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
1106 matStr := w.String()
1107 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001108}
1109
Akronc9c0eae2021-10-22 19:49:43 +02001110func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001111 bu := make([]byte, 0, 2048)
1112 w := bytes.NewBuffer(bu)
1113
Akron28031b72021-10-02 13:07:25 +02001114 r := strings.NewReader(s)
1115
Akron094a4e82021-10-02 18:37:00 +02001116 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001117
1118 b.ResetTimer()
1119
1120 for i := 0; i < b.N; i++ {
1121 w.Reset()
1122 r.Reset(s)
1123 ok := mat.Transduce(r, w)
1124 if !ok {
1125 fmt.Println("Fail!")
1126 fmt.Println(w.String())
1127 os.Exit(1)
1128 }
1129 }
1130}