blob: 79191158ff487e3c0726b7be7017d3d251d1d0f4 [file] [log] [blame]
Akron1c34ce62021-09-23 23:27:39 +02001package datok
2
3import (
4 "bytes"
Akron28031b72021-10-02 13:07:25 +02005 "fmt"
6 "os"
Akron1c34ce62021-09-23 23:27:39 +02007 "strings"
8 "testing"
9
10 "github.com/stretchr/testify/assert"
11)
12
Akronabcb6a52021-10-09 15:52:08 +020013var s string = `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
14Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
15Der Termin ist am 5.9.2018.
16Ich habe die readme.txt heruntergeladen.
17Ausschalten!!! Hast Du nicht gehört???
18Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
19Er sagte: \"Es geht mir gut!\", daraufhin ging er. "Das ist von C&A!" Früher bzw. später ... Sie erreichte den 1. Platz!
20Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
21Mach's macht's was'n ist's haste willste kannste biste kriegste.`
22
Akron9fb63af2021-10-28 01:15:53 +020023var mat *MatrixTokenizer
24
Akronc9c0eae2021-10-22 19:49:43 +020025func TestMatrixFullTokenizer(t *testing.T) {
Akron1c34ce62021-09-23 23:27:39 +020026 assert := assert.New(t)
27 foma := LoadFomaFile("testdata/simpletok.fst")
28 assert.NotNil(foma)
29
30 mat := foma.ToMatrix()
31
32 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
33 b := make([]byte, 0, 2048)
34 w := bytes.NewBuffer(b)
35 var tokens []string
36 mat.Transduce(r, w)
37 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +020038 assert.Equal(len(tokens), 11)
Akron1c34ce62021-09-23 23:27:39 +020039 assert.Equal("wald", tokens[0])
40 assert.Equal("gehen", tokens[1])
41 assert.Equal("Da", tokens[2])
42 assert.Equal("kann", tokens[3])
43 assert.Equal("man", tokens[4])
44 assert.Equal("was", tokens[5])
45 assert.Equal("\"erleben\"", tokens[6])
46 assert.Equal("!", tokens[7])
Akron5c82a922021-09-24 19:11:29 +020047
48 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
49 w.Reset()
50 mat.Transduce(r, w)
51 tokens = strings.Split(w.String(), "\n")
52 assert.Equal("In", tokens[0])
53 assert.Equal("den", tokens[1])
54 assert.Equal("Wald", tokens[2])
55 assert.Equal("gehen", tokens[3])
56 assert.Equal("?", tokens[4])
57 assert.Equal("--", tokens[5])
58
59 r = strings.NewReader(" g? -- D")
60 w.Reset()
61 mat.Transduce(r, w)
62 tokens = strings.Split(w.String(), "\n")
63 assert.Equal("g", tokens[0])
64 assert.Equal("?", tokens[1])
65 assert.Equal("--", tokens[2])
66 assert.Equal("D", tokens[3])
67 assert.Equal("", tokens[4])
68 assert.Equal("", tokens[5])
Akrona854faa2021-10-22 19:31:08 +020069 assert.Equal("", tokens[6])
70 assert.Equal(7, len(tokens))
Akron5c82a922021-09-24 19:11:29 +020071}
72
Akronc9c0eae2021-10-22 19:49:43 +020073func TestMatrixReadWriteTokenizer(t *testing.T) {
Akron16c312e2021-09-26 13:11:12 +020074 assert := assert.New(t)
75 foma := LoadFomaFile("testdata/simpletok.fst")
76 assert.NotNil(foma)
77
78 mat := foma.ToMatrix()
Akron28031b72021-10-02 13:07:25 +020079 assert.NotNil(mat)
Akron16c312e2021-09-26 13:11:12 +020080
81 assert.True(tmatch(mat, "bau"))
82 assert.True(tmatch(mat, "bad"))
83 assert.True(tmatch(mat, "wald gehen"))
84 b := make([]byte, 0, 1024)
85 buf := bytes.NewBuffer(b)
86 n, err := mat.WriteTo(buf)
87 assert.Nil(err)
Akron28031b72021-10-02 13:07:25 +020088 assert.Equal(int64(230), n)
Akron16c312e2021-09-26 13:11:12 +020089 mat2 := ParseMatrix(buf)
90 assert.NotNil(mat2)
91 assert.Equal(mat.sigma, mat2.sigma)
92 assert.Equal(mat.epsilon, mat2.epsilon)
93 assert.Equal(mat.unknown, mat2.unknown)
94 assert.Equal(mat.identity, mat2.identity)
95 assert.Equal(mat.stateCount, mat2.stateCount)
96 assert.Equal(len(mat.array), len(mat2.array))
97 assert.Equal(mat.array, mat2.array)
98 assert.True(tmatch(mat2, "bau"))
99 assert.True(tmatch(mat2, "bad"))
100 assert.True(tmatch(mat2, "wald gehen"))
101}
102
Akrone396a932021-10-19 01:06:13 +0200103func TestMatrixIgnorableMCS(t *testing.T) {
104 assert := assert.New(t)
105
106 // This test relies on final states. That's why it is
107 // not working correctly anymore.
108
109 // File has MCS in sigma but not in net
110 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
111 assert.NotNil(tok)
112 mat := tok.ToMatrix()
113 assert.NotNil(mat)
114
115 b := make([]byte, 0, 2048)
116 w := bytes.NewBuffer(b)
117 var tokens []string
118
119 // Is only unambigous when transducing strictly greedy!
120 assert.True(mat.Transduce(strings.NewReader("ab<ab>a"), w))
121 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200122 assert.Equal("a\nb\n<ab>a\n\n\n", w.String())
Akrone396a932021-10-19 01:06:13 +0200123 assert.Equal("a", tokens[0])
124 assert.Equal("b", tokens[1])
125 assert.Equal("<ab>a", tokens[2])
Akrona854faa2021-10-22 19:31:08 +0200126 assert.Equal(6, len(tokens))
Akrone396a932021-10-19 01:06:13 +0200127}
128
Akronc9c0eae2021-10-22 19:49:43 +0200129func xTestMatrixReadWriteFullTokenizer(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200130 assert := assert.New(t)
131 foma := LoadFomaFile("testdata/tokenizer.fst")
132 assert.NotNil(foma)
133
134 mat := foma.ToMatrix()
135 assert.NotNil(foma)
136
137 tb := make([]byte, 0, 2048)
138 w := bytes.NewBuffer(tb)
139
140 assert.True(mat.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200141 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200142
143 b := make([]byte, 0, 1024)
144 buf := bytes.NewBuffer(b)
145 _, err := mat.WriteTo(buf)
146 assert.Nil(err)
147 w.Reset()
148 // assert.Equal(int64(248), n)
149
150 mat2 := ParseMatrix(buf)
151 assert.NotNil(mat2)
152 assert.Equal(mat.sigma, mat2.sigma)
153 assert.Equal(mat.epsilon, mat2.epsilon)
154 assert.Equal(mat.unknown, mat2.unknown)
155 assert.Equal(mat.identity, mat2.identity)
156 assert.Equal(mat.stateCount, mat2.stateCount)
157 assert.Equal(len(mat.array), len(mat2.array))
158 // assert.Equal(mat.array, mat2.array)
159
160 assert.True(mat2.Transduce(strings.NewReader("der alte baum"), w))
Akrona854faa2021-10-22 19:31:08 +0200161 assert.Equal("der\nalte\nbaum\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200162}
163
Akronc9c0eae2021-10-22 19:49:43 +0200164func TestMatrixFullTokenizerTransduce(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200165 assert := assert.New(t)
166
Akron9fb63af2021-10-28 01:15:53 +0200167 if mat == nil {
168 mat = LoadMatrixFile("testdata/tokenizer.matok")
169 }
Akron28031b72021-10-02 13:07:25 +0200170
171 assert.NotNil(mat)
172
173 b := make([]byte, 0, 2048)
174 w := bytes.NewBuffer(b)
175 var tokens []string
176
177 assert.True(mat.Transduce(strings.NewReader("tra. u Du?"), w))
178
179 tokens = strings.Split(w.String(), "\n")
Akrona854faa2021-10-22 19:31:08 +0200180 assert.Equal("tra\n.\n\nu\nDu\n?\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200181 assert.Equal("tra", tokens[0])
182 assert.Equal(".", tokens[1])
183 assert.Equal("", tokens[2])
184 assert.Equal("u", tokens[3])
185 assert.Equal("Du", tokens[4])
186 assert.Equal("?", tokens[5])
187 assert.Equal("", tokens[6])
188 assert.Equal("", tokens[7])
Akrona854faa2021-10-22 19:31:08 +0200189 assert.Equal(9, len(tokens))
Akron28031b72021-10-02 13:07:25 +0200190
191 w.Reset()
192 assert.True(mat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
Akrona854faa2021-10-22 19:31:08 +0200193 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n\n", w.String())
Akron28031b72021-10-02 13:07:25 +0200194}
195
Akronc9c0eae2021-10-22 19:49:43 +0200196func TestMatrixFullTokenizerMatrixSentenceSplitter(t *testing.T) {
Akron5c82a922021-09-24 19:11:29 +0200197 assert := assert.New(t)
Akron9fb63af2021-10-28 01:15:53 +0200198
199 if mat == nil {
200 mat = LoadMatrixFile("testdata/tokenizer.matok")
201 }
Akron5c82a922021-09-24 19:11:29 +0200202
203 b := make([]byte, 0, 2048)
204 w := bytes.NewBuffer(b)
205 var sentences []string
206
207 // testSentSplitterSimple
208 assert.True(mat.Transduce(strings.NewReader("Der alte Mann."), w))
209 sentences = strings.Split(w.String(), "\n\n")
210
Akrona854faa2021-10-22 19:31:08 +0200211 assert.Equal("Der\nalte\nMann\n.\n\n\n", w.String())
Akron5c82a922021-09-24 19:11:29 +0200212 assert.Equal("Der\nalte\nMann\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200213 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200214 assert.Equal(len(sentences), 2)
215
216 w.Reset()
Akrona854faa2021-10-22 19:31:08 +0200217 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der F.D.P. hat gewählt."), w))
218 sentences = strings.Split(w.String(), "\n\n")
219 assert.Equal(len(sentences), 2)
220 assert.Equal("Der\nVorsitzende\nder\nF.D.P.\nhat\ngewählt\n.", sentences[0])
221 assert.Equal("\n", sentences[1])
222
223 w.Reset()
Akron5c82a922021-09-24 19:11:29 +0200224 assert.True(mat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal(len(sentences), 2)
227 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200228 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200229
Akron28031b72021-10-02 13:07:25 +0200230 w.Reset()
231 assert.True(mat.Transduce(strings.NewReader(""), w))
232 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200233 assert.Equal(len(sentences), 2)
234 assert.Equal("", sentences[0])
235 assert.Equal("", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200236
Akron28031b72021-10-02 13:07:25 +0200237 w.Reset()
238 assert.True(mat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
239 sentences = strings.Split(w.String(), "\n\n")
240 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200241
Akron28031b72021-10-02 13:07:25 +0200242 w.Reset()
243 assert.True(mat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
244 sentences = strings.Split(w.String(), "\n\n")
245 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200246
Akron28031b72021-10-02 13:07:25 +0200247 w.Reset()
248 assert.True(mat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200251 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200252 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200253
Akron28031b72021-10-02 13:07:25 +0200254 w.Reset()
255 assert.True(mat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
256 sentences = strings.Split(w.String(), "\n\n")
Akrona854faa2021-10-22 19:31:08 +0200257 assert.Equal("\n", sentences[1])
Akron28031b72021-10-02 13:07:25 +0200258 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200259
Akron28031b72021-10-02 13:07:25 +0200260 w.Reset()
261 assert.True(mat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
262 sentences = strings.Split(w.String(), "\n\n")
263 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200264
Akron28031b72021-10-02 13:07:25 +0200265 w.Reset()
266 assert.True(mat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
267 sentences = strings.Split(w.String(), "\n\n")
268 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200269
Akron28031b72021-10-02 13:07:25 +0200270 w.Reset()
271 assert.True(mat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
272 sentences = strings.Split(w.String(), "\n\n")
273 assert.Equal(len(sentences), 2)
274 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
Akrona854faa2021-10-22 19:31:08 +0200275 assert.Equal("\n", sentences[1])
Akron5c82a922021-09-24 19:11:29 +0200276
Akron28031b72021-10-02 13:07:25 +0200277 w.Reset()
278 assert.True(mat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
279 sentences = strings.Split(w.String(), "\n\n")
280 assert.Equal(len(sentences), 3)
281 assert.Equal("Ausschalten\n!!!", sentences[0])
282 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
Akrona854faa2021-10-22 19:31:08 +0200283 assert.Equal("\n", sentences[2])
Akron5c82a922021-09-24 19:11:29 +0200284
Akron28031b72021-10-02 13:07:25 +0200285 w.Reset()
286 assert.True(mat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
287 sentences = strings.Split(w.String(), "\n\n")
288 assert.Equal(len(sentences), 2)
Akron5c82a922021-09-24 19:11:29 +0200289 /*
290 Test:
291 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
292 */
Akron1c34ce62021-09-23 23:27:39 +0200293}
Akron28031b72021-10-02 13:07:25 +0200294
Akronc9c0eae2021-10-22 19:49:43 +0200295func TestMatrixFullTokenizerTokenSplitter(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200296 assert := assert.New(t)
297
Akron9fb63af2021-10-28 01:15:53 +0200298 if mat == nil {
299 mat = LoadMatrixFile("testdata/tokenizer.matok")
300 }
Akron28031b72021-10-02 13:07:25 +0200301
302 b := make([]byte, 0, 2048)
303 w := bytes.NewBuffer(b)
304 var tokens []string
305
306 // testTokenizerSimple
307 tokens = ttokenize(mat, w, "Der alte Mann")
308 assert.Equal(tokens[0], "Der")
309 assert.Equal(tokens[1], "alte")
310 assert.Equal(tokens[2], "Mann")
311 assert.Equal(len(tokens), 3)
312
313 tokens = ttokenize(mat, w, "Der alte Mann.")
314 assert.Equal(tokens[0], "Der")
315 assert.Equal(tokens[1], "alte")
316 assert.Equal(tokens[2], "Mann")
317 assert.Equal(tokens[3], ".")
318 assert.Equal(len(tokens), 4)
319
320 // testTokenizerAbbr
321 tokens = ttokenize(mat, w, "Der Vorsitzende der F.D.P. hat gewählt")
322 assert.Equal(tokens[0], "Der")
323 assert.Equal(tokens[1], "Vorsitzende")
324 assert.Equal(tokens[2], "der")
325 assert.Equal(tokens[3], "F.D.P.")
326 assert.Equal(tokens[4], "hat")
327 assert.Equal(tokens[5], "gewählt")
328 assert.Equal(len(tokens), 6)
329 // Ignored in KorAP-Tokenizer
330
331 // testTokenizerHost1
332 tokens = ttokenize(mat, w, "Gefunden auf wikipedia.org")
333 assert.Equal(tokens[0], "Gefunden")
334 assert.Equal(tokens[1], "auf")
335 assert.Equal(tokens[2], "wikipedia.org")
336 assert.Equal(len(tokens), 3)
337
338 // testTokenizerWwwHost
339 tokens = ttokenize(mat, w, "Gefunden auf www.wikipedia.org")
340 assert.Equal("Gefunden", tokens[0])
341 assert.Equal("auf", tokens[1])
342 assert.Equal("www.wikipedia.org", tokens[2])
343 assert.Equal(3, len(tokens))
344
345 // testTokenizerWwwUrl
346 tokens = ttokenize(mat, w, "Weitere Infos unter www.info.biz/info")
347 assert.Equal("www.info.biz/info", tokens[3])
348
349 // testTokenizerFtpHost
350 /*
351 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
352 assert.Equal("Kann", tokens[0])
353 assert.Equal("von", tokens[1])
354 assert.Equal("ftp.download.org", tokens[2])
355 assert.Equal(5, len(tokens))
356 // Ignored in KorAP-Tokenizer
357 */
358
359 // testTokenizerDash
360 tokens = ttokenize(mat, w, "Das war -- spitze")
361 assert.Equal(tokens[0], "Das")
362 assert.Equal(tokens[1], "war")
363 assert.Equal(tokens[2], "--")
364 assert.Equal(tokens[3], "spitze")
365 assert.Equal(len(tokens), 4)
366
367 // testTokenizerEmail1
368 tokens = ttokenize(mat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
369 assert.Equal(tokens[0], "Ich")
370 assert.Equal(tokens[1], "bin")
371 assert.Equal(tokens[2], "unter")
372 assert.Equal(tokens[3], "korap@ids-mannheim.de")
373 assert.Equal(tokens[4], "erreichbar")
374 assert.Equal(tokens[5], ".")
375 assert.Equal(len(tokens), 6)
376
377 // testTokenizerEmail2
378 tokens = ttokenize(mat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
379 assert.Equal(tokens[0], "Oder")
380 assert.Equal(tokens[1], "unter")
381 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
382 assert.Equal(tokens[3], ".")
383 assert.Equal(len(tokens), 4)
384
385 // testTokenizerEmail3
386 tokens = ttokenize(mat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
387 assert.Equal(tokens[0], "Oder")
388 assert.Equal(tokens[1], "unter")
389 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
390 assert.Equal(tokens[3], ".")
391 assert.Equal(len(tokens), 4)
392 // Ignored in KorAP-Tokenizer
393
394 // testTokenizerDoNotAcceptQuotedEmailNames
395 tokens = ttokenize(mat, w, "\"John Doe\"@xx.com")
396 assert.Equal("\"", tokens[0])
397 assert.Equal("John", tokens[1])
398 assert.Equal("Doe", tokens[2])
399 assert.Equal("\"", tokens[3])
400 assert.Equal("@xx", tokens[4])
401 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
402 assert.Equal("com", tokens[6])
403 assert.Equal(7, len(tokens))
404
405 // testTokenizerTwitter
406 tokens = ttokenize(mat, w, "Folgt @korap und #korap")
407 assert.Equal(tokens[0], "Folgt")
408 assert.Equal(tokens[1], "@korap")
409 assert.Equal(tokens[2], "und")
410 assert.Equal(tokens[3], "#korap")
411 assert.Equal(len(tokens), 4)
412
413 // testTokenizerWeb1
414 tokens = ttokenize(mat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
415 assert.Equal(tokens[0], "Unsere")
416 assert.Equal(tokens[1], "Website")
417 assert.Equal(tokens[2], "ist")
418 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
419 assert.Equal(len(tokens), 4)
420
421 // testTokenizerWeb2
422 tokens = ttokenize(mat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
423 assert.Equal(tokens[0], "Wir")
424 assert.Equal(tokens[1], "sind")
425 assert.Equal(tokens[2], "auch")
426 assert.Equal(tokens[3], "im")
427 assert.Equal(tokens[4], "Internet")
428 assert.Equal(tokens[5], "(")
429 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
430 assert.Equal(tokens[7], ")")
431 assert.Equal(len(tokens), 8)
432 // Ignored in KorAP-Tokenizer
433
434 // testTokenizerWeb3
435 tokens = ttokenize(mat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
436 assert.Equal(tokens[0], "Die")
437 assert.Equal(tokens[1], "Adresse")
438 assert.Equal(tokens[2], "ist")
439 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
440 assert.Equal(tokens[4], ".")
441 assert.Equal(len(tokens), 5)
442 // Ignored in KorAP-Tokenizer
443
444 // testTokenizerServer
445 tokens = ttokenize(mat, w, "Unser Server ist 10.0.10.51.")
446 assert.Equal(tokens[0], "Unser")
447 assert.Equal(tokens[1], "Server")
448 assert.Equal(tokens[2], "ist")
449 assert.Equal(tokens[3], "10.0.10.51")
450 assert.Equal(tokens[4], ".")
451 assert.Equal(len(tokens), 5)
452
453 // testTokenizerNum
454 tokens = ttokenize(mat, w, "Zu 50,4% ist es sicher")
455 assert.Equal(tokens[0], "Zu")
456 assert.Equal(tokens[1], "50,4%")
457 assert.Equal(tokens[2], "ist")
458 assert.Equal(tokens[3], "es")
459 assert.Equal(tokens[4], "sicher")
460 assert.Equal(len(tokens), 5)
461 // Differs from KorAP-Tokenizer
462
463 // testTokenizerDate
464 tokens = ttokenize(mat, w, "Der Termin ist am 5.9.2018")
465 assert.Equal(tokens[0], "Der")
466 assert.Equal(tokens[1], "Termin")
467 assert.Equal(tokens[2], "ist")
468 assert.Equal(tokens[3], "am")
469 assert.Equal(tokens[4], "5.9.2018")
470 assert.Equal(len(tokens), 5)
471
472 tokens = ttokenize(mat, w, "Der Termin ist am 5/9/2018")
473 assert.Equal(tokens[0], "Der")
474 assert.Equal(tokens[1], "Termin")
475 assert.Equal(tokens[2], "ist")
476 assert.Equal(tokens[3], "am")
477 assert.Equal(tokens[4], "5/9/2018")
478 assert.Equal(len(tokens), 5)
479
480 // testTokenizerDateRange
481 /*
482 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
483 assert.Equal(tokens[0], "Der")
484 assert.Equal(tokens[1], "Termin")
485 assert.Equal(tokens[2], "war")
486 assert.Equal(tokens[3], "vom")
487 assert.Equal(tokens[4], "4.")
488 assert.Equal(tokens[5], "-")
489 assert.Equal(tokens[6], "5.9.2018")
490 assert.Equal(len(tokens), 7)
491 // Ignored in KorAP-Tokenizer
492 */
493
494 // testTokenizerEmoji1
495 tokens = ttokenize(mat, w, "Das ist toll! ;)")
496 assert.Equal(tokens[0], "Das")
497 assert.Equal(tokens[1], "ist")
498 assert.Equal(tokens[2], "toll")
499 assert.Equal(tokens[3], "!")
500 assert.Equal(tokens[4], ";)")
501 assert.Equal(len(tokens), 5)
502
503 // testTokenizerRef1
504 tokens = ttokenize(mat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
505 assert.Equal(tokens[0], "Kupietz")
506 assert.Equal(tokens[1], "und")
507 assert.Equal(tokens[2], "Schmidt")
508 assert.Equal(tokens[3], "(2018)")
509 assert.Equal(tokens[4], ":")
510 assert.Equal(tokens[5], "Korpuslinguistik")
511 assert.Equal(len(tokens), 6)
512 // Differs from KorAP-Tokenizer!
513
514 // testTokenizerRef2 () {
515 tokens = ttokenize(mat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
516 assert.Equal(tokens[0], "Kupietz")
517 assert.Equal(tokens[1], "und")
518 assert.Equal(tokens[2], "Schmidt")
519 assert.Equal(tokens[3], "[2018]")
520 assert.Equal(tokens[4], ":")
521 assert.Equal(tokens[5], "Korpuslinguistik")
522 assert.Equal(len(tokens), 6)
523 // Differs from KorAP-Tokenizer!
524
525 // testTokenizerOmission1 () {
526 tokens = ttokenize(mat, w, "Er ist ein A****loch!")
527 assert.Equal(tokens[0], "Er")
528 assert.Equal(tokens[1], "ist")
529 assert.Equal(tokens[2], "ein")
530 assert.Equal(tokens[3], "A****loch")
531 assert.Equal(tokens[4], "!")
532 assert.Equal(len(tokens), 5)
533
534 // testTokenizerOmission2
535 tokens = ttokenize(mat, w, "F*ck!")
536 assert.Equal(tokens[0], "F*ck")
537 assert.Equal(tokens[1], "!")
538 assert.Equal(len(tokens), 2)
539
540 // testTokenizerOmission3 () {
541 tokens = ttokenize(mat, w, "Dieses verf***** Kleid!")
542 assert.Equal(tokens[0], "Dieses")
543 assert.Equal(tokens[1], "verf*****")
544 assert.Equal(tokens[2], "Kleid")
545 assert.Equal(tokens[3], "!")
546 assert.Equal(len(tokens), 4)
547
548 // Probably interpreted as HOST
549 // testTokenizerFileExtension1
550 tokens = ttokenize(mat, w, "Ich habe die readme.txt heruntergeladen")
551 assert.Equal(tokens[0], "Ich")
552 assert.Equal(tokens[1], "habe")
553 assert.Equal(tokens[2], "die")
554 assert.Equal(tokens[3], "readme.txt")
555 assert.Equal(tokens[4], "heruntergeladen")
556 assert.Equal(len(tokens), 5)
557
558 // Probably interpreted as HOST
559 // testTokenizerFileExtension2
560 tokens = ttokenize(mat, w, "Nimm die README.TXT!")
561 assert.Equal(tokens[0], "Nimm")
562 assert.Equal(tokens[1], "die")
563 assert.Equal(tokens[2], "README.TXT")
564 assert.Equal(tokens[3], "!")
565 assert.Equal(len(tokens), 4)
566
567 // Probably interpreted as HOST
568 // testTokenizerFileExtension3
569 tokens = ttokenize(mat, w, "Zeig mir profile.jpeg")
570 assert.Equal(tokens[0], "Zeig")
571 assert.Equal(tokens[1], "mir")
572 assert.Equal(tokens[2], "profile.jpeg")
573 assert.Equal(len(tokens), 3)
574
575 // testTokenizerFile1
576
577 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.docx")
578 assert.Equal(tokens[0], "Zeig")
579 assert.Equal(tokens[1], "mir")
580 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
581 assert.Equal(len(tokens), 3)
582
583 // testTokenizerFile2
584 tokens = ttokenize(mat, w, "Gehe zu /Dokumente/profile.docx")
585 assert.Equal(tokens[0], "Gehe")
586 assert.Equal(tokens[1], "zu")
587 assert.Equal(tokens[2], "/Dokumente/profile.docx")
588 assert.Equal(len(tokens), 3)
589
590 // testTokenizerFile3
591 tokens = ttokenize(mat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
592 assert.Equal(tokens[0], "Zeig")
593 assert.Equal(tokens[1], "mir")
594 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
595 assert.Equal(len(tokens), 3)
596 // Ignored in KorAP-Tokenizer
597
598 // testTokenizerPunct
599 tokens = ttokenize(mat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
600 assert.Equal(tokens[0], "Er")
601 assert.Equal(tokens[1], "sagte")
602 assert.Equal(tokens[2], ":")
603 assert.Equal(tokens[3], "\"")
604 assert.Equal(tokens[4], "Es")
605 assert.Equal(tokens[5], "geht")
606 assert.Equal(tokens[6], "mir")
607 assert.Equal(tokens[7], "gut")
608 assert.Equal(tokens[8], "!")
609 assert.Equal(tokens[9], "\"")
610 assert.Equal(tokens[10], ",")
611 assert.Equal(tokens[11], "daraufhin")
612 assert.Equal(tokens[12], "ging")
613 assert.Equal(tokens[13], "er")
614 assert.Equal(tokens[14], ".")
615 assert.Equal(len(tokens), 15)
616
617 // testTokenizerPlusAmpersand
618 tokens = ttokenize(mat, w, "&quot;Das ist von C&A!&quot;")
619 assert.Equal(tokens[0], "&quot;")
620 assert.Equal(tokens[1], "Das")
621 assert.Equal(tokens[2], "ist")
622 assert.Equal(tokens[3], "von")
623 assert.Equal(tokens[4], "C&A")
624 assert.Equal(tokens[5], "!")
625 assert.Equal(tokens[6], "&quot;")
626 assert.Equal(len(tokens), 7)
627
628 // testTokenizerLongEnd
629 tokens = ttokenize(mat, w, "Siehst Du?!!?")
630 assert.Equal(tokens[0], "Siehst")
631 assert.Equal(tokens[1], "Du")
632 assert.Equal(tokens[2], "?!!?")
633 assert.Equal(len(tokens), 3)
634
635 // testTokenizerIrishO
636 tokens = ttokenize(mat, w, "Peter O'Toole")
637 assert.Equal(tokens[0], "Peter")
638 assert.Equal(tokens[1], "O'Toole")
639 assert.Equal(len(tokens), 2)
640
641 // testTokenizerAbr
642 tokens = ttokenize(mat, w, "Früher bzw. später ...")
643 assert.Equal(tokens[0], "Früher")
644 assert.Equal(tokens[1], "bzw.")
645 assert.Equal(tokens[2], "später")
646 assert.Equal(tokens[3], "...")
647 assert.Equal(len(tokens), 4)
648
649 // testTokenizerUppercaseRule
650 tokens = ttokenize(mat, w, "Es war spät.Morgen ist es früh.")
651 assert.Equal(tokens[0], "Es")
652 assert.Equal(tokens[1], "war")
653 assert.Equal(tokens[2], "spät")
654 assert.Equal(tokens[3], ".")
655 assert.Equal(tokens[4], "Morgen")
656 assert.Equal(tokens[5], "ist")
657 assert.Equal(tokens[6], "es")
658 assert.Equal(tokens[7], "früh")
659 assert.Equal(tokens[8], ".")
660 assert.Equal(len(tokens), 9)
661 // Ignored in KorAP-Tokenizer
662
663 // testTokenizerOrd
664 tokens = ttokenize(mat, w, "Sie erreichte den 1. Platz!")
665 assert.Equal(tokens[0], "Sie")
666 assert.Equal(tokens[1], "erreichte")
667 assert.Equal(tokens[2], "den")
668 assert.Equal(tokens[3], "1.")
669 assert.Equal(tokens[4], "Platz")
670 assert.Equal(tokens[5], "!")
671 assert.Equal(len(tokens), 6)
672
673 // testNoZipOuputArchive
674 tokens = ttokenize(mat, w, "Archive: Ich bin kein zip\n")
675 assert.Equal(tokens[0], "Archive")
676 assert.Equal(tokens[1], ":")
677 assert.Equal(tokens[2], "Ich")
678 assert.Equal(tokens[3], "bin")
679 assert.Equal(tokens[4], "kein")
680 assert.Equal(tokens[5], "zip")
681 assert.Equal(6, len(tokens))
682
683 // testTokenizerStrasse
684 tokens = ttokenize(mat, w, "Ich wohne in der Weststr. und Du?")
685 assert.Equal(tokens[4], "Weststr.")
686 assert.Equal(8, len(tokens))
687
688 // germanTokenizerKnowsGermanOmissionWords
689 tokens = ttokenize(mat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
690 assert.Equal("D'dorf", tokens[0])
691 assert.Equal("Ku'damm", tokens[1])
692 assert.Equal("Lu'hafen", tokens[2])
693 assert.Equal("M'gladbach", tokens[3])
694 assert.Equal("W'schaft", tokens[4])
695 assert.Equal(5, len(tokens))
696
697 // germanTokenizerDoesNOTSeparateGermanContractions
698 tokens = ttokenize(mat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
699 assert.Equal("mach's", tokens[0])
700 assert.Equal("macht's", tokens[1])
701 assert.Equal("was'n", tokens[2])
702 assert.Equal("ist's", tokens[3])
703 assert.Equal("haste", tokens[4])
704 assert.Equal("willste", tokens[5])
705 assert.Equal("kannste", tokens[6])
706 assert.Equal("biste", tokens[7])
707 assert.Equal("kriegste", tokens[8])
708 assert.Equal(9, len(tokens))
709
Akron78dba062021-10-28 19:30:46 +0200710 tokens = ttokenize(mat, w, "Es ist gleich 2:30 Uhr.")
711 assert.Equal("Es", tokens[0])
712 assert.Equal("ist", tokens[1])
713 assert.Equal("gleich", tokens[2])
714 assert.Equal("2:30", tokens[3])
715 assert.Equal("Uhr", tokens[4])
716 assert.Equal(".", tokens[5])
717 assert.Equal(6, len(tokens))
718
Akron17984c82021-10-30 11:44:37 +0200719 tokens = ttokenize(mat, w, "Sie schwamm die Strecke in 00:00:57,34 00:57,341 0:57 Stunden.")
720 assert.Equal("Sie", tokens[0])
721 assert.Equal("schwamm", tokens[1])
722 assert.Equal("die", tokens[2])
723 assert.Equal("Strecke", tokens[3])
724 assert.Equal("in", tokens[4])
725 assert.Equal("00:00:57,34", tokens[5])
726 assert.Equal("00:57,341", tokens[6])
727 assert.Equal("0:57", tokens[7])
728 assert.Equal("Stunden", tokens[8])
729 assert.Equal(".", tokens[9])
730 assert.Equal(10, len(tokens))
731
Akronf1106ec2021-11-05 13:04:44 +0100732 // waste example
733 tokens = ttokenize(mat, w, "Am 24.1.1806 feierte E. T. A. Hoffmann seinen 30. Geburtstag.")
734 assert.Equal(tokens[0], "Am")
735 assert.Equal(tokens[1], "24.1.1806")
736 assert.Equal(tokens[2], "feierte")
737 assert.Equal(tokens[3], "E.")
738 assert.Equal(tokens[4], "T.")
739 assert.Equal(tokens[5], "A.")
740 assert.Equal(tokens[6], "Hoffmann")
741 assert.Equal(tokens[7], "seinen")
742 assert.Equal(tokens[8], "30.")
743 assert.Equal(tokens[9], "Geburtstag")
744 assert.Equal(tokens[10], ".")
745 assert.Equal(11, len(tokens))
746
Akron9135b202021-11-06 13:16:07 +0100747 // IPtest
748 tokens = ttokenize(mat, w, "Meine IP ist 192.178.168.55.")
749 assert.Equal(tokens[0], "Meine")
750 assert.Equal(tokens[1], "IP")
751 assert.Equal(tokens[2], "ist")
752 assert.Equal(tokens[3], "192.178.168.55")
753 assert.Equal(tokens[4], ".")
754 assert.Equal(5, len(tokens))
755
Akron6742b962021-11-09 01:17:20 +0100756 // XML entities
757 tokens = ttokenize(mat, w, "Das ist&nbsp;1:30 Stunden&20 Minuten zu spät &GT;.")
758 assert.Equal(tokens[0], "Das")
759 assert.Equal(tokens[1], "ist")
760 assert.Equal(tokens[2], "&nbsp;")
761 assert.Equal(tokens[3], "1:30")
762 assert.Equal(tokens[4], "Stunden")
763 assert.Equal(tokens[5], "&")
764 assert.Equal(tokens[6], "20")
765 assert.Equal(tokens[7], "Minuten")
766 assert.Equal(tokens[8], "zu")
767 assert.Equal(tokens[9], "spät")
768 assert.Equal(tokens[10], "&GT;")
769 assert.Equal(tokens[11], ".")
770 assert.Equal(12, len(tokens))
771
Akron936c0f52021-12-07 11:30:53 +0100772 // Plusampersand compounds (1)
Akrone62e8eb2021-12-03 11:59:53 +0100773 tokens = ttokenize(mat, w, "Die 2G+-Regel soll weitere Covid-19-Erkrankungen reduzieren.")
774 assert.Equal(tokens[0], "Die")
775 assert.Equal(tokens[1], "2G+-Regel")
776 assert.Equal(tokens[2], "soll")
777 assert.Equal(tokens[3], "weitere")
778 assert.Equal(tokens[4], "Covid-19-Erkrankungen")
779 assert.Equal(tokens[5], "reduzieren")
780 assert.Equal(tokens[6], ".")
781 assert.Equal(7, len(tokens))
782
Akron936c0f52021-12-07 11:30:53 +0100783 // Plusampersand compounds (2)
784 tokens = ttokenize(mat, w, "Der Neu-C++-Programmierer.")
785 assert.Equal(tokens[0], "Der")
786 assert.Equal(tokens[1], "Neu-C++-Programmierer")
787 assert.Equal(tokens[2], ".")
788 assert.Equal(3, len(tokens))
789
Akron54ed7e72022-01-04 12:05:00 +0100790 // z.B.
791 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
792 assert.Equal(tokens[0], "Dies")
793 assert.Equal(tokens[1], "sind")
794 assert.Equal(tokens[2], "z.")
795 assert.Equal(tokens[3], "B.")
796 assert.Equal(tokens[4], "zwei")
797 assert.Equal(tokens[5], "Wörter")
798 assert.Equal(tokens[6], "-")
799 assert.Equal(tokens[7], "z.")
800 assert.Equal(tokens[8], "B.")
801 assert.Equal(tokens[9], "auch")
802 assert.Equal(tokens[10], ".")
803 assert.Equal(11, len(tokens))
804
Akron9a594712022-01-14 11:12:21 +0100805 // z.B.
806 tokens = ttokenize(mat, w, "Dies sind z.B. zwei Wörter - z. B. auch.")
807 assert.Equal(tokens[0], "Dies")
808 assert.Equal(tokens[1], "sind")
809 assert.Equal(tokens[2], "z.")
810 assert.Equal(tokens[3], "B.")
811 assert.Equal(tokens[4], "zwei")
812 assert.Equal(tokens[5], "Wörter")
813 assert.Equal(tokens[6], "-")
814 assert.Equal(tokens[7], "z.")
815 assert.Equal(tokens[8], "B.")
816 assert.Equal(tokens[9], "auch")
817 assert.Equal(tokens[10], ".")
818 assert.Equal(11, len(tokens))
819
820 // Single quote handling
821 tokens = ttokenize(mat, w, "Es heißt 'Leitungssportteams' und nicht anders.")
822 assert.Equal(tokens[0], "Es")
823 assert.Equal(tokens[1], "heißt")
824 assert.Equal(tokens[2], "'")
825 assert.Equal(tokens[3], "Leitungssportteams")
826 assert.Equal(tokens[4], "'")
827 assert.Equal(tokens[5], "und")
828 assert.Equal(tokens[6], "nicht")
829 assert.Equal(tokens[7], "anders")
830 assert.Equal(tokens[8], ".")
831 assert.Equal(9, len(tokens))
832
Akronb02ad072022-01-19 12:41:44 +0100833 // Apostrophe handling
834 tokens = ttokenize(mat, w, "Das ist Nils’ Einkaufskorb bei McDonald's.")
835 assert.Equal(tokens[0], "Das")
836 assert.Equal(tokens[1], "ist")
837 assert.Equal(tokens[2], "Nils’")
838 assert.Equal(tokens[3], "Einkaufskorb")
839 assert.Equal(tokens[4], "bei")
840 assert.Equal(tokens[5], "McDonald's")
841 assert.Equal(tokens[6], ".")
842 assert.Equal(7, len(tokens))
843
Akron28031b72021-10-02 13:07:25 +0200844 /*
845 @Test
846 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
847 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
848 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
849 assert.Equal("'ve", tokens[1]);
850 assert.Equal("'ll", tokens[3]);
851 assert.Equal("'d", tokens[5]);
852 assert.Equal("'m", tokens[7]);
853 assert.Equal("'re", tokens[9]);
854 assert.Equal("'s", tokens[11]);
855 assert.Equal("is", tokens[12]);
856 assert.Equal("n't", tokens[13]);
857 assert.Equal(14, len(tokens));
858 }
859
860 @Test
861 public void frenchTokenizerKnowsFrenchAbbreviations () {
862 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
863 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
864 assert.Equal("Approx.", tokens[0]);
865 assert.Equal("juill.", tokens[2]);
866 assert.Equal("prof.", tokens[5]);
867 assert.Equal("exerc.", tokens[15]);
868 assert.Equal("no.", tokens[16]);
869 assert.Equal("pp.", tokens[21]);
870 }
871
872 @Test
873 public void frenchTokenizerKnowsFrenchContractions () {
874 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
875 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
876 assert.Equal("J'", tokens[0]);
877 assert.Equal("j'", tokens[2]);
878 assert.Equal("qu'", tokens[4]);
879 assert.Equal("d'", tokens[6]);
880 assert.Equal("jusqu'", tokens[8]);
881 assert.Equal("Aujourd'hui", tokens[10]);
882 assert.Equal("D'", tokens[11]); // ’
883 assert.Equal("Quelqu'un", tokens[13]); // ’
884 assert.Equal("Presqu'île", tokens[14]); // ’
885 }
886
887 @Test
888 public void frenchTokenizerKnowsFrenchClitics () {
889 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
890 tokens = tokenize(dat, w, "suis-je sont-elles ")
891 assert.Equal("suis", tokens[0]);
892 assert.Equal("-je", tokens[1]);
893 assert.Equal("sont", tokens[2]);
894 assert.Equal("-elles", tokens[3]);
895 }
896
897 @Test
898 public void testEnglishTokenizerScienceAbbreviations () {
899 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
900 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
901 assert.Equal("Approx.", tokens[0]);
902 assert.Equal("in", tokens[1]);
903 assert.Equal("Sept.", tokens[2]);
904 assert.Equal("1954", tokens[3]);
905 assert.Equal(",", tokens[4]);
906 assert.Equal("Assoc.", tokens[5]);
907 assert.Equal("Prof.", tokens[6]);
908 assert.Equal("Dr.", tokens[7]);
909 assert.Equal("R.", tokens[8]);
910 assert.Equal("J.", tokens[9]);
911 assert.Equal("Ewing", tokens[10]);
912 assert.Equal("reviewed", tokens[11]);
913 assert.Equal("articles", tokens[12]);
914 assert.Equal("on", tokens[13]);
915 assert.Equal("Enzymol.", tokens[14]);
916 assert.Equal("Bacteriol.", tokens[15]);
917 assert.Equal("effects", tokens[16]);
918 assert.Equal("later", tokens[17]);
919 assert.Equal("published", tokens[18]);
920 assert.Equal("in", tokens[19]);
921 assert.Equal("Nutr.", tokens[20]);
922 assert.Equal("Rheumatol.", tokens[21]);
923 assert.Equal("No.", tokens[22]);
924 assert.Equal("12", tokens[23]);
925 assert.Equal("and", tokens[24]);
926 assert.Equal("Nº.", tokens[25]);
927 assert.Equal("13.", tokens[26]);
928 assert.Equal(",", tokens[27]);
929 assert.Equal("pp.", tokens[28]);
930 assert.Equal("17-18", tokens[29]);
931 assert.Equal(".", tokens[30]);
932 }
933
934 @Test
935 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
936 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
937 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
938 assert.Equal("I.", tokens[1]);
939 assert.Equal("I", tokens[8]);
940 assert.Equal(".", tokens[9]);
941 assert.Equal("I", tokens[12]);
942 assert.Equal(".", tokens[13]);
943 }
944
945 @Test
946 public void testZipOuputArchive () {
947
948 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
949 System.setOut(new PrintStream(clearOut));
950 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
951 assert.Equal(0, len(tokens));
952 }
953 */
954 /*
955
956 @Test
957 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
958 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
959 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
960 .printOffsets(true)
961 .build();
962 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
963 assert.Equal("Text1", tokens[0].getType());
964 assert.Equal(len(tokens), 9 );
965 }
966 */
967}
968
Akronc9c0eae2021-10-22 19:49:43 +0200969func TestMatrixFullTokenizerXML(t *testing.T) {
Akron28031b72021-10-02 13:07:25 +0200970 assert := assert.New(t)
971
Akron9fb63af2021-10-28 01:15:53 +0200972 if mat == nil {
973 mat = LoadMatrixFile("testdata/tokenizer.matok")
974 }
Akron28031b72021-10-02 13:07:25 +0200975
Akron28031b72021-10-02 13:07:25 +0200976 assert.NotNil(mat)
977
978 b := make([]byte, 0, 2048)
979 w := bytes.NewBuffer(b)
980 var tokens []string
981
982 tokens = ttokenize(mat, w, "Das <b>beste</b> Fußballspiel")
983 assert.Equal("Das", tokens[0])
984 assert.Equal("<b>", tokens[1])
985 assert.Equal("beste", tokens[2])
986 assert.Equal("</b>", tokens[3])
987 assert.Equal("Fußballspiel", tokens[4])
988 assert.Equal(5, len(tokens))
989
990 tokens = ttokenize(mat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
991 assert.Equal("Das", tokens[0])
992 assert.Equal("<b class=\"c\">", tokens[1])
993 assert.Equal("beste", tokens[2])
994 assert.Equal("</b>", tokens[3])
995 assert.Equal("Fußballspiel", tokens[4])
996 assert.Equal(5, len(tokens))
997
998 tokens = ttokenize(mat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
999 assert.Equal("der", tokens[0])
1000 assert.Equal("<x y=\"alte \">", tokens[1])
1001 assert.Equal("<x x>", tokens[2])
1002 assert.Equal("alte", tokens[3])
1003 assert.Equal("</x>", tokens[4])
1004 assert.Equal("etc.", tokens[5])
1005 assert.Equal("et", tokens[6])
1006 assert.Equal(".", tokens[7])
1007 assert.Equal("Mann", tokens[8])
1008 assert.Equal(".", tokens[9])
1009 assert.Equal(10, len(tokens))
Akron066d99c2021-10-28 19:04:59 +02001010
1011 tokens = ttokenize(mat, w, "das<br class=\"br\" />ging.")
1012 assert.Equal("das", tokens[0])
1013 assert.Equal("<br class=\"br\" />", tokens[1])
1014 assert.Equal("ging", tokens[2])
1015 assert.Equal(".", tokens[3])
1016 assert.Equal(4, len(tokens))
Akron28031b72021-10-02 13:07:25 +02001017}
1018
Akronabcb6a52021-10-09 15:52:08 +02001019func TestMatokDatokEquivalence(t *testing.T) {
1020 assert := assert.New(t)
1021
Akron9fb63af2021-10-28 01:15:53 +02001022 if mat == nil {
1023 mat = LoadMatrixFile("testdata/tokenizer.matok")
1024 }
Akronabcb6a52021-10-09 15:52:08 +02001025 dat := LoadDatokFile("testdata/tokenizer.datok")
1026
1027 r := strings.NewReader(s)
1028
1029 tb := make([]byte, 0, 2048)
1030 w := bytes.NewBuffer(tb)
1031
1032 // Transduce with double array representation
1033 dat.Transduce(r, w)
1034
1035 datStr := w.String()
1036
1037 r.Reset(s)
1038 w.Reset()
1039
1040 // Transduce with matrix representation
1041 mat.Transduce(r, w)
1042
1043 matStr := w.String()
1044
1045 assert.Equal(datStr, matStr)
1046}
1047
Akronc9c0eae2021-10-22 19:49:43 +02001048func TestMatrixFullTokenizerCallbackTransduce(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +02001049 assert := assert.New(t)
1050
Akron9fb63af2021-10-28 01:15:53 +02001051 if mat == nil {
1052 mat = LoadMatrixFile("testdata/tokenizer.matok")
1053 }
Akrone396a932021-10-19 01:06:13 +02001054
1055 assert.NotNil(mat)
1056
1057 b := make([]byte, 0, 2048)
1058 w := bytes.NewBuffer(b)
Akrone396a932021-10-19 01:06:13 +02001059
1060 assert.True(mat.Transduce(strings.NewReader("Der alte Baum. Er war schon alt."), w))
Akrona854faa2021-10-22 19:31:08 +02001061
1062 matStr := w.String()
1063
1064 assert.Equal("Der\nalte\nBaum\n.\n\nEr\nwar\nschon\nalt\n.\n\n\n", matStr)
1065}
1066
Akronc9c0eae2021-10-22 19:49:43 +02001067func TestMatrixFullTokenizerTextTreatment(t *testing.T) {
Akrona854faa2021-10-22 19:31:08 +02001068 assert := assert.New(t)
1069
Akron9fb63af2021-10-28 01:15:53 +02001070 if mat == nil {
1071 mat = LoadMatrixFile("testdata/tokenizer.matok")
1072 }
Akrona854faa2021-10-22 19:31:08 +02001073
1074 assert.NotNil(mat)
1075
1076 b := make([]byte, 0, 2048)
1077 w := bytes.NewBuffer(b)
1078
1079 assert.True(mat.Transduce(strings.NewReader("Erste.\n\n\n\n\x04\x0aNächst.\x04"), w))
1080 matStr := w.String()
1081 assert.Equal("Erste\n.\n\n\nNächst\n.\n\n\n", matStr)
Akronf6bdfdb2021-10-23 15:56:53 +02001082}
Akrona854faa2021-10-22 19:31:08 +02001083
Akron22c565a2021-11-28 17:31:36 +01001084func TestMatrixFullTokenizerLongText(t *testing.T) {
1085 assert := assert.New(t)
1086
1087 if mat == nil {
1088 mat = LoadMatrixFile("testdata/tokenizer.matok")
1089 }
1090
1091 assert.NotNil(mat)
1092
1093 b := make([]byte, 0, 2048)
1094 w := bytes.NewBuffer(b)
1095
1096 text := `The Project Gutenberg EBook of Effi Briest, by Theodor Fontane
1097
1098Copyright laws are changing all over the world. Be sure to check the
1099copyright laws for your country before downloading or redistributing
1100this or any other Project Gutenberg eBook.
1101
1102This header should be the first thing seen when viewing this Project
1103Gutenberg file. Please do not remove it. Do not change or edit the
1104header without written permission.
1105
1106Please read the "legal small print," and other information about the
1107eBook and Project Gutenberg at the bottom of this file. Included is
1108important information about your specific rights and restrictions in
1109how the file may be used. You can also find out about how to make a
1110donation to Project Gutenberg, and how to get involved.
1111
1112
1113**Welcome To The World of Free Plain Vanilla Electronic Texts**
1114
1115**eBooks Readable By Both Humans and By Computers, Since 1971**
1116
1117*****These eBooks Were Prepared By Thousands of Volunteers!*****
1118
1119
1120Title: Effi Briest
1121
1122Author: Theodor Fontane
1123
1124Release Date: March, 2004 [EBook #5323]
1125`
1126
1127 assert.True(mat.Transduce(strings.NewReader(text), w))
1128
1129 assert.True(strings.Contains(w.String(), "Release"))
1130}
1131
Akronf6bdfdb2021-10-23 15:56:53 +02001132func TestMatrixTrimming(t *testing.T) {
1133 assert := assert.New(t)
1134
Akron9fb63af2021-10-28 01:15:53 +02001135 if mat == nil {
1136 mat = LoadMatrixFile("testdata/tokenizer.matok")
1137 }
Akronf6bdfdb2021-10-23 15:56:53 +02001138
1139 assert.NotNil(mat)
1140
1141 b := make([]byte, 0, 2048)
1142 w := bytes.NewBuffer(b)
1143
1144 assert.True(mat.Transduce(strings.NewReader(" Erste."), w))
1145 matStr := w.String()
1146 assert.Equal("Erste\n.\n\n\n", matStr)
Akrone396a932021-10-19 01:06:13 +02001147}
1148
Akronc9c0eae2021-10-22 19:49:43 +02001149func BenchmarkMatrixTransduce(b *testing.B) {
Akron28031b72021-10-02 13:07:25 +02001150 bu := make([]byte, 0, 2048)
1151 w := bytes.NewBuffer(bu)
1152
Akron28031b72021-10-02 13:07:25 +02001153 r := strings.NewReader(s)
1154
Akron094a4e82021-10-02 18:37:00 +02001155 mat := LoadMatrixFile("testdata/tokenizer.matok")
Akron28031b72021-10-02 13:07:25 +02001156
1157 b.ResetTimer()
1158
1159 for i := 0; i < b.N; i++ {
1160 w.Reset()
1161 r.Reset(s)
1162 ok := mat.Transduce(r, w)
1163 if !ok {
1164 fmt.Println("Fail!")
1165 fmt.Println(w.String())
1166 os.Exit(1)
1167 }
1168 }
1169}