blob: 6e219f160a066194f8a414f6048a0be2b4269939 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akronec835ad2021-08-11 18:23:22 +020014func tmatch(dat *DaTokenizer, s string) bool {
15 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
17 return dat.Transduce(strings.NewReader(s), w)
18}
19
20func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
21 w.Reset()
22 ok := dat.Transduce(strings.NewReader(str), w)
23 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akron8ef408b2021-08-02 22:11:04 +020032func TestSimpleString(t *testing.T) {
33 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akron0630be52021-08-28 09:06:16 +020041 assert.False(tmatch(dat, "baua"))
Akron8ef408b2021-08-02 22:11:04 +020042}
Akron75ebe7f2021-08-03 10:34:10 +020043
44func TestSimpleBranches(t *testing.T) {
45 assert := assert.New(t)
46
47 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020048 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020049 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020050 assert.False(tmatch(dat, "bau"))
51 assert.True(tmatch(dat, "bauamt"))
52 assert.True(tmatch(dat, "wahlamt"))
53 assert.True(tmatch(dat, "bauen"))
54 assert.True(tmatch(dat, "wahlen"))
55 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020056}
Akron730a79c2021-08-03 11:05:29 +020057
58func TestSimpleTokenizer(t *testing.T) {
59 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020060 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020061 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020062 assert.True(tmatch(dat, "bau"))
63 assert.True(tmatch(dat, "bad"))
64 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020065}
Akron740f3d72021-08-03 12:12:34 +020066
Akron068874c2021-08-04 15:19:56 +020067func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020068 assert := assert.New(t)
69 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020070 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020071
72 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
73 b := make([]byte, 0, 2048)
74 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020075 var tokens []string
Akron524c5432021-08-05 14:14:27 +020076 dat.Transduce(r, w)
77 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020078 assert.Equal("wald", tokens[0])
79 assert.Equal("gehen", tokens[1])
80 assert.Equal("Da", tokens[2])
81 assert.Equal("kann", tokens[3])
82 assert.Equal("man", tokens[4])
83 assert.Equal("was", tokens[5])
84 assert.Equal("\"erleben\"", tokens[6])
85
Akron524c5432021-08-05 14:14:27 +020086 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
87 w.Reset()
88 dat.Transduce(r, w)
89 tokens = strings.Split(w.String(), "\n")
90 assert.Equal("In", tokens[0])
91 assert.Equal("den", tokens[1])
92 assert.Equal("Wald", tokens[2])
93 assert.Equal("gehen", tokens[3])
94 assert.Equal("?", tokens[4])
95 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020096
Akron524c5432021-08-05 14:14:27 +020097 r = strings.NewReader(" g? -- D")
98 w.Reset()
99 dat.Transduce(r, w)
100 tokens = strings.Split(w.String(), "\n")
101 assert.Equal("g", tokens[0])
102 assert.Equal("?", tokens[1])
103 assert.Equal("--", tokens[2])
104 assert.Equal("D", tokens[3])
105 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200106 assert.Equal("", tokens[5])
107 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200108}
109
Akron3f8571a2021-08-05 11:18:10 +0200110func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200111 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200112 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200113 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200114 assert.True(tmatch(dat, "bau"))
115 assert.True(tmatch(dat, "bad"))
116 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200117
Akron3f8571a2021-08-05 11:18:10 +0200118 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200119 buf := bytes.NewBuffer(b)
120 n, err := dat.WriteTo(buf)
121 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200122 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200123
124 dat2 := ParseDatok(buf)
125 assert.NotNil(dat2)
126 assert.Equal(dat.array, dat2.array)
127 assert.Equal(dat.sigma, dat2.sigma)
128 assert.Equal(dat.epsilon, dat2.epsilon)
129 assert.Equal(dat.unknown, dat2.unknown)
130 assert.Equal(dat.identity, dat2.identity)
131 assert.Equal(dat.final, dat2.final)
132 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200133 assert.True(tmatch(dat2, "bau"))
134 assert.True(tmatch(dat2, "bad"))
135 assert.True(tmatch(dat2, "wald gehen"))
Akron4fa28b32021-08-27 10:55:41 +0200136
Akron92704eb2021-08-27 10:59:46 +0200137 assert.Equal(dat.TransCount(), 17)
138 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200139}
140
Akron31f3c062021-08-27 10:15:13 +0200141func TestIgnorableMCS(t *testing.T) {
142 assert := assert.New(t)
143 // File has MCS in sigma but not in net
144 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
145 assert.NotNil(tok)
146 dat := tok.ToDoubleArray()
147 assert.NotNil(dat)
148
149 b := make([]byte, 0, 2048)
150 w := bytes.NewBuffer(b)
151 var tokens []string
152
153 // Is only unambigous when transducing strictly greedy!
154 assert.True(dat.Transduce(strings.NewReader("ab<ab>"), w))
155 tokens = strings.Split(w.String(), "\n")
156 assert.Equal("a\nb\n<ab>\n", w.String())
157 assert.Equal("a", tokens[0])
158 assert.Equal("b", tokens[1])
159 assert.Equal("<ab>", tokens[2])
160 assert.Equal(4, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200161 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200162}
163
Akron6247a5d2021-08-03 19:18:28 +0200164func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200165 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200166 dat := LoadDatokFile("testdata/tokenizer.datok")
167 assert.NotNil(dat)
168 assert.True(dat.LoadFactor() >= 70)
169 assert.Equal(dat.epsilon, 1)
170 assert.Equal(dat.unknown, 2)
171 assert.Equal(dat.identity, 3)
Akron4c2a1ad2021-08-31 00:35:53 +0200172 assert.Equal(dat.final, 145)
173 assert.Equal(len(dat.sigma), 140)
Akronf1a16502021-08-16 15:24:38 +0200174 assert.True(len(dat.array) > 3600000)
175 assert.True(dat.maxSize > 3600000)
Akronec835ad2021-08-11 18:23:22 +0200176 assert.True(tmatch(dat, "bau"))
177 assert.True(tmatch(dat, "bad"))
178 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200179}
Akron3f8571a2021-08-05 11:18:10 +0200180
Akrona0bded52021-08-11 15:48:02 +0200181func XTestFullTokenizerBuild(t *testing.T) {
182 assert := assert.New(t)
183 tok := LoadFomaFile("testdata/tokenizer.fst")
184 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200185 assert.NotNil(dat)
186 // n, err := dat.Save("testdata/tokenizer.datok")
187 // assert.Nil(err)
188 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200189}
190
Akron3f8571a2021-08-05 11:18:10 +0200191func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200192 assert := assert.New(t)
193
Akrona0bded52021-08-11 15:48:02 +0200194 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200195 assert.NotNil(dat)
196
Akron3610f102021-08-08 14:13:25 +0200197 b := make([]byte, 0, 2048)
198 w := bytes.NewBuffer(b)
199 var tokens []string
200
Akron03ca4252021-08-11 13:32:53 +0200201 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200202
203 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200204 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200205 assert.Equal("tra", tokens[0])
206 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200207 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200208 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200209 assert.Equal("Du", tokens[4])
210 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200211 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200212 assert.Equal("", tokens[7])
213 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200214
215 w.Reset()
216 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
217 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200218}
Akronb7e1f132021-08-10 11:52:31 +0200219
220func TestFullTokenizerSentenceSplitter(t *testing.T) {
221 assert := assert.New(t)
222 dat := LoadDatokFile("testdata/tokenizer.datok")
223 assert.NotNil(dat)
224
225 b := make([]byte, 0, 2048)
226 w := bytes.NewBuffer(b)
227 var sentences []string
228
229 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200230 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
231 sentences = strings.Split(w.String(), "\n\n")
232
233 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
234 assert.Equal("Der\nalte\nMann\n.", sentences[0])
235 assert.Equal("", sentences[1])
236 assert.Equal(len(sentences), 2)
237
238 w.Reset()
239 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
240 sentences = strings.Split(w.String(), "\n\n")
241 assert.Equal(len(sentences), 2)
242 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
243 assert.Equal("", sentences[1])
244
245 w.Reset()
246 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200247 sentences = strings.Split(w.String(), "\n\n")
248 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200249 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200250
251 w.Reset()
252 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
253 sentences = strings.Split(w.String(), "\n\n")
254 assert.Equal(len(sentences), 2)
255
256 w.Reset()
257 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
258 sentences = strings.Split(w.String(), "\n\n")
259 assert.Equal(len(sentences), 2)
260
Akron6e70dc82021-08-11 11:33:18 +0200261 w.Reset()
262 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
263 sentences = strings.Split(w.String(), "\n\n")
264 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
265 assert.Equal("", sentences[1])
266 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200267
Akron6e70dc82021-08-11 11:33:18 +0200268 w.Reset()
269 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
270 sentences = strings.Split(w.String(), "\n\n")
271 assert.Equal("", sentences[1])
272 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200273
Akron6e70dc82021-08-11 11:33:18 +0200274 w.Reset()
275 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
276 sentences = strings.Split(w.String(), "\n\n")
277 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200278
Akron6e70dc82021-08-11 11:33:18 +0200279 w.Reset()
280 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
281 sentences = strings.Split(w.String(), "\n\n")
282 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200283
Akron6e70dc82021-08-11 11:33:18 +0200284 w.Reset()
285 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
286 sentences = strings.Split(w.String(), "\n\n")
287 assert.Equal(len(sentences), 2)
288 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
289 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200290
Akron6e70dc82021-08-11 11:33:18 +0200291 w.Reset()
292 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
293 sentences = strings.Split(w.String(), "\n\n")
294 assert.Equal(len(sentences), 3)
295 assert.Equal("Ausschalten\n!!!", sentences[0])
296 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
297 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200298
Akron4af79f12021-08-11 14:48:17 +0200299 w.Reset()
300 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
301 sentences = strings.Split(w.String(), "\n\n")
302 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200303
304 /*
305 Test:
306 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
307 */
Akronb7e1f132021-08-10 11:52:31 +0200308}
Akron03ca4252021-08-11 13:32:53 +0200309
Akron03ca4252021-08-11 13:32:53 +0200310func TestFullTokenizerTokenSplitter(t *testing.T) {
311 assert := assert.New(t)
312 dat := LoadDatokFile("testdata/tokenizer.datok")
313 assert.NotNil(dat)
314
315 b := make([]byte, 0, 2048)
316 w := bytes.NewBuffer(b)
317 var tokens []string
318
319 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200320 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200321 assert.Equal(tokens[0], "Der")
322 assert.Equal(tokens[1], "alte")
323 assert.Equal(tokens[2], "Mann")
324 assert.Equal(len(tokens), 3)
325
Akronec835ad2021-08-11 18:23:22 +0200326 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200327 assert.Equal(tokens[0], "Der")
328 assert.Equal(tokens[1], "alte")
329 assert.Equal(tokens[2], "Mann")
330 assert.Equal(tokens[3], ".")
331 assert.Equal(len(tokens), 4)
332
333 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200334 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200335 assert.Equal(tokens[0], "Der")
336 assert.Equal(tokens[1], "Vorsitzende")
337 assert.Equal(tokens[2], "der")
338 assert.Equal(tokens[3], "F.D.P.")
339 assert.Equal(tokens[4], "hat")
340 assert.Equal(tokens[5], "gewählt")
341 assert.Equal(len(tokens), 6)
342 // Ignored in KorAP-Tokenizer
343
344 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200345 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200346 assert.Equal(tokens[0], "Gefunden")
347 assert.Equal(tokens[1], "auf")
348 assert.Equal(tokens[2], "wikipedia.org")
349 assert.Equal(len(tokens), 3)
350
351 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200352 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200353 assert.Equal("Gefunden", tokens[0])
354 assert.Equal("auf", tokens[1])
355 assert.Equal("www.wikipedia.org", tokens[2])
356 assert.Equal(3, len(tokens))
357
358 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200359 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200360 assert.Equal("www.info.biz/info", tokens[3])
361
362 // testTokenizerFtpHost
363 /*
364 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
365 assert.Equal("Kann", tokens[0])
366 assert.Equal("von", tokens[1])
367 assert.Equal("ftp.download.org", tokens[2])
368 assert.Equal(5, len(tokens))
369 // Ignored in KorAP-Tokenizer
370 */
371
372 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200373 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200374 assert.Equal(tokens[0], "Das")
375 assert.Equal(tokens[1], "war")
376 assert.Equal(tokens[2], "--")
377 assert.Equal(tokens[3], "spitze")
378 assert.Equal(len(tokens), 4)
379
380 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200381 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200382 assert.Equal(tokens[0], "Ich")
383 assert.Equal(tokens[1], "bin")
384 assert.Equal(tokens[2], "unter")
385 assert.Equal(tokens[3], "korap@ids-mannheim.de")
386 assert.Equal(tokens[4], "erreichbar")
387 assert.Equal(tokens[5], ".")
388 assert.Equal(len(tokens), 6)
389
390 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200391 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200392 assert.Equal(tokens[0], "Oder")
393 assert.Equal(tokens[1], "unter")
394 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
395 assert.Equal(tokens[3], ".")
396 assert.Equal(len(tokens), 4)
397
398 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200399 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200400 assert.Equal(tokens[0], "Oder")
401 assert.Equal(tokens[1], "unter")
402 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
403 assert.Equal(tokens[3], ".")
404 assert.Equal(len(tokens), 4)
405 // Ignored in KorAP-Tokenizer
406
407 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200408 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200409 assert.Equal("\"", tokens[0])
410 assert.Equal("John", tokens[1])
411 assert.Equal("Doe", tokens[2])
412 assert.Equal("\"", tokens[3])
413 assert.Equal("@xx", tokens[4])
414 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
415 assert.Equal("com", tokens[6])
416 assert.Equal(7, len(tokens))
417
418 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200419 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200420 assert.Equal(tokens[0], "Folgt")
421 assert.Equal(tokens[1], "@korap")
422 assert.Equal(tokens[2], "und")
423 assert.Equal(tokens[3], "#korap")
424 assert.Equal(len(tokens), 4)
425
426 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200427 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200428 assert.Equal(tokens[0], "Unsere")
429 assert.Equal(tokens[1], "Website")
430 assert.Equal(tokens[2], "ist")
431 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
432 assert.Equal(len(tokens), 4)
433
434 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200435 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200436 assert.Equal(tokens[0], "Wir")
437 assert.Equal(tokens[1], "sind")
438 assert.Equal(tokens[2], "auch")
439 assert.Equal(tokens[3], "im")
440 assert.Equal(tokens[4], "Internet")
441 assert.Equal(tokens[5], "(")
442 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
443 assert.Equal(tokens[7], ")")
444 assert.Equal(len(tokens), 8)
445 // Ignored in KorAP-Tokenizer
446
447 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200448 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200449 assert.Equal(tokens[0], "Die")
450 assert.Equal(tokens[1], "Adresse")
451 assert.Equal(tokens[2], "ist")
452 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
453 assert.Equal(tokens[4], ".")
454 assert.Equal(len(tokens), 5)
455 // Ignored in KorAP-Tokenizer
456
457 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200458 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200459 assert.Equal(tokens[0], "Unser")
460 assert.Equal(tokens[1], "Server")
461 assert.Equal(tokens[2], "ist")
462 assert.Equal(tokens[3], "10.0.10.51")
463 assert.Equal(tokens[4], ".")
464 assert.Equal(len(tokens), 5)
465
466 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200467 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200468 assert.Equal(tokens[0], "Zu")
469 assert.Equal(tokens[1], "50,4%")
470 assert.Equal(tokens[2], "ist")
471 assert.Equal(tokens[3], "es")
472 assert.Equal(tokens[4], "sicher")
473 assert.Equal(len(tokens), 5)
474 // Differs from KorAP-Tokenizer
475
476 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200477 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200478 assert.Equal(tokens[0], "Der")
479 assert.Equal(tokens[1], "Termin")
480 assert.Equal(tokens[2], "ist")
481 assert.Equal(tokens[3], "am")
482 assert.Equal(tokens[4], "5.9.2018")
483 assert.Equal(len(tokens), 5)
484
Akronec835ad2021-08-11 18:23:22 +0200485 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200486 assert.Equal(tokens[0], "Der")
487 assert.Equal(tokens[1], "Termin")
488 assert.Equal(tokens[2], "ist")
489 assert.Equal(tokens[3], "am")
490 assert.Equal(tokens[4], "5/9/2018")
491 assert.Equal(len(tokens), 5)
492
493 // testTokenizerDateRange
494 /*
495 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
496 assert.Equal(tokens[0], "Der")
497 assert.Equal(tokens[1], "Termin")
498 assert.Equal(tokens[2], "war")
499 assert.Equal(tokens[3], "vom")
500 assert.Equal(tokens[4], "4.")
501 assert.Equal(tokens[5], "-")
502 assert.Equal(tokens[6], "5.9.2018")
503 assert.Equal(len(tokens), 7)
504 // Ignored in KorAP-Tokenizer
505 */
506
507 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200508 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200509 assert.Equal(tokens[0], "Das")
510 assert.Equal(tokens[1], "ist")
511 assert.Equal(tokens[2], "toll")
512 assert.Equal(tokens[3], "!")
513 assert.Equal(tokens[4], ";)")
514 assert.Equal(len(tokens), 5)
515
516 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200517 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200518 assert.Equal(tokens[0], "Kupietz")
519 assert.Equal(tokens[1], "und")
520 assert.Equal(tokens[2], "Schmidt")
521 assert.Equal(tokens[3], "(2018)")
522 assert.Equal(tokens[4], ":")
523 assert.Equal(tokens[5], "Korpuslinguistik")
524 assert.Equal(len(tokens), 6)
525 // Differs from KorAP-Tokenizer!
526
527 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200528 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200529 assert.Equal(tokens[0], "Kupietz")
530 assert.Equal(tokens[1], "und")
531 assert.Equal(tokens[2], "Schmidt")
532 assert.Equal(tokens[3], "[2018]")
533 assert.Equal(tokens[4], ":")
534 assert.Equal(tokens[5], "Korpuslinguistik")
535 assert.Equal(len(tokens), 6)
536 // Differs from KorAP-Tokenizer!
537
538 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200539 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200540 assert.Equal(tokens[0], "Er")
541 assert.Equal(tokens[1], "ist")
542 assert.Equal(tokens[2], "ein")
543 assert.Equal(tokens[3], "A****loch")
544 assert.Equal(tokens[4], "!")
545 assert.Equal(len(tokens), 5)
546
547 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200548 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200549 assert.Equal(tokens[0], "F*ck")
550 assert.Equal(tokens[1], "!")
551 assert.Equal(len(tokens), 2)
552
553 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200554 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200555 assert.Equal(tokens[0], "Dieses")
556 assert.Equal(tokens[1], "verf*****")
557 assert.Equal(tokens[2], "Kleid")
558 assert.Equal(tokens[3], "!")
559 assert.Equal(len(tokens), 4)
560
561 // Probably interpreted as HOST
562 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200563 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200564 assert.Equal(tokens[0], "Ich")
565 assert.Equal(tokens[1], "habe")
566 assert.Equal(tokens[2], "die")
567 assert.Equal(tokens[3], "readme.txt")
568 assert.Equal(tokens[4], "heruntergeladen")
569 assert.Equal(len(tokens), 5)
570
571 // Probably interpreted as HOST
572 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200573 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200574 assert.Equal(tokens[0], "Nimm")
575 assert.Equal(tokens[1], "die")
576 assert.Equal(tokens[2], "README.TXT")
577 assert.Equal(tokens[3], "!")
578 assert.Equal(len(tokens), 4)
579
580 // Probably interpreted as HOST
581 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200582 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200583 assert.Equal(tokens[0], "Zeig")
584 assert.Equal(tokens[1], "mir")
585 assert.Equal(tokens[2], "profile.jpeg")
586 assert.Equal(len(tokens), 3)
587
588 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200589
Akronec835ad2021-08-11 18:23:22 +0200590 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200591 assert.Equal(tokens[0], "Zeig")
592 assert.Equal(tokens[1], "mir")
593 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
594 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200595
Akrone8837b52021-08-11 17:29:58 +0200596 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200597 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200598 assert.Equal(tokens[0], "Gehe")
599 assert.Equal(tokens[1], "zu")
600 assert.Equal(tokens[2], "/Dokumente/profile.docx")
601 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200602
Akrone8837b52021-08-11 17:29:58 +0200603 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200604 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200605 assert.Equal(tokens[0], "Zeig")
606 assert.Equal(tokens[1], "mir")
607 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
608 assert.Equal(len(tokens), 3)
609 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200610
Akronfd92d7e2021-08-11 16:31:43 +0200611 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200612 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200613 assert.Equal(tokens[0], "Er")
614 assert.Equal(tokens[1], "sagte")
615 assert.Equal(tokens[2], ":")
616 assert.Equal(tokens[3], "\"")
617 assert.Equal(tokens[4], "Es")
618 assert.Equal(tokens[5], "geht")
619 assert.Equal(tokens[6], "mir")
620 assert.Equal(tokens[7], "gut")
621 assert.Equal(tokens[8], "!")
622 assert.Equal(tokens[9], "\"")
623 assert.Equal(tokens[10], ",")
624 assert.Equal(tokens[11], "daraufhin")
625 assert.Equal(tokens[12], "ging")
626 assert.Equal(tokens[13], "er")
627 assert.Equal(tokens[14], ".")
628 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200629
630 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200631 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
632 assert.Equal(tokens[0], "&quot;")
633 assert.Equal(tokens[1], "Das")
634 assert.Equal(tokens[2], "ist")
635 assert.Equal(tokens[3], "von")
636 assert.Equal(tokens[4], "C&A")
637 assert.Equal(tokens[5], "!")
638 assert.Equal(tokens[6], "&quot;")
639 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200640
641 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200642 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200643 assert.Equal(tokens[0], "Siehst")
644 assert.Equal(tokens[1], "Du")
645 assert.Equal(tokens[2], "?!!?")
646 assert.Equal(len(tokens), 3)
647
648 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200649 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200650 assert.Equal(tokens[0], "Peter")
651 assert.Equal(tokens[1], "O'Toole")
652 assert.Equal(len(tokens), 2)
653
654 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200655 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200656 assert.Equal(tokens[0], "Früher")
657 assert.Equal(tokens[1], "bzw.")
658 assert.Equal(tokens[2], "später")
659 assert.Equal(tokens[3], "...")
660 assert.Equal(len(tokens), 4)
661
662 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200663 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200664 assert.Equal(tokens[0], "Es")
665 assert.Equal(tokens[1], "war")
666 assert.Equal(tokens[2], "spät")
667 assert.Equal(tokens[3], ".")
668 assert.Equal(tokens[4], "Morgen")
669 assert.Equal(tokens[5], "ist")
670 assert.Equal(tokens[6], "es")
671 assert.Equal(tokens[7], "früh")
672 assert.Equal(tokens[8], ".")
673 assert.Equal(len(tokens), 9)
674 // Ignored in KorAP-Tokenizer
675
676 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200677 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200678 assert.Equal(tokens[0], "Sie")
679 assert.Equal(tokens[1], "erreichte")
680 assert.Equal(tokens[2], "den")
681 assert.Equal(tokens[3], "1.")
682 assert.Equal(tokens[4], "Platz")
683 assert.Equal(tokens[5], "!")
684 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200685
686 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200687 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200688 assert.Equal(tokens[0], "Archive")
689 assert.Equal(tokens[1], ":")
690 assert.Equal(tokens[2], "Ich")
691 assert.Equal(tokens[3], "bin")
692 assert.Equal(tokens[4], "kein")
693 assert.Equal(tokens[5], "zip")
694 assert.Equal(6, len(tokens))
695
696 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200697 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200698 assert.Equal(tokens[4], "Weststr.")
699 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200700
701 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200702 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200703 assert.Equal("D'dorf", tokens[0])
704 assert.Equal("Ku'damm", tokens[1])
705 assert.Equal("Lu'hafen", tokens[2])
706 assert.Equal("M'gladbach", tokens[3])
707 assert.Equal("W'schaft", tokens[4])
708 assert.Equal(5, len(tokens))
709
710 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200711 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200712 assert.Equal("mach's", tokens[0])
713 assert.Equal("macht's", tokens[1])
714 assert.Equal("was'n", tokens[2])
715 assert.Equal("ist's", tokens[3])
716 assert.Equal("haste", tokens[4])
717 assert.Equal("willste", tokens[5])
718 assert.Equal("kannste", tokens[6])
719 assert.Equal("biste", tokens[7])
720 assert.Equal("kriegste", tokens[8])
721 assert.Equal(9, len(tokens))
722
723 /*
724 @Test
725 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
726 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
727 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
728 assert.Equal("'ve", tokens[1]);
729 assert.Equal("'ll", tokens[3]);
730 assert.Equal("'d", tokens[5]);
731 assert.Equal("'m", tokens[7]);
732 assert.Equal("'re", tokens[9]);
733 assert.Equal("'s", tokens[11]);
734 assert.Equal("is", tokens[12]);
735 assert.Equal("n't", tokens[13]);
736 assert.Equal(14, len(tokens));
737 }
738
739 @Test
740 public void frenchTokenizerKnowsFrenchAbbreviations () {
741 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
742 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
743 assert.Equal("Approx.", tokens[0]);
744 assert.Equal("juill.", tokens[2]);
745 assert.Equal("prof.", tokens[5]);
746 assert.Equal("exerc.", tokens[15]);
747 assert.Equal("no.", tokens[16]);
748 assert.Equal("pp.", tokens[21]);
749 }
750
751 @Test
752 public void frenchTokenizerKnowsFrenchContractions () {
753 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
754 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
755 assert.Equal("J'", tokens[0]);
756 assert.Equal("j'", tokens[2]);
757 assert.Equal("qu'", tokens[4]);
758 assert.Equal("d'", tokens[6]);
759 assert.Equal("jusqu'", tokens[8]);
760 assert.Equal("Aujourd'hui", tokens[10]);
761 assert.Equal("D'", tokens[11]); // ’
762 assert.Equal("Quelqu'un", tokens[13]); // ’
763 assert.Equal("Presqu'île", tokens[14]); // ’
764 }
765
766 @Test
767 public void frenchTokenizerKnowsFrenchClitics () {
768 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
769 tokens = tokenize(dat, w, "suis-je sont-elles ")
770 assert.Equal("suis", tokens[0]);
771 assert.Equal("-je", tokens[1]);
772 assert.Equal("sont", tokens[2]);
773 assert.Equal("-elles", tokens[3]);
774 }
775
776 @Test
777 public void testEnglishTokenizerScienceAbbreviations () {
778 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
779 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
780 assert.Equal("Approx.", tokens[0]);
781 assert.Equal("in", tokens[1]);
782 assert.Equal("Sept.", tokens[2]);
783 assert.Equal("1954", tokens[3]);
784 assert.Equal(",", tokens[4]);
785 assert.Equal("Assoc.", tokens[5]);
786 assert.Equal("Prof.", tokens[6]);
787 assert.Equal("Dr.", tokens[7]);
788 assert.Equal("R.", tokens[8]);
789 assert.Equal("J.", tokens[9]);
790 assert.Equal("Ewing", tokens[10]);
791 assert.Equal("reviewed", tokens[11]);
792 assert.Equal("articles", tokens[12]);
793 assert.Equal("on", tokens[13]);
794 assert.Equal("Enzymol.", tokens[14]);
795 assert.Equal("Bacteriol.", tokens[15]);
796 assert.Equal("effects", tokens[16]);
797 assert.Equal("later", tokens[17]);
798 assert.Equal("published", tokens[18]);
799 assert.Equal("in", tokens[19]);
800 assert.Equal("Nutr.", tokens[20]);
801 assert.Equal("Rheumatol.", tokens[21]);
802 assert.Equal("No.", tokens[22]);
803 assert.Equal("12", tokens[23]);
804 assert.Equal("and", tokens[24]);
805 assert.Equal("Nº.", tokens[25]);
806 assert.Equal("13.", tokens[26]);
807 assert.Equal(",", tokens[27]);
808 assert.Equal("pp.", tokens[28]);
809 assert.Equal("17-18", tokens[29]);
810 assert.Equal(".", tokens[30]);
811 }
812
813 @Test
814 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
815 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
816 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
817 assert.Equal("I.", tokens[1]);
818 assert.Equal("I", tokens[8]);
819 assert.Equal(".", tokens[9]);
820 assert.Equal("I", tokens[12]);
821 assert.Equal(".", tokens[13]);
822 }
823
824 @Test
825 public void testZipOuputArchive () {
826
827 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
828 System.setOut(new PrintStream(clearOut));
829 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
830 assert.Equal(0, len(tokens));
831 }
832 */
833 /*
834
835 @Test
836 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
837 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
838 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
839 .printOffsets(true)
840 .build();
841 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
842 assert.Equal("Text1", tokens[0].getType());
843 assert.Equal(len(tokens), 9 );
844 }
845 */
846}
Akronbd406802021-08-11 18:39:13 +0200847
Akron29e306f2021-09-02 18:29:56 +0200848func TestLoadFactor1(t *testing.T) {
849 assert := assert.New(t)
850 tok := LoadFomaFile("testdata/abbr_bench.fst")
851 dat := tok.ToDoubleArray()
852 assert.True(dat.LoadFactor() > 88)
853}
854
Akron4c2a1ad2021-08-31 00:35:53 +0200855func TestFullTokenizerXML(t *testing.T) {
856 assert := assert.New(t)
857
858 dat := LoadDatokFile("testdata/tokenizer.datok")
859 assert.NotNil(dat)
860
861 b := make([]byte, 0, 2048)
862 w := bytes.NewBuffer(b)
863 var tokens []string
864
865 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
866 assert.Equal("Das", tokens[0])
867 assert.Equal("<b>", tokens[1])
868 assert.Equal("beste", tokens[2])
869 assert.Equal("</b>", tokens[3])
870 assert.Equal("Fußballspiel", tokens[4])
871 assert.Equal(5, len(tokens))
872
873 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
874 assert.Equal("Das", tokens[0])
875 assert.Equal("<b class=\"c\">", tokens[1])
876 assert.Equal("beste", tokens[2])
877 assert.Equal("</b>", tokens[3])
878 assert.Equal("Fußballspiel", tokens[4])
879 assert.Equal(5, len(tokens))
880
881 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
882 assert.Equal("der", tokens[0])
883 assert.Equal("<x y=\"alte \">", tokens[1])
884 assert.Equal("<x x>", tokens[2])
885 assert.Equal("alte", tokens[3])
886 assert.Equal("</x>", tokens[4])
887 assert.Equal("etc.", tokens[5])
888 assert.Equal("et", tokens[6])
889 assert.Equal(".", tokens[7])
890 assert.Equal("Mann", tokens[8])
891 assert.Equal(".", tokens[9])
892 assert.Equal(10, len(tokens))
893}
894
Akronbd406802021-08-11 18:39:13 +0200895func BenchmarkTransduce(b *testing.B) {
896 bu := make([]byte, 0, 2048)
897 w := bytes.NewBuffer(bu)
898
899 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
900 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
901 Der Termin ist am 5.9.2018.
902 Ich habe die readme.txt heruntergeladen.
903 Ausschalten!!! Hast Du nicht gehört???
904 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
905 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
906 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
907 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
908 r := strings.NewReader(s)
909
910 dat := LoadDatokFile("testdata/tokenizer.datok")
911
Akrondf37a552021-09-02 12:16:08 +0200912 b.ResetTimer()
913
Akronbd406802021-08-11 18:39:13 +0200914 for i := 0; i < b.N; i++ {
915 w.Reset()
916 r.Reset(s)
917 ok := dat.Transduce(r, w)
918 if !ok {
919 fmt.Println("Fail!")
920 fmt.Println(w.String())
921 os.Exit(1)
922 }
923 }
Akronbd406802021-08-11 18:39:13 +0200924}
Akronbb4aac52021-08-13 00:52:27 +0200925
Akron6f1c16c2021-08-17 10:45:42 +0200926// This test is deprecated as the datok file changes over time
927func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200928 for i := 0; i < b.N; i++ {
929 dat := LoadDatokFile("testdata/tokenizer.datok")
930 if dat == nil {
931 fmt.Println("Fail!")
932 os.Exit(1)
933 }
934 }
935}
936
Akron6f1c16c2021-08-17 10:45:42 +0200937func BenchmarkToDoubleArray(b *testing.B) {
938 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +0200939 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +0200940 for i := 0; i < b.N; i++ {
941 dat := tok.ToDoubleArray()
942 if dat == nil {
943 fmt.Println("Fail!")
944 os.Exit(1)
945 }
946 }
947}
948
Akron7b1faa62021-09-02 16:10:21 +0200949func BenchmarkToDoubleArrayLarger(b *testing.B) {
950 tok := LoadFomaFile("testdata/abbr_bench.fst")
951 b.ResetTimer()
952 for i := 0; i < b.N; i++ {
953 dat := tok.ToDoubleArray()
954 if dat == nil {
955 fmt.Println("Fail!")
956 os.Exit(1)
957 }
958 }
959}
960
Akronbb4aac52021-08-13 00:52:27 +0200961// 2021-08-11 (go 1.16)
962// go test -bench=. -test.benchmem
963// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200964// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +0200965// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
966// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
967// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
968// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200969// 2021-08-16
970// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
971// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
972// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
973// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +0200974// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
975// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +0200976// 2021-08-17
977// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
978// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +0200979// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
980// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +0200981// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +0200982// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
983// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
984// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
985// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
986// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
987// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
988// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
989// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
990// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +0200991// 2021-09-02 - xCheckSkip() with .9
992// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
993// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
994// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +0200995// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
996// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
997// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
998// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op