blob: 1143eb1f78d63b185b53e6e3cb06777fe1b9697e [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akron1c34ce62021-09-23 23:27:39 +020014func tmatch(tok Tokenizer, s string) bool {
Akronec835ad2021-08-11 18:23:22 +020015 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
Akron1c34ce62021-09-23 23:27:39 +020017 return tok.Transduce(strings.NewReader(s), w)
Akronec835ad2021-08-11 18:23:22 +020018}
19
Akron1c34ce62021-09-23 23:27:39 +020020func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
Akronec835ad2021-08-11 18:23:22 +020021 w.Reset()
Akron1c34ce62021-09-23 23:27:39 +020022 ok := tok.Transduce(strings.NewReader(str), w)
Akronec835ad2021-08-11 18:23:22 +020023 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akron8ef408b2021-08-02 22:11:04 +020032func TestSimpleString(t *testing.T) {
33 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akrone396a932021-10-19 01:06:13 +020041 assert.True(tmatch(dat, "baua"))
Akron8ef408b2021-08-02 22:11:04 +020042}
Akron75ebe7f2021-08-03 10:34:10 +020043
44func TestSimpleBranches(t *testing.T) {
45 assert := assert.New(t)
46
47 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020048 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020049 dat := tok.ToDoubleArray()
Akrone396a932021-10-19 01:06:13 +020050 assert.True(tmatch(dat, "bau"))
Akronec835ad2021-08-11 18:23:22 +020051 assert.True(tmatch(dat, "bauamt"))
52 assert.True(tmatch(dat, "wahlamt"))
53 assert.True(tmatch(dat, "bauen"))
54 assert.True(tmatch(dat, "wahlen"))
55 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020056}
Akron730a79c2021-08-03 11:05:29 +020057
58func TestSimpleTokenizer(t *testing.T) {
59 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020060 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020061 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020062 assert.True(tmatch(dat, "bau"))
63 assert.True(tmatch(dat, "bad"))
64 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020065}
Akron740f3d72021-08-03 12:12:34 +020066
Akron068874c2021-08-04 15:19:56 +020067func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020068 assert := assert.New(t)
69 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020070 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020071
72 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
73 b := make([]byte, 0, 2048)
74 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020075 var tokens []string
Akron524c5432021-08-05 14:14:27 +020076 dat.Transduce(r, w)
77 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020078 assert.Equal(len(tokens), 10)
Akron3f8571a2021-08-05 11:18:10 +020079 assert.Equal("wald", tokens[0])
80 assert.Equal("gehen", tokens[1])
81 assert.Equal("Da", tokens[2])
82 assert.Equal("kann", tokens[3])
83 assert.Equal("man", tokens[4])
84 assert.Equal("was", tokens[5])
85 assert.Equal("\"erleben\"", tokens[6])
86
Akron524c5432021-08-05 14:14:27 +020087 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
88 w.Reset()
89 dat.Transduce(r, w)
90 tokens = strings.Split(w.String(), "\n")
91 assert.Equal("In", tokens[0])
92 assert.Equal("den", tokens[1])
93 assert.Equal("Wald", tokens[2])
94 assert.Equal("gehen", tokens[3])
95 assert.Equal("?", tokens[4])
96 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020097
Akron524c5432021-08-05 14:14:27 +020098 r = strings.NewReader(" g? -- D")
99 w.Reset()
100 dat.Transduce(r, w)
101 tokens = strings.Split(w.String(), "\n")
102 assert.Equal("g", tokens[0])
103 assert.Equal("?", tokens[1])
104 assert.Equal("--", tokens[2])
105 assert.Equal("D", tokens[3])
106 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200107 assert.Equal("", tokens[5])
108 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200109}
110
Akron3f8571a2021-08-05 11:18:10 +0200111func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200112 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200113 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200114 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200115 assert.True(tmatch(dat, "bau"))
116 assert.True(tmatch(dat, "bad"))
117 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200118
Akron3f8571a2021-08-05 11:18:10 +0200119 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200120 buf := bytes.NewBuffer(b)
121 n, err := dat.WriteTo(buf)
122 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200123 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200124
125 dat2 := ParseDatok(buf)
126 assert.NotNil(dat2)
127 assert.Equal(dat.array, dat2.array)
128 assert.Equal(dat.sigma, dat2.sigma)
129 assert.Equal(dat.epsilon, dat2.epsilon)
130 assert.Equal(dat.unknown, dat2.unknown)
131 assert.Equal(dat.identity, dat2.identity)
132 assert.Equal(dat.final, dat2.final)
133 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200134 assert.True(tmatch(dat2, "bau"))
135 assert.True(tmatch(dat2, "bad"))
136 assert.True(tmatch(dat2, "wald gehen"))
Akron4fa28b32021-08-27 10:55:41 +0200137
Akron92704eb2021-08-27 10:59:46 +0200138 assert.Equal(dat.TransCount(), 17)
139 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200140}
141
Akron31f3c062021-08-27 10:15:13 +0200142func TestIgnorableMCS(t *testing.T) {
Akrone396a932021-10-19 01:06:13 +0200143
144 // This test relies on final states. That's why it is
145 // not working correctly anymore.
146
Akron31f3c062021-08-27 10:15:13 +0200147 assert := assert.New(t)
148 // File has MCS in sigma but not in net
149 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
150 assert.NotNil(tok)
151 dat := tok.ToDoubleArray()
152 assert.NotNil(dat)
153
154 b := make([]byte, 0, 2048)
155 w := bytes.NewBuffer(b)
156 var tokens []string
157
158 // Is only unambigous when transducing strictly greedy!
Akrone396a932021-10-19 01:06:13 +0200159 assert.True(dat.Transduce(strings.NewReader("ab<ab>a"), w))
Akron31f3c062021-08-27 10:15:13 +0200160 tokens = strings.Split(w.String(), "\n")
Akrone396a932021-10-19 01:06:13 +0200161 assert.Equal("a\nb\n<ab>a\n\n", w.String())
Akron31f3c062021-08-27 10:15:13 +0200162 assert.Equal("a", tokens[0])
163 assert.Equal("b", tokens[1])
Akrone396a932021-10-19 01:06:13 +0200164 assert.Equal("<ab>a", tokens[2])
165 assert.Equal(5, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200166 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200167}
168
Akron6247a5d2021-08-03 19:18:28 +0200169func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200170 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200171 dat := LoadDatokFile("testdata/tokenizer.datok")
172 assert.NotNil(dat)
173 assert.True(dat.LoadFactor() >= 70)
174 assert.Equal(dat.epsilon, 1)
175 assert.Equal(dat.unknown, 2)
176 assert.Equal(dat.identity, 3)
Akron4c2a1ad2021-08-31 00:35:53 +0200177 assert.Equal(dat.final, 145)
178 assert.Equal(len(dat.sigma), 140)
Akronf1a16502021-08-16 15:24:38 +0200179 assert.True(len(dat.array) > 3600000)
180 assert.True(dat.maxSize > 3600000)
Akronec835ad2021-08-11 18:23:22 +0200181 assert.True(tmatch(dat, "bau"))
182 assert.True(tmatch(dat, "bad"))
183 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200184}
Akron3f8571a2021-08-05 11:18:10 +0200185
Akron941f2152021-09-26 15:14:25 +0200186func TestTokenizerBranch(t *testing.T) {
187 assert := assert.New(t)
188 tok := LoadTokenizerFile("testdata/simpletok.datok")
189 assert.NotNil(tok)
190 assert.Equal(tok.Type(), "DATOK")
191
192 tok = LoadTokenizerFile("testdata/simpletok.matok")
193 assert.NotNil(tok)
194 assert.Equal(tok.Type(), "MATOK")
195}
196
Akrona0bded52021-08-11 15:48:02 +0200197func XTestFullTokenizerBuild(t *testing.T) {
198 assert := assert.New(t)
199 tok := LoadFomaFile("testdata/tokenizer.fst")
200 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200201 assert.NotNil(dat)
202 // n, err := dat.Save("testdata/tokenizer.datok")
203 // assert.Nil(err)
204 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200205}
206
Akron3f8571a2021-08-05 11:18:10 +0200207func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200208 assert := assert.New(t)
209
Akrona0bded52021-08-11 15:48:02 +0200210 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200211 assert.NotNil(dat)
212
Akron3610f102021-08-08 14:13:25 +0200213 b := make([]byte, 0, 2048)
214 w := bytes.NewBuffer(b)
215 var tokens []string
216
Akron03ca4252021-08-11 13:32:53 +0200217 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200218
219 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200220 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200221 assert.Equal("tra", tokens[0])
222 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200223 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200224 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200225 assert.Equal("Du", tokens[4])
226 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200227 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200228 assert.Equal("", tokens[7])
229 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200230
231 w.Reset()
232 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
233 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200234}
Akronb7e1f132021-08-10 11:52:31 +0200235
236func TestFullTokenizerSentenceSplitter(t *testing.T) {
237 assert := assert.New(t)
238 dat := LoadDatokFile("testdata/tokenizer.datok")
239 assert.NotNil(dat)
240
241 b := make([]byte, 0, 2048)
242 w := bytes.NewBuffer(b)
243 var sentences []string
244
245 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200246 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
247 sentences = strings.Split(w.String(), "\n\n")
248
249 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
250 assert.Equal("Der\nalte\nMann\n.", sentences[0])
251 assert.Equal("", sentences[1])
252 assert.Equal(len(sentences), 2)
253
254 w.Reset()
255 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
256 sentences = strings.Split(w.String(), "\n\n")
257 assert.Equal(len(sentences), 2)
258 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
259 assert.Equal("", sentences[1])
260
261 w.Reset()
262 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200263 sentences = strings.Split(w.String(), "\n\n")
264 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200265 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200266
267 w.Reset()
268 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
269 sentences = strings.Split(w.String(), "\n\n")
270 assert.Equal(len(sentences), 2)
271
272 w.Reset()
273 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
274 sentences = strings.Split(w.String(), "\n\n")
275 assert.Equal(len(sentences), 2)
276
Akron6e70dc82021-08-11 11:33:18 +0200277 w.Reset()
278 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
279 sentences = strings.Split(w.String(), "\n\n")
280 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
281 assert.Equal("", sentences[1])
282 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200283
Akron6e70dc82021-08-11 11:33:18 +0200284 w.Reset()
285 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
286 sentences = strings.Split(w.String(), "\n\n")
287 assert.Equal("", sentences[1])
288 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200289
Akron6e70dc82021-08-11 11:33:18 +0200290 w.Reset()
291 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
292 sentences = strings.Split(w.String(), "\n\n")
293 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200294
Akron6e70dc82021-08-11 11:33:18 +0200295 w.Reset()
296 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
297 sentences = strings.Split(w.String(), "\n\n")
298 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200299
Akron6e70dc82021-08-11 11:33:18 +0200300 w.Reset()
301 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
302 sentences = strings.Split(w.String(), "\n\n")
303 assert.Equal(len(sentences), 2)
304 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
305 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200306
Akron6e70dc82021-08-11 11:33:18 +0200307 w.Reset()
308 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
309 sentences = strings.Split(w.String(), "\n\n")
310 assert.Equal(len(sentences), 3)
311 assert.Equal("Ausschalten\n!!!", sentences[0])
312 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
313 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200314
Akron4af79f12021-08-11 14:48:17 +0200315 w.Reset()
316 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
317 sentences = strings.Split(w.String(), "\n\n")
318 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200319
320 /*
321 Test:
322 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
323 */
Akronb7e1f132021-08-10 11:52:31 +0200324}
Akron03ca4252021-08-11 13:32:53 +0200325
Akron03ca4252021-08-11 13:32:53 +0200326func TestFullTokenizerTokenSplitter(t *testing.T) {
327 assert := assert.New(t)
328 dat := LoadDatokFile("testdata/tokenizer.datok")
329 assert.NotNil(dat)
330
331 b := make([]byte, 0, 2048)
332 w := bytes.NewBuffer(b)
333 var tokens []string
334
335 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200336 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200337 assert.Equal(tokens[0], "Der")
338 assert.Equal(tokens[1], "alte")
339 assert.Equal(tokens[2], "Mann")
340 assert.Equal(len(tokens), 3)
341
Akronec835ad2021-08-11 18:23:22 +0200342 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200343 assert.Equal(tokens[0], "Der")
344 assert.Equal(tokens[1], "alte")
345 assert.Equal(tokens[2], "Mann")
346 assert.Equal(tokens[3], ".")
347 assert.Equal(len(tokens), 4)
348
349 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200350 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200351 assert.Equal(tokens[0], "Der")
352 assert.Equal(tokens[1], "Vorsitzende")
353 assert.Equal(tokens[2], "der")
354 assert.Equal(tokens[3], "F.D.P.")
355 assert.Equal(tokens[4], "hat")
356 assert.Equal(tokens[5], "gewählt")
357 assert.Equal(len(tokens), 6)
358 // Ignored in KorAP-Tokenizer
359
360 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200361 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200362 assert.Equal(tokens[0], "Gefunden")
363 assert.Equal(tokens[1], "auf")
364 assert.Equal(tokens[2], "wikipedia.org")
365 assert.Equal(len(tokens), 3)
366
367 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200368 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200369 assert.Equal("Gefunden", tokens[0])
370 assert.Equal("auf", tokens[1])
371 assert.Equal("www.wikipedia.org", tokens[2])
372 assert.Equal(3, len(tokens))
373
374 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200375 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200376 assert.Equal("www.info.biz/info", tokens[3])
377
378 // testTokenizerFtpHost
379 /*
380 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
381 assert.Equal("Kann", tokens[0])
382 assert.Equal("von", tokens[1])
383 assert.Equal("ftp.download.org", tokens[2])
384 assert.Equal(5, len(tokens))
385 // Ignored in KorAP-Tokenizer
386 */
387
388 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200389 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200390 assert.Equal(tokens[0], "Das")
391 assert.Equal(tokens[1], "war")
392 assert.Equal(tokens[2], "--")
393 assert.Equal(tokens[3], "spitze")
394 assert.Equal(len(tokens), 4)
395
396 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200397 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200398 assert.Equal(tokens[0], "Ich")
399 assert.Equal(tokens[1], "bin")
400 assert.Equal(tokens[2], "unter")
401 assert.Equal(tokens[3], "korap@ids-mannheim.de")
402 assert.Equal(tokens[4], "erreichbar")
403 assert.Equal(tokens[5], ".")
404 assert.Equal(len(tokens), 6)
405
406 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200407 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200408 assert.Equal(tokens[0], "Oder")
409 assert.Equal(tokens[1], "unter")
410 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
411 assert.Equal(tokens[3], ".")
412 assert.Equal(len(tokens), 4)
413
414 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200415 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200416 assert.Equal(tokens[0], "Oder")
417 assert.Equal(tokens[1], "unter")
418 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
419 assert.Equal(tokens[3], ".")
420 assert.Equal(len(tokens), 4)
421 // Ignored in KorAP-Tokenizer
422
423 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200424 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200425 assert.Equal("\"", tokens[0])
426 assert.Equal("John", tokens[1])
427 assert.Equal("Doe", tokens[2])
428 assert.Equal("\"", tokens[3])
429 assert.Equal("@xx", tokens[4])
430 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
431 assert.Equal("com", tokens[6])
432 assert.Equal(7, len(tokens))
433
434 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200435 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200436 assert.Equal(tokens[0], "Folgt")
437 assert.Equal(tokens[1], "@korap")
438 assert.Equal(tokens[2], "und")
439 assert.Equal(tokens[3], "#korap")
440 assert.Equal(len(tokens), 4)
441
442 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200443 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200444 assert.Equal(tokens[0], "Unsere")
445 assert.Equal(tokens[1], "Website")
446 assert.Equal(tokens[2], "ist")
447 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
448 assert.Equal(len(tokens), 4)
449
450 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200451 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200452 assert.Equal(tokens[0], "Wir")
453 assert.Equal(tokens[1], "sind")
454 assert.Equal(tokens[2], "auch")
455 assert.Equal(tokens[3], "im")
456 assert.Equal(tokens[4], "Internet")
457 assert.Equal(tokens[5], "(")
458 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
459 assert.Equal(tokens[7], ")")
460 assert.Equal(len(tokens), 8)
461 // Ignored in KorAP-Tokenizer
462
463 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200464 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200465 assert.Equal(tokens[0], "Die")
466 assert.Equal(tokens[1], "Adresse")
467 assert.Equal(tokens[2], "ist")
468 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
469 assert.Equal(tokens[4], ".")
470 assert.Equal(len(tokens), 5)
471 // Ignored in KorAP-Tokenizer
472
473 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200474 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200475 assert.Equal(tokens[0], "Unser")
476 assert.Equal(tokens[1], "Server")
477 assert.Equal(tokens[2], "ist")
478 assert.Equal(tokens[3], "10.0.10.51")
479 assert.Equal(tokens[4], ".")
480 assert.Equal(len(tokens), 5)
481
482 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200483 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200484 assert.Equal(tokens[0], "Zu")
485 assert.Equal(tokens[1], "50,4%")
486 assert.Equal(tokens[2], "ist")
487 assert.Equal(tokens[3], "es")
488 assert.Equal(tokens[4], "sicher")
489 assert.Equal(len(tokens), 5)
490 // Differs from KorAP-Tokenizer
491
492 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200493 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200494 assert.Equal(tokens[0], "Der")
495 assert.Equal(tokens[1], "Termin")
496 assert.Equal(tokens[2], "ist")
497 assert.Equal(tokens[3], "am")
498 assert.Equal(tokens[4], "5.9.2018")
499 assert.Equal(len(tokens), 5)
500
Akronec835ad2021-08-11 18:23:22 +0200501 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200502 assert.Equal(tokens[0], "Der")
503 assert.Equal(tokens[1], "Termin")
504 assert.Equal(tokens[2], "ist")
505 assert.Equal(tokens[3], "am")
506 assert.Equal(tokens[4], "5/9/2018")
507 assert.Equal(len(tokens), 5)
508
509 // testTokenizerDateRange
510 /*
511 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
512 assert.Equal(tokens[0], "Der")
513 assert.Equal(tokens[1], "Termin")
514 assert.Equal(tokens[2], "war")
515 assert.Equal(tokens[3], "vom")
516 assert.Equal(tokens[4], "4.")
517 assert.Equal(tokens[5], "-")
518 assert.Equal(tokens[6], "5.9.2018")
519 assert.Equal(len(tokens), 7)
520 // Ignored in KorAP-Tokenizer
521 */
522
523 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200524 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200525 assert.Equal(tokens[0], "Das")
526 assert.Equal(tokens[1], "ist")
527 assert.Equal(tokens[2], "toll")
528 assert.Equal(tokens[3], "!")
529 assert.Equal(tokens[4], ";)")
530 assert.Equal(len(tokens), 5)
531
532 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200533 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200534 assert.Equal(tokens[0], "Kupietz")
535 assert.Equal(tokens[1], "und")
536 assert.Equal(tokens[2], "Schmidt")
537 assert.Equal(tokens[3], "(2018)")
538 assert.Equal(tokens[4], ":")
539 assert.Equal(tokens[5], "Korpuslinguistik")
540 assert.Equal(len(tokens), 6)
541 // Differs from KorAP-Tokenizer!
542
543 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200544 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200545 assert.Equal(tokens[0], "Kupietz")
546 assert.Equal(tokens[1], "und")
547 assert.Equal(tokens[2], "Schmidt")
548 assert.Equal(tokens[3], "[2018]")
549 assert.Equal(tokens[4], ":")
550 assert.Equal(tokens[5], "Korpuslinguistik")
551 assert.Equal(len(tokens), 6)
552 // Differs from KorAP-Tokenizer!
553
554 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200555 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200556 assert.Equal(tokens[0], "Er")
557 assert.Equal(tokens[1], "ist")
558 assert.Equal(tokens[2], "ein")
559 assert.Equal(tokens[3], "A****loch")
560 assert.Equal(tokens[4], "!")
561 assert.Equal(len(tokens), 5)
562
563 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200564 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200565 assert.Equal(tokens[0], "F*ck")
566 assert.Equal(tokens[1], "!")
567 assert.Equal(len(tokens), 2)
568
569 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200570 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200571 assert.Equal(tokens[0], "Dieses")
572 assert.Equal(tokens[1], "verf*****")
573 assert.Equal(tokens[2], "Kleid")
574 assert.Equal(tokens[3], "!")
575 assert.Equal(len(tokens), 4)
576
577 // Probably interpreted as HOST
578 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200579 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200580 assert.Equal(tokens[0], "Ich")
581 assert.Equal(tokens[1], "habe")
582 assert.Equal(tokens[2], "die")
583 assert.Equal(tokens[3], "readme.txt")
584 assert.Equal(tokens[4], "heruntergeladen")
585 assert.Equal(len(tokens), 5)
586
587 // Probably interpreted as HOST
588 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200589 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200590 assert.Equal(tokens[0], "Nimm")
591 assert.Equal(tokens[1], "die")
592 assert.Equal(tokens[2], "README.TXT")
593 assert.Equal(tokens[3], "!")
594 assert.Equal(len(tokens), 4)
595
596 // Probably interpreted as HOST
597 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200598 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200599 assert.Equal(tokens[0], "Zeig")
600 assert.Equal(tokens[1], "mir")
601 assert.Equal(tokens[2], "profile.jpeg")
602 assert.Equal(len(tokens), 3)
603
604 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200605
Akronec835ad2021-08-11 18:23:22 +0200606 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200607 assert.Equal(tokens[0], "Zeig")
608 assert.Equal(tokens[1], "mir")
609 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
610 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200611
Akrone8837b52021-08-11 17:29:58 +0200612 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200613 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200614 assert.Equal(tokens[0], "Gehe")
615 assert.Equal(tokens[1], "zu")
616 assert.Equal(tokens[2], "/Dokumente/profile.docx")
617 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200618
Akrone8837b52021-08-11 17:29:58 +0200619 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200620 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200621 assert.Equal(tokens[0], "Zeig")
622 assert.Equal(tokens[1], "mir")
623 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
624 assert.Equal(len(tokens), 3)
625 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200626
Akronfd92d7e2021-08-11 16:31:43 +0200627 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200628 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200629 assert.Equal(tokens[0], "Er")
630 assert.Equal(tokens[1], "sagte")
631 assert.Equal(tokens[2], ":")
632 assert.Equal(tokens[3], "\"")
633 assert.Equal(tokens[4], "Es")
634 assert.Equal(tokens[5], "geht")
635 assert.Equal(tokens[6], "mir")
636 assert.Equal(tokens[7], "gut")
637 assert.Equal(tokens[8], "!")
638 assert.Equal(tokens[9], "\"")
639 assert.Equal(tokens[10], ",")
640 assert.Equal(tokens[11], "daraufhin")
641 assert.Equal(tokens[12], "ging")
642 assert.Equal(tokens[13], "er")
643 assert.Equal(tokens[14], ".")
644 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200645
646 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200647 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
648 assert.Equal(tokens[0], "&quot;")
649 assert.Equal(tokens[1], "Das")
650 assert.Equal(tokens[2], "ist")
651 assert.Equal(tokens[3], "von")
652 assert.Equal(tokens[4], "C&A")
653 assert.Equal(tokens[5], "!")
654 assert.Equal(tokens[6], "&quot;")
655 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200656
657 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200658 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200659 assert.Equal(tokens[0], "Siehst")
660 assert.Equal(tokens[1], "Du")
661 assert.Equal(tokens[2], "?!!?")
662 assert.Equal(len(tokens), 3)
663
664 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200665 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200666 assert.Equal(tokens[0], "Peter")
667 assert.Equal(tokens[1], "O'Toole")
668 assert.Equal(len(tokens), 2)
669
670 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200671 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200672 assert.Equal(tokens[0], "Früher")
673 assert.Equal(tokens[1], "bzw.")
674 assert.Equal(tokens[2], "später")
675 assert.Equal(tokens[3], "...")
676 assert.Equal(len(tokens), 4)
677
678 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200679 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200680 assert.Equal(tokens[0], "Es")
681 assert.Equal(tokens[1], "war")
682 assert.Equal(tokens[2], "spät")
683 assert.Equal(tokens[3], ".")
684 assert.Equal(tokens[4], "Morgen")
685 assert.Equal(tokens[5], "ist")
686 assert.Equal(tokens[6], "es")
687 assert.Equal(tokens[7], "früh")
688 assert.Equal(tokens[8], ".")
689 assert.Equal(len(tokens), 9)
690 // Ignored in KorAP-Tokenizer
691
692 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200693 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200694 assert.Equal(tokens[0], "Sie")
695 assert.Equal(tokens[1], "erreichte")
696 assert.Equal(tokens[2], "den")
697 assert.Equal(tokens[3], "1.")
698 assert.Equal(tokens[4], "Platz")
699 assert.Equal(tokens[5], "!")
700 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200701
702 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200703 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200704 assert.Equal(tokens[0], "Archive")
705 assert.Equal(tokens[1], ":")
706 assert.Equal(tokens[2], "Ich")
707 assert.Equal(tokens[3], "bin")
708 assert.Equal(tokens[4], "kein")
709 assert.Equal(tokens[5], "zip")
710 assert.Equal(6, len(tokens))
711
712 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200713 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200714 assert.Equal(tokens[4], "Weststr.")
715 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200716
717 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200718 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200719 assert.Equal("D'dorf", tokens[0])
720 assert.Equal("Ku'damm", tokens[1])
721 assert.Equal("Lu'hafen", tokens[2])
722 assert.Equal("M'gladbach", tokens[3])
723 assert.Equal("W'schaft", tokens[4])
724 assert.Equal(5, len(tokens))
725
726 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200727 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200728 assert.Equal("mach's", tokens[0])
729 assert.Equal("macht's", tokens[1])
730 assert.Equal("was'n", tokens[2])
731 assert.Equal("ist's", tokens[3])
732 assert.Equal("haste", tokens[4])
733 assert.Equal("willste", tokens[5])
734 assert.Equal("kannste", tokens[6])
735 assert.Equal("biste", tokens[7])
736 assert.Equal("kriegste", tokens[8])
737 assert.Equal(9, len(tokens))
738
739 /*
740 @Test
741 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
742 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
743 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
744 assert.Equal("'ve", tokens[1]);
745 assert.Equal("'ll", tokens[3]);
746 assert.Equal("'d", tokens[5]);
747 assert.Equal("'m", tokens[7]);
748 assert.Equal("'re", tokens[9]);
749 assert.Equal("'s", tokens[11]);
750 assert.Equal("is", tokens[12]);
751 assert.Equal("n't", tokens[13]);
752 assert.Equal(14, len(tokens));
753 }
754
755 @Test
756 public void frenchTokenizerKnowsFrenchAbbreviations () {
757 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
758 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
759 assert.Equal("Approx.", tokens[0]);
760 assert.Equal("juill.", tokens[2]);
761 assert.Equal("prof.", tokens[5]);
762 assert.Equal("exerc.", tokens[15]);
763 assert.Equal("no.", tokens[16]);
764 assert.Equal("pp.", tokens[21]);
765 }
766
767 @Test
768 public void frenchTokenizerKnowsFrenchContractions () {
769 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
770 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
771 assert.Equal("J'", tokens[0]);
772 assert.Equal("j'", tokens[2]);
773 assert.Equal("qu'", tokens[4]);
774 assert.Equal("d'", tokens[6]);
775 assert.Equal("jusqu'", tokens[8]);
776 assert.Equal("Aujourd'hui", tokens[10]);
777 assert.Equal("D'", tokens[11]); // ’
778 assert.Equal("Quelqu'un", tokens[13]); // ’
779 assert.Equal("Presqu'île", tokens[14]); // ’
780 }
781
782 @Test
783 public void frenchTokenizerKnowsFrenchClitics () {
784 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
785 tokens = tokenize(dat, w, "suis-je sont-elles ")
786 assert.Equal("suis", tokens[0]);
787 assert.Equal("-je", tokens[1]);
788 assert.Equal("sont", tokens[2]);
789 assert.Equal("-elles", tokens[3]);
790 }
791
792 @Test
793 public void testEnglishTokenizerScienceAbbreviations () {
794 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
795 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
796 assert.Equal("Approx.", tokens[0]);
797 assert.Equal("in", tokens[1]);
798 assert.Equal("Sept.", tokens[2]);
799 assert.Equal("1954", tokens[3]);
800 assert.Equal(",", tokens[4]);
801 assert.Equal("Assoc.", tokens[5]);
802 assert.Equal("Prof.", tokens[6]);
803 assert.Equal("Dr.", tokens[7]);
804 assert.Equal("R.", tokens[8]);
805 assert.Equal("J.", tokens[9]);
806 assert.Equal("Ewing", tokens[10]);
807 assert.Equal("reviewed", tokens[11]);
808 assert.Equal("articles", tokens[12]);
809 assert.Equal("on", tokens[13]);
810 assert.Equal("Enzymol.", tokens[14]);
811 assert.Equal("Bacteriol.", tokens[15]);
812 assert.Equal("effects", tokens[16]);
813 assert.Equal("later", tokens[17]);
814 assert.Equal("published", tokens[18]);
815 assert.Equal("in", tokens[19]);
816 assert.Equal("Nutr.", tokens[20]);
817 assert.Equal("Rheumatol.", tokens[21]);
818 assert.Equal("No.", tokens[22]);
819 assert.Equal("12", tokens[23]);
820 assert.Equal("and", tokens[24]);
821 assert.Equal("Nº.", tokens[25]);
822 assert.Equal("13.", tokens[26]);
823 assert.Equal(",", tokens[27]);
824 assert.Equal("pp.", tokens[28]);
825 assert.Equal("17-18", tokens[29]);
826 assert.Equal(".", tokens[30]);
827 }
828
829 @Test
830 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
831 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
832 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
833 assert.Equal("I.", tokens[1]);
834 assert.Equal("I", tokens[8]);
835 assert.Equal(".", tokens[9]);
836 assert.Equal("I", tokens[12]);
837 assert.Equal(".", tokens[13]);
838 }
839
840 @Test
841 public void testZipOuputArchive () {
842
843 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
844 System.setOut(new PrintStream(clearOut));
845 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
846 assert.Equal(0, len(tokens));
847 }
848 */
849 /*
850
851 @Test
852 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
853 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
854 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
855 .printOffsets(true)
856 .build();
857 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
858 assert.Equal("Text1", tokens[0].getType());
859 assert.Equal(len(tokens), 9 );
860 }
861 */
862}
Akronbd406802021-08-11 18:39:13 +0200863
Akron29e306f2021-09-02 18:29:56 +0200864func TestLoadFactor1(t *testing.T) {
865 assert := assert.New(t)
866 tok := LoadFomaFile("testdata/abbr_bench.fst")
867 dat := tok.ToDoubleArray()
868 assert.True(dat.LoadFactor() > 88)
869}
870
Akron4c2a1ad2021-08-31 00:35:53 +0200871func TestFullTokenizerXML(t *testing.T) {
872 assert := assert.New(t)
873
874 dat := LoadDatokFile("testdata/tokenizer.datok")
875 assert.NotNil(dat)
876
877 b := make([]byte, 0, 2048)
878 w := bytes.NewBuffer(b)
879 var tokens []string
880
881 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
882 assert.Equal("Das", tokens[0])
883 assert.Equal("<b>", tokens[1])
884 assert.Equal("beste", tokens[2])
885 assert.Equal("</b>", tokens[3])
886 assert.Equal("Fußballspiel", tokens[4])
887 assert.Equal(5, len(tokens))
888
889 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
890 assert.Equal("Das", tokens[0])
891 assert.Equal("<b class=\"c\">", tokens[1])
892 assert.Equal("beste", tokens[2])
893 assert.Equal("</b>", tokens[3])
894 assert.Equal("Fußballspiel", tokens[4])
895 assert.Equal(5, len(tokens))
896
897 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
898 assert.Equal("der", tokens[0])
899 assert.Equal("<x y=\"alte \">", tokens[1])
900 assert.Equal("<x x>", tokens[2])
901 assert.Equal("alte", tokens[3])
902 assert.Equal("</x>", tokens[4])
903 assert.Equal("etc.", tokens[5])
904 assert.Equal("et", tokens[6])
905 assert.Equal(".", tokens[7])
906 assert.Equal("Mann", tokens[8])
907 assert.Equal(".", tokens[9])
908 assert.Equal(10, len(tokens))
909}
910
Akronbd406802021-08-11 18:39:13 +0200911func BenchmarkTransduce(b *testing.B) {
912 bu := make([]byte, 0, 2048)
913 w := bytes.NewBuffer(bu)
914
915 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
916 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
917 Der Termin ist am 5.9.2018.
918 Ich habe die readme.txt heruntergeladen.
919 Ausschalten!!! Hast Du nicht gehört???
920 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
921 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
922 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
923 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
924 r := strings.NewReader(s)
925
926 dat := LoadDatokFile("testdata/tokenizer.datok")
927
Akrondf37a552021-09-02 12:16:08 +0200928 b.ResetTimer()
929
Akronbd406802021-08-11 18:39:13 +0200930 for i := 0; i < b.N; i++ {
931 w.Reset()
932 r.Reset(s)
933 ok := dat.Transduce(r, w)
934 if !ok {
935 fmt.Println("Fail!")
936 fmt.Println(w.String())
937 os.Exit(1)
938 }
939 }
Akronbd406802021-08-11 18:39:13 +0200940}
Akronbb4aac52021-08-13 00:52:27 +0200941
Akron6f1c16c2021-08-17 10:45:42 +0200942// This test is deprecated as the datok file changes over time
943func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200944 for i := 0; i < b.N; i++ {
945 dat := LoadDatokFile("testdata/tokenizer.datok")
946 if dat == nil {
947 fmt.Println("Fail!")
948 os.Exit(1)
949 }
950 }
951}
952
Akron6f1c16c2021-08-17 10:45:42 +0200953func BenchmarkToDoubleArray(b *testing.B) {
954 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +0200955 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +0200956 for i := 0; i < b.N; i++ {
957 dat := tok.ToDoubleArray()
958 if dat == nil {
959 fmt.Println("Fail!")
960 os.Exit(1)
961 }
962 }
963}
964
Akron7b1faa62021-09-02 16:10:21 +0200965func BenchmarkToDoubleArrayLarger(b *testing.B) {
966 tok := LoadFomaFile("testdata/abbr_bench.fst")
967 b.ResetTimer()
968 for i := 0; i < b.N; i++ {
969 dat := tok.ToDoubleArray()
970 if dat == nil {
971 fmt.Println("Fail!")
972 os.Exit(1)
973 }
974 }
975}
976
Akronbb4aac52021-08-13 00:52:27 +0200977// 2021-08-11 (go 1.16)
978// go test -bench=. -test.benchmem
979// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200980// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +0200981// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
982// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
983// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
984// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200985// 2021-08-16
986// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
987// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
988// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
989// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +0200990// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
991// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +0200992// 2021-08-17
993// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
994// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +0200995// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
996// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +0200997// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +0200998// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
999// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
1000// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
1001// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
1002// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
1003// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
1004// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
1005// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
1006// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +02001007// 2021-09-02 - xCheckSkip() with .9
1008// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
1009// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
1010// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +02001011// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
1012// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
1013// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
1014// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op
Akron28031b72021-10-02 13:07:25 +02001015// 2021-09-30 - Go 1.17.1
1016// BenchmarkTransduce-4 47222 25962 ns/op 8240 B/op 3 allocs/op
1017// BenchmarkToDoubleArray-4 69192 17355 ns/op 10704 B/op 29 allocs/op
1018// BenchmarkToDoubleArrayLarger-4 16 65042885 ns/op 6357794 B/op 2576 allocs/op
1019// BenchmarkTransduceMatrix-4 45404 25156 ns/op 8240 B/op 3 allocs/op
Akron094a4e82021-10-02 18:37:00 +02001020// 2021-10-02
1021// BenchmarkTransduce-4 47676 25398 ns/op 8240 B/op 3 allocs/op
1022// BenchmarkToDoubleArray-4 71919 16083 ns/op 10702 B/op 29 allocs/op
1023// BenchmarkToDoubleArrayLarger-4 16 68012819 ns/op 6357920 B/op 2578 allocs/op
1024// BenchmarkTransduceMatrix-4 51529 23678 ns/op 8240 B/op 3 allocs/op
Akrone396a932021-10-19 01:06:13 +02001025// 2021-10-12 - Introduction of Callbacks in Matrix
1026// BenchmarkTransduce-4 46947 26043 ns/op 8240 B/op 3 allocs/op
1027// BenchmarkToDoubleArray-4 65192 16501 ns/op 10703 B/op 29 allocs/op
1028// BenchmarkToDoubleArrayLarger-4 15 69263576 ns/op 6357859 B/op 2577 allocs/op
1029// BenchmarkTransduceMatrix-4 49928 26313 ns/op 12408 B/op 6 allocs/op
1030// 2021-10-18 - Introduction of Callbacks in DA
1031// BenchmarkTransduce-4 41055 30058 ns/op 12408 B/op 6 allocs/op
1032// BenchmarkToDoubleArray-4 64672 17659 ns/op 10703 B/op 29 allocs/op
1033// BenchmarkToDoubleArrayLarger-4 15 71640553 ns/op 6357865 B/op 2577 allocs/op
1034// BenchmarkTransduceMatrix-4 47036 26009 ns/op 12408 B/op 6 allocs/op