blob: c6b6c8ee415e35fb50ac64f0b16b8b90af316ce7 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akronec835ad2021-08-11 18:23:22 +020014func tmatch(dat *DaTokenizer, s string) bool {
15 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
17 return dat.Transduce(strings.NewReader(s), w)
18}
19
20func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
21 w.Reset()
22 ok := dat.Transduce(strings.NewReader(str), w)
23 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akron8ef408b2021-08-02 22:11:04 +020032func TestSimpleString(t *testing.T) {
33 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akron0630be52021-08-28 09:06:16 +020041 assert.False(tmatch(dat, "baua"))
Akron8ef408b2021-08-02 22:11:04 +020042}
Akron75ebe7f2021-08-03 10:34:10 +020043
44func TestSimpleBranches(t *testing.T) {
45 assert := assert.New(t)
46
47 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020048 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020049 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020050 assert.False(tmatch(dat, "bau"))
51 assert.True(tmatch(dat, "bauamt"))
52 assert.True(tmatch(dat, "wahlamt"))
53 assert.True(tmatch(dat, "bauen"))
54 assert.True(tmatch(dat, "wahlen"))
55 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020056}
Akron730a79c2021-08-03 11:05:29 +020057
58func TestSimpleTokenizer(t *testing.T) {
59 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020060 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020061 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020062 assert.True(tmatch(dat, "bau"))
63 assert.True(tmatch(dat, "bad"))
64 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020065}
Akron740f3d72021-08-03 12:12:34 +020066
Akron068874c2021-08-04 15:19:56 +020067func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020068 assert := assert.New(t)
69 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020070 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020071
72 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
73 b := make([]byte, 0, 2048)
74 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020075 var tokens []string
Akron524c5432021-08-05 14:14:27 +020076 dat.Transduce(r, w)
77 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020078 assert.Equal("wald", tokens[0])
79 assert.Equal("gehen", tokens[1])
80 assert.Equal("Da", tokens[2])
81 assert.Equal("kann", tokens[3])
82 assert.Equal("man", tokens[4])
83 assert.Equal("was", tokens[5])
84 assert.Equal("\"erleben\"", tokens[6])
85
Akron524c5432021-08-05 14:14:27 +020086 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
87 w.Reset()
88 dat.Transduce(r, w)
89 tokens = strings.Split(w.String(), "\n")
90 assert.Equal("In", tokens[0])
91 assert.Equal("den", tokens[1])
92 assert.Equal("Wald", tokens[2])
93 assert.Equal("gehen", tokens[3])
94 assert.Equal("?", tokens[4])
95 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020096
Akron524c5432021-08-05 14:14:27 +020097 r = strings.NewReader(" g? -- D")
98 w.Reset()
99 dat.Transduce(r, w)
100 tokens = strings.Split(w.String(), "\n")
101 assert.Equal("g", tokens[0])
102 assert.Equal("?", tokens[1])
103 assert.Equal("--", tokens[2])
104 assert.Equal("D", tokens[3])
105 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200106 assert.Equal("", tokens[5])
107 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200108}
109
Akron3f8571a2021-08-05 11:18:10 +0200110func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200111 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200112 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200113 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200114 assert.True(tmatch(dat, "bau"))
115 assert.True(tmatch(dat, "bad"))
116 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200117
Akron4fa28b32021-08-27 10:55:41 +0200118 assert.True(dat.LoadFactor() >= 75)
Akron6247a5d2021-08-03 19:18:28 +0200119
Akron3f8571a2021-08-05 11:18:10 +0200120 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200121 buf := bytes.NewBuffer(b)
122 n, err := dat.WriteTo(buf)
123 assert.Nil(err)
Akron92704eb2021-08-27 10:59:46 +0200124 assert.Equal(int64(208), n)
Akron3f8571a2021-08-05 11:18:10 +0200125
126 dat2 := ParseDatok(buf)
127 assert.NotNil(dat2)
128 assert.Equal(dat.array, dat2.array)
129 assert.Equal(dat.sigma, dat2.sigma)
130 assert.Equal(dat.epsilon, dat2.epsilon)
131 assert.Equal(dat.unknown, dat2.unknown)
132 assert.Equal(dat.identity, dat2.identity)
133 assert.Equal(dat.final, dat2.final)
134 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200135 assert.True(tmatch(dat2, "bau"))
136 assert.True(tmatch(dat2, "bad"))
137 assert.True(tmatch(dat2, "wald gehen"))
Akron4fa28b32021-08-27 10:55:41 +0200138
Akron92704eb2021-08-27 10:59:46 +0200139 assert.Equal(dat.TransCount(), 17)
140 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200141}
142
Akron31f3c062021-08-27 10:15:13 +0200143func TestIgnorableMCS(t *testing.T) {
144 assert := assert.New(t)
145 // File has MCS in sigma but not in net
146 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
147 assert.NotNil(tok)
148 dat := tok.ToDoubleArray()
149 assert.NotNil(dat)
150
151 b := make([]byte, 0, 2048)
152 w := bytes.NewBuffer(b)
153 var tokens []string
154
155 // Is only unambigous when transducing strictly greedy!
156 assert.True(dat.Transduce(strings.NewReader("ab<ab>"), w))
157 tokens = strings.Split(w.String(), "\n")
158 assert.Equal("a\nb\n<ab>\n", w.String())
159 assert.Equal("a", tokens[0])
160 assert.Equal("b", tokens[1])
161 assert.Equal("<ab>", tokens[2])
162 assert.Equal(4, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200163 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200164}
165
Akron6247a5d2021-08-03 19:18:28 +0200166func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200167 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200168 dat := LoadDatokFile("testdata/tokenizer.datok")
169 assert.NotNil(dat)
170 assert.True(dat.LoadFactor() >= 70)
171 assert.Equal(dat.epsilon, 1)
172 assert.Equal(dat.unknown, 2)
173 assert.Equal(dat.identity, 3)
Akron4c2a1ad2021-08-31 00:35:53 +0200174 assert.Equal(dat.final, 145)
175 assert.Equal(len(dat.sigma), 140)
Akronf1a16502021-08-16 15:24:38 +0200176 assert.True(len(dat.array) > 3600000)
177 assert.True(dat.maxSize > 3600000)
Akronec835ad2021-08-11 18:23:22 +0200178 assert.True(tmatch(dat, "bau"))
179 assert.True(tmatch(dat, "bad"))
180 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200181}
Akron3f8571a2021-08-05 11:18:10 +0200182
Akrona0bded52021-08-11 15:48:02 +0200183func XTestFullTokenizerBuild(t *testing.T) {
184 assert := assert.New(t)
185 tok := LoadFomaFile("testdata/tokenizer.fst")
186 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200187 assert.NotNil(dat)
188 // n, err := dat.Save("testdata/tokenizer.datok")
189 // assert.Nil(err)
190 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200191}
192
Akron3f8571a2021-08-05 11:18:10 +0200193func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200194 assert := assert.New(t)
195
Akrona0bded52021-08-11 15:48:02 +0200196 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200197 assert.NotNil(dat)
198
Akron3610f102021-08-08 14:13:25 +0200199 b := make([]byte, 0, 2048)
200 w := bytes.NewBuffer(b)
201 var tokens []string
202
Akron03ca4252021-08-11 13:32:53 +0200203 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200204
205 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200206 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200207 assert.Equal("tra", tokens[0])
208 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200209 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200210 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200211 assert.Equal("Du", tokens[4])
212 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200213 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200214 assert.Equal("", tokens[7])
215 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200216
217 w.Reset()
218 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
219 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200220}
Akronb7e1f132021-08-10 11:52:31 +0200221
222func TestFullTokenizerSentenceSplitter(t *testing.T) {
223 assert := assert.New(t)
224 dat := LoadDatokFile("testdata/tokenizer.datok")
225 assert.NotNil(dat)
226
227 b := make([]byte, 0, 2048)
228 w := bytes.NewBuffer(b)
229 var sentences []string
230
231 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200232 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
233 sentences = strings.Split(w.String(), "\n\n")
234
235 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
236 assert.Equal("Der\nalte\nMann\n.", sentences[0])
237 assert.Equal("", sentences[1])
238 assert.Equal(len(sentences), 2)
239
240 w.Reset()
241 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
242 sentences = strings.Split(w.String(), "\n\n")
243 assert.Equal(len(sentences), 2)
244 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
245 assert.Equal("", sentences[1])
246
247 w.Reset()
248 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200251 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200252
253 w.Reset()
254 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
255 sentences = strings.Split(w.String(), "\n\n")
256 assert.Equal(len(sentences), 2)
257
258 w.Reset()
259 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
260 sentences = strings.Split(w.String(), "\n\n")
261 assert.Equal(len(sentences), 2)
262
Akron6e70dc82021-08-11 11:33:18 +0200263 w.Reset()
264 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
265 sentences = strings.Split(w.String(), "\n\n")
266 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
267 assert.Equal("", sentences[1])
268 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200269
Akron6e70dc82021-08-11 11:33:18 +0200270 w.Reset()
271 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
272 sentences = strings.Split(w.String(), "\n\n")
273 assert.Equal("", sentences[1])
274 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200275
Akron6e70dc82021-08-11 11:33:18 +0200276 w.Reset()
277 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
278 sentences = strings.Split(w.String(), "\n\n")
279 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200280
Akron6e70dc82021-08-11 11:33:18 +0200281 w.Reset()
282 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
283 sentences = strings.Split(w.String(), "\n\n")
284 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200285
Akron6e70dc82021-08-11 11:33:18 +0200286 w.Reset()
287 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
288 sentences = strings.Split(w.String(), "\n\n")
289 assert.Equal(len(sentences), 2)
290 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
291 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200292
Akron6e70dc82021-08-11 11:33:18 +0200293 w.Reset()
294 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
295 sentences = strings.Split(w.String(), "\n\n")
296 assert.Equal(len(sentences), 3)
297 assert.Equal("Ausschalten\n!!!", sentences[0])
298 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
299 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200300
Akron4af79f12021-08-11 14:48:17 +0200301 w.Reset()
302 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
303 sentences = strings.Split(w.String(), "\n\n")
304 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200305
306 /*
307 Test:
308 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
309 */
Akronb7e1f132021-08-10 11:52:31 +0200310}
Akron03ca4252021-08-11 13:32:53 +0200311
Akron03ca4252021-08-11 13:32:53 +0200312func TestFullTokenizerTokenSplitter(t *testing.T) {
313 assert := assert.New(t)
314 dat := LoadDatokFile("testdata/tokenizer.datok")
315 assert.NotNil(dat)
316
317 b := make([]byte, 0, 2048)
318 w := bytes.NewBuffer(b)
319 var tokens []string
320
321 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200322 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200323 assert.Equal(tokens[0], "Der")
324 assert.Equal(tokens[1], "alte")
325 assert.Equal(tokens[2], "Mann")
326 assert.Equal(len(tokens), 3)
327
Akronec835ad2021-08-11 18:23:22 +0200328 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200329 assert.Equal(tokens[0], "Der")
330 assert.Equal(tokens[1], "alte")
331 assert.Equal(tokens[2], "Mann")
332 assert.Equal(tokens[3], ".")
333 assert.Equal(len(tokens), 4)
334
335 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200336 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200337 assert.Equal(tokens[0], "Der")
338 assert.Equal(tokens[1], "Vorsitzende")
339 assert.Equal(tokens[2], "der")
340 assert.Equal(tokens[3], "F.D.P.")
341 assert.Equal(tokens[4], "hat")
342 assert.Equal(tokens[5], "gewählt")
343 assert.Equal(len(tokens), 6)
344 // Ignored in KorAP-Tokenizer
345
346 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200347 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200348 assert.Equal(tokens[0], "Gefunden")
349 assert.Equal(tokens[1], "auf")
350 assert.Equal(tokens[2], "wikipedia.org")
351 assert.Equal(len(tokens), 3)
352
353 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200354 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200355 assert.Equal("Gefunden", tokens[0])
356 assert.Equal("auf", tokens[1])
357 assert.Equal("www.wikipedia.org", tokens[2])
358 assert.Equal(3, len(tokens))
359
360 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200361 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200362 assert.Equal("www.info.biz/info", tokens[3])
363
364 // testTokenizerFtpHost
365 /*
366 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
367 assert.Equal("Kann", tokens[0])
368 assert.Equal("von", tokens[1])
369 assert.Equal("ftp.download.org", tokens[2])
370 assert.Equal(5, len(tokens))
371 // Ignored in KorAP-Tokenizer
372 */
373
374 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200375 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200376 assert.Equal(tokens[0], "Das")
377 assert.Equal(tokens[1], "war")
378 assert.Equal(tokens[2], "--")
379 assert.Equal(tokens[3], "spitze")
380 assert.Equal(len(tokens), 4)
381
382 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200383 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200384 assert.Equal(tokens[0], "Ich")
385 assert.Equal(tokens[1], "bin")
386 assert.Equal(tokens[2], "unter")
387 assert.Equal(tokens[3], "korap@ids-mannheim.de")
388 assert.Equal(tokens[4], "erreichbar")
389 assert.Equal(tokens[5], ".")
390 assert.Equal(len(tokens), 6)
391
392 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200393 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200394 assert.Equal(tokens[0], "Oder")
395 assert.Equal(tokens[1], "unter")
396 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
397 assert.Equal(tokens[3], ".")
398 assert.Equal(len(tokens), 4)
399
400 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200401 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200402 assert.Equal(tokens[0], "Oder")
403 assert.Equal(tokens[1], "unter")
404 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
405 assert.Equal(tokens[3], ".")
406 assert.Equal(len(tokens), 4)
407 // Ignored in KorAP-Tokenizer
408
409 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200410 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200411 assert.Equal("\"", tokens[0])
412 assert.Equal("John", tokens[1])
413 assert.Equal("Doe", tokens[2])
414 assert.Equal("\"", tokens[3])
415 assert.Equal("@xx", tokens[4])
416 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
417 assert.Equal("com", tokens[6])
418 assert.Equal(7, len(tokens))
419
420 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200421 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200422 assert.Equal(tokens[0], "Folgt")
423 assert.Equal(tokens[1], "@korap")
424 assert.Equal(tokens[2], "und")
425 assert.Equal(tokens[3], "#korap")
426 assert.Equal(len(tokens), 4)
427
428 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200429 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200430 assert.Equal(tokens[0], "Unsere")
431 assert.Equal(tokens[1], "Website")
432 assert.Equal(tokens[2], "ist")
433 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
434 assert.Equal(len(tokens), 4)
435
436 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200437 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200438 assert.Equal(tokens[0], "Wir")
439 assert.Equal(tokens[1], "sind")
440 assert.Equal(tokens[2], "auch")
441 assert.Equal(tokens[3], "im")
442 assert.Equal(tokens[4], "Internet")
443 assert.Equal(tokens[5], "(")
444 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
445 assert.Equal(tokens[7], ")")
446 assert.Equal(len(tokens), 8)
447 // Ignored in KorAP-Tokenizer
448
449 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200450 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200451 assert.Equal(tokens[0], "Die")
452 assert.Equal(tokens[1], "Adresse")
453 assert.Equal(tokens[2], "ist")
454 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
455 assert.Equal(tokens[4], ".")
456 assert.Equal(len(tokens), 5)
457 // Ignored in KorAP-Tokenizer
458
459 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200460 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200461 assert.Equal(tokens[0], "Unser")
462 assert.Equal(tokens[1], "Server")
463 assert.Equal(tokens[2], "ist")
464 assert.Equal(tokens[3], "10.0.10.51")
465 assert.Equal(tokens[4], ".")
466 assert.Equal(len(tokens), 5)
467
468 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200469 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200470 assert.Equal(tokens[0], "Zu")
471 assert.Equal(tokens[1], "50,4%")
472 assert.Equal(tokens[2], "ist")
473 assert.Equal(tokens[3], "es")
474 assert.Equal(tokens[4], "sicher")
475 assert.Equal(len(tokens), 5)
476 // Differs from KorAP-Tokenizer
477
478 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200479 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200480 assert.Equal(tokens[0], "Der")
481 assert.Equal(tokens[1], "Termin")
482 assert.Equal(tokens[2], "ist")
483 assert.Equal(tokens[3], "am")
484 assert.Equal(tokens[4], "5.9.2018")
485 assert.Equal(len(tokens), 5)
486
Akronec835ad2021-08-11 18:23:22 +0200487 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200488 assert.Equal(tokens[0], "Der")
489 assert.Equal(tokens[1], "Termin")
490 assert.Equal(tokens[2], "ist")
491 assert.Equal(tokens[3], "am")
492 assert.Equal(tokens[4], "5/9/2018")
493 assert.Equal(len(tokens), 5)
494
495 // testTokenizerDateRange
496 /*
497 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
498 assert.Equal(tokens[0], "Der")
499 assert.Equal(tokens[1], "Termin")
500 assert.Equal(tokens[2], "war")
501 assert.Equal(tokens[3], "vom")
502 assert.Equal(tokens[4], "4.")
503 assert.Equal(tokens[5], "-")
504 assert.Equal(tokens[6], "5.9.2018")
505 assert.Equal(len(tokens), 7)
506 // Ignored in KorAP-Tokenizer
507 */
508
509 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200510 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200511 assert.Equal(tokens[0], "Das")
512 assert.Equal(tokens[1], "ist")
513 assert.Equal(tokens[2], "toll")
514 assert.Equal(tokens[3], "!")
515 assert.Equal(tokens[4], ";)")
516 assert.Equal(len(tokens), 5)
517
518 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200519 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200520 assert.Equal(tokens[0], "Kupietz")
521 assert.Equal(tokens[1], "und")
522 assert.Equal(tokens[2], "Schmidt")
523 assert.Equal(tokens[3], "(2018)")
524 assert.Equal(tokens[4], ":")
525 assert.Equal(tokens[5], "Korpuslinguistik")
526 assert.Equal(len(tokens), 6)
527 // Differs from KorAP-Tokenizer!
528
529 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200530 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200531 assert.Equal(tokens[0], "Kupietz")
532 assert.Equal(tokens[1], "und")
533 assert.Equal(tokens[2], "Schmidt")
534 assert.Equal(tokens[3], "[2018]")
535 assert.Equal(tokens[4], ":")
536 assert.Equal(tokens[5], "Korpuslinguistik")
537 assert.Equal(len(tokens), 6)
538 // Differs from KorAP-Tokenizer!
539
540 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200541 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200542 assert.Equal(tokens[0], "Er")
543 assert.Equal(tokens[1], "ist")
544 assert.Equal(tokens[2], "ein")
545 assert.Equal(tokens[3], "A****loch")
546 assert.Equal(tokens[4], "!")
547 assert.Equal(len(tokens), 5)
548
549 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200550 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200551 assert.Equal(tokens[0], "F*ck")
552 assert.Equal(tokens[1], "!")
553 assert.Equal(len(tokens), 2)
554
555 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200556 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200557 assert.Equal(tokens[0], "Dieses")
558 assert.Equal(tokens[1], "verf*****")
559 assert.Equal(tokens[2], "Kleid")
560 assert.Equal(tokens[3], "!")
561 assert.Equal(len(tokens), 4)
562
563 // Probably interpreted as HOST
564 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200565 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200566 assert.Equal(tokens[0], "Ich")
567 assert.Equal(tokens[1], "habe")
568 assert.Equal(tokens[2], "die")
569 assert.Equal(tokens[3], "readme.txt")
570 assert.Equal(tokens[4], "heruntergeladen")
571 assert.Equal(len(tokens), 5)
572
573 // Probably interpreted as HOST
574 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200575 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200576 assert.Equal(tokens[0], "Nimm")
577 assert.Equal(tokens[1], "die")
578 assert.Equal(tokens[2], "README.TXT")
579 assert.Equal(tokens[3], "!")
580 assert.Equal(len(tokens), 4)
581
582 // Probably interpreted as HOST
583 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200584 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200585 assert.Equal(tokens[0], "Zeig")
586 assert.Equal(tokens[1], "mir")
587 assert.Equal(tokens[2], "profile.jpeg")
588 assert.Equal(len(tokens), 3)
589
590 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200591
Akronec835ad2021-08-11 18:23:22 +0200592 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200593 assert.Equal(tokens[0], "Zeig")
594 assert.Equal(tokens[1], "mir")
595 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
596 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200597
Akrone8837b52021-08-11 17:29:58 +0200598 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200599 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200600 assert.Equal(tokens[0], "Gehe")
601 assert.Equal(tokens[1], "zu")
602 assert.Equal(tokens[2], "/Dokumente/profile.docx")
603 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200604
Akrone8837b52021-08-11 17:29:58 +0200605 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200606 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200607 assert.Equal(tokens[0], "Zeig")
608 assert.Equal(tokens[1], "mir")
609 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
610 assert.Equal(len(tokens), 3)
611 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200612
Akronfd92d7e2021-08-11 16:31:43 +0200613 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200614 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200615 assert.Equal(tokens[0], "Er")
616 assert.Equal(tokens[1], "sagte")
617 assert.Equal(tokens[2], ":")
618 assert.Equal(tokens[3], "\"")
619 assert.Equal(tokens[4], "Es")
620 assert.Equal(tokens[5], "geht")
621 assert.Equal(tokens[6], "mir")
622 assert.Equal(tokens[7], "gut")
623 assert.Equal(tokens[8], "!")
624 assert.Equal(tokens[9], "\"")
625 assert.Equal(tokens[10], ",")
626 assert.Equal(tokens[11], "daraufhin")
627 assert.Equal(tokens[12], "ging")
628 assert.Equal(tokens[13], "er")
629 assert.Equal(tokens[14], ".")
630 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200631
632 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200633 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
634 assert.Equal(tokens[0], "&quot;")
635 assert.Equal(tokens[1], "Das")
636 assert.Equal(tokens[2], "ist")
637 assert.Equal(tokens[3], "von")
638 assert.Equal(tokens[4], "C&A")
639 assert.Equal(tokens[5], "!")
640 assert.Equal(tokens[6], "&quot;")
641 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200642
643 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200644 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200645 assert.Equal(tokens[0], "Siehst")
646 assert.Equal(tokens[1], "Du")
647 assert.Equal(tokens[2], "?!!?")
648 assert.Equal(len(tokens), 3)
649
650 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200651 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200652 assert.Equal(tokens[0], "Peter")
653 assert.Equal(tokens[1], "O'Toole")
654 assert.Equal(len(tokens), 2)
655
656 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200657 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200658 assert.Equal(tokens[0], "Früher")
659 assert.Equal(tokens[1], "bzw.")
660 assert.Equal(tokens[2], "später")
661 assert.Equal(tokens[3], "...")
662 assert.Equal(len(tokens), 4)
663
664 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200665 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200666 assert.Equal(tokens[0], "Es")
667 assert.Equal(tokens[1], "war")
668 assert.Equal(tokens[2], "spät")
669 assert.Equal(tokens[3], ".")
670 assert.Equal(tokens[4], "Morgen")
671 assert.Equal(tokens[5], "ist")
672 assert.Equal(tokens[6], "es")
673 assert.Equal(tokens[7], "früh")
674 assert.Equal(tokens[8], ".")
675 assert.Equal(len(tokens), 9)
676 // Ignored in KorAP-Tokenizer
677
678 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200679 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200680 assert.Equal(tokens[0], "Sie")
681 assert.Equal(tokens[1], "erreichte")
682 assert.Equal(tokens[2], "den")
683 assert.Equal(tokens[3], "1.")
684 assert.Equal(tokens[4], "Platz")
685 assert.Equal(tokens[5], "!")
686 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200687
688 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200689 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200690 assert.Equal(tokens[0], "Archive")
691 assert.Equal(tokens[1], ":")
692 assert.Equal(tokens[2], "Ich")
693 assert.Equal(tokens[3], "bin")
694 assert.Equal(tokens[4], "kein")
695 assert.Equal(tokens[5], "zip")
696 assert.Equal(6, len(tokens))
697
698 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200699 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200700 assert.Equal(tokens[4], "Weststr.")
701 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200702
703 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200704 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200705 assert.Equal("D'dorf", tokens[0])
706 assert.Equal("Ku'damm", tokens[1])
707 assert.Equal("Lu'hafen", tokens[2])
708 assert.Equal("M'gladbach", tokens[3])
709 assert.Equal("W'schaft", tokens[4])
710 assert.Equal(5, len(tokens))
711
712 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200713 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200714 assert.Equal("mach's", tokens[0])
715 assert.Equal("macht's", tokens[1])
716 assert.Equal("was'n", tokens[2])
717 assert.Equal("ist's", tokens[3])
718 assert.Equal("haste", tokens[4])
719 assert.Equal("willste", tokens[5])
720 assert.Equal("kannste", tokens[6])
721 assert.Equal("biste", tokens[7])
722 assert.Equal("kriegste", tokens[8])
723 assert.Equal(9, len(tokens))
724
725 /*
726 @Test
727 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
728 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
729 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
730 assert.Equal("'ve", tokens[1]);
731 assert.Equal("'ll", tokens[3]);
732 assert.Equal("'d", tokens[5]);
733 assert.Equal("'m", tokens[7]);
734 assert.Equal("'re", tokens[9]);
735 assert.Equal("'s", tokens[11]);
736 assert.Equal("is", tokens[12]);
737 assert.Equal("n't", tokens[13]);
738 assert.Equal(14, len(tokens));
739 }
740
741 @Test
742 public void frenchTokenizerKnowsFrenchAbbreviations () {
743 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
744 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
745 assert.Equal("Approx.", tokens[0]);
746 assert.Equal("juill.", tokens[2]);
747 assert.Equal("prof.", tokens[5]);
748 assert.Equal("exerc.", tokens[15]);
749 assert.Equal("no.", tokens[16]);
750 assert.Equal("pp.", tokens[21]);
751 }
752
753 @Test
754 public void frenchTokenizerKnowsFrenchContractions () {
755 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
756 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
757 assert.Equal("J'", tokens[0]);
758 assert.Equal("j'", tokens[2]);
759 assert.Equal("qu'", tokens[4]);
760 assert.Equal("d'", tokens[6]);
761 assert.Equal("jusqu'", tokens[8]);
762 assert.Equal("Aujourd'hui", tokens[10]);
763 assert.Equal("D'", tokens[11]); // ’
764 assert.Equal("Quelqu'un", tokens[13]); // ’
765 assert.Equal("Presqu'île", tokens[14]); // ’
766 }
767
768 @Test
769 public void frenchTokenizerKnowsFrenchClitics () {
770 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
771 tokens = tokenize(dat, w, "suis-je sont-elles ")
772 assert.Equal("suis", tokens[0]);
773 assert.Equal("-je", tokens[1]);
774 assert.Equal("sont", tokens[2]);
775 assert.Equal("-elles", tokens[3]);
776 }
777
778 @Test
779 public void testEnglishTokenizerScienceAbbreviations () {
780 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
781 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
782 assert.Equal("Approx.", tokens[0]);
783 assert.Equal("in", tokens[1]);
784 assert.Equal("Sept.", tokens[2]);
785 assert.Equal("1954", tokens[3]);
786 assert.Equal(",", tokens[4]);
787 assert.Equal("Assoc.", tokens[5]);
788 assert.Equal("Prof.", tokens[6]);
789 assert.Equal("Dr.", tokens[7]);
790 assert.Equal("R.", tokens[8]);
791 assert.Equal("J.", tokens[9]);
792 assert.Equal("Ewing", tokens[10]);
793 assert.Equal("reviewed", tokens[11]);
794 assert.Equal("articles", tokens[12]);
795 assert.Equal("on", tokens[13]);
796 assert.Equal("Enzymol.", tokens[14]);
797 assert.Equal("Bacteriol.", tokens[15]);
798 assert.Equal("effects", tokens[16]);
799 assert.Equal("later", tokens[17]);
800 assert.Equal("published", tokens[18]);
801 assert.Equal("in", tokens[19]);
802 assert.Equal("Nutr.", tokens[20]);
803 assert.Equal("Rheumatol.", tokens[21]);
804 assert.Equal("No.", tokens[22]);
805 assert.Equal("12", tokens[23]);
806 assert.Equal("and", tokens[24]);
807 assert.Equal("Nº.", tokens[25]);
808 assert.Equal("13.", tokens[26]);
809 assert.Equal(",", tokens[27]);
810 assert.Equal("pp.", tokens[28]);
811 assert.Equal("17-18", tokens[29]);
812 assert.Equal(".", tokens[30]);
813 }
814
815 @Test
816 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
817 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
818 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
819 assert.Equal("I.", tokens[1]);
820 assert.Equal("I", tokens[8]);
821 assert.Equal(".", tokens[9]);
822 assert.Equal("I", tokens[12]);
823 assert.Equal(".", tokens[13]);
824 }
825
826 @Test
827 public void testZipOuputArchive () {
828
829 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
830 System.setOut(new PrintStream(clearOut));
831 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
832 assert.Equal(0, len(tokens));
833 }
834 */
835 /*
836
837 @Test
838 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
839 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
840 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
841 .printOffsets(true)
842 .build();
843 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
844 assert.Equal("Text1", tokens[0].getType());
845 assert.Equal(len(tokens), 9 );
846 }
847 */
848}
Akronbd406802021-08-11 18:39:13 +0200849
Akron4c2a1ad2021-08-31 00:35:53 +0200850func TestFullTokenizerXML(t *testing.T) {
851 assert := assert.New(t)
852
853 dat := LoadDatokFile("testdata/tokenizer.datok")
854 assert.NotNil(dat)
855
856 b := make([]byte, 0, 2048)
857 w := bytes.NewBuffer(b)
858 var tokens []string
859
860 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
861 assert.Equal("Das", tokens[0])
862 assert.Equal("<b>", tokens[1])
863 assert.Equal("beste", tokens[2])
864 assert.Equal("</b>", tokens[3])
865 assert.Equal("Fußballspiel", tokens[4])
866 assert.Equal(5, len(tokens))
867
868 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
869 assert.Equal("Das", tokens[0])
870 assert.Equal("<b class=\"c\">", tokens[1])
871 assert.Equal("beste", tokens[2])
872 assert.Equal("</b>", tokens[3])
873 assert.Equal("Fußballspiel", tokens[4])
874 assert.Equal(5, len(tokens))
875
876 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
877 assert.Equal("der", tokens[0])
878 assert.Equal("<x y=\"alte \">", tokens[1])
879 assert.Equal("<x x>", tokens[2])
880 assert.Equal("alte", tokens[3])
881 assert.Equal("</x>", tokens[4])
882 assert.Equal("etc.", tokens[5])
883 assert.Equal("et", tokens[6])
884 assert.Equal(".", tokens[7])
885 assert.Equal("Mann", tokens[8])
886 assert.Equal(".", tokens[9])
887 assert.Equal(10, len(tokens))
888}
889
Akronbd406802021-08-11 18:39:13 +0200890func BenchmarkTransduce(b *testing.B) {
891 bu := make([]byte, 0, 2048)
892 w := bytes.NewBuffer(bu)
893
894 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
895 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
896 Der Termin ist am 5.9.2018.
897 Ich habe die readme.txt heruntergeladen.
898 Ausschalten!!! Hast Du nicht gehört???
899 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
900 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
901 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
902 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
903 r := strings.NewReader(s)
904
905 dat := LoadDatokFile("testdata/tokenizer.datok")
906
Akrondf37a552021-09-02 12:16:08 +0200907 b.ResetTimer()
908
Akronbd406802021-08-11 18:39:13 +0200909 for i := 0; i < b.N; i++ {
910 w.Reset()
911 r.Reset(s)
912 ok := dat.Transduce(r, w)
913 if !ok {
914 fmt.Println("Fail!")
915 fmt.Println(w.String())
916 os.Exit(1)
917 }
918 }
Akronbd406802021-08-11 18:39:13 +0200919}
Akronbb4aac52021-08-13 00:52:27 +0200920
Akron6f1c16c2021-08-17 10:45:42 +0200921// This test is deprecated as the datok file changes over time
922func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200923 for i := 0; i < b.N; i++ {
924 dat := LoadDatokFile("testdata/tokenizer.datok")
925 if dat == nil {
926 fmt.Println("Fail!")
927 os.Exit(1)
928 }
929 }
930}
931
Akron6f1c16c2021-08-17 10:45:42 +0200932func BenchmarkToDoubleArray(b *testing.B) {
933 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +0200934 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +0200935 for i := 0; i < b.N; i++ {
936 dat := tok.ToDoubleArray()
937 if dat == nil {
938 fmt.Println("Fail!")
939 os.Exit(1)
940 }
941 }
942}
943
Akron7b1faa62021-09-02 16:10:21 +0200944func BenchmarkToDoubleArrayLarger(b *testing.B) {
945 tok := LoadFomaFile("testdata/abbr_bench.fst")
946 b.ResetTimer()
947 for i := 0; i < b.N; i++ {
948 dat := tok.ToDoubleArray()
949 if dat == nil {
950 fmt.Println("Fail!")
951 os.Exit(1)
952 }
953 }
954}
955
Akronbb4aac52021-08-13 00:52:27 +0200956// 2021-08-11 (go 1.16)
957// go test -bench=. -test.benchmem
958// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200959// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +0200960// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
961// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
962// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
963// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200964// 2021-08-16
965// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
966// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
967// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
968// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +0200969// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
970// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +0200971// 2021-08-17
972// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
973// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +0200974// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
975// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +0200976// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +0200977// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
978// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
979// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
980// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
981// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
982// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
983// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
984// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
985// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op