blob: 1ebb167c5c8aa3ad2fa9d225de7d5a3ac6322b08 [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akron1c34ce62021-09-23 23:27:39 +020014func tmatch(tok Tokenizer, s string) bool {
Akronec835ad2021-08-11 18:23:22 +020015 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
Akron1c34ce62021-09-23 23:27:39 +020017 return tok.Transduce(strings.NewReader(s), w)
Akronec835ad2021-08-11 18:23:22 +020018}
19
Akron1c34ce62021-09-23 23:27:39 +020020func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
Akronec835ad2021-08-11 18:23:22 +020021 w.Reset()
Akron1c34ce62021-09-23 23:27:39 +020022 ok := tok.Transduce(strings.NewReader(str), w)
Akronec835ad2021-08-11 18:23:22 +020023 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akron8ef408b2021-08-02 22:11:04 +020032func TestSimpleString(t *testing.T) {
33 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akron0630be52021-08-28 09:06:16 +020041 assert.False(tmatch(dat, "baua"))
Akron8ef408b2021-08-02 22:11:04 +020042}
Akron75ebe7f2021-08-03 10:34:10 +020043
44func TestSimpleBranches(t *testing.T) {
45 assert := assert.New(t)
46
47 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020048 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020049 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020050 assert.False(tmatch(dat, "bau"))
51 assert.True(tmatch(dat, "bauamt"))
52 assert.True(tmatch(dat, "wahlamt"))
53 assert.True(tmatch(dat, "bauen"))
54 assert.True(tmatch(dat, "wahlen"))
55 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020056}
Akron730a79c2021-08-03 11:05:29 +020057
58func TestSimpleTokenizer(t *testing.T) {
59 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020060 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020061 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020062 assert.True(tmatch(dat, "bau"))
63 assert.True(tmatch(dat, "bad"))
64 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020065}
Akron740f3d72021-08-03 12:12:34 +020066
Akron068874c2021-08-04 15:19:56 +020067func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020068 assert := assert.New(t)
69 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020070 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020071
72 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
73 b := make([]byte, 0, 2048)
74 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020075 var tokens []string
Akron524c5432021-08-05 14:14:27 +020076 dat.Transduce(r, w)
77 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020078 assert.Equal(len(tokens), 10)
Akron3f8571a2021-08-05 11:18:10 +020079 assert.Equal("wald", tokens[0])
80 assert.Equal("gehen", tokens[1])
81 assert.Equal("Da", tokens[2])
82 assert.Equal("kann", tokens[3])
83 assert.Equal("man", tokens[4])
84 assert.Equal("was", tokens[5])
85 assert.Equal("\"erleben\"", tokens[6])
86
Akron524c5432021-08-05 14:14:27 +020087 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
88 w.Reset()
89 dat.Transduce(r, w)
90 tokens = strings.Split(w.String(), "\n")
91 assert.Equal("In", tokens[0])
92 assert.Equal("den", tokens[1])
93 assert.Equal("Wald", tokens[2])
94 assert.Equal("gehen", tokens[3])
95 assert.Equal("?", tokens[4])
96 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020097
Akron524c5432021-08-05 14:14:27 +020098 r = strings.NewReader(" g? -- D")
99 w.Reset()
100 dat.Transduce(r, w)
101 tokens = strings.Split(w.String(), "\n")
102 assert.Equal("g", tokens[0])
103 assert.Equal("?", tokens[1])
104 assert.Equal("--", tokens[2])
105 assert.Equal("D", tokens[3])
106 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200107 assert.Equal("", tokens[5])
108 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200109}
110
Akron3f8571a2021-08-05 11:18:10 +0200111func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200112 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200113 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200114 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200115 assert.True(tmatch(dat, "bau"))
116 assert.True(tmatch(dat, "bad"))
117 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200118
Akron3f8571a2021-08-05 11:18:10 +0200119 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200120 buf := bytes.NewBuffer(b)
121 n, err := dat.WriteTo(buf)
122 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200123 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200124
125 dat2 := ParseDatok(buf)
126 assert.NotNil(dat2)
127 assert.Equal(dat.array, dat2.array)
128 assert.Equal(dat.sigma, dat2.sigma)
129 assert.Equal(dat.epsilon, dat2.epsilon)
130 assert.Equal(dat.unknown, dat2.unknown)
131 assert.Equal(dat.identity, dat2.identity)
132 assert.Equal(dat.final, dat2.final)
133 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200134 assert.True(tmatch(dat2, "bau"))
135 assert.True(tmatch(dat2, "bad"))
136 assert.True(tmatch(dat2, "wald gehen"))
Akron4fa28b32021-08-27 10:55:41 +0200137
Akron92704eb2021-08-27 10:59:46 +0200138 assert.Equal(dat.TransCount(), 17)
139 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200140}
141
Akron31f3c062021-08-27 10:15:13 +0200142func TestIgnorableMCS(t *testing.T) {
143 assert := assert.New(t)
144 // File has MCS in sigma but not in net
145 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
146 assert.NotNil(tok)
147 dat := tok.ToDoubleArray()
148 assert.NotNil(dat)
149
150 b := make([]byte, 0, 2048)
151 w := bytes.NewBuffer(b)
152 var tokens []string
153
154 // Is only unambigous when transducing strictly greedy!
155 assert.True(dat.Transduce(strings.NewReader("ab<ab>"), w))
156 tokens = strings.Split(w.String(), "\n")
157 assert.Equal("a\nb\n<ab>\n", w.String())
158 assert.Equal("a", tokens[0])
159 assert.Equal("b", tokens[1])
160 assert.Equal("<ab>", tokens[2])
161 assert.Equal(4, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200162 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200163}
164
Akron6247a5d2021-08-03 19:18:28 +0200165func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200166 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200167 dat := LoadDatokFile("testdata/tokenizer.datok")
168 assert.NotNil(dat)
169 assert.True(dat.LoadFactor() >= 70)
170 assert.Equal(dat.epsilon, 1)
171 assert.Equal(dat.unknown, 2)
172 assert.Equal(dat.identity, 3)
Akron4c2a1ad2021-08-31 00:35:53 +0200173 assert.Equal(dat.final, 145)
174 assert.Equal(len(dat.sigma), 140)
Akronf1a16502021-08-16 15:24:38 +0200175 assert.True(len(dat.array) > 3600000)
176 assert.True(dat.maxSize > 3600000)
Akronec835ad2021-08-11 18:23:22 +0200177 assert.True(tmatch(dat, "bau"))
178 assert.True(tmatch(dat, "bad"))
179 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200180}
Akron3f8571a2021-08-05 11:18:10 +0200181
Akron941f2152021-09-26 15:14:25 +0200182func TestTokenizerBranch(t *testing.T) {
183 assert := assert.New(t)
184 tok := LoadTokenizerFile("testdata/simpletok.datok")
185 assert.NotNil(tok)
186 assert.Equal(tok.Type(), "DATOK")
187
188 tok = LoadTokenizerFile("testdata/simpletok.matok")
189 assert.NotNil(tok)
190 assert.Equal(tok.Type(), "MATOK")
191}
192
Akrona0bded52021-08-11 15:48:02 +0200193func XTestFullTokenizerBuild(t *testing.T) {
194 assert := assert.New(t)
195 tok := LoadFomaFile("testdata/tokenizer.fst")
196 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200197 assert.NotNil(dat)
198 // n, err := dat.Save("testdata/tokenizer.datok")
199 // assert.Nil(err)
200 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200201}
202
Akron3f8571a2021-08-05 11:18:10 +0200203func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200204 assert := assert.New(t)
205
Akrona0bded52021-08-11 15:48:02 +0200206 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200207 assert.NotNil(dat)
208
Akron3610f102021-08-08 14:13:25 +0200209 b := make([]byte, 0, 2048)
210 w := bytes.NewBuffer(b)
211 var tokens []string
212
Akron03ca4252021-08-11 13:32:53 +0200213 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200214
215 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200216 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200217 assert.Equal("tra", tokens[0])
218 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200219 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200220 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200221 assert.Equal("Du", tokens[4])
222 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200223 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200224 assert.Equal("", tokens[7])
225 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200226
227 w.Reset()
228 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
229 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200230}
Akronb7e1f132021-08-10 11:52:31 +0200231
232func TestFullTokenizerSentenceSplitter(t *testing.T) {
233 assert := assert.New(t)
234 dat := LoadDatokFile("testdata/tokenizer.datok")
235 assert.NotNil(dat)
236
237 b := make([]byte, 0, 2048)
238 w := bytes.NewBuffer(b)
239 var sentences []string
240
241 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200242 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
243 sentences = strings.Split(w.String(), "\n\n")
244
245 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
246 assert.Equal("Der\nalte\nMann\n.", sentences[0])
247 assert.Equal("", sentences[1])
248 assert.Equal(len(sentences), 2)
249
250 w.Reset()
251 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
252 sentences = strings.Split(w.String(), "\n\n")
253 assert.Equal(len(sentences), 2)
254 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
255 assert.Equal("", sentences[1])
256
257 w.Reset()
258 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200259 sentences = strings.Split(w.String(), "\n\n")
260 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200261 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200262
263 w.Reset()
264 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
265 sentences = strings.Split(w.String(), "\n\n")
266 assert.Equal(len(sentences), 2)
267
268 w.Reset()
269 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
270 sentences = strings.Split(w.String(), "\n\n")
271 assert.Equal(len(sentences), 2)
272
Akron6e70dc82021-08-11 11:33:18 +0200273 w.Reset()
274 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
275 sentences = strings.Split(w.String(), "\n\n")
276 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
277 assert.Equal("", sentences[1])
278 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200279
Akron6e70dc82021-08-11 11:33:18 +0200280 w.Reset()
281 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
282 sentences = strings.Split(w.String(), "\n\n")
283 assert.Equal("", sentences[1])
284 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200285
Akron6e70dc82021-08-11 11:33:18 +0200286 w.Reset()
287 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
288 sentences = strings.Split(w.String(), "\n\n")
289 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200290
Akron6e70dc82021-08-11 11:33:18 +0200291 w.Reset()
292 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
293 sentences = strings.Split(w.String(), "\n\n")
294 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200295
Akron6e70dc82021-08-11 11:33:18 +0200296 w.Reset()
297 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
298 sentences = strings.Split(w.String(), "\n\n")
299 assert.Equal(len(sentences), 2)
300 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
301 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200302
Akron6e70dc82021-08-11 11:33:18 +0200303 w.Reset()
304 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
305 sentences = strings.Split(w.String(), "\n\n")
306 assert.Equal(len(sentences), 3)
307 assert.Equal("Ausschalten\n!!!", sentences[0])
308 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
309 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200310
Akron4af79f12021-08-11 14:48:17 +0200311 w.Reset()
312 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
313 sentences = strings.Split(w.String(), "\n\n")
314 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200315
316 /*
317 Test:
318 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
319 */
Akronb7e1f132021-08-10 11:52:31 +0200320}
Akron03ca4252021-08-11 13:32:53 +0200321
Akron03ca4252021-08-11 13:32:53 +0200322func TestFullTokenizerTokenSplitter(t *testing.T) {
323 assert := assert.New(t)
324 dat := LoadDatokFile("testdata/tokenizer.datok")
325 assert.NotNil(dat)
326
327 b := make([]byte, 0, 2048)
328 w := bytes.NewBuffer(b)
329 var tokens []string
330
331 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200332 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200333 assert.Equal(tokens[0], "Der")
334 assert.Equal(tokens[1], "alte")
335 assert.Equal(tokens[2], "Mann")
336 assert.Equal(len(tokens), 3)
337
Akronec835ad2021-08-11 18:23:22 +0200338 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200339 assert.Equal(tokens[0], "Der")
340 assert.Equal(tokens[1], "alte")
341 assert.Equal(tokens[2], "Mann")
342 assert.Equal(tokens[3], ".")
343 assert.Equal(len(tokens), 4)
344
345 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200346 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200347 assert.Equal(tokens[0], "Der")
348 assert.Equal(tokens[1], "Vorsitzende")
349 assert.Equal(tokens[2], "der")
350 assert.Equal(tokens[3], "F.D.P.")
351 assert.Equal(tokens[4], "hat")
352 assert.Equal(tokens[5], "gewählt")
353 assert.Equal(len(tokens), 6)
354 // Ignored in KorAP-Tokenizer
355
356 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200357 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200358 assert.Equal(tokens[0], "Gefunden")
359 assert.Equal(tokens[1], "auf")
360 assert.Equal(tokens[2], "wikipedia.org")
361 assert.Equal(len(tokens), 3)
362
363 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200364 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200365 assert.Equal("Gefunden", tokens[0])
366 assert.Equal("auf", tokens[1])
367 assert.Equal("www.wikipedia.org", tokens[2])
368 assert.Equal(3, len(tokens))
369
370 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200371 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200372 assert.Equal("www.info.biz/info", tokens[3])
373
374 // testTokenizerFtpHost
375 /*
376 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
377 assert.Equal("Kann", tokens[0])
378 assert.Equal("von", tokens[1])
379 assert.Equal("ftp.download.org", tokens[2])
380 assert.Equal(5, len(tokens))
381 // Ignored in KorAP-Tokenizer
382 */
383
384 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200385 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200386 assert.Equal(tokens[0], "Das")
387 assert.Equal(tokens[1], "war")
388 assert.Equal(tokens[2], "--")
389 assert.Equal(tokens[3], "spitze")
390 assert.Equal(len(tokens), 4)
391
392 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200393 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200394 assert.Equal(tokens[0], "Ich")
395 assert.Equal(tokens[1], "bin")
396 assert.Equal(tokens[2], "unter")
397 assert.Equal(tokens[3], "korap@ids-mannheim.de")
398 assert.Equal(tokens[4], "erreichbar")
399 assert.Equal(tokens[5], ".")
400 assert.Equal(len(tokens), 6)
401
402 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200403 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200404 assert.Equal(tokens[0], "Oder")
405 assert.Equal(tokens[1], "unter")
406 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
407 assert.Equal(tokens[3], ".")
408 assert.Equal(len(tokens), 4)
409
410 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200411 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200412 assert.Equal(tokens[0], "Oder")
413 assert.Equal(tokens[1], "unter")
414 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
415 assert.Equal(tokens[3], ".")
416 assert.Equal(len(tokens), 4)
417 // Ignored in KorAP-Tokenizer
418
419 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200420 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200421 assert.Equal("\"", tokens[0])
422 assert.Equal("John", tokens[1])
423 assert.Equal("Doe", tokens[2])
424 assert.Equal("\"", tokens[3])
425 assert.Equal("@xx", tokens[4])
426 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
427 assert.Equal("com", tokens[6])
428 assert.Equal(7, len(tokens))
429
430 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200431 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200432 assert.Equal(tokens[0], "Folgt")
433 assert.Equal(tokens[1], "@korap")
434 assert.Equal(tokens[2], "und")
435 assert.Equal(tokens[3], "#korap")
436 assert.Equal(len(tokens), 4)
437
438 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200439 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200440 assert.Equal(tokens[0], "Unsere")
441 assert.Equal(tokens[1], "Website")
442 assert.Equal(tokens[2], "ist")
443 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
444 assert.Equal(len(tokens), 4)
445
446 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200447 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200448 assert.Equal(tokens[0], "Wir")
449 assert.Equal(tokens[1], "sind")
450 assert.Equal(tokens[2], "auch")
451 assert.Equal(tokens[3], "im")
452 assert.Equal(tokens[4], "Internet")
453 assert.Equal(tokens[5], "(")
454 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
455 assert.Equal(tokens[7], ")")
456 assert.Equal(len(tokens), 8)
457 // Ignored in KorAP-Tokenizer
458
459 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200460 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200461 assert.Equal(tokens[0], "Die")
462 assert.Equal(tokens[1], "Adresse")
463 assert.Equal(tokens[2], "ist")
464 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
465 assert.Equal(tokens[4], ".")
466 assert.Equal(len(tokens), 5)
467 // Ignored in KorAP-Tokenizer
468
469 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200470 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200471 assert.Equal(tokens[0], "Unser")
472 assert.Equal(tokens[1], "Server")
473 assert.Equal(tokens[2], "ist")
474 assert.Equal(tokens[3], "10.0.10.51")
475 assert.Equal(tokens[4], ".")
476 assert.Equal(len(tokens), 5)
477
478 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200479 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200480 assert.Equal(tokens[0], "Zu")
481 assert.Equal(tokens[1], "50,4%")
482 assert.Equal(tokens[2], "ist")
483 assert.Equal(tokens[3], "es")
484 assert.Equal(tokens[4], "sicher")
485 assert.Equal(len(tokens), 5)
486 // Differs from KorAP-Tokenizer
487
488 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200489 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200490 assert.Equal(tokens[0], "Der")
491 assert.Equal(tokens[1], "Termin")
492 assert.Equal(tokens[2], "ist")
493 assert.Equal(tokens[3], "am")
494 assert.Equal(tokens[4], "5.9.2018")
495 assert.Equal(len(tokens), 5)
496
Akronec835ad2021-08-11 18:23:22 +0200497 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200498 assert.Equal(tokens[0], "Der")
499 assert.Equal(tokens[1], "Termin")
500 assert.Equal(tokens[2], "ist")
501 assert.Equal(tokens[3], "am")
502 assert.Equal(tokens[4], "5/9/2018")
503 assert.Equal(len(tokens), 5)
504
505 // testTokenizerDateRange
506 /*
507 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
508 assert.Equal(tokens[0], "Der")
509 assert.Equal(tokens[1], "Termin")
510 assert.Equal(tokens[2], "war")
511 assert.Equal(tokens[3], "vom")
512 assert.Equal(tokens[4], "4.")
513 assert.Equal(tokens[5], "-")
514 assert.Equal(tokens[6], "5.9.2018")
515 assert.Equal(len(tokens), 7)
516 // Ignored in KorAP-Tokenizer
517 */
518
519 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200520 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200521 assert.Equal(tokens[0], "Das")
522 assert.Equal(tokens[1], "ist")
523 assert.Equal(tokens[2], "toll")
524 assert.Equal(tokens[3], "!")
525 assert.Equal(tokens[4], ";)")
526 assert.Equal(len(tokens), 5)
527
528 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200529 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200530 assert.Equal(tokens[0], "Kupietz")
531 assert.Equal(tokens[1], "und")
532 assert.Equal(tokens[2], "Schmidt")
533 assert.Equal(tokens[3], "(2018)")
534 assert.Equal(tokens[4], ":")
535 assert.Equal(tokens[5], "Korpuslinguistik")
536 assert.Equal(len(tokens), 6)
537 // Differs from KorAP-Tokenizer!
538
539 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200540 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200541 assert.Equal(tokens[0], "Kupietz")
542 assert.Equal(tokens[1], "und")
543 assert.Equal(tokens[2], "Schmidt")
544 assert.Equal(tokens[3], "[2018]")
545 assert.Equal(tokens[4], ":")
546 assert.Equal(tokens[5], "Korpuslinguistik")
547 assert.Equal(len(tokens), 6)
548 // Differs from KorAP-Tokenizer!
549
550 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200551 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200552 assert.Equal(tokens[0], "Er")
553 assert.Equal(tokens[1], "ist")
554 assert.Equal(tokens[2], "ein")
555 assert.Equal(tokens[3], "A****loch")
556 assert.Equal(tokens[4], "!")
557 assert.Equal(len(tokens), 5)
558
559 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200560 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200561 assert.Equal(tokens[0], "F*ck")
562 assert.Equal(tokens[1], "!")
563 assert.Equal(len(tokens), 2)
564
565 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200566 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200567 assert.Equal(tokens[0], "Dieses")
568 assert.Equal(tokens[1], "verf*****")
569 assert.Equal(tokens[2], "Kleid")
570 assert.Equal(tokens[3], "!")
571 assert.Equal(len(tokens), 4)
572
573 // Probably interpreted as HOST
574 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200575 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200576 assert.Equal(tokens[0], "Ich")
577 assert.Equal(tokens[1], "habe")
578 assert.Equal(tokens[2], "die")
579 assert.Equal(tokens[3], "readme.txt")
580 assert.Equal(tokens[4], "heruntergeladen")
581 assert.Equal(len(tokens), 5)
582
583 // Probably interpreted as HOST
584 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200585 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200586 assert.Equal(tokens[0], "Nimm")
587 assert.Equal(tokens[1], "die")
588 assert.Equal(tokens[2], "README.TXT")
589 assert.Equal(tokens[3], "!")
590 assert.Equal(len(tokens), 4)
591
592 // Probably interpreted as HOST
593 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200594 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200595 assert.Equal(tokens[0], "Zeig")
596 assert.Equal(tokens[1], "mir")
597 assert.Equal(tokens[2], "profile.jpeg")
598 assert.Equal(len(tokens), 3)
599
600 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200601
Akronec835ad2021-08-11 18:23:22 +0200602 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200603 assert.Equal(tokens[0], "Zeig")
604 assert.Equal(tokens[1], "mir")
605 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
606 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200607
Akrone8837b52021-08-11 17:29:58 +0200608 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200609 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200610 assert.Equal(tokens[0], "Gehe")
611 assert.Equal(tokens[1], "zu")
612 assert.Equal(tokens[2], "/Dokumente/profile.docx")
613 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200614
Akrone8837b52021-08-11 17:29:58 +0200615 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200616 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200617 assert.Equal(tokens[0], "Zeig")
618 assert.Equal(tokens[1], "mir")
619 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
620 assert.Equal(len(tokens), 3)
621 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200622
Akronfd92d7e2021-08-11 16:31:43 +0200623 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200624 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200625 assert.Equal(tokens[0], "Er")
626 assert.Equal(tokens[1], "sagte")
627 assert.Equal(tokens[2], ":")
628 assert.Equal(tokens[3], "\"")
629 assert.Equal(tokens[4], "Es")
630 assert.Equal(tokens[5], "geht")
631 assert.Equal(tokens[6], "mir")
632 assert.Equal(tokens[7], "gut")
633 assert.Equal(tokens[8], "!")
634 assert.Equal(tokens[9], "\"")
635 assert.Equal(tokens[10], ",")
636 assert.Equal(tokens[11], "daraufhin")
637 assert.Equal(tokens[12], "ging")
638 assert.Equal(tokens[13], "er")
639 assert.Equal(tokens[14], ".")
640 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200641
642 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200643 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
644 assert.Equal(tokens[0], "&quot;")
645 assert.Equal(tokens[1], "Das")
646 assert.Equal(tokens[2], "ist")
647 assert.Equal(tokens[3], "von")
648 assert.Equal(tokens[4], "C&A")
649 assert.Equal(tokens[5], "!")
650 assert.Equal(tokens[6], "&quot;")
651 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200652
653 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200654 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200655 assert.Equal(tokens[0], "Siehst")
656 assert.Equal(tokens[1], "Du")
657 assert.Equal(tokens[2], "?!!?")
658 assert.Equal(len(tokens), 3)
659
660 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200661 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200662 assert.Equal(tokens[0], "Peter")
663 assert.Equal(tokens[1], "O'Toole")
664 assert.Equal(len(tokens), 2)
665
666 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200667 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200668 assert.Equal(tokens[0], "Früher")
669 assert.Equal(tokens[1], "bzw.")
670 assert.Equal(tokens[2], "später")
671 assert.Equal(tokens[3], "...")
672 assert.Equal(len(tokens), 4)
673
674 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200675 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200676 assert.Equal(tokens[0], "Es")
677 assert.Equal(tokens[1], "war")
678 assert.Equal(tokens[2], "spät")
679 assert.Equal(tokens[3], ".")
680 assert.Equal(tokens[4], "Morgen")
681 assert.Equal(tokens[5], "ist")
682 assert.Equal(tokens[6], "es")
683 assert.Equal(tokens[7], "früh")
684 assert.Equal(tokens[8], ".")
685 assert.Equal(len(tokens), 9)
686 // Ignored in KorAP-Tokenizer
687
688 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200689 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200690 assert.Equal(tokens[0], "Sie")
691 assert.Equal(tokens[1], "erreichte")
692 assert.Equal(tokens[2], "den")
693 assert.Equal(tokens[3], "1.")
694 assert.Equal(tokens[4], "Platz")
695 assert.Equal(tokens[5], "!")
696 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200697
698 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200699 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200700 assert.Equal(tokens[0], "Archive")
701 assert.Equal(tokens[1], ":")
702 assert.Equal(tokens[2], "Ich")
703 assert.Equal(tokens[3], "bin")
704 assert.Equal(tokens[4], "kein")
705 assert.Equal(tokens[5], "zip")
706 assert.Equal(6, len(tokens))
707
708 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200709 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200710 assert.Equal(tokens[4], "Weststr.")
711 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200712
713 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200714 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200715 assert.Equal("D'dorf", tokens[0])
716 assert.Equal("Ku'damm", tokens[1])
717 assert.Equal("Lu'hafen", tokens[2])
718 assert.Equal("M'gladbach", tokens[3])
719 assert.Equal("W'schaft", tokens[4])
720 assert.Equal(5, len(tokens))
721
722 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200723 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200724 assert.Equal("mach's", tokens[0])
725 assert.Equal("macht's", tokens[1])
726 assert.Equal("was'n", tokens[2])
727 assert.Equal("ist's", tokens[3])
728 assert.Equal("haste", tokens[4])
729 assert.Equal("willste", tokens[5])
730 assert.Equal("kannste", tokens[6])
731 assert.Equal("biste", tokens[7])
732 assert.Equal("kriegste", tokens[8])
733 assert.Equal(9, len(tokens))
734
735 /*
736 @Test
737 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
738 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
739 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
740 assert.Equal("'ve", tokens[1]);
741 assert.Equal("'ll", tokens[3]);
742 assert.Equal("'d", tokens[5]);
743 assert.Equal("'m", tokens[7]);
744 assert.Equal("'re", tokens[9]);
745 assert.Equal("'s", tokens[11]);
746 assert.Equal("is", tokens[12]);
747 assert.Equal("n't", tokens[13]);
748 assert.Equal(14, len(tokens));
749 }
750
751 @Test
752 public void frenchTokenizerKnowsFrenchAbbreviations () {
753 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
754 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
755 assert.Equal("Approx.", tokens[0]);
756 assert.Equal("juill.", tokens[2]);
757 assert.Equal("prof.", tokens[5]);
758 assert.Equal("exerc.", tokens[15]);
759 assert.Equal("no.", tokens[16]);
760 assert.Equal("pp.", tokens[21]);
761 }
762
763 @Test
764 public void frenchTokenizerKnowsFrenchContractions () {
765 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
766 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
767 assert.Equal("J'", tokens[0]);
768 assert.Equal("j'", tokens[2]);
769 assert.Equal("qu'", tokens[4]);
770 assert.Equal("d'", tokens[6]);
771 assert.Equal("jusqu'", tokens[8]);
772 assert.Equal("Aujourd'hui", tokens[10]);
773 assert.Equal("D'", tokens[11]); // ’
774 assert.Equal("Quelqu'un", tokens[13]); // ’
775 assert.Equal("Presqu'île", tokens[14]); // ’
776 }
777
778 @Test
779 public void frenchTokenizerKnowsFrenchClitics () {
780 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
781 tokens = tokenize(dat, w, "suis-je sont-elles ")
782 assert.Equal("suis", tokens[0]);
783 assert.Equal("-je", tokens[1]);
784 assert.Equal("sont", tokens[2]);
785 assert.Equal("-elles", tokens[3]);
786 }
787
788 @Test
789 public void testEnglishTokenizerScienceAbbreviations () {
790 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
791 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
792 assert.Equal("Approx.", tokens[0]);
793 assert.Equal("in", tokens[1]);
794 assert.Equal("Sept.", tokens[2]);
795 assert.Equal("1954", tokens[3]);
796 assert.Equal(",", tokens[4]);
797 assert.Equal("Assoc.", tokens[5]);
798 assert.Equal("Prof.", tokens[6]);
799 assert.Equal("Dr.", tokens[7]);
800 assert.Equal("R.", tokens[8]);
801 assert.Equal("J.", tokens[9]);
802 assert.Equal("Ewing", tokens[10]);
803 assert.Equal("reviewed", tokens[11]);
804 assert.Equal("articles", tokens[12]);
805 assert.Equal("on", tokens[13]);
806 assert.Equal("Enzymol.", tokens[14]);
807 assert.Equal("Bacteriol.", tokens[15]);
808 assert.Equal("effects", tokens[16]);
809 assert.Equal("later", tokens[17]);
810 assert.Equal("published", tokens[18]);
811 assert.Equal("in", tokens[19]);
812 assert.Equal("Nutr.", tokens[20]);
813 assert.Equal("Rheumatol.", tokens[21]);
814 assert.Equal("No.", tokens[22]);
815 assert.Equal("12", tokens[23]);
816 assert.Equal("and", tokens[24]);
817 assert.Equal("Nº.", tokens[25]);
818 assert.Equal("13.", tokens[26]);
819 assert.Equal(",", tokens[27]);
820 assert.Equal("pp.", tokens[28]);
821 assert.Equal("17-18", tokens[29]);
822 assert.Equal(".", tokens[30]);
823 }
824
825 @Test
826 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
827 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
828 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
829 assert.Equal("I.", tokens[1]);
830 assert.Equal("I", tokens[8]);
831 assert.Equal(".", tokens[9]);
832 assert.Equal("I", tokens[12]);
833 assert.Equal(".", tokens[13]);
834 }
835
836 @Test
837 public void testZipOuputArchive () {
838
839 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
840 System.setOut(new PrintStream(clearOut));
841 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
842 assert.Equal(0, len(tokens));
843 }
844 */
845 /*
846
847 @Test
848 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
849 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
850 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
851 .printOffsets(true)
852 .build();
853 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
854 assert.Equal("Text1", tokens[0].getType());
855 assert.Equal(len(tokens), 9 );
856 }
857 */
858}
Akronbd406802021-08-11 18:39:13 +0200859
Akron29e306f2021-09-02 18:29:56 +0200860func TestLoadFactor1(t *testing.T) {
861 assert := assert.New(t)
862 tok := LoadFomaFile("testdata/abbr_bench.fst")
863 dat := tok.ToDoubleArray()
864 assert.True(dat.LoadFactor() > 88)
865}
866
Akron4c2a1ad2021-08-31 00:35:53 +0200867func TestFullTokenizerXML(t *testing.T) {
868 assert := assert.New(t)
869
870 dat := LoadDatokFile("testdata/tokenizer.datok")
871 assert.NotNil(dat)
872
873 b := make([]byte, 0, 2048)
874 w := bytes.NewBuffer(b)
875 var tokens []string
876
877 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
878 assert.Equal("Das", tokens[0])
879 assert.Equal("<b>", tokens[1])
880 assert.Equal("beste", tokens[2])
881 assert.Equal("</b>", tokens[3])
882 assert.Equal("Fußballspiel", tokens[4])
883 assert.Equal(5, len(tokens))
884
885 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
886 assert.Equal("Das", tokens[0])
887 assert.Equal("<b class=\"c\">", tokens[1])
888 assert.Equal("beste", tokens[2])
889 assert.Equal("</b>", tokens[3])
890 assert.Equal("Fußballspiel", tokens[4])
891 assert.Equal(5, len(tokens))
892
893 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
894 assert.Equal("der", tokens[0])
895 assert.Equal("<x y=\"alte \">", tokens[1])
896 assert.Equal("<x x>", tokens[2])
897 assert.Equal("alte", tokens[3])
898 assert.Equal("</x>", tokens[4])
899 assert.Equal("etc.", tokens[5])
900 assert.Equal("et", tokens[6])
901 assert.Equal(".", tokens[7])
902 assert.Equal("Mann", tokens[8])
903 assert.Equal(".", tokens[9])
904 assert.Equal(10, len(tokens))
905}
906
Akronbd406802021-08-11 18:39:13 +0200907func BenchmarkTransduce(b *testing.B) {
908 bu := make([]byte, 0, 2048)
909 w := bytes.NewBuffer(bu)
910
911 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
912 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
913 Der Termin ist am 5.9.2018.
914 Ich habe die readme.txt heruntergeladen.
915 Ausschalten!!! Hast Du nicht gehört???
916 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
917 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
918 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
919 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
920 r := strings.NewReader(s)
921
922 dat := LoadDatokFile("testdata/tokenizer.datok")
923
Akrondf37a552021-09-02 12:16:08 +0200924 b.ResetTimer()
925
Akronbd406802021-08-11 18:39:13 +0200926 for i := 0; i < b.N; i++ {
927 w.Reset()
928 r.Reset(s)
929 ok := dat.Transduce(r, w)
930 if !ok {
931 fmt.Println("Fail!")
932 fmt.Println(w.String())
933 os.Exit(1)
934 }
935 }
Akronbd406802021-08-11 18:39:13 +0200936}
Akronbb4aac52021-08-13 00:52:27 +0200937
Akron6f1c16c2021-08-17 10:45:42 +0200938// This test is deprecated as the datok file changes over time
939func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200940 for i := 0; i < b.N; i++ {
941 dat := LoadDatokFile("testdata/tokenizer.datok")
942 if dat == nil {
943 fmt.Println("Fail!")
944 os.Exit(1)
945 }
946 }
947}
948
Akron6f1c16c2021-08-17 10:45:42 +0200949func BenchmarkToDoubleArray(b *testing.B) {
950 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +0200951 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +0200952 for i := 0; i < b.N; i++ {
953 dat := tok.ToDoubleArray()
954 if dat == nil {
955 fmt.Println("Fail!")
956 os.Exit(1)
957 }
958 }
959}
960
Akron7b1faa62021-09-02 16:10:21 +0200961func BenchmarkToDoubleArrayLarger(b *testing.B) {
962 tok := LoadFomaFile("testdata/abbr_bench.fst")
963 b.ResetTimer()
964 for i := 0; i < b.N; i++ {
965 dat := tok.ToDoubleArray()
966 if dat == nil {
967 fmt.Println("Fail!")
968 os.Exit(1)
969 }
970 }
971}
972
Akronbb4aac52021-08-13 00:52:27 +0200973// 2021-08-11 (go 1.16)
974// go test -bench=. -test.benchmem
975// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200976// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +0200977// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
978// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
979// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
980// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200981// 2021-08-16
982// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
983// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
984// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
985// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +0200986// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
987// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +0200988// 2021-08-17
989// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
990// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +0200991// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
992// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +0200993// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +0200994// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
995// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
996// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
997// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
998// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
999// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
1000// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
1001// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
1002// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +02001003// 2021-09-02 - xCheckSkip() with .9
1004// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
1005// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
1006// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +02001007// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
1008// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
1009// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
1010// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op