blob: 8c2b8948c4701568366be027963b7620199cb4ee [file] [log] [blame]
Akron7f1097f2021-09-21 16:00:29 +02001package datok
Akron8ef408b2021-08-02 22:11:04 +02002
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akronbd406802021-08-11 18:39:13 +02005 "fmt"
6 "os"
Akron03ca4252021-08-11 13:32:53 +02007 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02008 "strings"
Akron8ef408b2021-08-02 22:11:04 +02009 "testing"
10
11 "github.com/stretchr/testify/assert"
12)
13
Akron1c34ce62021-09-23 23:27:39 +020014func tmatch(tok Tokenizer, s string) bool {
Akronec835ad2021-08-11 18:23:22 +020015 b := make([]byte, 0, 2048)
16 w := bytes.NewBuffer(b)
Akron1c34ce62021-09-23 23:27:39 +020017 return tok.Transduce(strings.NewReader(s), w)
Akronec835ad2021-08-11 18:23:22 +020018}
19
Akron1c34ce62021-09-23 23:27:39 +020020func ttokenize(tok Tokenizer, w *bytes.Buffer, str string) []string {
Akronec835ad2021-08-11 18:23:22 +020021 w.Reset()
Akron1c34ce62021-09-23 23:27:39 +020022 ok := tok.Transduce(strings.NewReader(str), w)
Akronec835ad2021-08-11 18:23:22 +020023 if !ok {
24 return []string{}
25 }
26 obj := regexp.MustCompile("\n+")
27
28 tokens := obj.Split(w.String(), -1)
29 return tokens[:len(tokens)-1]
30}
31
Akron8ef408b2021-08-02 22:11:04 +020032func TestSimpleString(t *testing.T) {
33 assert := assert.New(t)
34
35 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020036 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020037 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020038 assert.True(tmatch(dat, "bau"))
39 assert.True(tmatch(dat, "bauamt"))
40 assert.False(tmatch(dat, "baum"))
Akron0630be52021-08-28 09:06:16 +020041 assert.False(tmatch(dat, "baua"))
Akron8ef408b2021-08-02 22:11:04 +020042}
Akron75ebe7f2021-08-03 10:34:10 +020043
44func TestSimpleBranches(t *testing.T) {
45 assert := assert.New(t)
46
47 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020048 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020049 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020050 assert.False(tmatch(dat, "bau"))
51 assert.True(tmatch(dat, "bauamt"))
52 assert.True(tmatch(dat, "wahlamt"))
53 assert.True(tmatch(dat, "bauen"))
54 assert.True(tmatch(dat, "wahlen"))
55 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020056}
Akron730a79c2021-08-03 11:05:29 +020057
58func TestSimpleTokenizer(t *testing.T) {
59 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020060 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020061 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020062 assert.True(tmatch(dat, "bau"))
63 assert.True(tmatch(dat, "bad"))
64 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020065}
Akron740f3d72021-08-03 12:12:34 +020066
Akron068874c2021-08-04 15:19:56 +020067func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020068 assert := assert.New(t)
69 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020070 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020071
72 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
73 b := make([]byte, 0, 2048)
74 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020075 var tokens []string
Akron524c5432021-08-05 14:14:27 +020076 dat.Transduce(r, w)
77 tokens = strings.Split(w.String(), "\n")
Akron5c82a922021-09-24 19:11:29 +020078 assert.Equal(len(tokens), 10)
Akron3f8571a2021-08-05 11:18:10 +020079 assert.Equal("wald", tokens[0])
80 assert.Equal("gehen", tokens[1])
81 assert.Equal("Da", tokens[2])
82 assert.Equal("kann", tokens[3])
83 assert.Equal("man", tokens[4])
84 assert.Equal("was", tokens[5])
85 assert.Equal("\"erleben\"", tokens[6])
86
Akron524c5432021-08-05 14:14:27 +020087 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
88 w.Reset()
89 dat.Transduce(r, w)
90 tokens = strings.Split(w.String(), "\n")
91 assert.Equal("In", tokens[0])
92 assert.Equal("den", tokens[1])
93 assert.Equal("Wald", tokens[2])
94 assert.Equal("gehen", tokens[3])
95 assert.Equal("?", tokens[4])
96 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020097
Akron524c5432021-08-05 14:14:27 +020098 r = strings.NewReader(" g? -- D")
99 w.Reset()
100 dat.Transduce(r, w)
101 tokens = strings.Split(w.String(), "\n")
102 assert.Equal("g", tokens[0])
103 assert.Equal("?", tokens[1])
104 assert.Equal("--", tokens[2])
105 assert.Equal("D", tokens[3])
106 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200107 assert.Equal("", tokens[5])
108 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200109}
110
Akron3f8571a2021-08-05 11:18:10 +0200111func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200112 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200113 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200114 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200115 assert.True(tmatch(dat, "bau"))
116 assert.True(tmatch(dat, "bad"))
117 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200118
Akron3f8571a2021-08-05 11:18:10 +0200119 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200120 buf := bytes.NewBuffer(b)
121 n, err := dat.WriteTo(buf)
122 assert.Nil(err)
Akron29e306f2021-09-02 18:29:56 +0200123 assert.Equal(int64(296), n)
Akron3f8571a2021-08-05 11:18:10 +0200124
125 dat2 := ParseDatok(buf)
126 assert.NotNil(dat2)
127 assert.Equal(dat.array, dat2.array)
128 assert.Equal(dat.sigma, dat2.sigma)
129 assert.Equal(dat.epsilon, dat2.epsilon)
130 assert.Equal(dat.unknown, dat2.unknown)
131 assert.Equal(dat.identity, dat2.identity)
132 assert.Equal(dat.final, dat2.final)
133 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200134 assert.True(tmatch(dat2, "bau"))
135 assert.True(tmatch(dat2, "bad"))
136 assert.True(tmatch(dat2, "wald gehen"))
Akron4fa28b32021-08-27 10:55:41 +0200137
Akron92704eb2021-08-27 10:59:46 +0200138 assert.Equal(dat.TransCount(), 17)
139 assert.Equal(dat2.TransCount(), 17)
Akron6247a5d2021-08-03 19:18:28 +0200140}
141
Akron31f3c062021-08-27 10:15:13 +0200142func TestIgnorableMCS(t *testing.T) {
143 assert := assert.New(t)
144 // File has MCS in sigma but not in net
145 tok := LoadFomaFile("testdata/ignorable_mcs.fst")
146 assert.NotNil(tok)
147 dat := tok.ToDoubleArray()
148 assert.NotNil(dat)
149
150 b := make([]byte, 0, 2048)
151 w := bytes.NewBuffer(b)
152 var tokens []string
153
154 // Is only unambigous when transducing strictly greedy!
155 assert.True(dat.Transduce(strings.NewReader("ab<ab>"), w))
156 tokens = strings.Split(w.String(), "\n")
157 assert.Equal("a\nb\n<ab>\n", w.String())
158 assert.Equal("a", tokens[0])
159 assert.Equal("b", tokens[1])
160 assert.Equal("<ab>", tokens[2])
161 assert.Equal(4, len(tokens))
Akron92704eb2021-08-27 10:59:46 +0200162 assert.Equal(dat.TransCount(), 15)
Akron31f3c062021-08-27 10:15:13 +0200163}
164
Akron6247a5d2021-08-03 19:18:28 +0200165func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200166 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200167 dat := LoadDatokFile("testdata/tokenizer.datok")
168 assert.NotNil(dat)
169 assert.True(dat.LoadFactor() >= 70)
170 assert.Equal(dat.epsilon, 1)
171 assert.Equal(dat.unknown, 2)
172 assert.Equal(dat.identity, 3)
Akron4c2a1ad2021-08-31 00:35:53 +0200173 assert.Equal(dat.final, 145)
174 assert.Equal(len(dat.sigma), 140)
Akronf1a16502021-08-16 15:24:38 +0200175 assert.True(len(dat.array) > 3600000)
176 assert.True(dat.maxSize > 3600000)
Akronec835ad2021-08-11 18:23:22 +0200177 assert.True(tmatch(dat, "bau"))
178 assert.True(tmatch(dat, "bad"))
179 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200180}
Akron3f8571a2021-08-05 11:18:10 +0200181
Akrona0bded52021-08-11 15:48:02 +0200182func XTestFullTokenizerBuild(t *testing.T) {
183 assert := assert.New(t)
184 tok := LoadFomaFile("testdata/tokenizer.fst")
185 dat := tok.ToDoubleArray()
Akronde18e902021-08-27 09:34:12 +0200186 assert.NotNil(dat)
187 // n, err := dat.Save("testdata/tokenizer.datok")
188 // assert.Nil(err)
189 // assert.True(n > 500)
Akrona0bded52021-08-11 15:48:02 +0200190}
191
Akron3f8571a2021-08-05 11:18:10 +0200192func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200193 assert := assert.New(t)
194
Akrona0bded52021-08-11 15:48:02 +0200195 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200196 assert.NotNil(dat)
197
Akron3610f102021-08-08 14:13:25 +0200198 b := make([]byte, 0, 2048)
199 w := bytes.NewBuffer(b)
200 var tokens []string
201
Akron03ca4252021-08-11 13:32:53 +0200202 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200203
204 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200205 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200206 assert.Equal("tra", tokens[0])
207 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200208 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200209 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200210 assert.Equal("Du", tokens[4])
211 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200212 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200213 assert.Equal("", tokens[7])
214 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200215
216 w.Reset()
217 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
218 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200219}
Akronb7e1f132021-08-10 11:52:31 +0200220
221func TestFullTokenizerSentenceSplitter(t *testing.T) {
222 assert := assert.New(t)
223 dat := LoadDatokFile("testdata/tokenizer.datok")
224 assert.NotNil(dat)
225
226 b := make([]byte, 0, 2048)
227 w := bytes.NewBuffer(b)
228 var sentences []string
229
230 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200231 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
232 sentences = strings.Split(w.String(), "\n\n")
233
234 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
235 assert.Equal("Der\nalte\nMann\n.", sentences[0])
236 assert.Equal("", sentences[1])
237 assert.Equal(len(sentences), 2)
238
239 w.Reset()
240 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
241 sentences = strings.Split(w.String(), "\n\n")
242 assert.Equal(len(sentences), 2)
243 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
244 assert.Equal("", sentences[1])
245
246 w.Reset()
247 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200248 sentences = strings.Split(w.String(), "\n\n")
249 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200250 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200251
252 w.Reset()
253 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
254 sentences = strings.Split(w.String(), "\n\n")
255 assert.Equal(len(sentences), 2)
256
257 w.Reset()
258 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
259 sentences = strings.Split(w.String(), "\n\n")
260 assert.Equal(len(sentences), 2)
261
Akron6e70dc82021-08-11 11:33:18 +0200262 w.Reset()
263 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
264 sentences = strings.Split(w.String(), "\n\n")
265 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
266 assert.Equal("", sentences[1])
267 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200268
Akron6e70dc82021-08-11 11:33:18 +0200269 w.Reset()
270 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
271 sentences = strings.Split(w.String(), "\n\n")
272 assert.Equal("", sentences[1])
273 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200274
Akron6e70dc82021-08-11 11:33:18 +0200275 w.Reset()
276 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
277 sentences = strings.Split(w.String(), "\n\n")
278 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200279
Akron6e70dc82021-08-11 11:33:18 +0200280 w.Reset()
281 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
282 sentences = strings.Split(w.String(), "\n\n")
283 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200284
Akron6e70dc82021-08-11 11:33:18 +0200285 w.Reset()
286 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
287 sentences = strings.Split(w.String(), "\n\n")
288 assert.Equal(len(sentences), 2)
289 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
290 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200291
Akron6e70dc82021-08-11 11:33:18 +0200292 w.Reset()
293 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
294 sentences = strings.Split(w.String(), "\n\n")
295 assert.Equal(len(sentences), 3)
296 assert.Equal("Ausschalten\n!!!", sentences[0])
297 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
298 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200299
Akron4af79f12021-08-11 14:48:17 +0200300 w.Reset()
301 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
302 sentences = strings.Split(w.String(), "\n\n")
303 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200304
305 /*
306 Test:
307 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
308 */
Akronb7e1f132021-08-10 11:52:31 +0200309}
Akron03ca4252021-08-11 13:32:53 +0200310
Akron03ca4252021-08-11 13:32:53 +0200311func TestFullTokenizerTokenSplitter(t *testing.T) {
312 assert := assert.New(t)
313 dat := LoadDatokFile("testdata/tokenizer.datok")
314 assert.NotNil(dat)
315
316 b := make([]byte, 0, 2048)
317 w := bytes.NewBuffer(b)
318 var tokens []string
319
320 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200321 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200322 assert.Equal(tokens[0], "Der")
323 assert.Equal(tokens[1], "alte")
324 assert.Equal(tokens[2], "Mann")
325 assert.Equal(len(tokens), 3)
326
Akronec835ad2021-08-11 18:23:22 +0200327 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200328 assert.Equal(tokens[0], "Der")
329 assert.Equal(tokens[1], "alte")
330 assert.Equal(tokens[2], "Mann")
331 assert.Equal(tokens[3], ".")
332 assert.Equal(len(tokens), 4)
333
334 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200335 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200336 assert.Equal(tokens[0], "Der")
337 assert.Equal(tokens[1], "Vorsitzende")
338 assert.Equal(tokens[2], "der")
339 assert.Equal(tokens[3], "F.D.P.")
340 assert.Equal(tokens[4], "hat")
341 assert.Equal(tokens[5], "gewählt")
342 assert.Equal(len(tokens), 6)
343 // Ignored in KorAP-Tokenizer
344
345 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200346 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200347 assert.Equal(tokens[0], "Gefunden")
348 assert.Equal(tokens[1], "auf")
349 assert.Equal(tokens[2], "wikipedia.org")
350 assert.Equal(len(tokens), 3)
351
352 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200353 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200354 assert.Equal("Gefunden", tokens[0])
355 assert.Equal("auf", tokens[1])
356 assert.Equal("www.wikipedia.org", tokens[2])
357 assert.Equal(3, len(tokens))
358
359 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200360 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200361 assert.Equal("www.info.biz/info", tokens[3])
362
363 // testTokenizerFtpHost
364 /*
365 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
366 assert.Equal("Kann", tokens[0])
367 assert.Equal("von", tokens[1])
368 assert.Equal("ftp.download.org", tokens[2])
369 assert.Equal(5, len(tokens))
370 // Ignored in KorAP-Tokenizer
371 */
372
373 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200374 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200375 assert.Equal(tokens[0], "Das")
376 assert.Equal(tokens[1], "war")
377 assert.Equal(tokens[2], "--")
378 assert.Equal(tokens[3], "spitze")
379 assert.Equal(len(tokens), 4)
380
381 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200382 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200383 assert.Equal(tokens[0], "Ich")
384 assert.Equal(tokens[1], "bin")
385 assert.Equal(tokens[2], "unter")
386 assert.Equal(tokens[3], "korap@ids-mannheim.de")
387 assert.Equal(tokens[4], "erreichbar")
388 assert.Equal(tokens[5], ".")
389 assert.Equal(len(tokens), 6)
390
391 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200392 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200393 assert.Equal(tokens[0], "Oder")
394 assert.Equal(tokens[1], "unter")
395 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
396 assert.Equal(tokens[3], ".")
397 assert.Equal(len(tokens), 4)
398
399 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200400 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200401 assert.Equal(tokens[0], "Oder")
402 assert.Equal(tokens[1], "unter")
403 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
404 assert.Equal(tokens[3], ".")
405 assert.Equal(len(tokens), 4)
406 // Ignored in KorAP-Tokenizer
407
408 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200409 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200410 assert.Equal("\"", tokens[0])
411 assert.Equal("John", tokens[1])
412 assert.Equal("Doe", tokens[2])
413 assert.Equal("\"", tokens[3])
414 assert.Equal("@xx", tokens[4])
415 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
416 assert.Equal("com", tokens[6])
417 assert.Equal(7, len(tokens))
418
419 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200420 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200421 assert.Equal(tokens[0], "Folgt")
422 assert.Equal(tokens[1], "@korap")
423 assert.Equal(tokens[2], "und")
424 assert.Equal(tokens[3], "#korap")
425 assert.Equal(len(tokens), 4)
426
427 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200428 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200429 assert.Equal(tokens[0], "Unsere")
430 assert.Equal(tokens[1], "Website")
431 assert.Equal(tokens[2], "ist")
432 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
433 assert.Equal(len(tokens), 4)
434
435 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200436 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200437 assert.Equal(tokens[0], "Wir")
438 assert.Equal(tokens[1], "sind")
439 assert.Equal(tokens[2], "auch")
440 assert.Equal(tokens[3], "im")
441 assert.Equal(tokens[4], "Internet")
442 assert.Equal(tokens[5], "(")
443 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
444 assert.Equal(tokens[7], ")")
445 assert.Equal(len(tokens), 8)
446 // Ignored in KorAP-Tokenizer
447
448 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200449 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200450 assert.Equal(tokens[0], "Die")
451 assert.Equal(tokens[1], "Adresse")
452 assert.Equal(tokens[2], "ist")
453 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
454 assert.Equal(tokens[4], ".")
455 assert.Equal(len(tokens), 5)
456 // Ignored in KorAP-Tokenizer
457
458 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200459 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200460 assert.Equal(tokens[0], "Unser")
461 assert.Equal(tokens[1], "Server")
462 assert.Equal(tokens[2], "ist")
463 assert.Equal(tokens[3], "10.0.10.51")
464 assert.Equal(tokens[4], ".")
465 assert.Equal(len(tokens), 5)
466
467 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200468 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200469 assert.Equal(tokens[0], "Zu")
470 assert.Equal(tokens[1], "50,4%")
471 assert.Equal(tokens[2], "ist")
472 assert.Equal(tokens[3], "es")
473 assert.Equal(tokens[4], "sicher")
474 assert.Equal(len(tokens), 5)
475 // Differs from KorAP-Tokenizer
476
477 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200478 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200479 assert.Equal(tokens[0], "Der")
480 assert.Equal(tokens[1], "Termin")
481 assert.Equal(tokens[2], "ist")
482 assert.Equal(tokens[3], "am")
483 assert.Equal(tokens[4], "5.9.2018")
484 assert.Equal(len(tokens), 5)
485
Akronec835ad2021-08-11 18:23:22 +0200486 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200487 assert.Equal(tokens[0], "Der")
488 assert.Equal(tokens[1], "Termin")
489 assert.Equal(tokens[2], "ist")
490 assert.Equal(tokens[3], "am")
491 assert.Equal(tokens[4], "5/9/2018")
492 assert.Equal(len(tokens), 5)
493
494 // testTokenizerDateRange
495 /*
496 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
497 assert.Equal(tokens[0], "Der")
498 assert.Equal(tokens[1], "Termin")
499 assert.Equal(tokens[2], "war")
500 assert.Equal(tokens[3], "vom")
501 assert.Equal(tokens[4], "4.")
502 assert.Equal(tokens[5], "-")
503 assert.Equal(tokens[6], "5.9.2018")
504 assert.Equal(len(tokens), 7)
505 // Ignored in KorAP-Tokenizer
506 */
507
508 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200509 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200510 assert.Equal(tokens[0], "Das")
511 assert.Equal(tokens[1], "ist")
512 assert.Equal(tokens[2], "toll")
513 assert.Equal(tokens[3], "!")
514 assert.Equal(tokens[4], ";)")
515 assert.Equal(len(tokens), 5)
516
517 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200518 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200519 assert.Equal(tokens[0], "Kupietz")
520 assert.Equal(tokens[1], "und")
521 assert.Equal(tokens[2], "Schmidt")
522 assert.Equal(tokens[3], "(2018)")
523 assert.Equal(tokens[4], ":")
524 assert.Equal(tokens[5], "Korpuslinguistik")
525 assert.Equal(len(tokens), 6)
526 // Differs from KorAP-Tokenizer!
527
528 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200529 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200530 assert.Equal(tokens[0], "Kupietz")
531 assert.Equal(tokens[1], "und")
532 assert.Equal(tokens[2], "Schmidt")
533 assert.Equal(tokens[3], "[2018]")
534 assert.Equal(tokens[4], ":")
535 assert.Equal(tokens[5], "Korpuslinguistik")
536 assert.Equal(len(tokens), 6)
537 // Differs from KorAP-Tokenizer!
538
539 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200540 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200541 assert.Equal(tokens[0], "Er")
542 assert.Equal(tokens[1], "ist")
543 assert.Equal(tokens[2], "ein")
544 assert.Equal(tokens[3], "A****loch")
545 assert.Equal(tokens[4], "!")
546 assert.Equal(len(tokens), 5)
547
548 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200549 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200550 assert.Equal(tokens[0], "F*ck")
551 assert.Equal(tokens[1], "!")
552 assert.Equal(len(tokens), 2)
553
554 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200555 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200556 assert.Equal(tokens[0], "Dieses")
557 assert.Equal(tokens[1], "verf*****")
558 assert.Equal(tokens[2], "Kleid")
559 assert.Equal(tokens[3], "!")
560 assert.Equal(len(tokens), 4)
561
562 // Probably interpreted as HOST
563 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200564 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200565 assert.Equal(tokens[0], "Ich")
566 assert.Equal(tokens[1], "habe")
567 assert.Equal(tokens[2], "die")
568 assert.Equal(tokens[3], "readme.txt")
569 assert.Equal(tokens[4], "heruntergeladen")
570 assert.Equal(len(tokens), 5)
571
572 // Probably interpreted as HOST
573 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200574 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200575 assert.Equal(tokens[0], "Nimm")
576 assert.Equal(tokens[1], "die")
577 assert.Equal(tokens[2], "README.TXT")
578 assert.Equal(tokens[3], "!")
579 assert.Equal(len(tokens), 4)
580
581 // Probably interpreted as HOST
582 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200583 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200584 assert.Equal(tokens[0], "Zeig")
585 assert.Equal(tokens[1], "mir")
586 assert.Equal(tokens[2], "profile.jpeg")
587 assert.Equal(len(tokens), 3)
588
589 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200590
Akronec835ad2021-08-11 18:23:22 +0200591 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200592 assert.Equal(tokens[0], "Zeig")
593 assert.Equal(tokens[1], "mir")
594 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
595 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200596
Akrone8837b52021-08-11 17:29:58 +0200597 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200598 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200599 assert.Equal(tokens[0], "Gehe")
600 assert.Equal(tokens[1], "zu")
601 assert.Equal(tokens[2], "/Dokumente/profile.docx")
602 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200603
Akrone8837b52021-08-11 17:29:58 +0200604 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200605 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200606 assert.Equal(tokens[0], "Zeig")
607 assert.Equal(tokens[1], "mir")
608 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
609 assert.Equal(len(tokens), 3)
610 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200611
Akronfd92d7e2021-08-11 16:31:43 +0200612 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200613 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200614 assert.Equal(tokens[0], "Er")
615 assert.Equal(tokens[1], "sagte")
616 assert.Equal(tokens[2], ":")
617 assert.Equal(tokens[3], "\"")
618 assert.Equal(tokens[4], "Es")
619 assert.Equal(tokens[5], "geht")
620 assert.Equal(tokens[6], "mir")
621 assert.Equal(tokens[7], "gut")
622 assert.Equal(tokens[8], "!")
623 assert.Equal(tokens[9], "\"")
624 assert.Equal(tokens[10], ",")
625 assert.Equal(tokens[11], "daraufhin")
626 assert.Equal(tokens[12], "ging")
627 assert.Equal(tokens[13], "er")
628 assert.Equal(tokens[14], ".")
629 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200630
631 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200632 tokens = ttokenize(dat, w, "&quot;Das ist von C&A!&quot;")
633 assert.Equal(tokens[0], "&quot;")
634 assert.Equal(tokens[1], "Das")
635 assert.Equal(tokens[2], "ist")
636 assert.Equal(tokens[3], "von")
637 assert.Equal(tokens[4], "C&A")
638 assert.Equal(tokens[5], "!")
639 assert.Equal(tokens[6], "&quot;")
640 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200641
642 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200643 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200644 assert.Equal(tokens[0], "Siehst")
645 assert.Equal(tokens[1], "Du")
646 assert.Equal(tokens[2], "?!!?")
647 assert.Equal(len(tokens), 3)
648
649 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200650 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200651 assert.Equal(tokens[0], "Peter")
652 assert.Equal(tokens[1], "O'Toole")
653 assert.Equal(len(tokens), 2)
654
655 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200656 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200657 assert.Equal(tokens[0], "Früher")
658 assert.Equal(tokens[1], "bzw.")
659 assert.Equal(tokens[2], "später")
660 assert.Equal(tokens[3], "...")
661 assert.Equal(len(tokens), 4)
662
663 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200664 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200665 assert.Equal(tokens[0], "Es")
666 assert.Equal(tokens[1], "war")
667 assert.Equal(tokens[2], "spät")
668 assert.Equal(tokens[3], ".")
669 assert.Equal(tokens[4], "Morgen")
670 assert.Equal(tokens[5], "ist")
671 assert.Equal(tokens[6], "es")
672 assert.Equal(tokens[7], "früh")
673 assert.Equal(tokens[8], ".")
674 assert.Equal(len(tokens), 9)
675 // Ignored in KorAP-Tokenizer
676
677 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200678 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200679 assert.Equal(tokens[0], "Sie")
680 assert.Equal(tokens[1], "erreichte")
681 assert.Equal(tokens[2], "den")
682 assert.Equal(tokens[3], "1.")
683 assert.Equal(tokens[4], "Platz")
684 assert.Equal(tokens[5], "!")
685 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200686
687 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200688 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200689 assert.Equal(tokens[0], "Archive")
690 assert.Equal(tokens[1], ":")
691 assert.Equal(tokens[2], "Ich")
692 assert.Equal(tokens[3], "bin")
693 assert.Equal(tokens[4], "kein")
694 assert.Equal(tokens[5], "zip")
695 assert.Equal(6, len(tokens))
696
697 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200698 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200699 assert.Equal(tokens[4], "Weststr.")
700 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200701
702 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200703 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200704 assert.Equal("D'dorf", tokens[0])
705 assert.Equal("Ku'damm", tokens[1])
706 assert.Equal("Lu'hafen", tokens[2])
707 assert.Equal("M'gladbach", tokens[3])
708 assert.Equal("W'schaft", tokens[4])
709 assert.Equal(5, len(tokens))
710
711 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200712 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200713 assert.Equal("mach's", tokens[0])
714 assert.Equal("macht's", tokens[1])
715 assert.Equal("was'n", tokens[2])
716 assert.Equal("ist's", tokens[3])
717 assert.Equal("haste", tokens[4])
718 assert.Equal("willste", tokens[5])
719 assert.Equal("kannste", tokens[6])
720 assert.Equal("biste", tokens[7])
721 assert.Equal("kriegste", tokens[8])
722 assert.Equal(9, len(tokens))
723
724 /*
725 @Test
726 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
727 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
728 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
729 assert.Equal("'ve", tokens[1]);
730 assert.Equal("'ll", tokens[3]);
731 assert.Equal("'d", tokens[5]);
732 assert.Equal("'m", tokens[7]);
733 assert.Equal("'re", tokens[9]);
734 assert.Equal("'s", tokens[11]);
735 assert.Equal("is", tokens[12]);
736 assert.Equal("n't", tokens[13]);
737 assert.Equal(14, len(tokens));
738 }
739
740 @Test
741 public void frenchTokenizerKnowsFrenchAbbreviations () {
742 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
743 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
744 assert.Equal("Approx.", tokens[0]);
745 assert.Equal("juill.", tokens[2]);
746 assert.Equal("prof.", tokens[5]);
747 assert.Equal("exerc.", tokens[15]);
748 assert.Equal("no.", tokens[16]);
749 assert.Equal("pp.", tokens[21]);
750 }
751
752 @Test
753 public void frenchTokenizerKnowsFrenchContractions () {
754 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
755 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
756 assert.Equal("J'", tokens[0]);
757 assert.Equal("j'", tokens[2]);
758 assert.Equal("qu'", tokens[4]);
759 assert.Equal("d'", tokens[6]);
760 assert.Equal("jusqu'", tokens[8]);
761 assert.Equal("Aujourd'hui", tokens[10]);
762 assert.Equal("D'", tokens[11]); // ’
763 assert.Equal("Quelqu'un", tokens[13]); // ’
764 assert.Equal("Presqu'île", tokens[14]); // ’
765 }
766
767 @Test
768 public void frenchTokenizerKnowsFrenchClitics () {
769 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
770 tokens = tokenize(dat, w, "suis-je sont-elles ")
771 assert.Equal("suis", tokens[0]);
772 assert.Equal("-je", tokens[1]);
773 assert.Equal("sont", tokens[2]);
774 assert.Equal("-elles", tokens[3]);
775 }
776
777 @Test
778 public void testEnglishTokenizerScienceAbbreviations () {
779 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
780 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
781 assert.Equal("Approx.", tokens[0]);
782 assert.Equal("in", tokens[1]);
783 assert.Equal("Sept.", tokens[2]);
784 assert.Equal("1954", tokens[3]);
785 assert.Equal(",", tokens[4]);
786 assert.Equal("Assoc.", tokens[5]);
787 assert.Equal("Prof.", tokens[6]);
788 assert.Equal("Dr.", tokens[7]);
789 assert.Equal("R.", tokens[8]);
790 assert.Equal("J.", tokens[9]);
791 assert.Equal("Ewing", tokens[10]);
792 assert.Equal("reviewed", tokens[11]);
793 assert.Equal("articles", tokens[12]);
794 assert.Equal("on", tokens[13]);
795 assert.Equal("Enzymol.", tokens[14]);
796 assert.Equal("Bacteriol.", tokens[15]);
797 assert.Equal("effects", tokens[16]);
798 assert.Equal("later", tokens[17]);
799 assert.Equal("published", tokens[18]);
800 assert.Equal("in", tokens[19]);
801 assert.Equal("Nutr.", tokens[20]);
802 assert.Equal("Rheumatol.", tokens[21]);
803 assert.Equal("No.", tokens[22]);
804 assert.Equal("12", tokens[23]);
805 assert.Equal("and", tokens[24]);
806 assert.Equal("Nº.", tokens[25]);
807 assert.Equal("13.", tokens[26]);
808 assert.Equal(",", tokens[27]);
809 assert.Equal("pp.", tokens[28]);
810 assert.Equal("17-18", tokens[29]);
811 assert.Equal(".", tokens[30]);
812 }
813
814 @Test
815 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
816 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
817 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
818 assert.Equal("I.", tokens[1]);
819 assert.Equal("I", tokens[8]);
820 assert.Equal(".", tokens[9]);
821 assert.Equal("I", tokens[12]);
822 assert.Equal(".", tokens[13]);
823 }
824
825 @Test
826 public void testZipOuputArchive () {
827
828 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
829 System.setOut(new PrintStream(clearOut));
830 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
831 assert.Equal(0, len(tokens));
832 }
833 */
834 /*
835
836 @Test
837 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
838 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
839 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
840 .printOffsets(true)
841 .build();
842 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
843 assert.Equal("Text1", tokens[0].getType());
844 assert.Equal(len(tokens), 9 );
845 }
846 */
847}
Akronbd406802021-08-11 18:39:13 +0200848
Akron29e306f2021-09-02 18:29:56 +0200849func TestLoadFactor1(t *testing.T) {
850 assert := assert.New(t)
851 tok := LoadFomaFile("testdata/abbr_bench.fst")
852 dat := tok.ToDoubleArray()
853 assert.True(dat.LoadFactor() > 88)
854}
855
Akron4c2a1ad2021-08-31 00:35:53 +0200856func TestFullTokenizerXML(t *testing.T) {
857 assert := assert.New(t)
858
859 dat := LoadDatokFile("testdata/tokenizer.datok")
860 assert.NotNil(dat)
861
862 b := make([]byte, 0, 2048)
863 w := bytes.NewBuffer(b)
864 var tokens []string
865
866 tokens = ttokenize(dat, w, "Das <b>beste</b> Fußballspiel")
867 assert.Equal("Das", tokens[0])
868 assert.Equal("<b>", tokens[1])
869 assert.Equal("beste", tokens[2])
870 assert.Equal("</b>", tokens[3])
871 assert.Equal("Fußballspiel", tokens[4])
872 assert.Equal(5, len(tokens))
873
874 tokens = ttokenize(dat, w, "Das <b class=\"c\">beste</b> Fußballspiel")
875 assert.Equal("Das", tokens[0])
876 assert.Equal("<b class=\"c\">", tokens[1])
877 assert.Equal("beste", tokens[2])
878 assert.Equal("</b>", tokens[3])
879 assert.Equal("Fußballspiel", tokens[4])
880 assert.Equal(5, len(tokens))
881
882 tokens = ttokenize(dat, w, "der<x y=\"alte \"> <x x> alte</x> etc. et. Mann.")
883 assert.Equal("der", tokens[0])
884 assert.Equal("<x y=\"alte \">", tokens[1])
885 assert.Equal("<x x>", tokens[2])
886 assert.Equal("alte", tokens[3])
887 assert.Equal("</x>", tokens[4])
888 assert.Equal("etc.", tokens[5])
889 assert.Equal("et", tokens[6])
890 assert.Equal(".", tokens[7])
891 assert.Equal("Mann", tokens[8])
892 assert.Equal(".", tokens[9])
893 assert.Equal(10, len(tokens))
894}
895
Akronbd406802021-08-11 18:39:13 +0200896func BenchmarkTransduce(b *testing.B) {
897 bu := make([]byte, 0, 2048)
898 w := bytes.NewBuffer(bu)
899
900 s := `Der Vorsitzende der Abk. hat gewählt. Gefunden auf wikipedia.org. Ich bin unter korap@ids-mannheim.de erreichbar.
901 Unsere Website ist https://korap.ids-mannheim.de/?q=Baum. Unser Server ist 10.0.10.51. Zu 50.4% ist es sicher.
902 Der Termin ist am 5.9.2018.
903 Ich habe die readme.txt heruntergeladen.
904 Ausschalten!!! Hast Du nicht gehört???
905 Ich wohne in der Weststr. und Du? Kupietz und Schmidt [2018]: Korpuslinguistik. Dieses verf***** Kleid! Ich habe die readme.txt heruntergeladen.
906 Er sagte: \"Es geht mir gut!\", daraufhin ging er. &quot;Das ist von C&A!&quot; Früher bzw. später ... Sie erreichte den 1. Platz!
907 Archive: Ich bin kein zip. D'dorf Ku'damm Lu'hafen M'gladbach W'schaft.
908 Mach's macht's was'n ist's haste willste kannste biste kriegste.`
909 r := strings.NewReader(s)
910
911 dat := LoadDatokFile("testdata/tokenizer.datok")
912
Akrondf37a552021-09-02 12:16:08 +0200913 b.ResetTimer()
914
Akronbd406802021-08-11 18:39:13 +0200915 for i := 0; i < b.N; i++ {
916 w.Reset()
917 r.Reset(s)
918 ok := dat.Transduce(r, w)
919 if !ok {
920 fmt.Println("Fail!")
921 fmt.Println(w.String())
922 os.Exit(1)
923 }
924 }
Akronbd406802021-08-11 18:39:13 +0200925}
Akronbb4aac52021-08-13 00:52:27 +0200926
Akron6f1c16c2021-08-17 10:45:42 +0200927// This test is deprecated as the datok file changes over time
928func XBenchmarkLoadDatokFile(b *testing.B) {
Akronbb4aac52021-08-13 00:52:27 +0200929 for i := 0; i < b.N; i++ {
930 dat := LoadDatokFile("testdata/tokenizer.datok")
931 if dat == nil {
932 fmt.Println("Fail!")
933 os.Exit(1)
934 }
935 }
936}
937
Akron6f1c16c2021-08-17 10:45:42 +0200938func BenchmarkToDoubleArray(b *testing.B) {
939 tok := LoadFomaFile("testdata/simple_bench.fst")
Akrondf37a552021-09-02 12:16:08 +0200940 b.ResetTimer()
Akron6f1c16c2021-08-17 10:45:42 +0200941 for i := 0; i < b.N; i++ {
942 dat := tok.ToDoubleArray()
943 if dat == nil {
944 fmt.Println("Fail!")
945 os.Exit(1)
946 }
947 }
948}
949
Akron7b1faa62021-09-02 16:10:21 +0200950func BenchmarkToDoubleArrayLarger(b *testing.B) {
951 tok := LoadFomaFile("testdata/abbr_bench.fst")
952 b.ResetTimer()
953 for i := 0; i < b.N; i++ {
954 dat := tok.ToDoubleArray()
955 if dat == nil {
956 fmt.Println("Fail!")
957 os.Exit(1)
958 }
959 }
960}
961
Akronbb4aac52021-08-13 00:52:27 +0200962// 2021-08-11 (go 1.16)
963// go test -bench=. -test.benchmem
964// BenchmarkTransduce-4 19069 60609 ns/op 11048 B/op 137 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200965// 2021-08-12 (go 1.16)
Akronbb4aac52021-08-13 00:52:27 +0200966// BenchmarkTransduce-4 20833 55241 ns/op 9676 B/op 3 allocs/op
967// BenchmarkLoadDatokFile-4 4 258418169 ns/op 29916470 B/op 5697 allocs/op
968// BenchmarkTransduce-4 19430 58133 ns/op 18696 B/op 3 allocs/op
969// BenchmarkLoadDatokFile-4 8 139071939 ns/op 203158377 B/op 5742 allocs/op
Akronf1a16502021-08-16 15:24:38 +0200970// 2021-08-16
971// BenchmarkTransduce-4 22251 49989 ns/op 17370 B/op 3 allocs/op
972// BenchmarkLoadDatokFile-4 8 138937532 ns/op 203158327 B/op 5742 allocs/op
973// BenchmarkTransduce-4 22005 48665 ns/op 17472 B/op 3 allocs/op
974// BenchmarkLoadDatokFile-4 7 143143934 ns/op 203158450 B/op 5743 allocs/op
Akronea46e8a2021-08-17 00:36:31 +0200975// BenchmarkTransduce-4 34939 34363 ns/op 14056 B/op 3 allocs/op
976// BenchmarkLoadDatokFile-4 7 149511609 ns/op 203217193 B/op 5915 allocs/op
Akron6f1c16c2021-08-17 10:45:42 +0200977// 2021-08-17
978// BenchmarkTransduce-4 31204 32678 ns/op 14752 B/op 3 allocs/op
979// BenchmarkToDoubleArray-4 44138 26850 ns/op 10704 B/op 29 allocs/op
Akronde18e902021-08-27 09:34:12 +0200980// BenchmarkTransduce-4 29376 34562 ns/op 15157 B/op 3 allocs/op
981// BenchmarkToDoubleArray-4 54441 21355 ns/op 10704 B/op 29 allocs/op
Akrondf37a552021-09-02 12:16:08 +0200982// 2021-09-02 - New tokenizer - fixed loading
Akron7b1faa62021-09-02 16:10:21 +0200983// BenchmarkTransduce-4 40149 31515 ns/op 8240 B/op 3 allocs/op
984// BenchmarkToDoubleArray-4 51043 22586 ns/op 10702 B/op 29 allocs/op
985// BenchmarkToDoubleArrayLarger-4 3 396009639 ns/op 6352293 B/op 2575 allocs/op
986// BenchmarkTransduce-4 38698 31900 ns/op 8240 B/op 3 allocs/op
987// BenchmarkToDoubleArray-4 50644 21569 ns/op 11151 B/op 14 allocs/op
988// BenchmarkToDoubleArrayLarger-4 3 441260766 ns/op 6942336 B/op 30 allocs/op
989// BenchmarkTransduce-4 39966 30835 ns/op 8240 B/op 3 allocs/op
990// BenchmarkToDoubleArray-4 50720 24863 ns/op 11091 B/op 46 allocs/op
991// BenchmarkToDoubleArrayLarger-4 3 432523828 ns/op 6413381 B/op 5122 allocs/op
Akron679b4862021-09-02 16:59:26 +0200992// 2021-09-02 - xCheckSkip() with .9
993// BenchmarkTransduce-4 36325 38501 ns/op 8240 B/op 3 allocs/op
994// BenchmarkToDoubleArray-4 66858 19286 ns/op 10607 B/op 29 allocs/op
995// BenchmarkToDoubleArrayLarger-4 18 67428011 ns/op 6360604 B/op 2578 allocs/op
Akron29e306f2021-09-02 18:29:56 +0200996// 2021-09-02 - xCheckSkipNiu() with .9 and >= 3
997// BenchmarkTransduce-4 37105 27714 ns/op 8240 B/op 3 allocs/op
998// BenchmarkToDoubleArray-4 76600 15973 ns/op 10703 B/op 29 allocs/op
999// BenchmarkToDoubleArrayLarger-4 21 55161934 ns/op 6357889 B/op 2578 allocs/op