blob: e20acd34f2ca51dfdea82560c44f7732b1a12ecc [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron03ca4252021-08-11 13:32:53 +02005 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02006 "strings"
Akron8ef408b2021-08-02 22:11:04 +02007 "testing"
8
9 "github.com/stretchr/testify/assert"
10)
11
Akronec835ad2021-08-11 18:23:22 +020012func tmatch(dat *DaTokenizer, s string) bool {
13 b := make([]byte, 0, 2048)
14 w := bytes.NewBuffer(b)
15 return dat.Transduce(strings.NewReader(s), w)
16}
17
18func ttokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
19 w.Reset()
20 ok := dat.Transduce(strings.NewReader(str), w)
21 if !ok {
22 return []string{}
23 }
24 obj := regexp.MustCompile("\n+")
25
26 tokens := obj.Split(w.String(), -1)
27 return tokens[:len(tokens)-1]
28}
29
Akron8ef408b2021-08-02 22:11:04 +020030func TestSimpleString(t *testing.T) {
31 assert := assert.New(t)
32
33 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020034 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020035 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020036 assert.True(tmatch(dat, "bau"))
37 assert.True(tmatch(dat, "bauamt"))
38 assert.False(tmatch(dat, "baum"))
Akron8ef408b2021-08-02 22:11:04 +020039}
Akron75ebe7f2021-08-03 10:34:10 +020040
41func TestSimpleBranches(t *testing.T) {
42 assert := assert.New(t)
43
44 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020045 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020046 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020047 assert.False(tmatch(dat, "bau"))
48 assert.True(tmatch(dat, "bauamt"))
49 assert.True(tmatch(dat, "wahlamt"))
50 assert.True(tmatch(dat, "bauen"))
51 assert.True(tmatch(dat, "wahlen"))
52 assert.False(tmatch(dat, "baum"))
Akron75ebe7f2021-08-03 10:34:10 +020053}
Akron730a79c2021-08-03 11:05:29 +020054
55func TestSimpleTokenizer(t *testing.T) {
56 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020057 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020058 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +020059 assert.True(tmatch(dat, "bau"))
60 assert.True(tmatch(dat, "bad"))
61 assert.True(tmatch(dat, "wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020062}
Akron740f3d72021-08-03 12:12:34 +020063
Akron068874c2021-08-04 15:19:56 +020064func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020065 assert := assert.New(t)
66 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020067 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020068
69 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
70 b := make([]byte, 0, 2048)
71 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020072 var tokens []string
Akron524c5432021-08-05 14:14:27 +020073 dat.Transduce(r, w)
74 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020075 assert.Equal("wald", tokens[0])
76 assert.Equal("gehen", tokens[1])
77 assert.Equal("Da", tokens[2])
78 assert.Equal("kann", tokens[3])
79 assert.Equal("man", tokens[4])
80 assert.Equal("was", tokens[5])
81 assert.Equal("\"erleben\"", tokens[6])
82
Akron524c5432021-08-05 14:14:27 +020083 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
84 w.Reset()
85 dat.Transduce(r, w)
86 tokens = strings.Split(w.String(), "\n")
87 assert.Equal("In", tokens[0])
88 assert.Equal("den", tokens[1])
89 assert.Equal("Wald", tokens[2])
90 assert.Equal("gehen", tokens[3])
91 assert.Equal("?", tokens[4])
92 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020093
Akron524c5432021-08-05 14:14:27 +020094 r = strings.NewReader(" g? -- D")
95 w.Reset()
96 dat.Transduce(r, w)
97 tokens = strings.Split(w.String(), "\n")
98 assert.Equal("g", tokens[0])
99 assert.Equal("?", tokens[1])
100 assert.Equal("--", tokens[2])
101 assert.Equal("D", tokens[3])
102 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +0200103 assert.Equal("", tokens[5])
104 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +0200105}
106
Akron3f8571a2021-08-05 11:18:10 +0200107func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +0200108 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +0200109 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +0200110 dat := tok.ToDoubleArray()
Akronec835ad2021-08-11 18:23:22 +0200111 assert.True(tmatch(dat, "bau"))
112 assert.True(tmatch(dat, "bad"))
113 assert.True(tmatch(dat, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200114
Akron03a3c612021-08-04 11:51:27 +0200115 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +0200116
Akron3f8571a2021-08-05 11:18:10 +0200117 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200118 buf := bytes.NewBuffer(b)
119 n, err := dat.WriteTo(buf)
120 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200121 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200122
123 dat2 := ParseDatok(buf)
124 assert.NotNil(dat2)
125 assert.Equal(dat.array, dat2.array)
126 assert.Equal(dat.sigma, dat2.sigma)
127 assert.Equal(dat.epsilon, dat2.epsilon)
128 assert.Equal(dat.unknown, dat2.unknown)
129 assert.Equal(dat.identity, dat2.identity)
130 assert.Equal(dat.final, dat2.final)
131 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
Akronec835ad2021-08-11 18:23:22 +0200132 assert.True(tmatch(dat2, "bau"))
133 assert.True(tmatch(dat2, "bad"))
134 assert.True(tmatch(dat2, "wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200135}
136
137func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200138 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200139 dat := LoadDatokFile("testdata/tokenizer.datok")
140 assert.NotNil(dat)
141 assert.True(dat.LoadFactor() >= 70)
142 assert.Equal(dat.epsilon, 1)
143 assert.Equal(dat.unknown, 2)
144 assert.Equal(dat.identity, 3)
Akronec835ad2021-08-11 18:23:22 +0200145 assert.Equal(dat.final, 137)
146 assert.Equal(len(dat.sigma), 132)
Akron4af79f12021-08-11 14:48:17 +0200147 assert.True(len(dat.array) > 3800000)
148 assert.True(dat.maxSize > 3800000)
Akron3a063ef2021-08-05 19:36:35 +0200149
Akronec835ad2021-08-11 18:23:22 +0200150 assert.True(tmatch(dat, "bau"))
151 assert.True(tmatch(dat, "bad"))
152 assert.True(tmatch(dat, "wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200153}
Akron3f8571a2021-08-05 11:18:10 +0200154
Akrona0bded52021-08-11 15:48:02 +0200155func XTestFullTokenizerBuild(t *testing.T) {
156 assert := assert.New(t)
157 tok := LoadFomaFile("testdata/tokenizer.fst")
158 dat := tok.ToDoubleArray()
159 n, err := dat.Save("testdata/tokenizer.datok")
160 assert.Nil(err)
161 assert.True(n > 500)
162}
163
Akron3f8571a2021-08-05 11:18:10 +0200164func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200165 assert := assert.New(t)
166
Akrona0bded52021-08-11 15:48:02 +0200167 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200168 assert.NotNil(dat)
169
Akron3610f102021-08-08 14:13:25 +0200170 b := make([]byte, 0, 2048)
171 w := bytes.NewBuffer(b)
172 var tokens []string
173
Akron03ca4252021-08-11 13:32:53 +0200174 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200175
176 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200177 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200178 assert.Equal("tra", tokens[0])
179 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200180 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200181 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200182 assert.Equal("Du", tokens[4])
183 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200184 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200185 assert.Equal("", tokens[7])
186 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200187
188 w.Reset()
189 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
190 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200191}
Akronb7e1f132021-08-10 11:52:31 +0200192
193func TestFullTokenizerSentenceSplitter(t *testing.T) {
194 assert := assert.New(t)
195 dat := LoadDatokFile("testdata/tokenizer.datok")
196 assert.NotNil(dat)
197
198 b := make([]byte, 0, 2048)
199 w := bytes.NewBuffer(b)
200 var sentences []string
201
202 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200203 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
204 sentences = strings.Split(w.String(), "\n\n")
205
206 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
207 assert.Equal("Der\nalte\nMann\n.", sentences[0])
208 assert.Equal("", sentences[1])
209 assert.Equal(len(sentences), 2)
210
211 w.Reset()
212 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
213 sentences = strings.Split(w.String(), "\n\n")
214 assert.Equal(len(sentences), 2)
215 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
216 assert.Equal("", sentences[1])
217
218 w.Reset()
219 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200220 sentences = strings.Split(w.String(), "\n\n")
221 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200222 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200223
224 w.Reset()
225 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
226 sentences = strings.Split(w.String(), "\n\n")
227 assert.Equal(len(sentences), 2)
228
229 w.Reset()
230 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
231 sentences = strings.Split(w.String(), "\n\n")
232 assert.Equal(len(sentences), 2)
233
Akron6e70dc82021-08-11 11:33:18 +0200234 w.Reset()
235 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
236 sentences = strings.Split(w.String(), "\n\n")
237 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
238 assert.Equal("", sentences[1])
239 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200240
Akron6e70dc82021-08-11 11:33:18 +0200241 w.Reset()
242 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
243 sentences = strings.Split(w.String(), "\n\n")
244 assert.Equal("", sentences[1])
245 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200246
Akron6e70dc82021-08-11 11:33:18 +0200247 w.Reset()
248 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
249 sentences = strings.Split(w.String(), "\n\n")
250 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200251
Akron6e70dc82021-08-11 11:33:18 +0200252 w.Reset()
253 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
254 sentences = strings.Split(w.String(), "\n\n")
255 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200256
Akron6e70dc82021-08-11 11:33:18 +0200257 w.Reset()
258 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
259 sentences = strings.Split(w.String(), "\n\n")
260 assert.Equal(len(sentences), 2)
261 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
262 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200263
Akron6e70dc82021-08-11 11:33:18 +0200264 w.Reset()
265 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
266 sentences = strings.Split(w.String(), "\n\n")
267 assert.Equal(len(sentences), 3)
268 assert.Equal("Ausschalten\n!!!", sentences[0])
269 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
270 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200271
Akron4af79f12021-08-11 14:48:17 +0200272 w.Reset()
273 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
274 sentences = strings.Split(w.String(), "\n\n")
275 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200276
277 /*
278 Test:
279 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
280 */
Akronb7e1f132021-08-10 11:52:31 +0200281}
Akron03ca4252021-08-11 13:32:53 +0200282
Akron03ca4252021-08-11 13:32:53 +0200283func TestFullTokenizerTokenSplitter(t *testing.T) {
284 assert := assert.New(t)
285 dat := LoadDatokFile("testdata/tokenizer.datok")
286 assert.NotNil(dat)
287
288 b := make([]byte, 0, 2048)
289 w := bytes.NewBuffer(b)
290 var tokens []string
291
292 // testTokenizerSimple
Akronec835ad2021-08-11 18:23:22 +0200293 tokens = ttokenize(dat, w, "Der alte Mann")
Akron03ca4252021-08-11 13:32:53 +0200294 assert.Equal(tokens[0], "Der")
295 assert.Equal(tokens[1], "alte")
296 assert.Equal(tokens[2], "Mann")
297 assert.Equal(len(tokens), 3)
298
Akronec835ad2021-08-11 18:23:22 +0200299 tokens = ttokenize(dat, w, "Der alte Mann.")
Akron03ca4252021-08-11 13:32:53 +0200300 assert.Equal(tokens[0], "Der")
301 assert.Equal(tokens[1], "alte")
302 assert.Equal(tokens[2], "Mann")
303 assert.Equal(tokens[3], ".")
304 assert.Equal(len(tokens), 4)
305
306 // testTokenizerAbbr
Akronec835ad2021-08-11 18:23:22 +0200307 tokens = ttokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
Akron03ca4252021-08-11 13:32:53 +0200308 assert.Equal(tokens[0], "Der")
309 assert.Equal(tokens[1], "Vorsitzende")
310 assert.Equal(tokens[2], "der")
311 assert.Equal(tokens[3], "F.D.P.")
312 assert.Equal(tokens[4], "hat")
313 assert.Equal(tokens[5], "gewählt")
314 assert.Equal(len(tokens), 6)
315 // Ignored in KorAP-Tokenizer
316
317 // testTokenizerHost1
Akronec835ad2021-08-11 18:23:22 +0200318 tokens = ttokenize(dat, w, "Gefunden auf wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200319 assert.Equal(tokens[0], "Gefunden")
320 assert.Equal(tokens[1], "auf")
321 assert.Equal(tokens[2], "wikipedia.org")
322 assert.Equal(len(tokens), 3)
323
324 // testTokenizerWwwHost
Akronec835ad2021-08-11 18:23:22 +0200325 tokens = ttokenize(dat, w, "Gefunden auf www.wikipedia.org")
Akron03ca4252021-08-11 13:32:53 +0200326 assert.Equal("Gefunden", tokens[0])
327 assert.Equal("auf", tokens[1])
328 assert.Equal("www.wikipedia.org", tokens[2])
329 assert.Equal(3, len(tokens))
330
331 // testTokenizerWwwUrl
Akronec835ad2021-08-11 18:23:22 +0200332 tokens = ttokenize(dat, w, "Weitere Infos unter www.info.biz/info")
Akron03ca4252021-08-11 13:32:53 +0200333 assert.Equal("www.info.biz/info", tokens[3])
334
335 // testTokenizerFtpHost
336 /*
337 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
338 assert.Equal("Kann", tokens[0])
339 assert.Equal("von", tokens[1])
340 assert.Equal("ftp.download.org", tokens[2])
341 assert.Equal(5, len(tokens))
342 // Ignored in KorAP-Tokenizer
343 */
344
345 // testTokenizerDash
Akronec835ad2021-08-11 18:23:22 +0200346 tokens = ttokenize(dat, w, "Das war -- spitze")
Akron03ca4252021-08-11 13:32:53 +0200347 assert.Equal(tokens[0], "Das")
348 assert.Equal(tokens[1], "war")
349 assert.Equal(tokens[2], "--")
350 assert.Equal(tokens[3], "spitze")
351 assert.Equal(len(tokens), 4)
352
353 // testTokenizerEmail1
Akronec835ad2021-08-11 18:23:22 +0200354 tokens = ttokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
Akron03ca4252021-08-11 13:32:53 +0200355 assert.Equal(tokens[0], "Ich")
356 assert.Equal(tokens[1], "bin")
357 assert.Equal(tokens[2], "unter")
358 assert.Equal(tokens[3], "korap@ids-mannheim.de")
359 assert.Equal(tokens[4], "erreichbar")
360 assert.Equal(tokens[5], ".")
361 assert.Equal(len(tokens), 6)
362
363 // testTokenizerEmail2
Akronec835ad2021-08-11 18:23:22 +0200364 tokens = ttokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
Akron03ca4252021-08-11 13:32:53 +0200365 assert.Equal(tokens[0], "Oder")
366 assert.Equal(tokens[1], "unter")
367 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
368 assert.Equal(tokens[3], ".")
369 assert.Equal(len(tokens), 4)
370
371 // testTokenizerEmail3
Akronec835ad2021-08-11 18:23:22 +0200372 tokens = ttokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
Akron03ca4252021-08-11 13:32:53 +0200373 assert.Equal(tokens[0], "Oder")
374 assert.Equal(tokens[1], "unter")
375 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
376 assert.Equal(tokens[3], ".")
377 assert.Equal(len(tokens), 4)
378 // Ignored in KorAP-Tokenizer
379
380 // testTokenizerDoNotAcceptQuotedEmailNames
Akronec835ad2021-08-11 18:23:22 +0200381 tokens = ttokenize(dat, w, "\"John Doe\"@xx.com")
Akron03ca4252021-08-11 13:32:53 +0200382 assert.Equal("\"", tokens[0])
383 assert.Equal("John", tokens[1])
384 assert.Equal("Doe", tokens[2])
385 assert.Equal("\"", tokens[3])
386 assert.Equal("@xx", tokens[4])
387 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
388 assert.Equal("com", tokens[6])
389 assert.Equal(7, len(tokens))
390
391 // testTokenizerTwitter
Akronec835ad2021-08-11 18:23:22 +0200392 tokens = ttokenize(dat, w, "Folgt @korap und #korap")
Akron03ca4252021-08-11 13:32:53 +0200393 assert.Equal(tokens[0], "Folgt")
394 assert.Equal(tokens[1], "@korap")
395 assert.Equal(tokens[2], "und")
396 assert.Equal(tokens[3], "#korap")
397 assert.Equal(len(tokens), 4)
398
399 // testTokenizerWeb1
Akronec835ad2021-08-11 18:23:22 +0200400 tokens = ttokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
Akron03ca4252021-08-11 13:32:53 +0200401 assert.Equal(tokens[0], "Unsere")
402 assert.Equal(tokens[1], "Website")
403 assert.Equal(tokens[2], "ist")
404 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
405 assert.Equal(len(tokens), 4)
406
407 // testTokenizerWeb2
Akronec835ad2021-08-11 18:23:22 +0200408 tokens = ttokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
Akron03ca4252021-08-11 13:32:53 +0200409 assert.Equal(tokens[0], "Wir")
410 assert.Equal(tokens[1], "sind")
411 assert.Equal(tokens[2], "auch")
412 assert.Equal(tokens[3], "im")
413 assert.Equal(tokens[4], "Internet")
414 assert.Equal(tokens[5], "(")
415 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
416 assert.Equal(tokens[7], ")")
417 assert.Equal(len(tokens), 8)
418 // Ignored in KorAP-Tokenizer
419
420 // testTokenizerWeb3
Akronec835ad2021-08-11 18:23:22 +0200421 tokens = ttokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
Akron03ca4252021-08-11 13:32:53 +0200422 assert.Equal(tokens[0], "Die")
423 assert.Equal(tokens[1], "Adresse")
424 assert.Equal(tokens[2], "ist")
425 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
426 assert.Equal(tokens[4], ".")
427 assert.Equal(len(tokens), 5)
428 // Ignored in KorAP-Tokenizer
429
430 // testTokenizerServer
Akronec835ad2021-08-11 18:23:22 +0200431 tokens = ttokenize(dat, w, "Unser Server ist 10.0.10.51.")
Akron03ca4252021-08-11 13:32:53 +0200432 assert.Equal(tokens[0], "Unser")
433 assert.Equal(tokens[1], "Server")
434 assert.Equal(tokens[2], "ist")
435 assert.Equal(tokens[3], "10.0.10.51")
436 assert.Equal(tokens[4], ".")
437 assert.Equal(len(tokens), 5)
438
439 // testTokenizerNum
Akronec835ad2021-08-11 18:23:22 +0200440 tokens = ttokenize(dat, w, "Zu 50,4% ist es sicher")
Akron03ca4252021-08-11 13:32:53 +0200441 assert.Equal(tokens[0], "Zu")
442 assert.Equal(tokens[1], "50,4%")
443 assert.Equal(tokens[2], "ist")
444 assert.Equal(tokens[3], "es")
445 assert.Equal(tokens[4], "sicher")
446 assert.Equal(len(tokens), 5)
447 // Differs from KorAP-Tokenizer
448
449 // testTokenizerDate
Akronec835ad2021-08-11 18:23:22 +0200450 tokens = ttokenize(dat, w, "Der Termin ist am 5.9.2018")
Akron03ca4252021-08-11 13:32:53 +0200451 assert.Equal(tokens[0], "Der")
452 assert.Equal(tokens[1], "Termin")
453 assert.Equal(tokens[2], "ist")
454 assert.Equal(tokens[3], "am")
455 assert.Equal(tokens[4], "5.9.2018")
456 assert.Equal(len(tokens), 5)
457
Akronec835ad2021-08-11 18:23:22 +0200458 tokens = ttokenize(dat, w, "Der Termin ist am 5/9/2018")
Akron03ca4252021-08-11 13:32:53 +0200459 assert.Equal(tokens[0], "Der")
460 assert.Equal(tokens[1], "Termin")
461 assert.Equal(tokens[2], "ist")
462 assert.Equal(tokens[3], "am")
463 assert.Equal(tokens[4], "5/9/2018")
464 assert.Equal(len(tokens), 5)
465
466 // testTokenizerDateRange
467 /*
468 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
469 assert.Equal(tokens[0], "Der")
470 assert.Equal(tokens[1], "Termin")
471 assert.Equal(tokens[2], "war")
472 assert.Equal(tokens[3], "vom")
473 assert.Equal(tokens[4], "4.")
474 assert.Equal(tokens[5], "-")
475 assert.Equal(tokens[6], "5.9.2018")
476 assert.Equal(len(tokens), 7)
477 // Ignored in KorAP-Tokenizer
478 */
479
480 // testTokenizerEmoji1
Akronec835ad2021-08-11 18:23:22 +0200481 tokens = ttokenize(dat, w, "Das ist toll! ;)")
Akron03ca4252021-08-11 13:32:53 +0200482 assert.Equal(tokens[0], "Das")
483 assert.Equal(tokens[1], "ist")
484 assert.Equal(tokens[2], "toll")
485 assert.Equal(tokens[3], "!")
486 assert.Equal(tokens[4], ";)")
487 assert.Equal(len(tokens), 5)
488
489 // testTokenizerRef1
Akronec835ad2021-08-11 18:23:22 +0200490 tokens = ttokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200491 assert.Equal(tokens[0], "Kupietz")
492 assert.Equal(tokens[1], "und")
493 assert.Equal(tokens[2], "Schmidt")
494 assert.Equal(tokens[3], "(2018)")
495 assert.Equal(tokens[4], ":")
496 assert.Equal(tokens[5], "Korpuslinguistik")
497 assert.Equal(len(tokens), 6)
498 // Differs from KorAP-Tokenizer!
499
500 // testTokenizerRef2 () {
Akronec835ad2021-08-11 18:23:22 +0200501 tokens = ttokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
Akron03ca4252021-08-11 13:32:53 +0200502 assert.Equal(tokens[0], "Kupietz")
503 assert.Equal(tokens[1], "und")
504 assert.Equal(tokens[2], "Schmidt")
505 assert.Equal(tokens[3], "[2018]")
506 assert.Equal(tokens[4], ":")
507 assert.Equal(tokens[5], "Korpuslinguistik")
508 assert.Equal(len(tokens), 6)
509 // Differs from KorAP-Tokenizer!
510
511 // testTokenizerOmission1 () {
Akronec835ad2021-08-11 18:23:22 +0200512 tokens = ttokenize(dat, w, "Er ist ein A****loch!")
Akron03ca4252021-08-11 13:32:53 +0200513 assert.Equal(tokens[0], "Er")
514 assert.Equal(tokens[1], "ist")
515 assert.Equal(tokens[2], "ein")
516 assert.Equal(tokens[3], "A****loch")
517 assert.Equal(tokens[4], "!")
518 assert.Equal(len(tokens), 5)
519
520 // testTokenizerOmission2
Akronec835ad2021-08-11 18:23:22 +0200521 tokens = ttokenize(dat, w, "F*ck!")
Akron03ca4252021-08-11 13:32:53 +0200522 assert.Equal(tokens[0], "F*ck")
523 assert.Equal(tokens[1], "!")
524 assert.Equal(len(tokens), 2)
525
526 // testTokenizerOmission3 () {
Akronec835ad2021-08-11 18:23:22 +0200527 tokens = ttokenize(dat, w, "Dieses verf***** Kleid!")
Akron03ca4252021-08-11 13:32:53 +0200528 assert.Equal(tokens[0], "Dieses")
529 assert.Equal(tokens[1], "verf*****")
530 assert.Equal(tokens[2], "Kleid")
531 assert.Equal(tokens[3], "!")
532 assert.Equal(len(tokens), 4)
533
534 // Probably interpreted as HOST
535 // testTokenizerFileExtension1
Akronec835ad2021-08-11 18:23:22 +0200536 tokens = ttokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
Akron03ca4252021-08-11 13:32:53 +0200537 assert.Equal(tokens[0], "Ich")
538 assert.Equal(tokens[1], "habe")
539 assert.Equal(tokens[2], "die")
540 assert.Equal(tokens[3], "readme.txt")
541 assert.Equal(tokens[4], "heruntergeladen")
542 assert.Equal(len(tokens), 5)
543
544 // Probably interpreted as HOST
545 // testTokenizerFileExtension2
Akronec835ad2021-08-11 18:23:22 +0200546 tokens = ttokenize(dat, w, "Nimm die README.TXT!")
Akron03ca4252021-08-11 13:32:53 +0200547 assert.Equal(tokens[0], "Nimm")
548 assert.Equal(tokens[1], "die")
549 assert.Equal(tokens[2], "README.TXT")
550 assert.Equal(tokens[3], "!")
551 assert.Equal(len(tokens), 4)
552
553 // Probably interpreted as HOST
554 // testTokenizerFileExtension3
Akronec835ad2021-08-11 18:23:22 +0200555 tokens = ttokenize(dat, w, "Zeig mir profile.jpeg")
Akron03ca4252021-08-11 13:32:53 +0200556 assert.Equal(tokens[0], "Zeig")
557 assert.Equal(tokens[1], "mir")
558 assert.Equal(tokens[2], "profile.jpeg")
559 assert.Equal(len(tokens), 3)
560
561 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200562
Akronec835ad2021-08-11 18:23:22 +0200563 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200564 assert.Equal(tokens[0], "Zeig")
565 assert.Equal(tokens[1], "mir")
566 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
567 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200568
Akrone8837b52021-08-11 17:29:58 +0200569 // testTokenizerFile2
Akronec835ad2021-08-11 18:23:22 +0200570 tokens = ttokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
Akrone8837b52021-08-11 17:29:58 +0200571 assert.Equal(tokens[0], "Gehe")
572 assert.Equal(tokens[1], "zu")
573 assert.Equal(tokens[2], "/Dokumente/profile.docx")
574 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200575
Akrone8837b52021-08-11 17:29:58 +0200576 // testTokenizerFile3
Akronec835ad2021-08-11 18:23:22 +0200577 tokens = ttokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
Akrone8837b52021-08-11 17:29:58 +0200578 assert.Equal(tokens[0], "Zeig")
579 assert.Equal(tokens[1], "mir")
580 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
581 assert.Equal(len(tokens), 3)
582 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200583
Akronfd92d7e2021-08-11 16:31:43 +0200584 // testTokenizerPunct
Akronec835ad2021-08-11 18:23:22 +0200585 tokens = ttokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
Akronfd92d7e2021-08-11 16:31:43 +0200586 assert.Equal(tokens[0], "Er")
587 assert.Equal(tokens[1], "sagte")
588 assert.Equal(tokens[2], ":")
589 assert.Equal(tokens[3], "\"")
590 assert.Equal(tokens[4], "Es")
591 assert.Equal(tokens[5], "geht")
592 assert.Equal(tokens[6], "mir")
593 assert.Equal(tokens[7], "gut")
594 assert.Equal(tokens[8], "!")
595 assert.Equal(tokens[9], "\"")
596 assert.Equal(tokens[10], ",")
597 assert.Equal(tokens[11], "daraufhin")
598 assert.Equal(tokens[12], "ging")
599 assert.Equal(tokens[13], "er")
600 assert.Equal(tokens[14], ".")
601 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200602
603 // testTokenizerPlusAmpersand
Akronec835ad2021-08-11 18:23:22 +0200604 tokens = ttokenize(dat, w, ""Das ist von C&A!"")
605 assert.Equal(tokens[0], """)
606 assert.Equal(tokens[1], "Das")
607 assert.Equal(tokens[2], "ist")
608 assert.Equal(tokens[3], "von")
609 assert.Equal(tokens[4], "C&A")
610 assert.Equal(tokens[5], "!")
611 assert.Equal(tokens[6], """)
612 assert.Equal(len(tokens), 7)
Akron03ca4252021-08-11 13:32:53 +0200613
614 // testTokenizerLongEnd
Akronec835ad2021-08-11 18:23:22 +0200615 tokens = ttokenize(dat, w, "Siehst Du?!!?")
Akron03ca4252021-08-11 13:32:53 +0200616 assert.Equal(tokens[0], "Siehst")
617 assert.Equal(tokens[1], "Du")
618 assert.Equal(tokens[2], "?!!?")
619 assert.Equal(len(tokens), 3)
620
621 // testTokenizerIrishO
Akronec835ad2021-08-11 18:23:22 +0200622 tokens = ttokenize(dat, w, "Peter O'Toole")
Akron03ca4252021-08-11 13:32:53 +0200623 assert.Equal(tokens[0], "Peter")
624 assert.Equal(tokens[1], "O'Toole")
625 assert.Equal(len(tokens), 2)
626
627 // testTokenizerAbr
Akronec835ad2021-08-11 18:23:22 +0200628 tokens = ttokenize(dat, w, "Früher bzw. später ...")
Akron03ca4252021-08-11 13:32:53 +0200629 assert.Equal(tokens[0], "Früher")
630 assert.Equal(tokens[1], "bzw.")
631 assert.Equal(tokens[2], "später")
632 assert.Equal(tokens[3], "...")
633 assert.Equal(len(tokens), 4)
634
635 // testTokenizerUppercaseRule
Akronec835ad2021-08-11 18:23:22 +0200636 tokens = ttokenize(dat, w, "Es war spät.Morgen ist es früh.")
Akron03ca4252021-08-11 13:32:53 +0200637 assert.Equal(tokens[0], "Es")
638 assert.Equal(tokens[1], "war")
639 assert.Equal(tokens[2], "spät")
640 assert.Equal(tokens[3], ".")
641 assert.Equal(tokens[4], "Morgen")
642 assert.Equal(tokens[5], "ist")
643 assert.Equal(tokens[6], "es")
644 assert.Equal(tokens[7], "früh")
645 assert.Equal(tokens[8], ".")
646 assert.Equal(len(tokens), 9)
647 // Ignored in KorAP-Tokenizer
648
649 // testTokenizerOrd
Akronec835ad2021-08-11 18:23:22 +0200650 tokens = ttokenize(dat, w, "Sie erreichte den 1. Platz!")
Akrona0bded52021-08-11 15:48:02 +0200651 assert.Equal(tokens[0], "Sie")
652 assert.Equal(tokens[1], "erreichte")
653 assert.Equal(tokens[2], "den")
654 assert.Equal(tokens[3], "1.")
655 assert.Equal(tokens[4], "Platz")
656 assert.Equal(tokens[5], "!")
657 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200658
659 // testNoZipOuputArchive
Akronec835ad2021-08-11 18:23:22 +0200660 tokens = ttokenize(dat, w, "Archive: Ich bin kein zip\n")
Akron03ca4252021-08-11 13:32:53 +0200661 assert.Equal(tokens[0], "Archive")
662 assert.Equal(tokens[1], ":")
663 assert.Equal(tokens[2], "Ich")
664 assert.Equal(tokens[3], "bin")
665 assert.Equal(tokens[4], "kein")
666 assert.Equal(tokens[5], "zip")
667 assert.Equal(6, len(tokens))
668
669 // testTokenizerStrasse
Akronec835ad2021-08-11 18:23:22 +0200670 tokens = ttokenize(dat, w, "Ich wohne in der Weststr. und Du?")
Akron4af79f12021-08-11 14:48:17 +0200671 assert.Equal(tokens[4], "Weststr.")
672 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200673
674 // germanTokenizerKnowsGermanOmissionWords
Akronec835ad2021-08-11 18:23:22 +0200675 tokens = ttokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
Akron03ca4252021-08-11 13:32:53 +0200676 assert.Equal("D'dorf", tokens[0])
677 assert.Equal("Ku'damm", tokens[1])
678 assert.Equal("Lu'hafen", tokens[2])
679 assert.Equal("M'gladbach", tokens[3])
680 assert.Equal("W'schaft", tokens[4])
681 assert.Equal(5, len(tokens))
682
683 // germanTokenizerDoesNOTSeparateGermanContractions
Akronec835ad2021-08-11 18:23:22 +0200684 tokens = ttokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
Akron03ca4252021-08-11 13:32:53 +0200685 assert.Equal("mach's", tokens[0])
686 assert.Equal("macht's", tokens[1])
687 assert.Equal("was'n", tokens[2])
688 assert.Equal("ist's", tokens[3])
689 assert.Equal("haste", tokens[4])
690 assert.Equal("willste", tokens[5])
691 assert.Equal("kannste", tokens[6])
692 assert.Equal("biste", tokens[7])
693 assert.Equal("kriegste", tokens[8])
694 assert.Equal(9, len(tokens))
695
696 /*
697 @Test
698 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
699 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
700 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
701 assert.Equal("'ve", tokens[1]);
702 assert.Equal("'ll", tokens[3]);
703 assert.Equal("'d", tokens[5]);
704 assert.Equal("'m", tokens[7]);
705 assert.Equal("'re", tokens[9]);
706 assert.Equal("'s", tokens[11]);
707 assert.Equal("is", tokens[12]);
708 assert.Equal("n't", tokens[13]);
709 assert.Equal(14, len(tokens));
710 }
711
712 @Test
713 public void frenchTokenizerKnowsFrenchAbbreviations () {
714 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
715 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
716 assert.Equal("Approx.", tokens[0]);
717 assert.Equal("juill.", tokens[2]);
718 assert.Equal("prof.", tokens[5]);
719 assert.Equal("exerc.", tokens[15]);
720 assert.Equal("no.", tokens[16]);
721 assert.Equal("pp.", tokens[21]);
722 }
723
724 @Test
725 public void frenchTokenizerKnowsFrenchContractions () {
726 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
727 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
728 assert.Equal("J'", tokens[0]);
729 assert.Equal("j'", tokens[2]);
730 assert.Equal("qu'", tokens[4]);
731 assert.Equal("d'", tokens[6]);
732 assert.Equal("jusqu'", tokens[8]);
733 assert.Equal("Aujourd'hui", tokens[10]);
734 assert.Equal("D'", tokens[11]); // ’
735 assert.Equal("Quelqu'un", tokens[13]); // ’
736 assert.Equal("Presqu'île", tokens[14]); // ’
737 }
738
739 @Test
740 public void frenchTokenizerKnowsFrenchClitics () {
741 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
742 tokens = tokenize(dat, w, "suis-je sont-elles ")
743 assert.Equal("suis", tokens[0]);
744 assert.Equal("-je", tokens[1]);
745 assert.Equal("sont", tokens[2]);
746 assert.Equal("-elles", tokens[3]);
747 }
748
749 @Test
750 public void testEnglishTokenizerScienceAbbreviations () {
751 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
752 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
753 assert.Equal("Approx.", tokens[0]);
754 assert.Equal("in", tokens[1]);
755 assert.Equal("Sept.", tokens[2]);
756 assert.Equal("1954", tokens[3]);
757 assert.Equal(",", tokens[4]);
758 assert.Equal("Assoc.", tokens[5]);
759 assert.Equal("Prof.", tokens[6]);
760 assert.Equal("Dr.", tokens[7]);
761 assert.Equal("R.", tokens[8]);
762 assert.Equal("J.", tokens[9]);
763 assert.Equal("Ewing", tokens[10]);
764 assert.Equal("reviewed", tokens[11]);
765 assert.Equal("articles", tokens[12]);
766 assert.Equal("on", tokens[13]);
767 assert.Equal("Enzymol.", tokens[14]);
768 assert.Equal("Bacteriol.", tokens[15]);
769 assert.Equal("effects", tokens[16]);
770 assert.Equal("later", tokens[17]);
771 assert.Equal("published", tokens[18]);
772 assert.Equal("in", tokens[19]);
773 assert.Equal("Nutr.", tokens[20]);
774 assert.Equal("Rheumatol.", tokens[21]);
775 assert.Equal("No.", tokens[22]);
776 assert.Equal("12", tokens[23]);
777 assert.Equal("and", tokens[24]);
778 assert.Equal("Nº.", tokens[25]);
779 assert.Equal("13.", tokens[26]);
780 assert.Equal(",", tokens[27]);
781 assert.Equal("pp.", tokens[28]);
782 assert.Equal("17-18", tokens[29]);
783 assert.Equal(".", tokens[30]);
784 }
785
786 @Test
787 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
788 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
789 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
790 assert.Equal("I.", tokens[1]);
791 assert.Equal("I", tokens[8]);
792 assert.Equal(".", tokens[9]);
793 assert.Equal("I", tokens[12]);
794 assert.Equal(".", tokens[13]);
795 }
796
797 @Test
798 public void testZipOuputArchive () {
799
800 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
801 System.setOut(new PrintStream(clearOut));
802 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
803 assert.Equal(0, len(tokens));
804 }
805 */
806 /*
807
808 @Test
809 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
810 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
811 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
812 .printOffsets(true)
813 .build();
814 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
815 assert.Equal("Text1", tokens[0].getType());
816 assert.Equal(len(tokens), 9 );
817 }
818 */
819}