blob: 815f9265302289af24fd259ca1bdabc7d9d5a929 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron03ca4252021-08-11 13:32:53 +02005 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02006 "strings"
Akron8ef408b2021-08-02 22:11:04 +02007 "testing"
8
9 "github.com/stretchr/testify/assert"
10)
11
12func TestSimpleString(t *testing.T) {
13 assert := assert.New(t)
14
15 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020016 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020017 dat := tok.ToDoubleArray()
18 assert.True(dat.Match("bau"))
19 assert.True(dat.Match("bauamt"))
20 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020021}
Akron75ebe7f2021-08-03 10:34:10 +020022
23func TestSimpleBranches(t *testing.T) {
24 assert := assert.New(t)
25
26 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020027 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020028 dat := tok.ToDoubleArray()
29 assert.False(dat.Match("bau"))
30 assert.True(dat.Match("bauamt"))
31 assert.True(dat.Match("wahlamt"))
32 assert.True(dat.Match("bauen"))
33 assert.True(dat.Match("wahlen"))
34 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020035}
Akron730a79c2021-08-03 11:05:29 +020036
37func TestSimpleTokenizer(t *testing.T) {
38 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020039 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020040 dat := tok.ToDoubleArray()
41 assert.True(dat.Match("bau"))
42 assert.True(dat.Match("bad"))
43 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020044}
Akron740f3d72021-08-03 12:12:34 +020045
Akron068874c2021-08-04 15:19:56 +020046func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020047 assert := assert.New(t)
48 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020049 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020050
51 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
52 b := make([]byte, 0, 2048)
53 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020054 var tokens []string
Akron524c5432021-08-05 14:14:27 +020055 dat.Transduce(r, w)
56 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020057 assert.Equal("wald", tokens[0])
58 assert.Equal("gehen", tokens[1])
59 assert.Equal("Da", tokens[2])
60 assert.Equal("kann", tokens[3])
61 assert.Equal("man", tokens[4])
62 assert.Equal("was", tokens[5])
63 assert.Equal("\"erleben\"", tokens[6])
64
Akron524c5432021-08-05 14:14:27 +020065 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
66 w.Reset()
67 dat.Transduce(r, w)
68 tokens = strings.Split(w.String(), "\n")
69 assert.Equal("In", tokens[0])
70 assert.Equal("den", tokens[1])
71 assert.Equal("Wald", tokens[2])
72 assert.Equal("gehen", tokens[3])
73 assert.Equal("?", tokens[4])
74 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020075
Akron524c5432021-08-05 14:14:27 +020076 r = strings.NewReader(" g? -- D")
77 w.Reset()
78 dat.Transduce(r, w)
79 tokens = strings.Split(w.String(), "\n")
80 assert.Equal("g", tokens[0])
81 assert.Equal("?", tokens[1])
82 assert.Equal("--", tokens[2])
83 assert.Equal("D", tokens[3])
84 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +020085 assert.Equal("", tokens[5])
86 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020087}
88
Akron3f8571a2021-08-05 11:18:10 +020089func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020090 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020091 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020092 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020093 assert.True(dat.Match("bau"))
94 assert.True(dat.Match("bad"))
95 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020096
Akron03a3c612021-08-04 11:51:27 +020097 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020098
Akron3f8571a2021-08-05 11:18:10 +020099 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200100 buf := bytes.NewBuffer(b)
101 n, err := dat.WriteTo(buf)
102 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200103 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200104
105 dat2 := ParseDatok(buf)
106 assert.NotNil(dat2)
107 assert.Equal(dat.array, dat2.array)
108 assert.Equal(dat.sigma, dat2.sigma)
109 assert.Equal(dat.epsilon, dat2.epsilon)
110 assert.Equal(dat.unknown, dat2.unknown)
111 assert.Equal(dat.identity, dat2.identity)
112 assert.Equal(dat.final, dat2.final)
113 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
114 assert.True(dat2.Match("bau"))
115 assert.True(dat2.Match("bad"))
116 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200117}
118
119func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200120 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200121 dat := LoadDatokFile("testdata/tokenizer.datok")
122 assert.NotNil(dat)
123 assert.True(dat.LoadFactor() >= 70)
124 assert.Equal(dat.epsilon, 1)
125 assert.Equal(dat.unknown, 2)
126 assert.Equal(dat.identity, 3)
Akronfd92d7e2021-08-11 16:31:43 +0200127 assert.Equal(dat.final, 135)
128 assert.Equal(len(dat.sigma), 130)
Akron4af79f12021-08-11 14:48:17 +0200129 assert.True(len(dat.array) > 3800000)
130 assert.True(dat.maxSize > 3800000)
Akron3a063ef2021-08-05 19:36:35 +0200131
132 assert.True(dat.Match("bau"))
133 assert.True(dat.Match("bad"))
134 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200135}
Akron3f8571a2021-08-05 11:18:10 +0200136
Akrona0bded52021-08-11 15:48:02 +0200137func XTestFullTokenizerBuild(t *testing.T) {
138 assert := assert.New(t)
139 tok := LoadFomaFile("testdata/tokenizer.fst")
140 dat := tok.ToDoubleArray()
141 n, err := dat.Save("testdata/tokenizer.datok")
142 assert.Nil(err)
143 assert.True(n > 500)
144}
145
Akron3f8571a2021-08-05 11:18:10 +0200146func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200147 assert := assert.New(t)
148
Akrona0bded52021-08-11 15:48:02 +0200149 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200150 assert.NotNil(dat)
151
Akron3610f102021-08-08 14:13:25 +0200152 b := make([]byte, 0, 2048)
153 w := bytes.NewBuffer(b)
154 var tokens []string
155
Akron03ca4252021-08-11 13:32:53 +0200156 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200157
158 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200159 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200160 assert.Equal("tra", tokens[0])
161 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200162 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200163 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200164 assert.Equal("Du", tokens[4])
165 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200166 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200167 assert.Equal("", tokens[7])
168 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200169
170 w.Reset()
171 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
172 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200173}
Akronb7e1f132021-08-10 11:52:31 +0200174
175func TestFullTokenizerSentenceSplitter(t *testing.T) {
176 assert := assert.New(t)
177 dat := LoadDatokFile("testdata/tokenizer.datok")
178 assert.NotNil(dat)
179
180 b := make([]byte, 0, 2048)
181 w := bytes.NewBuffer(b)
182 var sentences []string
183
184 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200185 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
186 sentences = strings.Split(w.String(), "\n\n")
187
188 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
189 assert.Equal("Der\nalte\nMann\n.", sentences[0])
190 assert.Equal("", sentences[1])
191 assert.Equal(len(sentences), 2)
192
193 w.Reset()
194 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
195 sentences = strings.Split(w.String(), "\n\n")
196 assert.Equal(len(sentences), 2)
197 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
198 assert.Equal("", sentences[1])
199
200 w.Reset()
201 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200202 sentences = strings.Split(w.String(), "\n\n")
203 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200204 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200205
206 w.Reset()
207 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
208 sentences = strings.Split(w.String(), "\n\n")
209 assert.Equal(len(sentences), 2)
210
211 w.Reset()
212 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
213 sentences = strings.Split(w.String(), "\n\n")
214 assert.Equal(len(sentences), 2)
215
Akron6e70dc82021-08-11 11:33:18 +0200216 w.Reset()
217 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
218 sentences = strings.Split(w.String(), "\n\n")
219 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
220 assert.Equal("", sentences[1])
221 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200222
Akron6e70dc82021-08-11 11:33:18 +0200223 w.Reset()
224 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal("", sentences[1])
227 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200228
Akron6e70dc82021-08-11 11:33:18 +0200229 w.Reset()
230 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
231 sentences = strings.Split(w.String(), "\n\n")
232 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200233
Akron6e70dc82021-08-11 11:33:18 +0200234 w.Reset()
235 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
236 sentences = strings.Split(w.String(), "\n\n")
237 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200238
Akron6e70dc82021-08-11 11:33:18 +0200239 w.Reset()
240 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
241 sentences = strings.Split(w.String(), "\n\n")
242 assert.Equal(len(sentences), 2)
243 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
244 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200245
Akron6e70dc82021-08-11 11:33:18 +0200246 w.Reset()
247 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
248 sentences = strings.Split(w.String(), "\n\n")
249 assert.Equal(len(sentences), 3)
250 assert.Equal("Ausschalten\n!!!", sentences[0])
251 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
252 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200253
Akron4af79f12021-08-11 14:48:17 +0200254 w.Reset()
255 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
256 sentences = strings.Split(w.String(), "\n\n")
257 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200258
259 /*
260 Test:
261 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
262 */
Akronb7e1f132021-08-10 11:52:31 +0200263}
Akron03ca4252021-08-11 13:32:53 +0200264
265func tokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
266 w.Reset()
267 ok := dat.Transduce(strings.NewReader(str), w)
268 if !ok {
269 return []string{}
270 }
271 obj := regexp.MustCompile("\n+")
272
273 tokens := obj.Split(w.String(), -1)
274 return tokens[:len(tokens)-1]
275}
276
277func TestFullTokenizerTokenSplitter(t *testing.T) {
278 assert := assert.New(t)
279 dat := LoadDatokFile("testdata/tokenizer.datok")
280 assert.NotNil(dat)
281
282 b := make([]byte, 0, 2048)
283 w := bytes.NewBuffer(b)
284 var tokens []string
285
286 // testTokenizerSimple
287 tokens = tokenize(dat, w, "Der alte Mann")
288 assert.Equal(tokens[0], "Der")
289 assert.Equal(tokens[1], "alte")
290 assert.Equal(tokens[2], "Mann")
291 assert.Equal(len(tokens), 3)
292
293 tokens = tokenize(dat, w, "Der alte Mann.")
294 assert.Equal(tokens[0], "Der")
295 assert.Equal(tokens[1], "alte")
296 assert.Equal(tokens[2], "Mann")
297 assert.Equal(tokens[3], ".")
298 assert.Equal(len(tokens), 4)
299
300 // testTokenizerAbbr
301 tokens = tokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
302 assert.Equal(tokens[0], "Der")
303 assert.Equal(tokens[1], "Vorsitzende")
304 assert.Equal(tokens[2], "der")
305 assert.Equal(tokens[3], "F.D.P.")
306 assert.Equal(tokens[4], "hat")
307 assert.Equal(tokens[5], "gewählt")
308 assert.Equal(len(tokens), 6)
309 // Ignored in KorAP-Tokenizer
310
311 // testTokenizerHost1
312 tokens = tokenize(dat, w, "Gefunden auf wikipedia.org")
313 assert.Equal(tokens[0], "Gefunden")
314 assert.Equal(tokens[1], "auf")
315 assert.Equal(tokens[2], "wikipedia.org")
316 assert.Equal(len(tokens), 3)
317
318 // testTokenizerWwwHost
319 tokens = tokenize(dat, w, "Gefunden auf www.wikipedia.org")
320 assert.Equal("Gefunden", tokens[0])
321 assert.Equal("auf", tokens[1])
322 assert.Equal("www.wikipedia.org", tokens[2])
323 assert.Equal(3, len(tokens))
324
325 // testTokenizerWwwUrl
326 tokens = tokenize(dat, w, "Weitere Infos unter www.info.biz/info")
327 assert.Equal("www.info.biz/info", tokens[3])
328
329 // testTokenizerFtpHost
330 /*
331 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
332 assert.Equal("Kann", tokens[0])
333 assert.Equal("von", tokens[1])
334 assert.Equal("ftp.download.org", tokens[2])
335 assert.Equal(5, len(tokens))
336 // Ignored in KorAP-Tokenizer
337 */
338
339 // testTokenizerDash
340 tokens = tokenize(dat, w, "Das war -- spitze")
341 assert.Equal(tokens[0], "Das")
342 assert.Equal(tokens[1], "war")
343 assert.Equal(tokens[2], "--")
344 assert.Equal(tokens[3], "spitze")
345 assert.Equal(len(tokens), 4)
346
347 // testTokenizerEmail1
348 tokens = tokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
349 assert.Equal(tokens[0], "Ich")
350 assert.Equal(tokens[1], "bin")
351 assert.Equal(tokens[2], "unter")
352 assert.Equal(tokens[3], "korap@ids-mannheim.de")
353 assert.Equal(tokens[4], "erreichbar")
354 assert.Equal(tokens[5], ".")
355 assert.Equal(len(tokens), 6)
356
357 // testTokenizerEmail2
358 tokens = tokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
359 assert.Equal(tokens[0], "Oder")
360 assert.Equal(tokens[1], "unter")
361 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
362 assert.Equal(tokens[3], ".")
363 assert.Equal(len(tokens), 4)
364
365 // testTokenizerEmail3
366 tokens = tokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
367 assert.Equal(tokens[0], "Oder")
368 assert.Equal(tokens[1], "unter")
369 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
370 assert.Equal(tokens[3], ".")
371 assert.Equal(len(tokens), 4)
372 // Ignored in KorAP-Tokenizer
373
374 // testTokenizerDoNotAcceptQuotedEmailNames
375 tokens = tokenize(dat, w, "\"John Doe\"@xx.com")
376 assert.Equal("\"", tokens[0])
377 assert.Equal("John", tokens[1])
378 assert.Equal("Doe", tokens[2])
379 assert.Equal("\"", tokens[3])
380 assert.Equal("@xx", tokens[4])
381 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
382 assert.Equal("com", tokens[6])
383 assert.Equal(7, len(tokens))
384
385 // testTokenizerTwitter
386 tokens = tokenize(dat, w, "Folgt @korap und #korap")
387 assert.Equal(tokens[0], "Folgt")
388 assert.Equal(tokens[1], "@korap")
389 assert.Equal(tokens[2], "und")
390 assert.Equal(tokens[3], "#korap")
391 assert.Equal(len(tokens), 4)
392
393 // testTokenizerWeb1
394 tokens = tokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
395 assert.Equal(tokens[0], "Unsere")
396 assert.Equal(tokens[1], "Website")
397 assert.Equal(tokens[2], "ist")
398 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
399 assert.Equal(len(tokens), 4)
400
401 // testTokenizerWeb2
402 tokens = tokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
403 assert.Equal(tokens[0], "Wir")
404 assert.Equal(tokens[1], "sind")
405 assert.Equal(tokens[2], "auch")
406 assert.Equal(tokens[3], "im")
407 assert.Equal(tokens[4], "Internet")
408 assert.Equal(tokens[5], "(")
409 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
410 assert.Equal(tokens[7], ")")
411 assert.Equal(len(tokens), 8)
412 // Ignored in KorAP-Tokenizer
413
414 // testTokenizerWeb3
415 tokens = tokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
416 assert.Equal(tokens[0], "Die")
417 assert.Equal(tokens[1], "Adresse")
418 assert.Equal(tokens[2], "ist")
419 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
420 assert.Equal(tokens[4], ".")
421 assert.Equal(len(tokens), 5)
422 // Ignored in KorAP-Tokenizer
423
424 // testTokenizerServer
425 tokens = tokenize(dat, w, "Unser Server ist 10.0.10.51.")
426 assert.Equal(tokens[0], "Unser")
427 assert.Equal(tokens[1], "Server")
428 assert.Equal(tokens[2], "ist")
429 assert.Equal(tokens[3], "10.0.10.51")
430 assert.Equal(tokens[4], ".")
431 assert.Equal(len(tokens), 5)
432
433 // testTokenizerNum
434 tokens = tokenize(dat, w, "Zu 50,4% ist es sicher")
435 assert.Equal(tokens[0], "Zu")
436 assert.Equal(tokens[1], "50,4%")
437 assert.Equal(tokens[2], "ist")
438 assert.Equal(tokens[3], "es")
439 assert.Equal(tokens[4], "sicher")
440 assert.Equal(len(tokens), 5)
441 // Differs from KorAP-Tokenizer
442
443 // testTokenizerDate
444 tokens = tokenize(dat, w, "Der Termin ist am 5.9.2018")
445 assert.Equal(tokens[0], "Der")
446 assert.Equal(tokens[1], "Termin")
447 assert.Equal(tokens[2], "ist")
448 assert.Equal(tokens[3], "am")
449 assert.Equal(tokens[4], "5.9.2018")
450 assert.Equal(len(tokens), 5)
451
452 tokens = tokenize(dat, w, "Der Termin ist am 5/9/2018")
453 assert.Equal(tokens[0], "Der")
454 assert.Equal(tokens[1], "Termin")
455 assert.Equal(tokens[2], "ist")
456 assert.Equal(tokens[3], "am")
457 assert.Equal(tokens[4], "5/9/2018")
458 assert.Equal(len(tokens), 5)
459
460 // testTokenizerDateRange
461 /*
462 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
463 assert.Equal(tokens[0], "Der")
464 assert.Equal(tokens[1], "Termin")
465 assert.Equal(tokens[2], "war")
466 assert.Equal(tokens[3], "vom")
467 assert.Equal(tokens[4], "4.")
468 assert.Equal(tokens[5], "-")
469 assert.Equal(tokens[6], "5.9.2018")
470 assert.Equal(len(tokens), 7)
471 // Ignored in KorAP-Tokenizer
472 */
473
474 // testTokenizerEmoji1
475 tokens = tokenize(dat, w, "Das ist toll! ;)")
476 assert.Equal(tokens[0], "Das")
477 assert.Equal(tokens[1], "ist")
478 assert.Equal(tokens[2], "toll")
479 assert.Equal(tokens[3], "!")
480 assert.Equal(tokens[4], ";)")
481 assert.Equal(len(tokens), 5)
482
483 // testTokenizerRef1
484 tokens = tokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
485 assert.Equal(tokens[0], "Kupietz")
486 assert.Equal(tokens[1], "und")
487 assert.Equal(tokens[2], "Schmidt")
488 assert.Equal(tokens[3], "(2018)")
489 assert.Equal(tokens[4], ":")
490 assert.Equal(tokens[5], "Korpuslinguistik")
491 assert.Equal(len(tokens), 6)
492 // Differs from KorAP-Tokenizer!
493
494 // testTokenizerRef2 () {
495 tokens = tokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
496 assert.Equal(tokens[0], "Kupietz")
497 assert.Equal(tokens[1], "und")
498 assert.Equal(tokens[2], "Schmidt")
499 assert.Equal(tokens[3], "[2018]")
500 assert.Equal(tokens[4], ":")
501 assert.Equal(tokens[5], "Korpuslinguistik")
502 assert.Equal(len(tokens), 6)
503 // Differs from KorAP-Tokenizer!
504
505 // testTokenizerOmission1 () {
506 tokens = tokenize(dat, w, "Er ist ein A****loch!")
507 assert.Equal(tokens[0], "Er")
508 assert.Equal(tokens[1], "ist")
509 assert.Equal(tokens[2], "ein")
510 assert.Equal(tokens[3], "A****loch")
511 assert.Equal(tokens[4], "!")
512 assert.Equal(len(tokens), 5)
513
514 // testTokenizerOmission2
515 tokens = tokenize(dat, w, "F*ck!")
516 assert.Equal(tokens[0], "F*ck")
517 assert.Equal(tokens[1], "!")
518 assert.Equal(len(tokens), 2)
519
520 // testTokenizerOmission3 () {
521 tokens = tokenize(dat, w, "Dieses verf***** Kleid!")
522 assert.Equal(tokens[0], "Dieses")
523 assert.Equal(tokens[1], "verf*****")
524 assert.Equal(tokens[2], "Kleid")
525 assert.Equal(tokens[3], "!")
526 assert.Equal(len(tokens), 4)
527
528 // Probably interpreted as HOST
529 // testTokenizerFileExtension1
530 tokens = tokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
531 assert.Equal(tokens[0], "Ich")
532 assert.Equal(tokens[1], "habe")
533 assert.Equal(tokens[2], "die")
534 assert.Equal(tokens[3], "readme.txt")
535 assert.Equal(tokens[4], "heruntergeladen")
536 assert.Equal(len(tokens), 5)
537
538 // Probably interpreted as HOST
539 // testTokenizerFileExtension2
540 tokens = tokenize(dat, w, "Nimm die README.TXT!")
541 assert.Equal(tokens[0], "Nimm")
542 assert.Equal(tokens[1], "die")
543 assert.Equal(tokens[2], "README.TXT")
544 assert.Equal(tokens[3], "!")
545 assert.Equal(len(tokens), 4)
546
547 // Probably interpreted as HOST
548 // testTokenizerFileExtension3
549 tokens = tokenize(dat, w, "Zeig mir profile.jpeg")
550 assert.Equal(tokens[0], "Zeig")
551 assert.Equal(tokens[1], "mir")
552 assert.Equal(tokens[2], "profile.jpeg")
553 assert.Equal(len(tokens), 3)
554
555 // testTokenizerFile1
Akron03ca4252021-08-11 13:32:53 +0200556
Akrone8837b52021-08-11 17:29:58 +0200557 tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
558 assert.Equal(tokens[0], "Zeig")
559 assert.Equal(tokens[1], "mir")
560 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
561 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200562
Akrone8837b52021-08-11 17:29:58 +0200563 // testTokenizerFile2
564 tokens = tokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
565 assert.Equal(tokens[0], "Gehe")
566 assert.Equal(tokens[1], "zu")
567 assert.Equal(tokens[2], "/Dokumente/profile.docx")
568 assert.Equal(len(tokens), 3)
Akron03ca4252021-08-11 13:32:53 +0200569
Akrone8837b52021-08-11 17:29:58 +0200570 // testTokenizerFile3
571 tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
572 assert.Equal(tokens[0], "Zeig")
573 assert.Equal(tokens[1], "mir")
574 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
575 assert.Equal(len(tokens), 3)
576 // Ignored in KorAP-Tokenizer
Akron03ca4252021-08-11 13:32:53 +0200577
Akronfd92d7e2021-08-11 16:31:43 +0200578 // testTokenizerPunct
579 tokens = tokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
580 assert.Equal(tokens[0], "Er")
581 assert.Equal(tokens[1], "sagte")
582 assert.Equal(tokens[2], ":")
583 assert.Equal(tokens[3], "\"")
584 assert.Equal(tokens[4], "Es")
585 assert.Equal(tokens[5], "geht")
586 assert.Equal(tokens[6], "mir")
587 assert.Equal(tokens[7], "gut")
588 assert.Equal(tokens[8], "!")
589 assert.Equal(tokens[9], "\"")
590 assert.Equal(tokens[10], ",")
591 assert.Equal(tokens[11], "daraufhin")
592 assert.Equal(tokens[12], "ging")
593 assert.Equal(tokens[13], "er")
594 assert.Equal(tokens[14], ".")
595 assert.Equal(len(tokens), 15)
Akron03ca4252021-08-11 13:32:53 +0200596
597 // testTokenizerPlusAmpersand
598 /*
599 tokens = tokenize(dat, w, ""Das ist von C&A!"")
600 assert.Equal(tokens[0], """)
601 assert.Equal(tokens[1], "Das")
602 assert.Equal(tokens[2], "ist")
603 assert.Equal(tokens[3], "von")
604 assert.Equal(tokens[4], "C&A")
605 assert.Equal(tokens[5], "!")
606 assert.Equal(tokens[6], """)
607 assert.Equal(len(tokens), 7)
608 */
609
610 // testTokenizerLongEnd
611 tokens = tokenize(dat, w, "Siehst Du?!!?")
612 assert.Equal(tokens[0], "Siehst")
613 assert.Equal(tokens[1], "Du")
614 assert.Equal(tokens[2], "?!!?")
615 assert.Equal(len(tokens), 3)
616
617 // testTokenizerIrishO
618 tokens = tokenize(dat, w, "Peter O'Toole")
619 assert.Equal(tokens[0], "Peter")
620 assert.Equal(tokens[1], "O'Toole")
621 assert.Equal(len(tokens), 2)
622
623 // testTokenizerAbr
624 tokens = tokenize(dat, w, "Früher bzw. später ...")
625 assert.Equal(tokens[0], "Früher")
626 assert.Equal(tokens[1], "bzw.")
627 assert.Equal(tokens[2], "später")
628 assert.Equal(tokens[3], "...")
629 assert.Equal(len(tokens), 4)
630
631 // testTokenizerUppercaseRule
632 tokens = tokenize(dat, w, "Es war spät.Morgen ist es früh.")
633 assert.Equal(tokens[0], "Es")
634 assert.Equal(tokens[1], "war")
635 assert.Equal(tokens[2], "spät")
636 assert.Equal(tokens[3], ".")
637 assert.Equal(tokens[4], "Morgen")
638 assert.Equal(tokens[5], "ist")
639 assert.Equal(tokens[6], "es")
640 assert.Equal(tokens[7], "früh")
641 assert.Equal(tokens[8], ".")
642 assert.Equal(len(tokens), 9)
643 // Ignored in KorAP-Tokenizer
644
645 // testTokenizerOrd
Akrona0bded52021-08-11 15:48:02 +0200646 tokens = tokenize(dat, w, "Sie erreichte den 1. Platz!")
647 assert.Equal(tokens[0], "Sie")
648 assert.Equal(tokens[1], "erreichte")
649 assert.Equal(tokens[2], "den")
650 assert.Equal(tokens[3], "1.")
651 assert.Equal(tokens[4], "Platz")
652 assert.Equal(tokens[5], "!")
653 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200654
655 // testNoZipOuputArchive
656 tokens = tokenize(dat, w, "Archive: Ich bin kein zip\n")
657 assert.Equal(tokens[0], "Archive")
658 assert.Equal(tokens[1], ":")
659 assert.Equal(tokens[2], "Ich")
660 assert.Equal(tokens[3], "bin")
661 assert.Equal(tokens[4], "kein")
662 assert.Equal(tokens[5], "zip")
663 assert.Equal(6, len(tokens))
664
665 // testTokenizerStrasse
Akron4af79f12021-08-11 14:48:17 +0200666 tokens = tokenize(dat, w, "Ich wohne in der Weststr. und Du?")
667 assert.Equal(tokens[4], "Weststr.")
668 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200669
670 // germanTokenizerKnowsGermanOmissionWords
671 tokens = tokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
672 assert.Equal("D'dorf", tokens[0])
673 assert.Equal("Ku'damm", tokens[1])
674 assert.Equal("Lu'hafen", tokens[2])
675 assert.Equal("M'gladbach", tokens[3])
676 assert.Equal("W'schaft", tokens[4])
677 assert.Equal(5, len(tokens))
678
679 // germanTokenizerDoesNOTSeparateGermanContractions
680 tokens = tokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
681 assert.Equal("mach's", tokens[0])
682 assert.Equal("macht's", tokens[1])
683 assert.Equal("was'n", tokens[2])
684 assert.Equal("ist's", tokens[3])
685 assert.Equal("haste", tokens[4])
686 assert.Equal("willste", tokens[5])
687 assert.Equal("kannste", tokens[6])
688 assert.Equal("biste", tokens[7])
689 assert.Equal("kriegste", tokens[8])
690 assert.Equal(9, len(tokens))
691
692 /*
693 @Test
694 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
695 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
696 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
697 assert.Equal("'ve", tokens[1]);
698 assert.Equal("'ll", tokens[3]);
699 assert.Equal("'d", tokens[5]);
700 assert.Equal("'m", tokens[7]);
701 assert.Equal("'re", tokens[9]);
702 assert.Equal("'s", tokens[11]);
703 assert.Equal("is", tokens[12]);
704 assert.Equal("n't", tokens[13]);
705 assert.Equal(14, len(tokens));
706 }
707
708 @Test
709 public void frenchTokenizerKnowsFrenchAbbreviations () {
710 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
711 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
712 assert.Equal("Approx.", tokens[0]);
713 assert.Equal("juill.", tokens[2]);
714 assert.Equal("prof.", tokens[5]);
715 assert.Equal("exerc.", tokens[15]);
716 assert.Equal("no.", tokens[16]);
717 assert.Equal("pp.", tokens[21]);
718 }
719
720 @Test
721 public void frenchTokenizerKnowsFrenchContractions () {
722 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
723 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
724 assert.Equal("J'", tokens[0]);
725 assert.Equal("j'", tokens[2]);
726 assert.Equal("qu'", tokens[4]);
727 assert.Equal("d'", tokens[6]);
728 assert.Equal("jusqu'", tokens[8]);
729 assert.Equal("Aujourd'hui", tokens[10]);
730 assert.Equal("D'", tokens[11]); // ’
731 assert.Equal("Quelqu'un", tokens[13]); // ’
732 assert.Equal("Presqu'île", tokens[14]); // ’
733 }
734
735 @Test
736 public void frenchTokenizerKnowsFrenchClitics () {
737 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
738 tokens = tokenize(dat, w, "suis-je sont-elles ")
739 assert.Equal("suis", tokens[0]);
740 assert.Equal("-je", tokens[1]);
741 assert.Equal("sont", tokens[2]);
742 assert.Equal("-elles", tokens[3]);
743 }
744
745 @Test
746 public void testEnglishTokenizerScienceAbbreviations () {
747 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
748 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
749 assert.Equal("Approx.", tokens[0]);
750 assert.Equal("in", tokens[1]);
751 assert.Equal("Sept.", tokens[2]);
752 assert.Equal("1954", tokens[3]);
753 assert.Equal(",", tokens[4]);
754 assert.Equal("Assoc.", tokens[5]);
755 assert.Equal("Prof.", tokens[6]);
756 assert.Equal("Dr.", tokens[7]);
757 assert.Equal("R.", tokens[8]);
758 assert.Equal("J.", tokens[9]);
759 assert.Equal("Ewing", tokens[10]);
760 assert.Equal("reviewed", tokens[11]);
761 assert.Equal("articles", tokens[12]);
762 assert.Equal("on", tokens[13]);
763 assert.Equal("Enzymol.", tokens[14]);
764 assert.Equal("Bacteriol.", tokens[15]);
765 assert.Equal("effects", tokens[16]);
766 assert.Equal("later", tokens[17]);
767 assert.Equal("published", tokens[18]);
768 assert.Equal("in", tokens[19]);
769 assert.Equal("Nutr.", tokens[20]);
770 assert.Equal("Rheumatol.", tokens[21]);
771 assert.Equal("No.", tokens[22]);
772 assert.Equal("12", tokens[23]);
773 assert.Equal("and", tokens[24]);
774 assert.Equal("Nº.", tokens[25]);
775 assert.Equal("13.", tokens[26]);
776 assert.Equal(",", tokens[27]);
777 assert.Equal("pp.", tokens[28]);
778 assert.Equal("17-18", tokens[29]);
779 assert.Equal(".", tokens[30]);
780 }
781
782 @Test
783 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
784 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
785 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
786 assert.Equal("I.", tokens[1]);
787 assert.Equal("I", tokens[8]);
788 assert.Equal(".", tokens[9]);
789 assert.Equal("I", tokens[12]);
790 assert.Equal(".", tokens[13]);
791 }
792
793 @Test
794 public void testZipOuputArchive () {
795
796 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
797 System.setOut(new PrintStream(clearOut));
798 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
799 assert.Equal(0, len(tokens));
800 }
801 */
802 /*
803
804 @Test
805 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
806 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
807 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
808 .printOffsets(true)
809 .build();
810 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
811 assert.Equal("Text1", tokens[0].getType());
812 assert.Equal(len(tokens), 9 );
813 }
814 */
815}