blob: 003006b891d7d397d53d1c6a556159c9d24ec953 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron03ca4252021-08-11 13:32:53 +02005 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02006 "strings"
Akron8ef408b2021-08-02 22:11:04 +02007 "testing"
8
9 "github.com/stretchr/testify/assert"
10)
11
12func TestSimpleString(t *testing.T) {
13 assert := assert.New(t)
14
15 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020016 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020017 dat := tok.ToDoubleArray()
18 assert.True(dat.Match("bau"))
19 assert.True(dat.Match("bauamt"))
20 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020021}
Akron75ebe7f2021-08-03 10:34:10 +020022
23func TestSimpleBranches(t *testing.T) {
24 assert := assert.New(t)
25
26 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020027 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020028 dat := tok.ToDoubleArray()
29 assert.False(dat.Match("bau"))
30 assert.True(dat.Match("bauamt"))
31 assert.True(dat.Match("wahlamt"))
32 assert.True(dat.Match("bauen"))
33 assert.True(dat.Match("wahlen"))
34 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020035}
Akron730a79c2021-08-03 11:05:29 +020036
37func TestSimpleTokenizer(t *testing.T) {
38 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020039 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020040 dat := tok.ToDoubleArray()
41 assert.True(dat.Match("bau"))
42 assert.True(dat.Match("bad"))
43 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020044}
Akron740f3d72021-08-03 12:12:34 +020045
Akron068874c2021-08-04 15:19:56 +020046func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020047 assert := assert.New(t)
48 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020049 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020050
51 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
52 b := make([]byte, 0, 2048)
53 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020054 var tokens []string
Akron524c5432021-08-05 14:14:27 +020055 dat.Transduce(r, w)
56 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020057 assert.Equal("wald", tokens[0])
58 assert.Equal("gehen", tokens[1])
59 assert.Equal("Da", tokens[2])
60 assert.Equal("kann", tokens[3])
61 assert.Equal("man", tokens[4])
62 assert.Equal("was", tokens[5])
63 assert.Equal("\"erleben\"", tokens[6])
64
Akron524c5432021-08-05 14:14:27 +020065 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
66 w.Reset()
67 dat.Transduce(r, w)
68 tokens = strings.Split(w.String(), "\n")
69 assert.Equal("In", tokens[0])
70 assert.Equal("den", tokens[1])
71 assert.Equal("Wald", tokens[2])
72 assert.Equal("gehen", tokens[3])
73 assert.Equal("?", tokens[4])
74 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020075
Akron524c5432021-08-05 14:14:27 +020076 r = strings.NewReader(" g? -- D")
77 w.Reset()
78 dat.Transduce(r, w)
79 tokens = strings.Split(w.String(), "\n")
80 assert.Equal("g", tokens[0])
81 assert.Equal("?", tokens[1])
82 assert.Equal("--", tokens[2])
83 assert.Equal("D", tokens[3])
84 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +020085 assert.Equal("", tokens[5])
86 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020087}
88
Akron3f8571a2021-08-05 11:18:10 +020089func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020090 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020091 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020092 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020093 assert.True(dat.Match("bau"))
94 assert.True(dat.Match("bad"))
95 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020096
Akron03a3c612021-08-04 11:51:27 +020097 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020098
Akron3f8571a2021-08-05 11:18:10 +020099 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200100 buf := bytes.NewBuffer(b)
101 n, err := dat.WriteTo(buf)
102 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200103 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200104
105 dat2 := ParseDatok(buf)
106 assert.NotNil(dat2)
107 assert.Equal(dat.array, dat2.array)
108 assert.Equal(dat.sigma, dat2.sigma)
109 assert.Equal(dat.epsilon, dat2.epsilon)
110 assert.Equal(dat.unknown, dat2.unknown)
111 assert.Equal(dat.identity, dat2.identity)
112 assert.Equal(dat.final, dat2.final)
113 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
114 assert.True(dat2.Match("bau"))
115 assert.True(dat2.Match("bad"))
116 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200117}
118
119func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200120 assert := assert.New(t)
Akron3a063ef2021-08-05 19:36:35 +0200121 dat := LoadDatokFile("testdata/tokenizer.datok")
122 assert.NotNil(dat)
123 assert.True(dat.LoadFactor() >= 70)
124 assert.Equal(dat.epsilon, 1)
125 assert.Equal(dat.unknown, 2)
126 assert.Equal(dat.identity, 3)
Akron03c92fe2021-08-09 14:07:57 +0200127 assert.Equal(dat.final, 136)
Akron3a063ef2021-08-05 19:36:35 +0200128 assert.Equal(len(dat.sigma), 131)
Akron4af79f12021-08-11 14:48:17 +0200129 assert.True(len(dat.array) > 3800000)
130 assert.True(dat.maxSize > 3800000)
Akron3a063ef2021-08-05 19:36:35 +0200131
132 assert.True(dat.Match("bau"))
133 assert.True(dat.Match("bad"))
134 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200135}
Akron3f8571a2021-08-05 11:18:10 +0200136
Akrona0bded52021-08-11 15:48:02 +0200137func XTestFullTokenizerBuild(t *testing.T) {
138 assert := assert.New(t)
139 tok := LoadFomaFile("testdata/tokenizer.fst")
140 dat := tok.ToDoubleArray()
141 n, err := dat.Save("testdata/tokenizer.datok")
142 assert.Nil(err)
143 assert.True(n > 500)
144}
145
Akron3f8571a2021-08-05 11:18:10 +0200146func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200147 assert := assert.New(t)
148
Akrona0bded52021-08-11 15:48:02 +0200149 dat := LoadDatokFile("testdata/tokenizer.datok")
Akron3610f102021-08-08 14:13:25 +0200150 assert.NotNil(dat)
151
Akron3610f102021-08-08 14:13:25 +0200152 b := make([]byte, 0, 2048)
153 w := bytes.NewBuffer(b)
154 var tokens []string
155
Akron03ca4252021-08-11 13:32:53 +0200156 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200157
158 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200159 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200160 assert.Equal("tra", tokens[0])
161 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200162 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200163 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200164 assert.Equal("Du", tokens[4])
165 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200166 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200167 assert.Equal("", tokens[7])
168 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200169
170 w.Reset()
171 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
172 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200173}
Akronb7e1f132021-08-10 11:52:31 +0200174
175func TestFullTokenizerSentenceSplitter(t *testing.T) {
176 assert := assert.New(t)
177 dat := LoadDatokFile("testdata/tokenizer.datok")
178 assert.NotNil(dat)
179
180 b := make([]byte, 0, 2048)
181 w := bytes.NewBuffer(b)
182 var sentences []string
183
184 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200185 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
186 sentences = strings.Split(w.String(), "\n\n")
187
188 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
189 assert.Equal("Der\nalte\nMann\n.", sentences[0])
190 assert.Equal("", sentences[1])
191 assert.Equal(len(sentences), 2)
192
193 w.Reset()
194 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
195 sentences = strings.Split(w.String(), "\n\n")
196 assert.Equal(len(sentences), 2)
197 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
198 assert.Equal("", sentences[1])
199
200 w.Reset()
201 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200202 sentences = strings.Split(w.String(), "\n\n")
203 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200204 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200205
206 w.Reset()
207 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
208 sentences = strings.Split(w.String(), "\n\n")
209 assert.Equal(len(sentences), 2)
210
211 w.Reset()
212 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
213 sentences = strings.Split(w.String(), "\n\n")
214 assert.Equal(len(sentences), 2)
215
Akron6e70dc82021-08-11 11:33:18 +0200216 w.Reset()
217 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
218 sentences = strings.Split(w.String(), "\n\n")
219 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
220 assert.Equal("", sentences[1])
221 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200222
Akron6e70dc82021-08-11 11:33:18 +0200223 w.Reset()
224 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
225 sentences = strings.Split(w.String(), "\n\n")
226 assert.Equal("", sentences[1])
227 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200228
Akron6e70dc82021-08-11 11:33:18 +0200229 w.Reset()
230 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
231 sentences = strings.Split(w.String(), "\n\n")
232 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200233
Akron6e70dc82021-08-11 11:33:18 +0200234 w.Reset()
235 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
236 sentences = strings.Split(w.String(), "\n\n")
237 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200238
Akron6e70dc82021-08-11 11:33:18 +0200239 w.Reset()
240 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
241 sentences = strings.Split(w.String(), "\n\n")
242 assert.Equal(len(sentences), 2)
243 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
244 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200245
Akron6e70dc82021-08-11 11:33:18 +0200246 w.Reset()
247 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
248 sentences = strings.Split(w.String(), "\n\n")
249 assert.Equal(len(sentences), 3)
250 assert.Equal("Ausschalten\n!!!", sentences[0])
251 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
252 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200253
Akron4af79f12021-08-11 14:48:17 +0200254 w.Reset()
255 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
256 sentences = strings.Split(w.String(), "\n\n")
257 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200258
259 /*
260 Test:
261 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
262 */
Akronb7e1f132021-08-10 11:52:31 +0200263}
Akron03ca4252021-08-11 13:32:53 +0200264
265func tokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
266 w.Reset()
267 ok := dat.Transduce(strings.NewReader(str), w)
268 if !ok {
269 return []string{}
270 }
271 obj := regexp.MustCompile("\n+")
272
273 tokens := obj.Split(w.String(), -1)
274 return tokens[:len(tokens)-1]
275}
276
277func TestFullTokenizerTokenSplitter(t *testing.T) {
278 assert := assert.New(t)
279 dat := LoadDatokFile("testdata/tokenizer.datok")
280 assert.NotNil(dat)
281
282 b := make([]byte, 0, 2048)
283 w := bytes.NewBuffer(b)
284 var tokens []string
285
286 // testTokenizerSimple
287 tokens = tokenize(dat, w, "Der alte Mann")
288 assert.Equal(tokens[0], "Der")
289 assert.Equal(tokens[1], "alte")
290 assert.Equal(tokens[2], "Mann")
291 assert.Equal(len(tokens), 3)
292
293 tokens = tokenize(dat, w, "Der alte Mann.")
294 assert.Equal(tokens[0], "Der")
295 assert.Equal(tokens[1], "alte")
296 assert.Equal(tokens[2], "Mann")
297 assert.Equal(tokens[3], ".")
298 assert.Equal(len(tokens), 4)
299
300 // testTokenizerAbbr
301 tokens = tokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
302 assert.Equal(tokens[0], "Der")
303 assert.Equal(tokens[1], "Vorsitzende")
304 assert.Equal(tokens[2], "der")
305 assert.Equal(tokens[3], "F.D.P.")
306 assert.Equal(tokens[4], "hat")
307 assert.Equal(tokens[5], "gewählt")
308 assert.Equal(len(tokens), 6)
309 // Ignored in KorAP-Tokenizer
310
311 // testTokenizerHost1
312 tokens = tokenize(dat, w, "Gefunden auf wikipedia.org")
313 assert.Equal(tokens[0], "Gefunden")
314 assert.Equal(tokens[1], "auf")
315 assert.Equal(tokens[2], "wikipedia.org")
316 assert.Equal(len(tokens), 3)
317
318 // testTokenizerWwwHost
319 tokens = tokenize(dat, w, "Gefunden auf www.wikipedia.org")
320 assert.Equal("Gefunden", tokens[0])
321 assert.Equal("auf", tokens[1])
322 assert.Equal("www.wikipedia.org", tokens[2])
323 assert.Equal(3, len(tokens))
324
325 // testTokenizerWwwUrl
326 tokens = tokenize(dat, w, "Weitere Infos unter www.info.biz/info")
327 assert.Equal("www.info.biz/info", tokens[3])
328
329 // testTokenizerFtpHost
330 /*
331 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
332 assert.Equal("Kann", tokens[0])
333 assert.Equal("von", tokens[1])
334 assert.Equal("ftp.download.org", tokens[2])
335 assert.Equal(5, len(tokens))
336 // Ignored in KorAP-Tokenizer
337 */
338
339 // testTokenizerDash
340 tokens = tokenize(dat, w, "Das war -- spitze")
341 assert.Equal(tokens[0], "Das")
342 assert.Equal(tokens[1], "war")
343 assert.Equal(tokens[2], "--")
344 assert.Equal(tokens[3], "spitze")
345 assert.Equal(len(tokens), 4)
346
347 // testTokenizerEmail1
348 tokens = tokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
349 assert.Equal(tokens[0], "Ich")
350 assert.Equal(tokens[1], "bin")
351 assert.Equal(tokens[2], "unter")
352 assert.Equal(tokens[3], "korap@ids-mannheim.de")
353 assert.Equal(tokens[4], "erreichbar")
354 assert.Equal(tokens[5], ".")
355 assert.Equal(len(tokens), 6)
356
357 // testTokenizerEmail2
358 tokens = tokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
359 assert.Equal(tokens[0], "Oder")
360 assert.Equal(tokens[1], "unter")
361 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
362 assert.Equal(tokens[3], ".")
363 assert.Equal(len(tokens), 4)
364
365 // testTokenizerEmail3
366 tokens = tokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
367 assert.Equal(tokens[0], "Oder")
368 assert.Equal(tokens[1], "unter")
369 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
370 assert.Equal(tokens[3], ".")
371 assert.Equal(len(tokens), 4)
372 // Ignored in KorAP-Tokenizer
373
374 // testTokenizerDoNotAcceptQuotedEmailNames
375 tokens = tokenize(dat, w, "\"John Doe\"@xx.com")
376 assert.Equal("\"", tokens[0])
377 assert.Equal("John", tokens[1])
378 assert.Equal("Doe", tokens[2])
379 assert.Equal("\"", tokens[3])
380 assert.Equal("@xx", tokens[4])
381 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
382 assert.Equal("com", tokens[6])
383 assert.Equal(7, len(tokens))
384
385 // testTokenizerTwitter
386 tokens = tokenize(dat, w, "Folgt @korap und #korap")
387 assert.Equal(tokens[0], "Folgt")
388 assert.Equal(tokens[1], "@korap")
389 assert.Equal(tokens[2], "und")
390 assert.Equal(tokens[3], "#korap")
391 assert.Equal(len(tokens), 4)
392
393 // testTokenizerWeb1
394 tokens = tokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
395 assert.Equal(tokens[0], "Unsere")
396 assert.Equal(tokens[1], "Website")
397 assert.Equal(tokens[2], "ist")
398 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
399 assert.Equal(len(tokens), 4)
400
401 // testTokenizerWeb2
402 tokens = tokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
403 assert.Equal(tokens[0], "Wir")
404 assert.Equal(tokens[1], "sind")
405 assert.Equal(tokens[2], "auch")
406 assert.Equal(tokens[3], "im")
407 assert.Equal(tokens[4], "Internet")
408 assert.Equal(tokens[5], "(")
409 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
410 assert.Equal(tokens[7], ")")
411 assert.Equal(len(tokens), 8)
412 // Ignored in KorAP-Tokenizer
413
414 // testTokenizerWeb3
415 tokens = tokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
416 assert.Equal(tokens[0], "Die")
417 assert.Equal(tokens[1], "Adresse")
418 assert.Equal(tokens[2], "ist")
419 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
420 assert.Equal(tokens[4], ".")
421 assert.Equal(len(tokens), 5)
422 // Ignored in KorAP-Tokenizer
423
424 // testTokenizerServer
425 tokens = tokenize(dat, w, "Unser Server ist 10.0.10.51.")
426 assert.Equal(tokens[0], "Unser")
427 assert.Equal(tokens[1], "Server")
428 assert.Equal(tokens[2], "ist")
429 assert.Equal(tokens[3], "10.0.10.51")
430 assert.Equal(tokens[4], ".")
431 assert.Equal(len(tokens), 5)
432
433 // testTokenizerNum
434 tokens = tokenize(dat, w, "Zu 50,4% ist es sicher")
435 assert.Equal(tokens[0], "Zu")
436 assert.Equal(tokens[1], "50,4%")
437 assert.Equal(tokens[2], "ist")
438 assert.Equal(tokens[3], "es")
439 assert.Equal(tokens[4], "sicher")
440 assert.Equal(len(tokens), 5)
441 // Differs from KorAP-Tokenizer
442
443 // testTokenizerDate
444 tokens = tokenize(dat, w, "Der Termin ist am 5.9.2018")
445 assert.Equal(tokens[0], "Der")
446 assert.Equal(tokens[1], "Termin")
447 assert.Equal(tokens[2], "ist")
448 assert.Equal(tokens[3], "am")
449 assert.Equal(tokens[4], "5.9.2018")
450 assert.Equal(len(tokens), 5)
451
452 tokens = tokenize(dat, w, "Der Termin ist am 5/9/2018")
453 assert.Equal(tokens[0], "Der")
454 assert.Equal(tokens[1], "Termin")
455 assert.Equal(tokens[2], "ist")
456 assert.Equal(tokens[3], "am")
457 assert.Equal(tokens[4], "5/9/2018")
458 assert.Equal(len(tokens), 5)
459
460 // testTokenizerDateRange
461 /*
462 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
463 assert.Equal(tokens[0], "Der")
464 assert.Equal(tokens[1], "Termin")
465 assert.Equal(tokens[2], "war")
466 assert.Equal(tokens[3], "vom")
467 assert.Equal(tokens[4], "4.")
468 assert.Equal(tokens[5], "-")
469 assert.Equal(tokens[6], "5.9.2018")
470 assert.Equal(len(tokens), 7)
471 // Ignored in KorAP-Tokenizer
472 */
473
474 // testTokenizerEmoji1
475 tokens = tokenize(dat, w, "Das ist toll! ;)")
476 assert.Equal(tokens[0], "Das")
477 assert.Equal(tokens[1], "ist")
478 assert.Equal(tokens[2], "toll")
479 assert.Equal(tokens[3], "!")
480 assert.Equal(tokens[4], ";)")
481 assert.Equal(len(tokens), 5)
482
483 // testTokenizerRef1
484 tokens = tokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
485 assert.Equal(tokens[0], "Kupietz")
486 assert.Equal(tokens[1], "und")
487 assert.Equal(tokens[2], "Schmidt")
488 assert.Equal(tokens[3], "(2018)")
489 assert.Equal(tokens[4], ":")
490 assert.Equal(tokens[5], "Korpuslinguistik")
491 assert.Equal(len(tokens), 6)
492 // Differs from KorAP-Tokenizer!
493
494 // testTokenizerRef2 () {
495 tokens = tokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
496 assert.Equal(tokens[0], "Kupietz")
497 assert.Equal(tokens[1], "und")
498 assert.Equal(tokens[2], "Schmidt")
499 assert.Equal(tokens[3], "[2018]")
500 assert.Equal(tokens[4], ":")
501 assert.Equal(tokens[5], "Korpuslinguistik")
502 assert.Equal(len(tokens), 6)
503 // Differs from KorAP-Tokenizer!
504
505 // testTokenizerOmission1 () {
506 tokens = tokenize(dat, w, "Er ist ein A****loch!")
507 assert.Equal(tokens[0], "Er")
508 assert.Equal(tokens[1], "ist")
509 assert.Equal(tokens[2], "ein")
510 assert.Equal(tokens[3], "A****loch")
511 assert.Equal(tokens[4], "!")
512 assert.Equal(len(tokens), 5)
513
514 // testTokenizerOmission2
515 tokens = tokenize(dat, w, "F*ck!")
516 assert.Equal(tokens[0], "F*ck")
517 assert.Equal(tokens[1], "!")
518 assert.Equal(len(tokens), 2)
519
520 // testTokenizerOmission3 () {
521 tokens = tokenize(dat, w, "Dieses verf***** Kleid!")
522 assert.Equal(tokens[0], "Dieses")
523 assert.Equal(tokens[1], "verf*****")
524 assert.Equal(tokens[2], "Kleid")
525 assert.Equal(tokens[3], "!")
526 assert.Equal(len(tokens), 4)
527
528 // Probably interpreted as HOST
529 // testTokenizerFileExtension1
530 tokens = tokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
531 assert.Equal(tokens[0], "Ich")
532 assert.Equal(tokens[1], "habe")
533 assert.Equal(tokens[2], "die")
534 assert.Equal(tokens[3], "readme.txt")
535 assert.Equal(tokens[4], "heruntergeladen")
536 assert.Equal(len(tokens), 5)
537
538 // Probably interpreted as HOST
539 // testTokenizerFileExtension2
540 tokens = tokenize(dat, w, "Nimm die README.TXT!")
541 assert.Equal(tokens[0], "Nimm")
542 assert.Equal(tokens[1], "die")
543 assert.Equal(tokens[2], "README.TXT")
544 assert.Equal(tokens[3], "!")
545 assert.Equal(len(tokens), 4)
546
547 // Probably interpreted as HOST
548 // testTokenizerFileExtension3
549 tokens = tokenize(dat, w, "Zeig mir profile.jpeg")
550 assert.Equal(tokens[0], "Zeig")
551 assert.Equal(tokens[1], "mir")
552 assert.Equal(tokens[2], "profile.jpeg")
553 assert.Equal(len(tokens), 3)
554
555 // testTokenizerFile1
556 /*
557 tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
558 assert.Equal(tokens[0], "Zeig")
559 assert.Equal(tokens[1], "mir")
560 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
561 assert.Equal(len(tokens), 3)
562
563
564 // testTokenizerFile2
565 tokens = tokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
566 assert.Equal(tokens[0], "Gehe")
567 assert.Equal(tokens[1], "zu")
568 assert.Equal(tokens[2], "/Dokumente/profile.docx")
569 assert.Equal(len(tokens), 3)
570
571 // testTokenizerFile3
572 tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
573 assert.Equal(tokens[0], "Zeig")
574 assert.Equal(tokens[1], "mir")
575 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
576 assert.Equal(len(tokens), 3)
577 // Ignored in KorAP-Tokenizer
578 */
579
580 /*
581 // testTokenizerPunct
582 tokens = tokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
583 assert.Equal(tokens[0], "Er")
584 assert.Equal(tokens[1], "sagte")
585 assert.Equal(tokens[2], ":")
586 assert.Equal(tokens[3], "\"")
587 assert.Equal(tokens[4], "Es")
588 assert.Equal(tokens[5], "geht")
589 assert.Equal(tokens[6], "mir")
590 assert.Equal(tokens[7], "gut")
591 assert.Equal(tokens[8], "!")
592 assert.Equal(tokens[9], "\"")
593 assert.Equal(tokens[10], ",")
594 assert.Equal(tokens[11], "daraufhin")
595 assert.Equal(tokens[12], "ging")
596 assert.Equal(tokens[13], "er")
597 assert.Equal(tokens[14], ".")
598 assert.Equal(len(tokens), 15)
599 */
600
601 // testTokenizerPlusAmpersand
602 /*
603 tokens = tokenize(dat, w, ""Das ist von C&A!"")
604 assert.Equal(tokens[0], """)
605 assert.Equal(tokens[1], "Das")
606 assert.Equal(tokens[2], "ist")
607 assert.Equal(tokens[3], "von")
608 assert.Equal(tokens[4], "C&A")
609 assert.Equal(tokens[5], "!")
610 assert.Equal(tokens[6], """)
611 assert.Equal(len(tokens), 7)
612 */
613
614 // testTokenizerLongEnd
615 tokens = tokenize(dat, w, "Siehst Du?!!?")
616 assert.Equal(tokens[0], "Siehst")
617 assert.Equal(tokens[1], "Du")
618 assert.Equal(tokens[2], "?!!?")
619 assert.Equal(len(tokens), 3)
620
621 // testTokenizerIrishO
622 tokens = tokenize(dat, w, "Peter O'Toole")
623 assert.Equal(tokens[0], "Peter")
624 assert.Equal(tokens[1], "O'Toole")
625 assert.Equal(len(tokens), 2)
626
627 // testTokenizerAbr
628 tokens = tokenize(dat, w, "Früher bzw. später ...")
629 assert.Equal(tokens[0], "Früher")
630 assert.Equal(tokens[1], "bzw.")
631 assert.Equal(tokens[2], "später")
632 assert.Equal(tokens[3], "...")
633 assert.Equal(len(tokens), 4)
634
635 // testTokenizerUppercaseRule
636 tokens = tokenize(dat, w, "Es war spät.Morgen ist es früh.")
637 assert.Equal(tokens[0], "Es")
638 assert.Equal(tokens[1], "war")
639 assert.Equal(tokens[2], "spät")
640 assert.Equal(tokens[3], ".")
641 assert.Equal(tokens[4], "Morgen")
642 assert.Equal(tokens[5], "ist")
643 assert.Equal(tokens[6], "es")
644 assert.Equal(tokens[7], "früh")
645 assert.Equal(tokens[8], ".")
646 assert.Equal(len(tokens), 9)
647 // Ignored in KorAP-Tokenizer
648
649 // testTokenizerOrd
Akrona0bded52021-08-11 15:48:02 +0200650 tokens = tokenize(dat, w, "Sie erreichte den 1. Platz!")
651 assert.Equal(tokens[0], "Sie")
652 assert.Equal(tokens[1], "erreichte")
653 assert.Equal(tokens[2], "den")
654 assert.Equal(tokens[3], "1.")
655 assert.Equal(tokens[4], "Platz")
656 assert.Equal(tokens[5], "!")
657 assert.Equal(len(tokens), 6)
Akron03ca4252021-08-11 13:32:53 +0200658
659 // testNoZipOuputArchive
660 tokens = tokenize(dat, w, "Archive: Ich bin kein zip\n")
661 assert.Equal(tokens[0], "Archive")
662 assert.Equal(tokens[1], ":")
663 assert.Equal(tokens[2], "Ich")
664 assert.Equal(tokens[3], "bin")
665 assert.Equal(tokens[4], "kein")
666 assert.Equal(tokens[5], "zip")
667 assert.Equal(6, len(tokens))
668
669 // testTokenizerStrasse
Akron4af79f12021-08-11 14:48:17 +0200670 tokens = tokenize(dat, w, "Ich wohne in der Weststr. und Du?")
671 assert.Equal(tokens[4], "Weststr.")
672 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200673
674 // germanTokenizerKnowsGermanOmissionWords
675 tokens = tokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
676 assert.Equal("D'dorf", tokens[0])
677 assert.Equal("Ku'damm", tokens[1])
678 assert.Equal("Lu'hafen", tokens[2])
679 assert.Equal("M'gladbach", tokens[3])
680 assert.Equal("W'schaft", tokens[4])
681 assert.Equal(5, len(tokens))
682
683 // germanTokenizerDoesNOTSeparateGermanContractions
684 tokens = tokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
685 assert.Equal("mach's", tokens[0])
686 assert.Equal("macht's", tokens[1])
687 assert.Equal("was'n", tokens[2])
688 assert.Equal("ist's", tokens[3])
689 assert.Equal("haste", tokens[4])
690 assert.Equal("willste", tokens[5])
691 assert.Equal("kannste", tokens[6])
692 assert.Equal("biste", tokens[7])
693 assert.Equal("kriegste", tokens[8])
694 assert.Equal(9, len(tokens))
695
696 /*
697 @Test
698 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
699 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
700 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
701 assert.Equal("'ve", tokens[1]);
702 assert.Equal("'ll", tokens[3]);
703 assert.Equal("'d", tokens[5]);
704 assert.Equal("'m", tokens[7]);
705 assert.Equal("'re", tokens[9]);
706 assert.Equal("'s", tokens[11]);
707 assert.Equal("is", tokens[12]);
708 assert.Equal("n't", tokens[13]);
709 assert.Equal(14, len(tokens));
710 }
711
712 @Test
713 public void frenchTokenizerKnowsFrenchAbbreviations () {
714 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
715 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
716 assert.Equal("Approx.", tokens[0]);
717 assert.Equal("juill.", tokens[2]);
718 assert.Equal("prof.", tokens[5]);
719 assert.Equal("exerc.", tokens[15]);
720 assert.Equal("no.", tokens[16]);
721 assert.Equal("pp.", tokens[21]);
722 }
723
724 @Test
725 public void frenchTokenizerKnowsFrenchContractions () {
726 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
727 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
728 assert.Equal("J'", tokens[0]);
729 assert.Equal("j'", tokens[2]);
730 assert.Equal("qu'", tokens[4]);
731 assert.Equal("d'", tokens[6]);
732 assert.Equal("jusqu'", tokens[8]);
733 assert.Equal("Aujourd'hui", tokens[10]);
734 assert.Equal("D'", tokens[11]); // ’
735 assert.Equal("Quelqu'un", tokens[13]); // ’
736 assert.Equal("Presqu'île", tokens[14]); // ’
737 }
738
739 @Test
740 public void frenchTokenizerKnowsFrenchClitics () {
741 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
742 tokens = tokenize(dat, w, "suis-je sont-elles ")
743 assert.Equal("suis", tokens[0]);
744 assert.Equal("-je", tokens[1]);
745 assert.Equal("sont", tokens[2]);
746 assert.Equal("-elles", tokens[3]);
747 }
748
749 @Test
750 public void testEnglishTokenizerScienceAbbreviations () {
751 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
752 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
753 assert.Equal("Approx.", tokens[0]);
754 assert.Equal("in", tokens[1]);
755 assert.Equal("Sept.", tokens[2]);
756 assert.Equal("1954", tokens[3]);
757 assert.Equal(",", tokens[4]);
758 assert.Equal("Assoc.", tokens[5]);
759 assert.Equal("Prof.", tokens[6]);
760 assert.Equal("Dr.", tokens[7]);
761 assert.Equal("R.", tokens[8]);
762 assert.Equal("J.", tokens[9]);
763 assert.Equal("Ewing", tokens[10]);
764 assert.Equal("reviewed", tokens[11]);
765 assert.Equal("articles", tokens[12]);
766 assert.Equal("on", tokens[13]);
767 assert.Equal("Enzymol.", tokens[14]);
768 assert.Equal("Bacteriol.", tokens[15]);
769 assert.Equal("effects", tokens[16]);
770 assert.Equal("later", tokens[17]);
771 assert.Equal("published", tokens[18]);
772 assert.Equal("in", tokens[19]);
773 assert.Equal("Nutr.", tokens[20]);
774 assert.Equal("Rheumatol.", tokens[21]);
775 assert.Equal("No.", tokens[22]);
776 assert.Equal("12", tokens[23]);
777 assert.Equal("and", tokens[24]);
778 assert.Equal("Nº.", tokens[25]);
779 assert.Equal("13.", tokens[26]);
780 assert.Equal(",", tokens[27]);
781 assert.Equal("pp.", tokens[28]);
782 assert.Equal("17-18", tokens[29]);
783 assert.Equal(".", tokens[30]);
784 }
785
786 @Test
787 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
788 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
789 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
790 assert.Equal("I.", tokens[1]);
791 assert.Equal("I", tokens[8]);
792 assert.Equal(".", tokens[9]);
793 assert.Equal("I", tokens[12]);
794 assert.Equal(".", tokens[13]);
795 }
796
797 @Test
798 public void testZipOuputArchive () {
799
800 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
801 System.setOut(new PrintStream(clearOut));
802 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
803 assert.Equal(0, len(tokens));
804 }
805 */
806 /*
807
808 @Test
809 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
810 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
811 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
812 .printOffsets(true)
813 .build();
814 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
815 assert.Equal("Text1", tokens[0].getType());
816 assert.Equal(len(tokens), 9 );
817 }
818 */
819}