blob: a522159295abd2f73c4783097d5827c9cdfe2dd5 [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron03ca4252021-08-11 13:32:53 +02005 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02006 "strings"
Akron8ef408b2021-08-02 22:11:04 +02007 "testing"
8
9 "github.com/stretchr/testify/assert"
10)
11
12func TestSimpleString(t *testing.T) {
13 assert := assert.New(t)
14
15 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020016 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020017 dat := tok.ToDoubleArray()
18 assert.True(dat.Match("bau"))
19 assert.True(dat.Match("bauamt"))
20 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020021}
Akron75ebe7f2021-08-03 10:34:10 +020022
23func TestSimpleBranches(t *testing.T) {
24 assert := assert.New(t)
25
26 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020027 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020028 dat := tok.ToDoubleArray()
29 assert.False(dat.Match("bau"))
30 assert.True(dat.Match("bauamt"))
31 assert.True(dat.Match("wahlamt"))
32 assert.True(dat.Match("bauen"))
33 assert.True(dat.Match("wahlen"))
34 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020035}
Akron730a79c2021-08-03 11:05:29 +020036
37func TestSimpleTokenizer(t *testing.T) {
38 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020039 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020040 dat := tok.ToDoubleArray()
41 assert.True(dat.Match("bau"))
42 assert.True(dat.Match("bad"))
43 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020044}
Akron740f3d72021-08-03 12:12:34 +020045
Akron068874c2021-08-04 15:19:56 +020046func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020047 assert := assert.New(t)
48 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020049 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020050
51 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
52 b := make([]byte, 0, 2048)
53 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020054 var tokens []string
Akron524c5432021-08-05 14:14:27 +020055 dat.Transduce(r, w)
56 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020057 assert.Equal("wald", tokens[0])
58 assert.Equal("gehen", tokens[1])
59 assert.Equal("Da", tokens[2])
60 assert.Equal("kann", tokens[3])
61 assert.Equal("man", tokens[4])
62 assert.Equal("was", tokens[5])
63 assert.Equal("\"erleben\"", tokens[6])
64
Akron524c5432021-08-05 14:14:27 +020065 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
66 w.Reset()
67 dat.Transduce(r, w)
68 tokens = strings.Split(w.String(), "\n")
69 assert.Equal("In", tokens[0])
70 assert.Equal("den", tokens[1])
71 assert.Equal("Wald", tokens[2])
72 assert.Equal("gehen", tokens[3])
73 assert.Equal("?", tokens[4])
74 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020075
Akron524c5432021-08-05 14:14:27 +020076 r = strings.NewReader(" g? -- D")
77 w.Reset()
78 dat.Transduce(r, w)
79 tokens = strings.Split(w.String(), "\n")
80 assert.Equal("g", tokens[0])
81 assert.Equal("?", tokens[1])
82 assert.Equal("--", tokens[2])
83 assert.Equal("D", tokens[3])
84 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +020085 assert.Equal("", tokens[5])
86 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020087}
88
Akron3f8571a2021-08-05 11:18:10 +020089func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020090 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020091 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020092 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020093 assert.True(dat.Match("bau"))
94 assert.True(dat.Match("bad"))
95 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020096
Akron03a3c612021-08-04 11:51:27 +020097 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020098
Akron3f8571a2021-08-05 11:18:10 +020099 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200100 buf := bytes.NewBuffer(b)
101 n, err := dat.WriteTo(buf)
102 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200103 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200104
105 dat2 := ParseDatok(buf)
106 assert.NotNil(dat2)
107 assert.Equal(dat.array, dat2.array)
108 assert.Equal(dat.sigma, dat2.sigma)
109 assert.Equal(dat.epsilon, dat2.epsilon)
110 assert.Equal(dat.unknown, dat2.unknown)
111 assert.Equal(dat.identity, dat2.identity)
112 assert.Equal(dat.final, dat2.final)
113 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
114 assert.True(dat2.Match("bau"))
115 assert.True(dat2.Match("bad"))
116 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200117}
118
119func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200120 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200121 /*
Akron2a4b9292021-08-04 15:35:22 +0200122 tok := LoadFomaFile("testdata/tokenizer.fst")
123 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200124 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200125 */
Akron3a063ef2021-08-05 19:36:35 +0200126 dat := LoadDatokFile("testdata/tokenizer.datok")
127 assert.NotNil(dat)
128 assert.True(dat.LoadFactor() >= 70)
129 assert.Equal(dat.epsilon, 1)
130 assert.Equal(dat.unknown, 2)
131 assert.Equal(dat.identity, 3)
Akron03c92fe2021-08-09 14:07:57 +0200132 assert.Equal(dat.final, 136)
Akron3a063ef2021-08-05 19:36:35 +0200133 assert.Equal(len(dat.sigma), 131)
Akron03c92fe2021-08-09 14:07:57 +0200134 assert.Equal(len(dat.array), 3806280)
135 assert.Equal(dat.maxSize, 3806279)
Akron3a063ef2021-08-05 19:36:35 +0200136
137 assert.True(dat.Match("bau"))
138 assert.True(dat.Match("bad"))
139 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200140}
Akron3f8571a2021-08-05 11:18:10 +0200141
142func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200143 assert := assert.New(t)
144
Akron03c92fe2021-08-09 14:07:57 +0200145 var dat *DaTokenizer
Akron3610f102021-08-08 14:13:25 +0200146
Akron03c92fe2021-08-09 14:07:57 +0200147 if false {
148 tok := LoadFomaFile("testdata/tokenizer.fst")
149 dat = tok.ToDoubleArray()
Akron439f4ec2021-08-09 15:45:38 +0200150 // dat.Save("testdata/tokenizer.datok")
Akron03c92fe2021-08-09 14:07:57 +0200151 } else {
152 dat = LoadDatokFile("testdata/tokenizer.datok")
153 }
Akron3610f102021-08-08 14:13:25 +0200154 assert.NotNil(dat)
155
Akron3610f102021-08-08 14:13:25 +0200156 b := make([]byte, 0, 2048)
157 w := bytes.NewBuffer(b)
158 var tokens []string
159
Akron03ca4252021-08-11 13:32:53 +0200160 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200161
162 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200163 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200164 assert.Equal("tra", tokens[0])
165 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200166 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200167 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200168 assert.Equal("Du", tokens[4])
169 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200170 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200171 assert.Equal("", tokens[7])
172 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200173
174 w.Reset()
175 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
176 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200177}
Akronb7e1f132021-08-10 11:52:31 +0200178
179func TestFullTokenizerSentenceSplitter(t *testing.T) {
180 assert := assert.New(t)
181 dat := LoadDatokFile("testdata/tokenizer.datok")
182 assert.NotNil(dat)
183
184 b := make([]byte, 0, 2048)
185 w := bytes.NewBuffer(b)
186 var sentences []string
187
188 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200189 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
190 sentences = strings.Split(w.String(), "\n\n")
191
192 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
193 assert.Equal("Der\nalte\nMann\n.", sentences[0])
194 assert.Equal("", sentences[1])
195 assert.Equal(len(sentences), 2)
196
197 w.Reset()
198 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
199 sentences = strings.Split(w.String(), "\n\n")
200 assert.Equal(len(sentences), 2)
201 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
202 assert.Equal("", sentences[1])
203
204 w.Reset()
205 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200206 sentences = strings.Split(w.String(), "\n\n")
207 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200208 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200209
210 w.Reset()
211 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
212 sentences = strings.Split(w.String(), "\n\n")
213 assert.Equal(len(sentences), 2)
214
215 w.Reset()
216 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
217 sentences = strings.Split(w.String(), "\n\n")
218 assert.Equal(len(sentences), 2)
219
Akron6e70dc82021-08-11 11:33:18 +0200220 w.Reset()
221 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
222 sentences = strings.Split(w.String(), "\n\n")
223 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
224 assert.Equal("", sentences[1])
225 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200226
Akron6e70dc82021-08-11 11:33:18 +0200227 w.Reset()
228 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
229 sentences = strings.Split(w.String(), "\n\n")
230 assert.Equal("", sentences[1])
231 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200232
Akron6e70dc82021-08-11 11:33:18 +0200233 w.Reset()
234 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
235 sentences = strings.Split(w.String(), "\n\n")
236 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200237
Akron6e70dc82021-08-11 11:33:18 +0200238 w.Reset()
239 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
240 sentences = strings.Split(w.String(), "\n\n")
241 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200242
Akron6e70dc82021-08-11 11:33:18 +0200243 w.Reset()
244 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
245 sentences = strings.Split(w.String(), "\n\n")
246 assert.Equal(len(sentences), 2)
247 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
248 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200249
Akron6e70dc82021-08-11 11:33:18 +0200250 w.Reset()
251 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
252 sentences = strings.Split(w.String(), "\n\n")
253 assert.Equal(len(sentences), 3)
254 assert.Equal("Ausschalten\n!!!", sentences[0])
255 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
256 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200257
258 /*
259 w.Reset()
260 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
261 sentences = strings.Split(w.String(), "\n\n")
262 assert.Equal(len(sentences), 1)
263 */
264
265 /*
266 Test:
267 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
268 */
Akronb7e1f132021-08-10 11:52:31 +0200269}
Akron03ca4252021-08-11 13:32:53 +0200270
271func tokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
272 w.Reset()
273 ok := dat.Transduce(strings.NewReader(str), w)
274 if !ok {
275 return []string{}
276 }
277 obj := regexp.MustCompile("\n+")
278
279 tokens := obj.Split(w.String(), -1)
280 return tokens[:len(tokens)-1]
281}
282
283func TestFullTokenizerTokenSplitter(t *testing.T) {
284 assert := assert.New(t)
285 dat := LoadDatokFile("testdata/tokenizer.datok")
286 assert.NotNil(dat)
287
288 b := make([]byte, 0, 2048)
289 w := bytes.NewBuffer(b)
290 var tokens []string
291
292 // testTokenizerSimple
293 tokens = tokenize(dat, w, "Der alte Mann")
294 assert.Equal(tokens[0], "Der")
295 assert.Equal(tokens[1], "alte")
296 assert.Equal(tokens[2], "Mann")
297 assert.Equal(len(tokens), 3)
298
299 tokens = tokenize(dat, w, "Der alte Mann.")
300 assert.Equal(tokens[0], "Der")
301 assert.Equal(tokens[1], "alte")
302 assert.Equal(tokens[2], "Mann")
303 assert.Equal(tokens[3], ".")
304 assert.Equal(len(tokens), 4)
305
306 // testTokenizerAbbr
307 tokens = tokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
308 assert.Equal(tokens[0], "Der")
309 assert.Equal(tokens[1], "Vorsitzende")
310 assert.Equal(tokens[2], "der")
311 assert.Equal(tokens[3], "F.D.P.")
312 assert.Equal(tokens[4], "hat")
313 assert.Equal(tokens[5], "gewählt")
314 assert.Equal(len(tokens), 6)
315 // Ignored in KorAP-Tokenizer
316
317 // testTokenizerHost1
318 tokens = tokenize(dat, w, "Gefunden auf wikipedia.org")
319 assert.Equal(tokens[0], "Gefunden")
320 assert.Equal(tokens[1], "auf")
321 assert.Equal(tokens[2], "wikipedia.org")
322 assert.Equal(len(tokens), 3)
323
324 // testTokenizerWwwHost
325 tokens = tokenize(dat, w, "Gefunden auf www.wikipedia.org")
326 assert.Equal("Gefunden", tokens[0])
327 assert.Equal("auf", tokens[1])
328 assert.Equal("www.wikipedia.org", tokens[2])
329 assert.Equal(3, len(tokens))
330
331 // testTokenizerWwwUrl
332 tokens = tokenize(dat, w, "Weitere Infos unter www.info.biz/info")
333 assert.Equal("www.info.biz/info", tokens[3])
334
335 // testTokenizerFtpHost
336 /*
337 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
338 assert.Equal("Kann", tokens[0])
339 assert.Equal("von", tokens[1])
340 assert.Equal("ftp.download.org", tokens[2])
341 assert.Equal(5, len(tokens))
342 // Ignored in KorAP-Tokenizer
343 */
344
345 // testTokenizerDash
346 tokens = tokenize(dat, w, "Das war -- spitze")
347 assert.Equal(tokens[0], "Das")
348 assert.Equal(tokens[1], "war")
349 assert.Equal(tokens[2], "--")
350 assert.Equal(tokens[3], "spitze")
351 assert.Equal(len(tokens), 4)
352
353 // testTokenizerEmail1
354 tokens = tokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
355 assert.Equal(tokens[0], "Ich")
356 assert.Equal(tokens[1], "bin")
357 assert.Equal(tokens[2], "unter")
358 assert.Equal(tokens[3], "korap@ids-mannheim.de")
359 assert.Equal(tokens[4], "erreichbar")
360 assert.Equal(tokens[5], ".")
361 assert.Equal(len(tokens), 6)
362
363 // testTokenizerEmail2
364 tokens = tokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
365 assert.Equal(tokens[0], "Oder")
366 assert.Equal(tokens[1], "unter")
367 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
368 assert.Equal(tokens[3], ".")
369 assert.Equal(len(tokens), 4)
370
371 // testTokenizerEmail3
372 tokens = tokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
373 assert.Equal(tokens[0], "Oder")
374 assert.Equal(tokens[1], "unter")
375 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
376 assert.Equal(tokens[3], ".")
377 assert.Equal(len(tokens), 4)
378 // Ignored in KorAP-Tokenizer
379
380 // testTokenizerDoNotAcceptQuotedEmailNames
381 tokens = tokenize(dat, w, "\"John Doe\"@xx.com")
382 assert.Equal("\"", tokens[0])
383 assert.Equal("John", tokens[1])
384 assert.Equal("Doe", tokens[2])
385 assert.Equal("\"", tokens[3])
386 assert.Equal("@xx", tokens[4])
387 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
388 assert.Equal("com", tokens[6])
389 assert.Equal(7, len(tokens))
390
391 // testTokenizerTwitter
392 tokens = tokenize(dat, w, "Folgt @korap und #korap")
393 assert.Equal(tokens[0], "Folgt")
394 assert.Equal(tokens[1], "@korap")
395 assert.Equal(tokens[2], "und")
396 assert.Equal(tokens[3], "#korap")
397 assert.Equal(len(tokens), 4)
398
399 // testTokenizerWeb1
400 tokens = tokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
401 assert.Equal(tokens[0], "Unsere")
402 assert.Equal(tokens[1], "Website")
403 assert.Equal(tokens[2], "ist")
404 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
405 assert.Equal(len(tokens), 4)
406
407 // testTokenizerWeb2
408 tokens = tokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
409 assert.Equal(tokens[0], "Wir")
410 assert.Equal(tokens[1], "sind")
411 assert.Equal(tokens[2], "auch")
412 assert.Equal(tokens[3], "im")
413 assert.Equal(tokens[4], "Internet")
414 assert.Equal(tokens[5], "(")
415 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
416 assert.Equal(tokens[7], ")")
417 assert.Equal(len(tokens), 8)
418 // Ignored in KorAP-Tokenizer
419
420 // testTokenizerWeb3
421 tokens = tokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
422 assert.Equal(tokens[0], "Die")
423 assert.Equal(tokens[1], "Adresse")
424 assert.Equal(tokens[2], "ist")
425 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
426 assert.Equal(tokens[4], ".")
427 assert.Equal(len(tokens), 5)
428 // Ignored in KorAP-Tokenizer
429
430 // testTokenizerServer
431 tokens = tokenize(dat, w, "Unser Server ist 10.0.10.51.")
432 assert.Equal(tokens[0], "Unser")
433 assert.Equal(tokens[1], "Server")
434 assert.Equal(tokens[2], "ist")
435 assert.Equal(tokens[3], "10.0.10.51")
436 assert.Equal(tokens[4], ".")
437 assert.Equal(len(tokens), 5)
438
439 // testTokenizerNum
440 tokens = tokenize(dat, w, "Zu 50,4% ist es sicher")
441 assert.Equal(tokens[0], "Zu")
442 assert.Equal(tokens[1], "50,4%")
443 assert.Equal(tokens[2], "ist")
444 assert.Equal(tokens[3], "es")
445 assert.Equal(tokens[4], "sicher")
446 assert.Equal(len(tokens), 5)
447 // Differs from KorAP-Tokenizer
448
449 // testTokenizerDate
450 tokens = tokenize(dat, w, "Der Termin ist am 5.9.2018")
451 assert.Equal(tokens[0], "Der")
452 assert.Equal(tokens[1], "Termin")
453 assert.Equal(tokens[2], "ist")
454 assert.Equal(tokens[3], "am")
455 assert.Equal(tokens[4], "5.9.2018")
456 assert.Equal(len(tokens), 5)
457
458 tokens = tokenize(dat, w, "Der Termin ist am 5/9/2018")
459 assert.Equal(tokens[0], "Der")
460 assert.Equal(tokens[1], "Termin")
461 assert.Equal(tokens[2], "ist")
462 assert.Equal(tokens[3], "am")
463 assert.Equal(tokens[4], "5/9/2018")
464 assert.Equal(len(tokens), 5)
465
466 // testTokenizerDateRange
467 /*
468 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
469 assert.Equal(tokens[0], "Der")
470 assert.Equal(tokens[1], "Termin")
471 assert.Equal(tokens[2], "war")
472 assert.Equal(tokens[3], "vom")
473 assert.Equal(tokens[4], "4.")
474 assert.Equal(tokens[5], "-")
475 assert.Equal(tokens[6], "5.9.2018")
476 assert.Equal(len(tokens), 7)
477 // Ignored in KorAP-Tokenizer
478 */
479
480 // testTokenizerEmoji1
481 tokens = tokenize(dat, w, "Das ist toll! ;)")
482 assert.Equal(tokens[0], "Das")
483 assert.Equal(tokens[1], "ist")
484 assert.Equal(tokens[2], "toll")
485 assert.Equal(tokens[3], "!")
486 assert.Equal(tokens[4], ";)")
487 assert.Equal(len(tokens), 5)
488
489 // testTokenizerRef1
490 tokens = tokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
491 assert.Equal(tokens[0], "Kupietz")
492 assert.Equal(tokens[1], "und")
493 assert.Equal(tokens[2], "Schmidt")
494 assert.Equal(tokens[3], "(2018)")
495 assert.Equal(tokens[4], ":")
496 assert.Equal(tokens[5], "Korpuslinguistik")
497 assert.Equal(len(tokens), 6)
498 // Differs from KorAP-Tokenizer!
499
500 // testTokenizerRef2 () {
501 tokens = tokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
502 assert.Equal(tokens[0], "Kupietz")
503 assert.Equal(tokens[1], "und")
504 assert.Equal(tokens[2], "Schmidt")
505 assert.Equal(tokens[3], "[2018]")
506 assert.Equal(tokens[4], ":")
507 assert.Equal(tokens[5], "Korpuslinguistik")
508 assert.Equal(len(tokens), 6)
509 // Differs from KorAP-Tokenizer!
510
511 // testTokenizerOmission1 () {
512 tokens = tokenize(dat, w, "Er ist ein A****loch!")
513 assert.Equal(tokens[0], "Er")
514 assert.Equal(tokens[1], "ist")
515 assert.Equal(tokens[2], "ein")
516 assert.Equal(tokens[3], "A****loch")
517 assert.Equal(tokens[4], "!")
518 assert.Equal(len(tokens), 5)
519
520 // testTokenizerOmission2
521 tokens = tokenize(dat, w, "F*ck!")
522 assert.Equal(tokens[0], "F*ck")
523 assert.Equal(tokens[1], "!")
524 assert.Equal(len(tokens), 2)
525
526 // testTokenizerOmission3 () {
527 tokens = tokenize(dat, w, "Dieses verf***** Kleid!")
528 assert.Equal(tokens[0], "Dieses")
529 assert.Equal(tokens[1], "verf*****")
530 assert.Equal(tokens[2], "Kleid")
531 assert.Equal(tokens[3], "!")
532 assert.Equal(len(tokens), 4)
533
534 // Probably interpreted as HOST
535 // testTokenizerFileExtension1
536 tokens = tokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
537 assert.Equal(tokens[0], "Ich")
538 assert.Equal(tokens[1], "habe")
539 assert.Equal(tokens[2], "die")
540 assert.Equal(tokens[3], "readme.txt")
541 assert.Equal(tokens[4], "heruntergeladen")
542 assert.Equal(len(tokens), 5)
543
544 // Probably interpreted as HOST
545 // testTokenizerFileExtension2
546 tokens = tokenize(dat, w, "Nimm die README.TXT!")
547 assert.Equal(tokens[0], "Nimm")
548 assert.Equal(tokens[1], "die")
549 assert.Equal(tokens[2], "README.TXT")
550 assert.Equal(tokens[3], "!")
551 assert.Equal(len(tokens), 4)
552
553 // Probably interpreted as HOST
554 // testTokenizerFileExtension3
555 tokens = tokenize(dat, w, "Zeig mir profile.jpeg")
556 assert.Equal(tokens[0], "Zeig")
557 assert.Equal(tokens[1], "mir")
558 assert.Equal(tokens[2], "profile.jpeg")
559 assert.Equal(len(tokens), 3)
560
561 // testTokenizerFile1
562 /*
563 tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
564 assert.Equal(tokens[0], "Zeig")
565 assert.Equal(tokens[1], "mir")
566 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
567 assert.Equal(len(tokens), 3)
568
569
570 // testTokenizerFile2
571 tokens = tokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
572 assert.Equal(tokens[0], "Gehe")
573 assert.Equal(tokens[1], "zu")
574 assert.Equal(tokens[2], "/Dokumente/profile.docx")
575 assert.Equal(len(tokens), 3)
576
577 // testTokenizerFile3
578 tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
579 assert.Equal(tokens[0], "Zeig")
580 assert.Equal(tokens[1], "mir")
581 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
582 assert.Equal(len(tokens), 3)
583 // Ignored in KorAP-Tokenizer
584 */
585
586 /*
587 // testTokenizerPunct
588 tokens = tokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
589 assert.Equal(tokens[0], "Er")
590 assert.Equal(tokens[1], "sagte")
591 assert.Equal(tokens[2], ":")
592 assert.Equal(tokens[3], "\"")
593 assert.Equal(tokens[4], "Es")
594 assert.Equal(tokens[5], "geht")
595 assert.Equal(tokens[6], "mir")
596 assert.Equal(tokens[7], "gut")
597 assert.Equal(tokens[8], "!")
598 assert.Equal(tokens[9], "\"")
599 assert.Equal(tokens[10], ",")
600 assert.Equal(tokens[11], "daraufhin")
601 assert.Equal(tokens[12], "ging")
602 assert.Equal(tokens[13], "er")
603 assert.Equal(tokens[14], ".")
604 assert.Equal(len(tokens), 15)
605 */
606
607 // testTokenizerPlusAmpersand
608 /*
609 tokens = tokenize(dat, w, ""Das ist von C&A!"")
610 assert.Equal(tokens[0], """)
611 assert.Equal(tokens[1], "Das")
612 assert.Equal(tokens[2], "ist")
613 assert.Equal(tokens[3], "von")
614 assert.Equal(tokens[4], "C&A")
615 assert.Equal(tokens[5], "!")
616 assert.Equal(tokens[6], """)
617 assert.Equal(len(tokens), 7)
618 */
619
620 // testTokenizerLongEnd
621 tokens = tokenize(dat, w, "Siehst Du?!!?")
622 assert.Equal(tokens[0], "Siehst")
623 assert.Equal(tokens[1], "Du")
624 assert.Equal(tokens[2], "?!!?")
625 assert.Equal(len(tokens), 3)
626
627 // testTokenizerIrishO
628 tokens = tokenize(dat, w, "Peter O'Toole")
629 assert.Equal(tokens[0], "Peter")
630 assert.Equal(tokens[1], "O'Toole")
631 assert.Equal(len(tokens), 2)
632
633 // testTokenizerAbr
634 tokens = tokenize(dat, w, "Früher bzw. später ...")
635 assert.Equal(tokens[0], "Früher")
636 assert.Equal(tokens[1], "bzw.")
637 assert.Equal(tokens[2], "später")
638 assert.Equal(tokens[3], "...")
639 assert.Equal(len(tokens), 4)
640
641 // testTokenizerUppercaseRule
642 tokens = tokenize(dat, w, "Es war spät.Morgen ist es früh.")
643 assert.Equal(tokens[0], "Es")
644 assert.Equal(tokens[1], "war")
645 assert.Equal(tokens[2], "spät")
646 assert.Equal(tokens[3], ".")
647 assert.Equal(tokens[4], "Morgen")
648 assert.Equal(tokens[5], "ist")
649 assert.Equal(tokens[6], "es")
650 assert.Equal(tokens[7], "früh")
651 assert.Equal(tokens[8], ".")
652 assert.Equal(len(tokens), 9)
653 // Ignored in KorAP-Tokenizer
654
655 // testTokenizerOrd
656 /*
657 tokens = tokenize(dat, w, "Sie erreichte den 1. Platz!")
658 assert.Equal(tokens[0], "Sie")
659 assert.Equal(tokens[1], "erreichte")
660 assert.Equal(tokens[2], "den")
661 assert.Equal(tokens[3], "1.")
662 assert.Equal(tokens[4], "Platz")
663 assert.Equal(tokens[5], "!")
664 assert.Equal(len(tokens), 6)
665 */
666
667 // testNoZipOuputArchive
668 tokens = tokenize(dat, w, "Archive: Ich bin kein zip\n")
669 assert.Equal(tokens[0], "Archive")
670 assert.Equal(tokens[1], ":")
671 assert.Equal(tokens[2], "Ich")
672 assert.Equal(tokens[3], "bin")
673 assert.Equal(tokens[4], "kein")
674 assert.Equal(tokens[5], "zip")
675 assert.Equal(6, len(tokens))
676
677 // testTokenizerStrasse
678 /*
679 tokens = tokenize(dat, w, "Ich wohne in der Weststr. und Du?")
680 assert.Equal(tokens[4], "Weststr.")
681 assert.Equal(8, len(tokens))
682 */
683
684 // germanTokenizerKnowsGermanOmissionWords
685 tokens = tokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
686 assert.Equal("D'dorf", tokens[0])
687 assert.Equal("Ku'damm", tokens[1])
688 assert.Equal("Lu'hafen", tokens[2])
689 assert.Equal("M'gladbach", tokens[3])
690 assert.Equal("W'schaft", tokens[4])
691 assert.Equal(5, len(tokens))
692
693 // germanTokenizerDoesNOTSeparateGermanContractions
694 tokens = tokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
695 assert.Equal("mach's", tokens[0])
696 assert.Equal("macht's", tokens[1])
697 assert.Equal("was'n", tokens[2])
698 assert.Equal("ist's", tokens[3])
699 assert.Equal("haste", tokens[4])
700 assert.Equal("willste", tokens[5])
701 assert.Equal("kannste", tokens[6])
702 assert.Equal("biste", tokens[7])
703 assert.Equal("kriegste", tokens[8])
704 assert.Equal(9, len(tokens))
705
706 /*
707 @Test
708 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
709 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
710 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
711 assert.Equal("'ve", tokens[1]);
712 assert.Equal("'ll", tokens[3]);
713 assert.Equal("'d", tokens[5]);
714 assert.Equal("'m", tokens[7]);
715 assert.Equal("'re", tokens[9]);
716 assert.Equal("'s", tokens[11]);
717 assert.Equal("is", tokens[12]);
718 assert.Equal("n't", tokens[13]);
719 assert.Equal(14, len(tokens));
720 }
721
722 @Test
723 public void frenchTokenizerKnowsFrenchAbbreviations () {
724 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
725 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
726 assert.Equal("Approx.", tokens[0]);
727 assert.Equal("juill.", tokens[2]);
728 assert.Equal("prof.", tokens[5]);
729 assert.Equal("exerc.", tokens[15]);
730 assert.Equal("no.", tokens[16]);
731 assert.Equal("pp.", tokens[21]);
732 }
733
734 @Test
735 public void frenchTokenizerKnowsFrenchContractions () {
736 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
737 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
738 assert.Equal("J'", tokens[0]);
739 assert.Equal("j'", tokens[2]);
740 assert.Equal("qu'", tokens[4]);
741 assert.Equal("d'", tokens[6]);
742 assert.Equal("jusqu'", tokens[8]);
743 assert.Equal("Aujourd'hui", tokens[10]);
744 assert.Equal("D'", tokens[11]); // ’
745 assert.Equal("Quelqu'un", tokens[13]); // ’
746 assert.Equal("Presqu'île", tokens[14]); // ’
747 }
748
749 @Test
750 public void frenchTokenizerKnowsFrenchClitics () {
751 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
752 tokens = tokenize(dat, w, "suis-je sont-elles ")
753 assert.Equal("suis", tokens[0]);
754 assert.Equal("-je", tokens[1]);
755 assert.Equal("sont", tokens[2]);
756 assert.Equal("-elles", tokens[3]);
757 }
758
759 @Test
760 public void testEnglishTokenizerScienceAbbreviations () {
761 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
762 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
763 assert.Equal("Approx.", tokens[0]);
764 assert.Equal("in", tokens[1]);
765 assert.Equal("Sept.", tokens[2]);
766 assert.Equal("1954", tokens[3]);
767 assert.Equal(",", tokens[4]);
768 assert.Equal("Assoc.", tokens[5]);
769 assert.Equal("Prof.", tokens[6]);
770 assert.Equal("Dr.", tokens[7]);
771 assert.Equal("R.", tokens[8]);
772 assert.Equal("J.", tokens[9]);
773 assert.Equal("Ewing", tokens[10]);
774 assert.Equal("reviewed", tokens[11]);
775 assert.Equal("articles", tokens[12]);
776 assert.Equal("on", tokens[13]);
777 assert.Equal("Enzymol.", tokens[14]);
778 assert.Equal("Bacteriol.", tokens[15]);
779 assert.Equal("effects", tokens[16]);
780 assert.Equal("later", tokens[17]);
781 assert.Equal("published", tokens[18]);
782 assert.Equal("in", tokens[19]);
783 assert.Equal("Nutr.", tokens[20]);
784 assert.Equal("Rheumatol.", tokens[21]);
785 assert.Equal("No.", tokens[22]);
786 assert.Equal("12", tokens[23]);
787 assert.Equal("and", tokens[24]);
788 assert.Equal("Nº.", tokens[25]);
789 assert.Equal("13.", tokens[26]);
790 assert.Equal(",", tokens[27]);
791 assert.Equal("pp.", tokens[28]);
792 assert.Equal("17-18", tokens[29]);
793 assert.Equal(".", tokens[30]);
794 }
795
796 @Test
797 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
798 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
799 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
800 assert.Equal("I.", tokens[1]);
801 assert.Equal("I", tokens[8]);
802 assert.Equal(".", tokens[9]);
803 assert.Equal("I", tokens[12]);
804 assert.Equal(".", tokens[13]);
805 }
806
807 @Test
808 public void testZipOuputArchive () {
809
810 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
811 System.setOut(new PrintStream(clearOut));
812 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
813 assert.Equal(0, len(tokens));
814 }
815 */
816 /*
817
818 @Test
819 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
820 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
821 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
822 .printOffsets(true)
823 .build();
824 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
825 assert.Equal("Text1", tokens[0].getType());
826 assert.Equal(len(tokens), 9 );
827 }
828 */
829}