blob: 2c6a525b77c6a3cc1d69568865f76ff82a9fd1be [file] [log] [blame]
Akron8ef408b2021-08-02 22:11:04 +02001package datokenizer
2
3import (
Akron6247a5d2021-08-03 19:18:28 +02004 "bytes"
Akron03ca4252021-08-11 13:32:53 +02005 "regexp"
Akron3f8571a2021-08-05 11:18:10 +02006 "strings"
Akron8ef408b2021-08-02 22:11:04 +02007 "testing"
8
9 "github.com/stretchr/testify/assert"
10)
11
12func TestSimpleString(t *testing.T) {
13 assert := assert.New(t)
14
15 // bau | bauamt
Akron64ffd9a2021-08-03 19:55:21 +020016 tok := LoadFomaFile("testdata/bauamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020017 dat := tok.ToDoubleArray()
18 assert.True(dat.Match("bau"))
19 assert.True(dat.Match("bauamt"))
20 assert.False(dat.Match("baum"))
Akron8ef408b2021-08-02 22:11:04 +020021}
Akron75ebe7f2021-08-03 10:34:10 +020022
23func TestSimpleBranches(t *testing.T) {
24 assert := assert.New(t)
25
26 // (bau | wahl) (amt | en)
Akron64ffd9a2021-08-03 19:55:21 +020027 tok := LoadFomaFile("testdata/wahlamt.fst")
Akronf2120ca2021-08-03 16:26:41 +020028 dat := tok.ToDoubleArray()
29 assert.False(dat.Match("bau"))
30 assert.True(dat.Match("bauamt"))
31 assert.True(dat.Match("wahlamt"))
32 assert.True(dat.Match("bauen"))
33 assert.True(dat.Match("wahlen"))
34 assert.False(dat.Match("baum"))
Akron75ebe7f2021-08-03 10:34:10 +020035}
Akron730a79c2021-08-03 11:05:29 +020036
37func TestSimpleTokenizer(t *testing.T) {
38 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020039 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020040 dat := tok.ToDoubleArray()
41 assert.True(dat.Match("bau"))
42 assert.True(dat.Match("bad"))
43 assert.True(dat.Match("wald gehen"))
Akron730a79c2021-08-03 11:05:29 +020044}
Akron740f3d72021-08-03 12:12:34 +020045
Akron068874c2021-08-04 15:19:56 +020046func TestSimpleTokenizerTransduce(t *testing.T) {
Akron84d68e62021-08-04 17:06:52 +020047 assert := assert.New(t)
48 tok := LoadFomaFile("testdata/simpletok.fst")
Akron84d68e62021-08-04 17:06:52 +020049 dat := tok.ToDoubleArray()
Akron3f8571a2021-08-05 11:18:10 +020050
51 r := strings.NewReader(" wald gehen Da kann\t man was \"erleben\"!")
52 b := make([]byte, 0, 2048)
53 w := bytes.NewBuffer(b)
Akron524c5432021-08-05 14:14:27 +020054 var tokens []string
Akron524c5432021-08-05 14:14:27 +020055 dat.Transduce(r, w)
56 tokens = strings.Split(w.String(), "\n")
Akron3f8571a2021-08-05 11:18:10 +020057 assert.Equal("wald", tokens[0])
58 assert.Equal("gehen", tokens[1])
59 assert.Equal("Da", tokens[2])
60 assert.Equal("kann", tokens[3])
61 assert.Equal("man", tokens[4])
62 assert.Equal("was", tokens[5])
63 assert.Equal("\"erleben\"", tokens[6])
64
Akron524c5432021-08-05 14:14:27 +020065 r = strings.NewReader(" In den Wald gehen? -- Da kann\t man was \"erleben\"!")
66 w.Reset()
67 dat.Transduce(r, w)
68 tokens = strings.Split(w.String(), "\n")
69 assert.Equal("In", tokens[0])
70 assert.Equal("den", tokens[1])
71 assert.Equal("Wald", tokens[2])
72 assert.Equal("gehen", tokens[3])
73 assert.Equal("?", tokens[4])
74 assert.Equal("--", tokens[5])
Akron3f8571a2021-08-05 11:18:10 +020075
Akron524c5432021-08-05 14:14:27 +020076 r = strings.NewReader(" g? -- D")
77 w.Reset()
78 dat.Transduce(r, w)
79 tokens = strings.Split(w.String(), "\n")
80 assert.Equal("g", tokens[0])
81 assert.Equal("?", tokens[1])
82 assert.Equal("--", tokens[2])
83 assert.Equal("D", tokens[3])
84 assert.Equal("", tokens[4])
Akron6e70dc82021-08-11 11:33:18 +020085 assert.Equal("", tokens[5])
86 assert.Equal(6, len(tokens))
Akron068874c2021-08-04 15:19:56 +020087}
88
Akron3f8571a2021-08-05 11:18:10 +020089func TestReadWriteTokenizer(t *testing.T) {
Akron740f3d72021-08-03 12:12:34 +020090 assert := assert.New(t)
Akron64ffd9a2021-08-03 19:55:21 +020091 tok := LoadFomaFile("testdata/simpletok.fst")
Akronf2120ca2021-08-03 16:26:41 +020092 dat := tok.ToDoubleArray()
Akronf2120ca2021-08-03 16:26:41 +020093 assert.True(dat.Match("bau"))
94 assert.True(dat.Match("bad"))
95 assert.True(dat.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +020096
Akron03a3c612021-08-04 11:51:27 +020097 assert.True(dat.LoadFactor() >= 70)
Akron6247a5d2021-08-03 19:18:28 +020098
Akron3f8571a2021-08-05 11:18:10 +020099 b := make([]byte, 0, 1024)
Akron6247a5d2021-08-03 19:18:28 +0200100 buf := bytes.NewBuffer(b)
101 n, err := dat.WriteTo(buf)
102 assert.Nil(err)
Akron03c92fe2021-08-09 14:07:57 +0200103 assert.Equal(int64(224), n)
Akron3f8571a2021-08-05 11:18:10 +0200104
105 dat2 := ParseDatok(buf)
106 assert.NotNil(dat2)
107 assert.Equal(dat.array, dat2.array)
108 assert.Equal(dat.sigma, dat2.sigma)
109 assert.Equal(dat.epsilon, dat2.epsilon)
110 assert.Equal(dat.unknown, dat2.unknown)
111 assert.Equal(dat.identity, dat2.identity)
112 assert.Equal(dat.final, dat2.final)
113 assert.Equal(dat.LoadFactor(), dat2.LoadFactor())
114 assert.True(dat2.Match("bau"))
115 assert.True(dat2.Match("bad"))
116 assert.True(dat2.Match("wald gehen"))
Akron6247a5d2021-08-03 19:18:28 +0200117}
118
119func TestFullTokenizer(t *testing.T) {
Akron3a063ef2021-08-05 19:36:35 +0200120 assert := assert.New(t)
Akron2a4b9292021-08-04 15:35:22 +0200121 /*
Akron2a4b9292021-08-04 15:35:22 +0200122 tok := LoadFomaFile("testdata/tokenizer.fst")
123 dat := tok.ToDoubleArray()
Akron3a063ef2021-08-05 19:36:35 +0200124 dat.Save("testdata/tokenizer.datok")
Akron2a4b9292021-08-04 15:35:22 +0200125 */
Akron3a063ef2021-08-05 19:36:35 +0200126 dat := LoadDatokFile("testdata/tokenizer.datok")
127 assert.NotNil(dat)
128 assert.True(dat.LoadFactor() >= 70)
129 assert.Equal(dat.epsilon, 1)
130 assert.Equal(dat.unknown, 2)
131 assert.Equal(dat.identity, 3)
Akron03c92fe2021-08-09 14:07:57 +0200132 assert.Equal(dat.final, 136)
Akron3a063ef2021-08-05 19:36:35 +0200133 assert.Equal(len(dat.sigma), 131)
Akron4af79f12021-08-11 14:48:17 +0200134 assert.True(len(dat.array) > 3800000)
135 assert.True(dat.maxSize > 3800000)
Akron3a063ef2021-08-05 19:36:35 +0200136
137 assert.True(dat.Match("bau"))
138 assert.True(dat.Match("bad"))
139 assert.True(dat.Match("wald gehen"))
Akron740f3d72021-08-03 12:12:34 +0200140}
Akron3f8571a2021-08-05 11:18:10 +0200141
142func TestFullTokenizerTransduce(t *testing.T) {
Akron3610f102021-08-08 14:13:25 +0200143 assert := assert.New(t)
144
Akron03c92fe2021-08-09 14:07:57 +0200145 var dat *DaTokenizer
Akron3610f102021-08-08 14:13:25 +0200146
Akron03c92fe2021-08-09 14:07:57 +0200147 if false {
148 tok := LoadFomaFile("testdata/tokenizer.fst")
149 dat = tok.ToDoubleArray()
Akron4af79f12021-08-11 14:48:17 +0200150 dat.Save("testdata/tokenizer.datok")
Akron03c92fe2021-08-09 14:07:57 +0200151 } else {
152 dat = LoadDatokFile("testdata/tokenizer.datok")
153 }
Akron3610f102021-08-08 14:13:25 +0200154 assert.NotNil(dat)
155
Akron3610f102021-08-08 14:13:25 +0200156 b := make([]byte, 0, 2048)
157 w := bytes.NewBuffer(b)
158 var tokens []string
159
Akron03ca4252021-08-11 13:32:53 +0200160 assert.True(dat.Transduce(strings.NewReader("tra. u Du?"), w))
Akron3610f102021-08-08 14:13:25 +0200161
162 tokens = strings.Split(w.String(), "\n")
Akron1594cb82021-08-11 11:14:56 +0200163 assert.Equal("tra\n.\n\nu\nDu\n?\n\n", w.String())
Akron3610f102021-08-08 14:13:25 +0200164 assert.Equal("tra", tokens[0])
165 assert.Equal(".", tokens[1])
Akronb4bbb472021-08-09 11:49:38 +0200166 assert.Equal("", tokens[2])
Akronc5d8d432021-08-10 16:48:44 +0200167 assert.Equal("u", tokens[3])
Akronb4bbb472021-08-09 11:49:38 +0200168 assert.Equal("Du", tokens[4])
169 assert.Equal("?", tokens[5])
Akron3610f102021-08-08 14:13:25 +0200170 assert.Equal("", tokens[6])
Akron1594cb82021-08-11 11:14:56 +0200171 assert.Equal("", tokens[7])
172 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200173
174 w.Reset()
175 assert.True(dat.Transduce(strings.NewReader("\"John Doe\"@xx.com"), w))
176 assert.Equal("\"\nJohn\nDoe\n\"\n@xx\n.\n\ncom\n\n", w.String())
Akron3f8571a2021-08-05 11:18:10 +0200177}
Akronb7e1f132021-08-10 11:52:31 +0200178
179func TestFullTokenizerSentenceSplitter(t *testing.T) {
180 assert := assert.New(t)
181 dat := LoadDatokFile("testdata/tokenizer.datok")
182 assert.NotNil(dat)
183
184 b := make([]byte, 0, 2048)
185 w := bytes.NewBuffer(b)
186 var sentences []string
187
188 // testSentSplitterSimple
Akron1594cb82021-08-11 11:14:56 +0200189 assert.True(dat.Transduce(strings.NewReader("Der alte Mann."), w))
190 sentences = strings.Split(w.String(), "\n\n")
191
192 assert.Equal("Der\nalte\nMann\n.\n\n", w.String())
193 assert.Equal("Der\nalte\nMann\n.", sentences[0])
194 assert.Equal("", sentences[1])
195 assert.Equal(len(sentences), 2)
196
197 w.Reset()
198 assert.True(dat.Transduce(strings.NewReader("Der Vorsitzende der Abk. hat gewählt."), w))
199 sentences = strings.Split(w.String(), "\n\n")
200 assert.Equal(len(sentences), 2)
201 assert.Equal("Der\nVorsitzende\nder\nAbk.\nhat\ngewählt\n.", sentences[0])
202 assert.Equal("", sentences[1])
203
204 w.Reset()
205 assert.True(dat.Transduce(strings.NewReader(""), w))
Akronb7e1f132021-08-10 11:52:31 +0200206 sentences = strings.Split(w.String(), "\n\n")
207 assert.Equal(len(sentences), 1)
Akron6e70dc82021-08-11 11:33:18 +0200208 assert.Equal("\n", sentences[0])
Akron1594cb82021-08-11 11:14:56 +0200209
210 w.Reset()
211 assert.True(dat.Transduce(strings.NewReader("Gefunden auf wikipedia.org."), w))
212 sentences = strings.Split(w.String(), "\n\n")
213 assert.Equal(len(sentences), 2)
214
215 w.Reset()
216 assert.True(dat.Transduce(strings.NewReader("Ich bin unter korap@ids-mannheim.de erreichbar."), w))
217 sentences = strings.Split(w.String(), "\n\n")
218 assert.Equal(len(sentences), 2)
219
Akron6e70dc82021-08-11 11:33:18 +0200220 w.Reset()
221 assert.True(dat.Transduce(strings.NewReader("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"), w))
222 sentences = strings.Split(w.String(), "\n\n")
223 assert.Equal("Unsere\nWebsite\nist\nhttps://korap.ids-mannheim.de/?q=Baum", sentences[0])
224 assert.Equal("", sentences[1])
225 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200226
Akron6e70dc82021-08-11 11:33:18 +0200227 w.Reset()
228 assert.True(dat.Transduce(strings.NewReader("Unser Server ist 10.0.10.51."), w))
229 sentences = strings.Split(w.String(), "\n\n")
230 assert.Equal("", sentences[1])
231 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200232
Akron6e70dc82021-08-11 11:33:18 +0200233 w.Reset()
234 assert.True(dat.Transduce(strings.NewReader("Zu 50.4% ist es sicher"), w))
235 sentences = strings.Split(w.String(), "\n\n")
236 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200237
Akron6e70dc82021-08-11 11:33:18 +0200238 w.Reset()
239 assert.True(dat.Transduce(strings.NewReader("Der Termin ist am 5.9.2018"), w))
240 sentences = strings.Split(w.String(), "\n\n")
241 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200242
Akron6e70dc82021-08-11 11:33:18 +0200243 w.Reset()
244 assert.True(dat.Transduce(strings.NewReader("Ich habe die readme.txt heruntergeladen"), w))
245 sentences = strings.Split(w.String(), "\n\n")
246 assert.Equal(len(sentences), 2)
247 assert.Equal("Ich\nhabe\ndie\nreadme.txt\nheruntergeladen", sentences[0])
248 assert.Equal("", sentences[1])
Akron1594cb82021-08-11 11:14:56 +0200249
Akron6e70dc82021-08-11 11:33:18 +0200250 w.Reset()
251 assert.True(dat.Transduce(strings.NewReader("Ausschalten!!! Hast Du nicht gehört???"), w))
252 sentences = strings.Split(w.String(), "\n\n")
253 assert.Equal(len(sentences), 3)
254 assert.Equal("Ausschalten\n!!!", sentences[0])
255 assert.Equal("Hast\nDu\nnicht\ngehört\n???", sentences[1])
256 assert.Equal("", sentences[2])
Akron1594cb82021-08-11 11:14:56 +0200257
Akron4af79f12021-08-11 14:48:17 +0200258 w.Reset()
259 assert.True(dat.Transduce(strings.NewReader("Ich wohne in der Weststr. und Du?"), w))
260 sentences = strings.Split(w.String(), "\n\n")
261 assert.Equal(len(sentences), 2)
Akron1594cb82021-08-11 11:14:56 +0200262
263 /*
264 Test:
265 "\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\""), w))
266 */
Akronb7e1f132021-08-10 11:52:31 +0200267}
Akron03ca4252021-08-11 13:32:53 +0200268
269func tokenize(dat *DaTokenizer, w *bytes.Buffer, str string) []string {
270 w.Reset()
271 ok := dat.Transduce(strings.NewReader(str), w)
272 if !ok {
273 return []string{}
274 }
275 obj := regexp.MustCompile("\n+")
276
277 tokens := obj.Split(w.String(), -1)
278 return tokens[:len(tokens)-1]
279}
280
281func TestFullTokenizerTokenSplitter(t *testing.T) {
282 assert := assert.New(t)
283 dat := LoadDatokFile("testdata/tokenizer.datok")
284 assert.NotNil(dat)
285
286 b := make([]byte, 0, 2048)
287 w := bytes.NewBuffer(b)
288 var tokens []string
289
290 // testTokenizerSimple
291 tokens = tokenize(dat, w, "Der alte Mann")
292 assert.Equal(tokens[0], "Der")
293 assert.Equal(tokens[1], "alte")
294 assert.Equal(tokens[2], "Mann")
295 assert.Equal(len(tokens), 3)
296
297 tokens = tokenize(dat, w, "Der alte Mann.")
298 assert.Equal(tokens[0], "Der")
299 assert.Equal(tokens[1], "alte")
300 assert.Equal(tokens[2], "Mann")
301 assert.Equal(tokens[3], ".")
302 assert.Equal(len(tokens), 4)
303
304 // testTokenizerAbbr
305 tokens = tokenize(dat, w, "Der Vorsitzende der F.D.P. hat gewählt")
306 assert.Equal(tokens[0], "Der")
307 assert.Equal(tokens[1], "Vorsitzende")
308 assert.Equal(tokens[2], "der")
309 assert.Equal(tokens[3], "F.D.P.")
310 assert.Equal(tokens[4], "hat")
311 assert.Equal(tokens[5], "gewählt")
312 assert.Equal(len(tokens), 6)
313 // Ignored in KorAP-Tokenizer
314
315 // testTokenizerHost1
316 tokens = tokenize(dat, w, "Gefunden auf wikipedia.org")
317 assert.Equal(tokens[0], "Gefunden")
318 assert.Equal(tokens[1], "auf")
319 assert.Equal(tokens[2], "wikipedia.org")
320 assert.Equal(len(tokens), 3)
321
322 // testTokenizerWwwHost
323 tokens = tokenize(dat, w, "Gefunden auf www.wikipedia.org")
324 assert.Equal("Gefunden", tokens[0])
325 assert.Equal("auf", tokens[1])
326 assert.Equal("www.wikipedia.org", tokens[2])
327 assert.Equal(3, len(tokens))
328
329 // testTokenizerWwwUrl
330 tokens = tokenize(dat, w, "Weitere Infos unter www.info.biz/info")
331 assert.Equal("www.info.biz/info", tokens[3])
332
333 // testTokenizerFtpHost
334 /*
335 tokens = tokenize(dat, w, "Kann von ftp.download.org heruntergeladen werden")
336 assert.Equal("Kann", tokens[0])
337 assert.Equal("von", tokens[1])
338 assert.Equal("ftp.download.org", tokens[2])
339 assert.Equal(5, len(tokens))
340 // Ignored in KorAP-Tokenizer
341 */
342
343 // testTokenizerDash
344 tokens = tokenize(dat, w, "Das war -- spitze")
345 assert.Equal(tokens[0], "Das")
346 assert.Equal(tokens[1], "war")
347 assert.Equal(tokens[2], "--")
348 assert.Equal(tokens[3], "spitze")
349 assert.Equal(len(tokens), 4)
350
351 // testTokenizerEmail1
352 tokens = tokenize(dat, w, "Ich bin unter korap@ids-mannheim.de erreichbar.")
353 assert.Equal(tokens[0], "Ich")
354 assert.Equal(tokens[1], "bin")
355 assert.Equal(tokens[2], "unter")
356 assert.Equal(tokens[3], "korap@ids-mannheim.de")
357 assert.Equal(tokens[4], "erreichbar")
358 assert.Equal(tokens[5], ".")
359 assert.Equal(len(tokens), 6)
360
361 // testTokenizerEmail2
362 tokens = tokenize(dat, w, "Oder unter korap[at]ids-mannheim[dot]de.")
363 assert.Equal(tokens[0], "Oder")
364 assert.Equal(tokens[1], "unter")
365 assert.Equal(tokens[2], "korap[at]ids-mannheim[dot]de")
366 assert.Equal(tokens[3], ".")
367 assert.Equal(len(tokens), 4)
368
369 // testTokenizerEmail3
370 tokens = tokenize(dat, w, "Oder unter korap(at)ids-mannheim(dot)de.")
371 assert.Equal(tokens[0], "Oder")
372 assert.Equal(tokens[1], "unter")
373 assert.Equal(tokens[2], "korap(at)ids-mannheim(dot)de")
374 assert.Equal(tokens[3], ".")
375 assert.Equal(len(tokens), 4)
376 // Ignored in KorAP-Tokenizer
377
378 // testTokenizerDoNotAcceptQuotedEmailNames
379 tokens = tokenize(dat, w, "\"John Doe\"@xx.com")
380 assert.Equal("\"", tokens[0])
381 assert.Equal("John", tokens[1])
382 assert.Equal("Doe", tokens[2])
383 assert.Equal("\"", tokens[3])
384 assert.Equal("@xx", tokens[4])
385 assert.Equal(".", tokens[5]) // Differs - as the sentence splitter splits here!
386 assert.Equal("com", tokens[6])
387 assert.Equal(7, len(tokens))
388
389 // testTokenizerTwitter
390 tokens = tokenize(dat, w, "Folgt @korap und #korap")
391 assert.Equal(tokens[0], "Folgt")
392 assert.Equal(tokens[1], "@korap")
393 assert.Equal(tokens[2], "und")
394 assert.Equal(tokens[3], "#korap")
395 assert.Equal(len(tokens), 4)
396
397 // testTokenizerWeb1
398 tokens = tokenize(dat, w, "Unsere Website ist https://korap.ids-mannheim.de/?q=Baum")
399 assert.Equal(tokens[0], "Unsere")
400 assert.Equal(tokens[1], "Website")
401 assert.Equal(tokens[2], "ist")
402 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
403 assert.Equal(len(tokens), 4)
404
405 // testTokenizerWeb2
406 tokens = tokenize(dat, w, "Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)")
407 assert.Equal(tokens[0], "Wir")
408 assert.Equal(tokens[1], "sind")
409 assert.Equal(tokens[2], "auch")
410 assert.Equal(tokens[3], "im")
411 assert.Equal(tokens[4], "Internet")
412 assert.Equal(tokens[5], "(")
413 assert.Equal(tokens[6], "https://korap.ids-mannheim.de/?q=Baum")
414 assert.Equal(tokens[7], ")")
415 assert.Equal(len(tokens), 8)
416 // Ignored in KorAP-Tokenizer
417
418 // testTokenizerWeb3
419 tokens = tokenize(dat, w, "Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.")
420 assert.Equal(tokens[0], "Die")
421 assert.Equal(tokens[1], "Adresse")
422 assert.Equal(tokens[2], "ist")
423 assert.Equal(tokens[3], "https://korap.ids-mannheim.de/?q=Baum")
424 assert.Equal(tokens[4], ".")
425 assert.Equal(len(tokens), 5)
426 // Ignored in KorAP-Tokenizer
427
428 // testTokenizerServer
429 tokens = tokenize(dat, w, "Unser Server ist 10.0.10.51.")
430 assert.Equal(tokens[0], "Unser")
431 assert.Equal(tokens[1], "Server")
432 assert.Equal(tokens[2], "ist")
433 assert.Equal(tokens[3], "10.0.10.51")
434 assert.Equal(tokens[4], ".")
435 assert.Equal(len(tokens), 5)
436
437 // testTokenizerNum
438 tokens = tokenize(dat, w, "Zu 50,4% ist es sicher")
439 assert.Equal(tokens[0], "Zu")
440 assert.Equal(tokens[1], "50,4%")
441 assert.Equal(tokens[2], "ist")
442 assert.Equal(tokens[3], "es")
443 assert.Equal(tokens[4], "sicher")
444 assert.Equal(len(tokens), 5)
445 // Differs from KorAP-Tokenizer
446
447 // testTokenizerDate
448 tokens = tokenize(dat, w, "Der Termin ist am 5.9.2018")
449 assert.Equal(tokens[0], "Der")
450 assert.Equal(tokens[1], "Termin")
451 assert.Equal(tokens[2], "ist")
452 assert.Equal(tokens[3], "am")
453 assert.Equal(tokens[4], "5.9.2018")
454 assert.Equal(len(tokens), 5)
455
456 tokens = tokenize(dat, w, "Der Termin ist am 5/9/2018")
457 assert.Equal(tokens[0], "Der")
458 assert.Equal(tokens[1], "Termin")
459 assert.Equal(tokens[2], "ist")
460 assert.Equal(tokens[3], "am")
461 assert.Equal(tokens[4], "5/9/2018")
462 assert.Equal(len(tokens), 5)
463
464 // testTokenizerDateRange
465 /*
466 tokens = tokenize(dat, w, "Der Termin war vom 4.-5.9.2018")
467 assert.Equal(tokens[0], "Der")
468 assert.Equal(tokens[1], "Termin")
469 assert.Equal(tokens[2], "war")
470 assert.Equal(tokens[3], "vom")
471 assert.Equal(tokens[4], "4.")
472 assert.Equal(tokens[5], "-")
473 assert.Equal(tokens[6], "5.9.2018")
474 assert.Equal(len(tokens), 7)
475 // Ignored in KorAP-Tokenizer
476 */
477
478 // testTokenizerEmoji1
479 tokens = tokenize(dat, w, "Das ist toll! ;)")
480 assert.Equal(tokens[0], "Das")
481 assert.Equal(tokens[1], "ist")
482 assert.Equal(tokens[2], "toll")
483 assert.Equal(tokens[3], "!")
484 assert.Equal(tokens[4], ";)")
485 assert.Equal(len(tokens), 5)
486
487 // testTokenizerRef1
488 tokens = tokenize(dat, w, "Kupietz und Schmidt (2018): Korpuslinguistik")
489 assert.Equal(tokens[0], "Kupietz")
490 assert.Equal(tokens[1], "und")
491 assert.Equal(tokens[2], "Schmidt")
492 assert.Equal(tokens[3], "(2018)")
493 assert.Equal(tokens[4], ":")
494 assert.Equal(tokens[5], "Korpuslinguistik")
495 assert.Equal(len(tokens), 6)
496 // Differs from KorAP-Tokenizer!
497
498 // testTokenizerRef2 () {
499 tokens = tokenize(dat, w, "Kupietz und Schmidt [2018]: Korpuslinguistik")
500 assert.Equal(tokens[0], "Kupietz")
501 assert.Equal(tokens[1], "und")
502 assert.Equal(tokens[2], "Schmidt")
503 assert.Equal(tokens[3], "[2018]")
504 assert.Equal(tokens[4], ":")
505 assert.Equal(tokens[5], "Korpuslinguistik")
506 assert.Equal(len(tokens), 6)
507 // Differs from KorAP-Tokenizer!
508
509 // testTokenizerOmission1 () {
510 tokens = tokenize(dat, w, "Er ist ein A****loch!")
511 assert.Equal(tokens[0], "Er")
512 assert.Equal(tokens[1], "ist")
513 assert.Equal(tokens[2], "ein")
514 assert.Equal(tokens[3], "A****loch")
515 assert.Equal(tokens[4], "!")
516 assert.Equal(len(tokens), 5)
517
518 // testTokenizerOmission2
519 tokens = tokenize(dat, w, "F*ck!")
520 assert.Equal(tokens[0], "F*ck")
521 assert.Equal(tokens[1], "!")
522 assert.Equal(len(tokens), 2)
523
524 // testTokenizerOmission3 () {
525 tokens = tokenize(dat, w, "Dieses verf***** Kleid!")
526 assert.Equal(tokens[0], "Dieses")
527 assert.Equal(tokens[1], "verf*****")
528 assert.Equal(tokens[2], "Kleid")
529 assert.Equal(tokens[3], "!")
530 assert.Equal(len(tokens), 4)
531
532 // Probably interpreted as HOST
533 // testTokenizerFileExtension1
534 tokens = tokenize(dat, w, "Ich habe die readme.txt heruntergeladen")
535 assert.Equal(tokens[0], "Ich")
536 assert.Equal(tokens[1], "habe")
537 assert.Equal(tokens[2], "die")
538 assert.Equal(tokens[3], "readme.txt")
539 assert.Equal(tokens[4], "heruntergeladen")
540 assert.Equal(len(tokens), 5)
541
542 // Probably interpreted as HOST
543 // testTokenizerFileExtension2
544 tokens = tokenize(dat, w, "Nimm die README.TXT!")
545 assert.Equal(tokens[0], "Nimm")
546 assert.Equal(tokens[1], "die")
547 assert.Equal(tokens[2], "README.TXT")
548 assert.Equal(tokens[3], "!")
549 assert.Equal(len(tokens), 4)
550
551 // Probably interpreted as HOST
552 // testTokenizerFileExtension3
553 tokens = tokenize(dat, w, "Zeig mir profile.jpeg")
554 assert.Equal(tokens[0], "Zeig")
555 assert.Equal(tokens[1], "mir")
556 assert.Equal(tokens[2], "profile.jpeg")
557 assert.Equal(len(tokens), 3)
558
559 // testTokenizerFile1
560 /*
561 tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.docx")
562 assert.Equal(tokens[0], "Zeig")
563 assert.Equal(tokens[1], "mir")
564 assert.Equal(tokens[2], "c:\\Dokumente\\profile.docx")
565 assert.Equal(len(tokens), 3)
566
567
568 // testTokenizerFile2
569 tokens = tokenize(dat, w, "Gehe zu /Dokumente/profile.docx")
570 assert.Equal(tokens[0], "Gehe")
571 assert.Equal(tokens[1], "zu")
572 assert.Equal(tokens[2], "/Dokumente/profile.docx")
573 assert.Equal(len(tokens), 3)
574
575 // testTokenizerFile3
576 tokens = tokenize(dat, w, "Zeig mir c:\\Dokumente\\profile.jpeg")
577 assert.Equal(tokens[0], "Zeig")
578 assert.Equal(tokens[1], "mir")
579 assert.Equal(tokens[2], "c:\\Dokumente\\profile.jpeg")
580 assert.Equal(len(tokens), 3)
581 // Ignored in KorAP-Tokenizer
582 */
583
584 /*
585 // testTokenizerPunct
586 tokens = tokenize(dat, w, "Er sagte: \"Es geht mir gut!\", daraufhin ging er.")
587 assert.Equal(tokens[0], "Er")
588 assert.Equal(tokens[1], "sagte")
589 assert.Equal(tokens[2], ":")
590 assert.Equal(tokens[3], "\"")
591 assert.Equal(tokens[4], "Es")
592 assert.Equal(tokens[5], "geht")
593 assert.Equal(tokens[6], "mir")
594 assert.Equal(tokens[7], "gut")
595 assert.Equal(tokens[8], "!")
596 assert.Equal(tokens[9], "\"")
597 assert.Equal(tokens[10], ",")
598 assert.Equal(tokens[11], "daraufhin")
599 assert.Equal(tokens[12], "ging")
600 assert.Equal(tokens[13], "er")
601 assert.Equal(tokens[14], ".")
602 assert.Equal(len(tokens), 15)
603 */
604
605 // testTokenizerPlusAmpersand
606 /*
607 tokens = tokenize(dat, w, ""Das ist von C&A!"")
608 assert.Equal(tokens[0], """)
609 assert.Equal(tokens[1], "Das")
610 assert.Equal(tokens[2], "ist")
611 assert.Equal(tokens[3], "von")
612 assert.Equal(tokens[4], "C&A")
613 assert.Equal(tokens[5], "!")
614 assert.Equal(tokens[6], """)
615 assert.Equal(len(tokens), 7)
616 */
617
618 // testTokenizerLongEnd
619 tokens = tokenize(dat, w, "Siehst Du?!!?")
620 assert.Equal(tokens[0], "Siehst")
621 assert.Equal(tokens[1], "Du")
622 assert.Equal(tokens[2], "?!!?")
623 assert.Equal(len(tokens), 3)
624
625 // testTokenizerIrishO
626 tokens = tokenize(dat, w, "Peter O'Toole")
627 assert.Equal(tokens[0], "Peter")
628 assert.Equal(tokens[1], "O'Toole")
629 assert.Equal(len(tokens), 2)
630
631 // testTokenizerAbr
632 tokens = tokenize(dat, w, "Früher bzw. später ...")
633 assert.Equal(tokens[0], "Früher")
634 assert.Equal(tokens[1], "bzw.")
635 assert.Equal(tokens[2], "später")
636 assert.Equal(tokens[3], "...")
637 assert.Equal(len(tokens), 4)
638
639 // testTokenizerUppercaseRule
640 tokens = tokenize(dat, w, "Es war spät.Morgen ist es früh.")
641 assert.Equal(tokens[0], "Es")
642 assert.Equal(tokens[1], "war")
643 assert.Equal(tokens[2], "spät")
644 assert.Equal(tokens[3], ".")
645 assert.Equal(tokens[4], "Morgen")
646 assert.Equal(tokens[5], "ist")
647 assert.Equal(tokens[6], "es")
648 assert.Equal(tokens[7], "früh")
649 assert.Equal(tokens[8], ".")
650 assert.Equal(len(tokens), 9)
651 // Ignored in KorAP-Tokenizer
652
653 // testTokenizerOrd
654 /*
655 tokens = tokenize(dat, w, "Sie erreichte den 1. Platz!")
656 assert.Equal(tokens[0], "Sie")
657 assert.Equal(tokens[1], "erreichte")
658 assert.Equal(tokens[2], "den")
659 assert.Equal(tokens[3], "1.")
660 assert.Equal(tokens[4], "Platz")
661 assert.Equal(tokens[5], "!")
662 assert.Equal(len(tokens), 6)
663 */
664
665 // testNoZipOuputArchive
666 tokens = tokenize(dat, w, "Archive: Ich bin kein zip\n")
667 assert.Equal(tokens[0], "Archive")
668 assert.Equal(tokens[1], ":")
669 assert.Equal(tokens[2], "Ich")
670 assert.Equal(tokens[3], "bin")
671 assert.Equal(tokens[4], "kein")
672 assert.Equal(tokens[5], "zip")
673 assert.Equal(6, len(tokens))
674
675 // testTokenizerStrasse
Akron4af79f12021-08-11 14:48:17 +0200676 tokens = tokenize(dat, w, "Ich wohne in der Weststr. und Du?")
677 assert.Equal(tokens[4], "Weststr.")
678 assert.Equal(8, len(tokens))
Akron03ca4252021-08-11 13:32:53 +0200679
680 // germanTokenizerKnowsGermanOmissionWords
681 tokens = tokenize(dat, w, "D'dorf Ku'damm Lu'hafen M'gladbach W'schaft")
682 assert.Equal("D'dorf", tokens[0])
683 assert.Equal("Ku'damm", tokens[1])
684 assert.Equal("Lu'hafen", tokens[2])
685 assert.Equal("M'gladbach", tokens[3])
686 assert.Equal("W'schaft", tokens[4])
687 assert.Equal(5, len(tokens))
688
689 // germanTokenizerDoesNOTSeparateGermanContractions
690 tokens = tokenize(dat, w, "mach's macht's was'n ist's haste willste kannste biste kriegste")
691 assert.Equal("mach's", tokens[0])
692 assert.Equal("macht's", tokens[1])
693 assert.Equal("was'n", tokens[2])
694 assert.Equal("ist's", tokens[3])
695 assert.Equal("haste", tokens[4])
696 assert.Equal("willste", tokens[5])
697 assert.Equal("kannste", tokens[6])
698 assert.Equal("biste", tokens[7])
699 assert.Equal("kriegste", tokens[8])
700 assert.Equal(9, len(tokens))
701
702 /*
703 @Test
704 public void englishTokenizerSeparatesEnglishContractionsAndClitics () {
705 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
706 tokens = tokenize(dat, w, "I've we'll you'd I'm we're Peter's isn't")
707 assert.Equal("'ve", tokens[1]);
708 assert.Equal("'ll", tokens[3]);
709 assert.Equal("'d", tokens[5]);
710 assert.Equal("'m", tokens[7]);
711 assert.Equal("'re", tokens[9]);
712 assert.Equal("'s", tokens[11]);
713 assert.Equal("is", tokens[12]);
714 assert.Equal("n't", tokens[13]);
715 assert.Equal(14, len(tokens));
716 }
717
718 @Test
719 public void frenchTokenizerKnowsFrenchAbbreviations () {
720 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
721 tokens = tokenize(dat, w, "Approx. en juill. 2004 mon prof. M. Foux m'a dit qu'il faut faire exerc. no. 4, et lire pp. 27-30.")
722 assert.Equal("Approx.", tokens[0]);
723 assert.Equal("juill.", tokens[2]);
724 assert.Equal("prof.", tokens[5]);
725 assert.Equal("exerc.", tokens[15]);
726 assert.Equal("no.", tokens[16]);
727 assert.Equal("pp.", tokens[21]);
728 }
729
730 @Test
731 public void frenchTokenizerKnowsFrenchContractions () {
732 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
733 tokens = tokenize(dat, w, "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île")
734 assert.Equal("J'", tokens[0]);
735 assert.Equal("j'", tokens[2]);
736 assert.Equal("qu'", tokens[4]);
737 assert.Equal("d'", tokens[6]);
738 assert.Equal("jusqu'", tokens[8]);
739 assert.Equal("Aujourd'hui", tokens[10]);
740 assert.Equal("D'", tokens[11]); // ’
741 assert.Equal("Quelqu'un", tokens[13]); // ’
742 assert.Equal("Presqu'île", tokens[14]); // ’
743 }
744
745 @Test
746 public void frenchTokenizerKnowsFrenchClitics () {
747 DerekoDfaTokenizer_fr tok = new DerekoDfaTokenizer_fr();
748 tokens = tokenize(dat, w, "suis-je sont-elles ")
749 assert.Equal("suis", tokens[0]);
750 assert.Equal("-je", tokens[1]);
751 assert.Equal("sont", tokens[2]);
752 assert.Equal("-elles", tokens[3]);
753 }
754
755 @Test
756 public void testEnglishTokenizerScienceAbbreviations () {
757 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
758 tokens = tokenize(dat, w, "Approx. in Sept. 1954, Assoc. Prof. Dr. R. J. Ewing reviewed articles on Enzymol. Bacteriol. effects later published in Nutr. Rheumatol. No. 12 and Nº. 13., pp. 17-18.")
759 assert.Equal("Approx.", tokens[0]);
760 assert.Equal("in", tokens[1]);
761 assert.Equal("Sept.", tokens[2]);
762 assert.Equal("1954", tokens[3]);
763 assert.Equal(",", tokens[4]);
764 assert.Equal("Assoc.", tokens[5]);
765 assert.Equal("Prof.", tokens[6]);
766 assert.Equal("Dr.", tokens[7]);
767 assert.Equal("R.", tokens[8]);
768 assert.Equal("J.", tokens[9]);
769 assert.Equal("Ewing", tokens[10]);
770 assert.Equal("reviewed", tokens[11]);
771 assert.Equal("articles", tokens[12]);
772 assert.Equal("on", tokens[13]);
773 assert.Equal("Enzymol.", tokens[14]);
774 assert.Equal("Bacteriol.", tokens[15]);
775 assert.Equal("effects", tokens[16]);
776 assert.Equal("later", tokens[17]);
777 assert.Equal("published", tokens[18]);
778 assert.Equal("in", tokens[19]);
779 assert.Equal("Nutr.", tokens[20]);
780 assert.Equal("Rheumatol.", tokens[21]);
781 assert.Equal("No.", tokens[22]);
782 assert.Equal("12", tokens[23]);
783 assert.Equal("and", tokens[24]);
784 assert.Equal("Nº.", tokens[25]);
785 assert.Equal("13.", tokens[26]);
786 assert.Equal(",", tokens[27]);
787 assert.Equal("pp.", tokens[28]);
788 assert.Equal("17-18", tokens[29]);
789 assert.Equal(".", tokens[30]);
790 }
791
792 @Test
793 public void englishTokenizerCanGuessWhetherIIsAbbrev () {
794 DerekoDfaTokenizer_en tok = new DerekoDfaTokenizer_en();
795 tokens = tokenize(dat, w, "M. I. Baxter was born during World War I. So was I. He went to the Peter I. Hardy school. So did I.")
796 assert.Equal("I.", tokens[1]);
797 assert.Equal("I", tokens[8]);
798 assert.Equal(".", tokens[9]);
799 assert.Equal("I", tokens[12]);
800 assert.Equal(".", tokens[13]);
801 }
802
803 @Test
804 public void testZipOuputArchive () {
805
806 final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
807 System.setOut(new PrintStream(clearOut));
808 tokens = tokenize(dat, w, "Archive: ich/bin/ein.zip\n")
809 assert.Equal(0, len(tokens));
810 }
811 */
812 /*
813
814 @Test
815 public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
816 DerekoDfaTokenizer_de tok = (DerekoDfaTokenizer_de) new KorapTokenizer.Builder()
817 .tokenizerClassName(DerekoDfaTokenizer_de.class.getName())
818 .printOffsets(true)
819 .build();
820 Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
821 assert.Equal("Text1", tokens[0].getType());
822 assert.Equal(len(tokens), 9 );
823 }
824 */
825}