blob: 28cae6fb7620957587456bb587bf2cd2d156564c [file] [log] [blame]
Marc Kupietz33677732020-09-04 22:07:39 +02001package de.ids_mannheim.korap.tokenizer;
2
3import static org.junit.Assert.*;
4import org.junit.Test;
5import org.junit.Ignore;
6import org.junit.runner.RunWith;
7import org.junit.runners.JUnit4;
8
9@RunWith(JUnit4.class)
10public class TokenizerTest {
11
12 @Test
13 public void testTokenizerSimple () {
14 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
15 String[] tokens = tok.tokenize("Der alte Mann");
16 assertEquals(tokens[0], "Der");
17 assertEquals(tokens[1], "alte");
18 assertEquals(tokens[2], "Mann");
19 assertEquals(tokens.length, 3);
20
21 tokens = tok.tokenize("Der alte Mann.");
22 assertEquals(tokens[0], "Der");
23 assertEquals(tokens[1], "alte");
24 assertEquals(tokens[2], "Mann");
25 assertEquals(tokens[3], ".");
26 assertEquals(tokens.length, 4);
27 }
28
29 @Test
30 @Ignore
31 public void testTokenizerAbbr () {
32 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
33 String[] tokens = tok.tokenize("Der Vorsitzende der F.D.P. hat gewählt");
34 assertEquals(tokens[0], "Der");
35 assertEquals(tokens[1], "Vorsitzende");
36 assertEquals(tokens[2], "der");
37 assertEquals(tokens[3], "F.D.P.");
38 assertEquals(tokens[4], "hat");
39 assertEquals(tokens[5], "gewählt");
40 assertEquals(tokens.length, 6);
41 }
42
43 @Test
44 public void testTokenizerHost1 () {
45 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
46 String[] tokens = tok.tokenize("Gefunden auf wikipedia.org");
47 assertEquals(tokens[0], "Gefunden");
48 assertEquals(tokens[1], "auf");
49 assertEquals(tokens[2], "wikipedia.org");
50 assertEquals(tokens.length, 3);
51 }
52
53 @Test
54 @Ignore
55 public void testTokenizerHost2 () {
56 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
57 String[] tokens = tok.tokenize("Gefunden auf www.wikipedia.org");
58 assertEquals(tokens[0], "Gefunden");
59 assertEquals(tokens[1], "auf");
60 assertEquals(tokens[2], "www.wikipedia.org");
61 assertEquals(tokens.length, 3);
62 }
63
64 @Test
65 public void testTokenizerDash () {
66 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
67 String[] tokens = tok.tokenize("Das war -- spitze");
68 assertEquals(tokens[0], "Das");
69 assertEquals(tokens[1], "war");
70 assertEquals(tokens[2], "--");
71 assertEquals(tokens[3], "spitze");
72 assertEquals(tokens.length, 4);
73 }
74
75 @Test
76 public void testTokenizerEmail1 () {
77 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
78 String[] tokens = tok.tokenize("Ich bin unter korap@ids-mannheim.de erreichbar.");
79 assertEquals(tokens[0], "Ich");
80 assertEquals(tokens[1], "bin");
81 assertEquals(tokens[2], "unter");
82 assertEquals(tokens[3], "korap@ids-mannheim.de");
83 assertEquals(tokens[4], "erreichbar");
84 assertEquals(tokens[5], ".");
85 assertEquals(tokens.length, 6);
86 }
87
88 @Test
89 public void testTokenizerEmail2 () {
90 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
91 String[] tokens = tok.tokenize("Oder unter korap[at]ids-mannheim[dot]de.");
92 assertEquals(tokens[0], "Oder");
93 assertEquals(tokens[1], "unter");
94 assertEquals(tokens[2], "korap[at]ids-mannheim[dot]de");
95 assertEquals(tokens[3], ".");
96 assertEquals(tokens.length, 4);
97 }
98
99 @Test
100 @Ignore
101 public void testTokenizerEmail3 () {
102 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
103 String[] tokens = tok.tokenize("Oder unter korap(at)ids-mannheim(dot)de.");
104 assertEquals(tokens[0], "Oder");
105 assertEquals(tokens[1], "unter");
106 assertEquals(tokens[2], "korap(at)ids-mannheim(dot)de");
107 assertEquals(tokens[3], ".");
108 assertEquals(tokens.length, 4);
109 }
110
111 @Test
112 public void testTokenizerTwitter () {
113 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
114 String[] tokens = tok.tokenize("Folgt @korap und #korap");
115 assertEquals(tokens[0], "Folgt");
116 assertEquals(tokens[1], "@korap");
117 assertEquals(tokens[2], "und");
118 assertEquals(tokens[3], "#korap");
119 assertEquals(tokens.length, 4);
120 }
121
122 @Test
123 public void testTokenizerWeb1 () {
124 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
125 String[] tokens = tok.tokenize("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum");
126 assertEquals(tokens[0], "Unsere");
127 assertEquals(tokens[1], "Website");
128 assertEquals(tokens[2], "ist");
129 assertEquals(tokens[3], "https://korap.ids-mannheim.de/?q=Baum");
130 assertEquals(tokens.length, 4);
131 }
132
133 @Test
134 @Ignore
135 public void testTokenizerWeb2 () {
136 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
137 String[] tokens = tok.tokenize("Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)");
138 assertEquals(tokens[0], "Wir");
139 assertEquals(tokens[1], "sind");
140 assertEquals(tokens[2], "auch");
141 assertEquals(tokens[3], "im");
142 assertEquals(tokens[4], "Internet");
143 assertEquals(tokens[5], "(");
144 assertEquals(tokens[6], "https://korap.ids-mannheim.de/?q=Baum");
145 assertEquals(tokens[7], ")");
146 assertEquals(tokens.length, 8);
147 }
148
149 @Test
150 @Ignore
151 public void testTokenizerWeb3 () {
152 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
153 String[] tokens = tok.tokenize("Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.");
154 assertEquals(tokens[0], "Die");
155 assertEquals(tokens[1], "Adresse");
156 assertEquals(tokens[2], "ist");
157 assertEquals(tokens[3], "https://korap.ids-mannheim.de/?q=Baum");
158 assertEquals(tokens[4], ".");
159 assertEquals(tokens.length, 8);
160 }
161
162 @Test
163 public void testTokenizerServer () {
164 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
165 String[] tokens = tok.tokenize("Unser Server ist 10.0.10.51.");
166 assertEquals(tokens[0], "Unser");
167 assertEquals(tokens[1], "Server");
168 assertEquals(tokens[2], "ist");
169 assertEquals(tokens[3], "10.0.10.51");
170 assertEquals(tokens[4], ".");
171 assertEquals(tokens.length, 5);
172 }
173
174 @Test
175 public void testTokenizerNum () {
176 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
177 String[] tokens = tok.tokenize("Zu 50,4% ist es sicher");
178 assertEquals(tokens[0], "Zu");
179 assertEquals(tokens[1], "50,4");
180 assertEquals(tokens[2], "%"); // Arguable
181 assertEquals(tokens[3], "ist");
182 assertEquals(tokens[4], "es");
183 assertEquals(tokens[5], "sicher");
184 assertEquals(tokens.length, 6);
185 }
186
187 @Test
188 public void testTokenizerDate () {
189 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
190 String[] tokens = tok.tokenize("Der Termin ist am 5.9.2018");
191 assertEquals(tokens[0], "Der");
192 assertEquals(tokens[1], "Termin");
193 assertEquals(tokens[2], "ist");
194 assertEquals(tokens[3], "am");
195 assertEquals(tokens[4], "5.9.2018");
196 assertEquals(tokens.length, 5);
197
198 tokens = tok.tokenize("Der Termin ist am 5/9/2018");
199 assertEquals(tokens[0], "Der");
200 assertEquals(tokens[1], "Termin");
201 assertEquals(tokens[2], "ist");
202 assertEquals(tokens[3], "am");
203 assertEquals(tokens[4], "5/9/2018");
204 assertEquals(tokens.length, 5);
205 }
206
207 @Test
208 @Ignore
209 public void testTokenizerDateRange () {
210 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
211 String[] tokens = tok.tokenize("Der Termin war vom 4.-5.9.2018");
212 assertEquals(tokens[0], "Der");
213 assertEquals(tokens[1], "Termin");
214 assertEquals(tokens[2], "war");
215 assertEquals(tokens[3], "vom");
216 assertEquals(tokens[4], "4.");
217 assertEquals(tokens[5], "-");
218 assertEquals(tokens[6], "5.9.2018");
219 assertEquals(tokens.length, 7);
220 }
221
222 @Test
223 public void testTokenizerEmoji1 () {
224 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
225 String[] tokens = tok.tokenize("Das ist toll! ;)");
226 assertEquals(tokens[0], "Das");
227 assertEquals(tokens[1], "ist");
228 assertEquals(tokens[2], "toll");
229 assertEquals(tokens[3], "!");
230 assertEquals(tokens[4], ";)");
231 assertEquals(tokens.length, 5);
232 }
233
234 @Test
235 public void testTokenizerRef1 () {
236 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
237 String[] tokens = tok.tokenize("Kupietz und Schmidt (2018): Korpuslinguistik");
238 assertEquals(tokens[0], "Kupietz");
239 assertEquals(tokens[1], "und");
240 assertEquals(tokens[2], "Schmidt");
241 assertEquals(tokens[3], "(");
242 assertEquals(tokens[4], "2018");
243 assertEquals(tokens[5], ")");
244 assertEquals(tokens[6], ":");
245 assertEquals(tokens[7], "Korpuslinguistik");
246 assertEquals(tokens.length, 8);
247 }
248
249 @Test
250 public void testTokenizerRef2 () {
251 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
252 String[] tokens = tok.tokenize("Kupietz und Schmidt [2018]: Korpuslinguistik");
253 assertEquals(tokens[0], "Kupietz");
254 assertEquals(tokens[1], "und");
255 assertEquals(tokens[2], "Schmidt");
256 assertEquals(tokens[3], "[");
257 assertEquals(tokens[4], "2018");
258 assertEquals(tokens[5], "]");
259 assertEquals(tokens[6], ":");
260 assertEquals(tokens[7], "Korpuslinguistik");
261 assertEquals(tokens.length, 8);
262 }
263
264 @Test
265 public void testTokenizerOmission1 () {
266 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
267 String[] tokens = tok.tokenize("Er ist ein A****loch!");
268 assertEquals(tokens[0], "Er");
269 assertEquals(tokens[1], "ist");
270 assertEquals(tokens[2], "ein");
271 assertEquals(tokens[3], "A****loch");
272 assertEquals(tokens[4], "!");
273 assertEquals(tokens.length, 5);
274 }
275
276 @Test
277 public void testTokenizerOmission2 () {
278 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
279 String[] tokens = tok.tokenize("F*ck!");
280 assertEquals(tokens[0], "F*ck");
281 assertEquals(tokens[1], "!");
282 assertEquals(tokens.length, 2);
283 }
284
285 @Test
286 public void testTokenizerOmission3 () {
287 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
288 String[] tokens = tok.tokenize("Dieses verf***** Kleid!");
289 assertEquals(tokens[0], "Dieses");
290 assertEquals(tokens[1], "verf*****");
291 assertEquals(tokens[2], "Kleid");
292 assertEquals(tokens[3], "!");
293 assertEquals(tokens.length, 4);
294 }
295
296 @Test
297 // Probably interpreted as HOST
298 public void testTokenizerFileExtension1 () {
299 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
300 String[] tokens = tok.tokenize("Ich habe die readme.txt heruntergeladen");
301 assertEquals(tokens[0], "Ich");
302 assertEquals(tokens[1], "habe");
303 assertEquals(tokens[2], "die");
304 assertEquals(tokens[3], "readme.txt");
305 assertEquals(tokens[4], "heruntergeladen");
306 assertEquals(tokens.length, 5);
307 }
308
309 @Test
310 // Probably interpreted as HOST
311 public void testTokenizerFileExtension2 () {
312 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
313 String[] tokens = tok.tokenize("Nimm die README.TXT!");
314 assertEquals(tokens[0], "Nimm");
315 assertEquals(tokens[1], "die");
316 assertEquals(tokens[2], "README.TXT");
317 assertEquals(tokens[3], "!");
318 assertEquals(tokens.length, 4);
319 }
320
321 @Test
322 // Probably interpreted as HOST
323 public void testTokenizerFileExtension3 () {
324 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
325 String[] tokens = tok.tokenize("Zeig mir profile.jpeg");
326 assertEquals(tokens[0], "Zeig");
327 assertEquals(tokens[1], "mir");
328 assertEquals(tokens[2], "profile.jpeg");
329 assertEquals(tokens.length, 3);
330 }
331
332 @Test
333 public void testTokenizerFile1 () {
334 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
335 String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.docx");
336 assertEquals(tokens[0], "Zeig");
337 assertEquals(tokens[1], "mir");
338 assertEquals(tokens[2], "c:\\Dokumente\\profile.docx");
339 assertEquals(tokens.length, 3);
340 }
341
342 @Test
343 public void testTokenizerFile2 () {
344 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
345 String[] tokens = tok.tokenize("Gehe zu /Dokumente/profile.docx");
346 assertEquals(tokens[0], "Gehe");
347 assertEquals(tokens[1], "zu");
348 assertEquals(tokens[2], "/Dokumente/profile.docx");
349 assertEquals(tokens.length, 3);
350 }
351
352 @Test
353 @Ignore
354 public void testTokenizerFile3 () {
355 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
356 String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.jpeg");
357 assertEquals(tokens[0], "Zeig");
358 assertEquals(tokens[1], "mir");
359 assertEquals(tokens[2], "c:\\Dokumente\\profile.jpeg");
360 assertEquals(tokens.length, 3);
361 }
362
363 @Test
364 public void testTokenizerPunct () {
365 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
366 String[] tokens = tok.tokenize("Er sagte: \"Es geht mir gut!\", daraufhin ging er.");
367 assertEquals(tokens[0], "Er");
368 assertEquals(tokens[1], "sagte");
369 assertEquals(tokens[2], ":");
370 assertEquals(tokens[3], "\"");
371 assertEquals(tokens[4], "Es");
372 assertEquals(tokens[5], "geht");
373 assertEquals(tokens[6], "mir");
374 assertEquals(tokens[7], "gut");
375 assertEquals(tokens[8], "!");
376 assertEquals(tokens[9], "\"");
377 assertEquals(tokens[10], ",");
378 assertEquals(tokens[11], "daraufhin");
379 assertEquals(tokens[12], "ging");
380 assertEquals(tokens[13], "er");
381 assertEquals(tokens[14], ".");
382 assertEquals(tokens.length, 15);
383 }
384
385 @Test
386 public void testTokenizerPlusAmpersand () {
387 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
388 String[] tokens = tok.tokenize(""Das ist von C&A!"");
389 assertEquals(tokens[0], """);
390 assertEquals(tokens[1], "Das");
391 assertEquals(tokens[2], "ist");
392 assertEquals(tokens[3], "von");
393 assertEquals(tokens[4], "C&A");
394 assertEquals(tokens[5], "!");
395 assertEquals(tokens[6], """);
396 assertEquals(tokens.length, 7);
397 }
398
399 @Test
400 public void testTokenizerLongEnd () {
401 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
402 String[] tokens = tok.tokenize("Siehst Du?!!?");
403 assertEquals(tokens[0], "Siehst");
404 assertEquals(tokens[1], "Du");
405 assertEquals(tokens[2], "?!!?");
406 assertEquals(tokens.length, 3);
407 }
408
409 @Test
410 public void testTokenizerIrishO () {
411 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
412 String[] tokens = tok.tokenize("Peter O'Toole");
413 assertEquals(tokens[0], "Peter");
414 assertEquals(tokens[1], "O'Toole");
415 assertEquals(tokens.length, 2);
416 }
417
418 @Test
419 public void testTokenizerAbr () {
420 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
421 String[] tokens = tok.tokenize("Früher bzw. später ...");
422 assertEquals(tokens[0], "Früher");
423 assertEquals(tokens[1], "bzw.");
424 assertEquals(tokens[2], "später");
425 assertEquals(tokens[3], "...");
426 assertEquals(tokens.length, 4);
427 }
428
429 @Test
430 @Ignore
431 public void testTokenizerUppercaseRule () {
432 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
433 String[] tokens = tok.tokenize("Es war spät.Morgen ist es früh.");
434 assertEquals(tokens[0], "Es");
435 assertEquals(tokens[1], "war");
436 assertEquals(tokens[2], "spät");
437 assertEquals(tokens[3], ".");
438 assertEquals(tokens[4], "Morgen");
439 assertEquals(tokens[5], "ist");
440 assertEquals(tokens[6], "es");
441 assertEquals(tokens[7], "früh");
442 assertEquals(tokens[8], ".");
443 assertEquals(tokens.length, 9);
444 }
445
446 @Test
447 public void testTokenizerOrd () {
448 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
449 String[] tokens = tok.tokenize("Sie erreichte den 1. Platz!");
450 assertEquals(tokens[0], "Sie");
451 assertEquals(tokens[1], "erreichte");
452 assertEquals(tokens[2], "den");
453 assertEquals(tokens[3], "1.");
454 assertEquals(tokens[4], "Platz");
455 assertEquals(tokens[5], "!");
456 assertEquals(tokens.length, 6);
457 }
458
459 @Test
460 public void testNoZipOuputArchive () {
461 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
462 String[] tokens = tok.tokenize("Archive: Ich bin kein zip\n");
463 assertEquals(tokens[0], "Archive");
464 assertEquals(tokens[1], ":");
465 assertEquals(tokens[2], "Ich");
466 assertEquals(tokens[3], "bin");
467 assertEquals(tokens[4], "kein");
468 assertEquals(tokens[5], "zip");
469 assertEquals(6, tokens.length);
470 }
471
472 @Test
473 public void testZipOuputArchive () {
474 KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
475 String[] tokens = tok.tokenize("Archive: ich/bin/ein.zip\n");
476 assertEquals(0, tokens.length);
477 }
478}