Marc Kupietz | 3367773 | 2020-09-04 22:07:39 +0200 | [diff] [blame] | 1 | package de.ids_mannheim.korap.tokenizer; |
| 2 | |
| 3 | import static org.junit.Assert.*; |
| 4 | import org.junit.Test; |
| 5 | import org.junit.Ignore; |
| 6 | import org.junit.runner.RunWith; |
| 7 | import org.junit.runners.JUnit4; |
| 8 | |
| 9 | @RunWith(JUnit4.class) |
| 10 | public class TokenizerTest { |
| 11 | |
| 12 | @Test |
| 13 | public void testTokenizerSimple () { |
| 14 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 15 | String[] tokens = tok.tokenize("Der alte Mann"); |
| 16 | assertEquals(tokens[0], "Der"); |
| 17 | assertEquals(tokens[1], "alte"); |
| 18 | assertEquals(tokens[2], "Mann"); |
| 19 | assertEquals(tokens.length, 3); |
| 20 | |
| 21 | tokens = tok.tokenize("Der alte Mann."); |
| 22 | assertEquals(tokens[0], "Der"); |
| 23 | assertEquals(tokens[1], "alte"); |
| 24 | assertEquals(tokens[2], "Mann"); |
| 25 | assertEquals(tokens[3], "."); |
| 26 | assertEquals(tokens.length, 4); |
| 27 | } |
| 28 | |
| 29 | @Test |
| 30 | @Ignore |
| 31 | public void testTokenizerAbbr () { |
| 32 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 33 | String[] tokens = tok.tokenize("Der Vorsitzende der F.D.P. hat gewählt"); |
| 34 | assertEquals(tokens[0], "Der"); |
| 35 | assertEquals(tokens[1], "Vorsitzende"); |
| 36 | assertEquals(tokens[2], "der"); |
| 37 | assertEquals(tokens[3], "F.D.P."); |
| 38 | assertEquals(tokens[4], "hat"); |
| 39 | assertEquals(tokens[5], "gewählt"); |
| 40 | assertEquals(tokens.length, 6); |
| 41 | } |
| 42 | |
| 43 | @Test |
| 44 | public void testTokenizerHost1 () { |
| 45 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 46 | String[] tokens = tok.tokenize("Gefunden auf wikipedia.org"); |
| 47 | assertEquals(tokens[0], "Gefunden"); |
| 48 | assertEquals(tokens[1], "auf"); |
| 49 | assertEquals(tokens[2], "wikipedia.org"); |
| 50 | assertEquals(tokens.length, 3); |
| 51 | } |
| 52 | |
| 53 | @Test |
| 54 | @Ignore |
| 55 | public void testTokenizerHost2 () { |
| 56 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 57 | String[] tokens = tok.tokenize("Gefunden auf www.wikipedia.org"); |
| 58 | assertEquals(tokens[0], "Gefunden"); |
| 59 | assertEquals(tokens[1], "auf"); |
| 60 | assertEquals(tokens[2], "www.wikipedia.org"); |
| 61 | assertEquals(tokens.length, 3); |
| 62 | } |
| 63 | |
| 64 | @Test |
| 65 | public void testTokenizerDash () { |
| 66 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 67 | String[] tokens = tok.tokenize("Das war -- spitze"); |
| 68 | assertEquals(tokens[0], "Das"); |
| 69 | assertEquals(tokens[1], "war"); |
| 70 | assertEquals(tokens[2], "--"); |
| 71 | assertEquals(tokens[3], "spitze"); |
| 72 | assertEquals(tokens.length, 4); |
| 73 | } |
| 74 | |
| 75 | @Test |
| 76 | public void testTokenizerEmail1 () { |
| 77 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 78 | String[] tokens = tok.tokenize("Ich bin unter korap@ids-mannheim.de erreichbar."); |
| 79 | assertEquals(tokens[0], "Ich"); |
| 80 | assertEquals(tokens[1], "bin"); |
| 81 | assertEquals(tokens[2], "unter"); |
| 82 | assertEquals(tokens[3], "korap@ids-mannheim.de"); |
| 83 | assertEquals(tokens[4], "erreichbar"); |
| 84 | assertEquals(tokens[5], "."); |
| 85 | assertEquals(tokens.length, 6); |
| 86 | } |
| 87 | |
| 88 | @Test |
| 89 | public void testTokenizerEmail2 () { |
| 90 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 91 | String[] tokens = tok.tokenize("Oder unter korap[at]ids-mannheim[dot]de."); |
| 92 | assertEquals(tokens[0], "Oder"); |
| 93 | assertEquals(tokens[1], "unter"); |
| 94 | assertEquals(tokens[2], "korap[at]ids-mannheim[dot]de"); |
| 95 | assertEquals(tokens[3], "."); |
| 96 | assertEquals(tokens.length, 4); |
| 97 | } |
| 98 | |
| 99 | @Test |
| 100 | @Ignore |
| 101 | public void testTokenizerEmail3 () { |
| 102 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 103 | String[] tokens = tok.tokenize("Oder unter korap(at)ids-mannheim(dot)de."); |
| 104 | assertEquals(tokens[0], "Oder"); |
| 105 | assertEquals(tokens[1], "unter"); |
| 106 | assertEquals(tokens[2], "korap(at)ids-mannheim(dot)de"); |
| 107 | assertEquals(tokens[3], "."); |
| 108 | assertEquals(tokens.length, 4); |
| 109 | } |
| 110 | |
| 111 | @Test |
| 112 | public void testTokenizerTwitter () { |
| 113 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 114 | String[] tokens = tok.tokenize("Folgt @korap und #korap"); |
| 115 | assertEquals(tokens[0], "Folgt"); |
| 116 | assertEquals(tokens[1], "@korap"); |
| 117 | assertEquals(tokens[2], "und"); |
| 118 | assertEquals(tokens[3], "#korap"); |
| 119 | assertEquals(tokens.length, 4); |
| 120 | } |
| 121 | |
| 122 | @Test |
| 123 | public void testTokenizerWeb1 () { |
| 124 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 125 | String[] tokens = tok.tokenize("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum"); |
| 126 | assertEquals(tokens[0], "Unsere"); |
| 127 | assertEquals(tokens[1], "Website"); |
| 128 | assertEquals(tokens[2], "ist"); |
| 129 | assertEquals(tokens[3], "https://korap.ids-mannheim.de/?q=Baum"); |
| 130 | assertEquals(tokens.length, 4); |
| 131 | } |
| 132 | |
| 133 | @Test |
| 134 | @Ignore |
| 135 | public void testTokenizerWeb2 () { |
| 136 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 137 | String[] tokens = tok.tokenize("Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)"); |
| 138 | assertEquals(tokens[0], "Wir"); |
| 139 | assertEquals(tokens[1], "sind"); |
| 140 | assertEquals(tokens[2], "auch"); |
| 141 | assertEquals(tokens[3], "im"); |
| 142 | assertEquals(tokens[4], "Internet"); |
| 143 | assertEquals(tokens[5], "("); |
| 144 | assertEquals(tokens[6], "https://korap.ids-mannheim.de/?q=Baum"); |
| 145 | assertEquals(tokens[7], ")"); |
| 146 | assertEquals(tokens.length, 8); |
| 147 | } |
| 148 | |
| 149 | @Test |
| 150 | @Ignore |
| 151 | public void testTokenizerWeb3 () { |
| 152 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 153 | String[] tokens = tok.tokenize("Die Adresse ist https://korap.ids-mannheim.de/?q=Baum."); |
| 154 | assertEquals(tokens[0], "Die"); |
| 155 | assertEquals(tokens[1], "Adresse"); |
| 156 | assertEquals(tokens[2], "ist"); |
| 157 | assertEquals(tokens[3], "https://korap.ids-mannheim.de/?q=Baum"); |
| 158 | assertEquals(tokens[4], "."); |
| 159 | assertEquals(tokens.length, 8); |
| 160 | } |
| 161 | |
| 162 | @Test |
| 163 | public void testTokenizerServer () { |
| 164 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 165 | String[] tokens = tok.tokenize("Unser Server ist 10.0.10.51."); |
| 166 | assertEquals(tokens[0], "Unser"); |
| 167 | assertEquals(tokens[1], "Server"); |
| 168 | assertEquals(tokens[2], "ist"); |
| 169 | assertEquals(tokens[3], "10.0.10.51"); |
| 170 | assertEquals(tokens[4], "."); |
| 171 | assertEquals(tokens.length, 5); |
| 172 | } |
| 173 | |
| 174 | @Test |
| 175 | public void testTokenizerNum () { |
| 176 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 177 | String[] tokens = tok.tokenize("Zu 50,4% ist es sicher"); |
| 178 | assertEquals(tokens[0], "Zu"); |
| 179 | assertEquals(tokens[1], "50,4"); |
| 180 | assertEquals(tokens[2], "%"); // Arguable |
| 181 | assertEquals(tokens[3], "ist"); |
| 182 | assertEquals(tokens[4], "es"); |
| 183 | assertEquals(tokens[5], "sicher"); |
| 184 | assertEquals(tokens.length, 6); |
| 185 | } |
| 186 | |
| 187 | @Test |
| 188 | public void testTokenizerDate () { |
| 189 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 190 | String[] tokens = tok.tokenize("Der Termin ist am 5.9.2018"); |
| 191 | assertEquals(tokens[0], "Der"); |
| 192 | assertEquals(tokens[1], "Termin"); |
| 193 | assertEquals(tokens[2], "ist"); |
| 194 | assertEquals(tokens[3], "am"); |
| 195 | assertEquals(tokens[4], "5.9.2018"); |
| 196 | assertEquals(tokens.length, 5); |
| 197 | |
| 198 | tokens = tok.tokenize("Der Termin ist am 5/9/2018"); |
| 199 | assertEquals(tokens[0], "Der"); |
| 200 | assertEquals(tokens[1], "Termin"); |
| 201 | assertEquals(tokens[2], "ist"); |
| 202 | assertEquals(tokens[3], "am"); |
| 203 | assertEquals(tokens[4], "5/9/2018"); |
| 204 | assertEquals(tokens.length, 5); |
| 205 | } |
| 206 | |
| 207 | @Test |
| 208 | @Ignore |
| 209 | public void testTokenizerDateRange () { |
| 210 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 211 | String[] tokens = tok.tokenize("Der Termin war vom 4.-5.9.2018"); |
| 212 | assertEquals(tokens[0], "Der"); |
| 213 | assertEquals(tokens[1], "Termin"); |
| 214 | assertEquals(tokens[2], "war"); |
| 215 | assertEquals(tokens[3], "vom"); |
| 216 | assertEquals(tokens[4], "4."); |
| 217 | assertEquals(tokens[5], "-"); |
| 218 | assertEquals(tokens[6], "5.9.2018"); |
| 219 | assertEquals(tokens.length, 7); |
| 220 | } |
| 221 | |
| 222 | @Test |
| 223 | public void testTokenizerEmoji1 () { |
| 224 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 225 | String[] tokens = tok.tokenize("Das ist toll! ;)"); |
| 226 | assertEquals(tokens[0], "Das"); |
| 227 | assertEquals(tokens[1], "ist"); |
| 228 | assertEquals(tokens[2], "toll"); |
| 229 | assertEquals(tokens[3], "!"); |
| 230 | assertEquals(tokens[4], ";)"); |
| 231 | assertEquals(tokens.length, 5); |
| 232 | } |
| 233 | |
| 234 | @Test |
| 235 | public void testTokenizerRef1 () { |
| 236 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 237 | String[] tokens = tok.tokenize("Kupietz und Schmidt (2018): Korpuslinguistik"); |
| 238 | assertEquals(tokens[0], "Kupietz"); |
| 239 | assertEquals(tokens[1], "und"); |
| 240 | assertEquals(tokens[2], "Schmidt"); |
| 241 | assertEquals(tokens[3], "("); |
| 242 | assertEquals(tokens[4], "2018"); |
| 243 | assertEquals(tokens[5], ")"); |
| 244 | assertEquals(tokens[6], ":"); |
| 245 | assertEquals(tokens[7], "Korpuslinguistik"); |
| 246 | assertEquals(tokens.length, 8); |
| 247 | } |
| 248 | |
| 249 | @Test |
| 250 | public void testTokenizerRef2 () { |
| 251 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 252 | String[] tokens = tok.tokenize("Kupietz und Schmidt [2018]: Korpuslinguistik"); |
| 253 | assertEquals(tokens[0], "Kupietz"); |
| 254 | assertEquals(tokens[1], "und"); |
| 255 | assertEquals(tokens[2], "Schmidt"); |
| 256 | assertEquals(tokens[3], "["); |
| 257 | assertEquals(tokens[4], "2018"); |
| 258 | assertEquals(tokens[5], "]"); |
| 259 | assertEquals(tokens[6], ":"); |
| 260 | assertEquals(tokens[7], "Korpuslinguistik"); |
| 261 | assertEquals(tokens.length, 8); |
| 262 | } |
| 263 | |
| 264 | @Test |
| 265 | public void testTokenizerOmission1 () { |
| 266 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 267 | String[] tokens = tok.tokenize("Er ist ein A****loch!"); |
| 268 | assertEquals(tokens[0], "Er"); |
| 269 | assertEquals(tokens[1], "ist"); |
| 270 | assertEquals(tokens[2], "ein"); |
| 271 | assertEquals(tokens[3], "A****loch"); |
| 272 | assertEquals(tokens[4], "!"); |
| 273 | assertEquals(tokens.length, 5); |
| 274 | } |
| 275 | |
| 276 | @Test |
| 277 | public void testTokenizerOmission2 () { |
| 278 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 279 | String[] tokens = tok.tokenize("F*ck!"); |
| 280 | assertEquals(tokens[0], "F*ck"); |
| 281 | assertEquals(tokens[1], "!"); |
| 282 | assertEquals(tokens.length, 2); |
| 283 | } |
| 284 | |
| 285 | @Test |
| 286 | public void testTokenizerOmission3 () { |
| 287 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 288 | String[] tokens = tok.tokenize("Dieses verf***** Kleid!"); |
| 289 | assertEquals(tokens[0], "Dieses"); |
| 290 | assertEquals(tokens[1], "verf*****"); |
| 291 | assertEquals(tokens[2], "Kleid"); |
| 292 | assertEquals(tokens[3], "!"); |
| 293 | assertEquals(tokens.length, 4); |
| 294 | } |
| 295 | |
| 296 | @Test |
| 297 | // Probably interpreted as HOST |
| 298 | public void testTokenizerFileExtension1 () { |
| 299 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 300 | String[] tokens = tok.tokenize("Ich habe die readme.txt heruntergeladen"); |
| 301 | assertEquals(tokens[0], "Ich"); |
| 302 | assertEquals(tokens[1], "habe"); |
| 303 | assertEquals(tokens[2], "die"); |
| 304 | assertEquals(tokens[3], "readme.txt"); |
| 305 | assertEquals(tokens[4], "heruntergeladen"); |
| 306 | assertEquals(tokens.length, 5); |
| 307 | } |
| 308 | |
| 309 | @Test |
| 310 | // Probably interpreted as HOST |
| 311 | public void testTokenizerFileExtension2 () { |
| 312 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 313 | String[] tokens = tok.tokenize("Nimm die README.TXT!"); |
| 314 | assertEquals(tokens[0], "Nimm"); |
| 315 | assertEquals(tokens[1], "die"); |
| 316 | assertEquals(tokens[2], "README.TXT"); |
| 317 | assertEquals(tokens[3], "!"); |
| 318 | assertEquals(tokens.length, 4); |
| 319 | } |
| 320 | |
| 321 | @Test |
| 322 | // Probably interpreted as HOST |
| 323 | public void testTokenizerFileExtension3 () { |
| 324 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 325 | String[] tokens = tok.tokenize("Zeig mir profile.jpeg"); |
| 326 | assertEquals(tokens[0], "Zeig"); |
| 327 | assertEquals(tokens[1], "mir"); |
| 328 | assertEquals(tokens[2], "profile.jpeg"); |
| 329 | assertEquals(tokens.length, 3); |
| 330 | } |
| 331 | |
| 332 | @Test |
| 333 | public void testTokenizerFile1 () { |
| 334 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 335 | String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.docx"); |
| 336 | assertEquals(tokens[0], "Zeig"); |
| 337 | assertEquals(tokens[1], "mir"); |
| 338 | assertEquals(tokens[2], "c:\\Dokumente\\profile.docx"); |
| 339 | assertEquals(tokens.length, 3); |
| 340 | } |
| 341 | |
| 342 | @Test |
| 343 | public void testTokenizerFile2 () { |
| 344 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 345 | String[] tokens = tok.tokenize("Gehe zu /Dokumente/profile.docx"); |
| 346 | assertEquals(tokens[0], "Gehe"); |
| 347 | assertEquals(tokens[1], "zu"); |
| 348 | assertEquals(tokens[2], "/Dokumente/profile.docx"); |
| 349 | assertEquals(tokens.length, 3); |
| 350 | } |
| 351 | |
| 352 | @Test |
| 353 | @Ignore |
| 354 | public void testTokenizerFile3 () { |
| 355 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 356 | String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.jpeg"); |
| 357 | assertEquals(tokens[0], "Zeig"); |
| 358 | assertEquals(tokens[1], "mir"); |
| 359 | assertEquals(tokens[2], "c:\\Dokumente\\profile.jpeg"); |
| 360 | assertEquals(tokens.length, 3); |
| 361 | } |
| 362 | |
| 363 | @Test |
| 364 | public void testTokenizerPunct () { |
| 365 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 366 | String[] tokens = tok.tokenize("Er sagte: \"Es geht mir gut!\", daraufhin ging er."); |
| 367 | assertEquals(tokens[0], "Er"); |
| 368 | assertEquals(tokens[1], "sagte"); |
| 369 | assertEquals(tokens[2], ":"); |
| 370 | assertEquals(tokens[3], "\""); |
| 371 | assertEquals(tokens[4], "Es"); |
| 372 | assertEquals(tokens[5], "geht"); |
| 373 | assertEquals(tokens[6], "mir"); |
| 374 | assertEquals(tokens[7], "gut"); |
| 375 | assertEquals(tokens[8], "!"); |
| 376 | assertEquals(tokens[9], "\""); |
| 377 | assertEquals(tokens[10], ","); |
| 378 | assertEquals(tokens[11], "daraufhin"); |
| 379 | assertEquals(tokens[12], "ging"); |
| 380 | assertEquals(tokens[13], "er"); |
| 381 | assertEquals(tokens[14], "."); |
| 382 | assertEquals(tokens.length, 15); |
| 383 | } |
| 384 | |
| 385 | @Test |
| 386 | public void testTokenizerPlusAmpersand () { |
| 387 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 388 | String[] tokens = tok.tokenize(""Das ist von C&A!""); |
| 389 | assertEquals(tokens[0], """); |
| 390 | assertEquals(tokens[1], "Das"); |
| 391 | assertEquals(tokens[2], "ist"); |
| 392 | assertEquals(tokens[3], "von"); |
| 393 | assertEquals(tokens[4], "C&A"); |
| 394 | assertEquals(tokens[5], "!"); |
| 395 | assertEquals(tokens[6], """); |
| 396 | assertEquals(tokens.length, 7); |
| 397 | } |
| 398 | |
| 399 | @Test |
| 400 | public void testTokenizerLongEnd () { |
| 401 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 402 | String[] tokens = tok.tokenize("Siehst Du?!!?"); |
| 403 | assertEquals(tokens[0], "Siehst"); |
| 404 | assertEquals(tokens[1], "Du"); |
| 405 | assertEquals(tokens[2], "?!!?"); |
| 406 | assertEquals(tokens.length, 3); |
| 407 | } |
| 408 | |
| 409 | @Test |
| 410 | public void testTokenizerIrishO () { |
| 411 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 412 | String[] tokens = tok.tokenize("Peter O'Toole"); |
| 413 | assertEquals(tokens[0], "Peter"); |
| 414 | assertEquals(tokens[1], "O'Toole"); |
| 415 | assertEquals(tokens.length, 2); |
| 416 | } |
| 417 | |
| 418 | @Test |
| 419 | public void testTokenizerAbr () { |
| 420 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 421 | String[] tokens = tok.tokenize("Früher bzw. später ..."); |
| 422 | assertEquals(tokens[0], "Früher"); |
| 423 | assertEquals(tokens[1], "bzw."); |
| 424 | assertEquals(tokens[2], "später"); |
| 425 | assertEquals(tokens[3], "..."); |
| 426 | assertEquals(tokens.length, 4); |
| 427 | } |
| 428 | |
| 429 | @Test |
| 430 | @Ignore |
| 431 | public void testTokenizerUppercaseRule () { |
| 432 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 433 | String[] tokens = tok.tokenize("Es war spät.Morgen ist es früh."); |
| 434 | assertEquals(tokens[0], "Es"); |
| 435 | assertEquals(tokens[1], "war"); |
| 436 | assertEquals(tokens[2], "spät"); |
| 437 | assertEquals(tokens[3], "."); |
| 438 | assertEquals(tokens[4], "Morgen"); |
| 439 | assertEquals(tokens[5], "ist"); |
| 440 | assertEquals(tokens[6], "es"); |
| 441 | assertEquals(tokens[7], "früh"); |
| 442 | assertEquals(tokens[8], "."); |
| 443 | assertEquals(tokens.length, 9); |
| 444 | } |
| 445 | |
| 446 | @Test |
| 447 | public void testTokenizerOrd () { |
| 448 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 449 | String[] tokens = tok.tokenize("Sie erreichte den 1. Platz!"); |
| 450 | assertEquals(tokens[0], "Sie"); |
| 451 | assertEquals(tokens[1], "erreichte"); |
| 452 | assertEquals(tokens[2], "den"); |
| 453 | assertEquals(tokens[3], "1."); |
| 454 | assertEquals(tokens[4], "Platz"); |
| 455 | assertEquals(tokens[5], "!"); |
| 456 | assertEquals(tokens.length, 6); |
| 457 | } |
| 458 | |
| 459 | @Test |
| 460 | public void testNoZipOuputArchive () { |
| 461 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 462 | String[] tokens = tok.tokenize("Archive: Ich bin kein zip\n"); |
| 463 | assertEquals(tokens[0], "Archive"); |
| 464 | assertEquals(tokens[1], ":"); |
| 465 | assertEquals(tokens[2], "Ich"); |
| 466 | assertEquals(tokens[3], "bin"); |
| 467 | assertEquals(tokens[4], "kein"); |
| 468 | assertEquals(tokens[5], "zip"); |
| 469 | assertEquals(6, tokens.length); |
| 470 | } |
| 471 | |
| 472 | @Test |
| 473 | public void testZipOuputArchive () { |
| 474 | KorAPTokenizerImpl tok = new KorAPTokenizerImpl(); |
| 475 | String[] tokens = tok.tokenize("Archive: ich/bin/ein.zip\n"); |
| 476 | assertEquals(0, tokens.length); |
| 477 | } |
| 478 | } |