Initial import from private/Ingestion

Change-Id: I96f428c440ef301384de6d7ef2e348df7d729816
diff --git a/src/test/java/TokenizerCoverTest.java b/src/test/java/TokenizerCoverTest.java
new file mode 100644
index 0000000..2d717de
--- /dev/null
+++ b/src/test/java/TokenizerCoverTest.java
@@ -0,0 +1,245 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import static org.junit.Assert.*;
+import java.util.*;
+import java.io.*;
+import java.net.URLDecoder;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TokenizerCoverTest {
+
+    /**
+     * This test suite checks for the tokenization coverage of our
+     * tokenizer implementation based on the EmpiriST 2015
+     * Gold Standard Suite, published under Creative Commons license
+     * BY-SA 3.0.
+     *
+     * Michael Beißwenger, Sabine Bartsch, Stefan Evert and
+     * Kay-Michael Würzner (2016). EmpiriST 2015: A shared task
+     * on the automatic linguistic annotation of computer-mediated
+     * communication and web corpora. In Proceedings of the 10th
+     * Web as Corpus Workshop (WAC-X) and the EmpiriST Shared Task,
+     * pages 78–90. Berlin, Germany.
+     *
+     * https://sites.google.com/site/empirist2015/home/gold
+     */
+
+    // Get a data file
+    private String getFile (String file) {
+        String path = getClass().getResource(file).getFile();
+
+        StringBuilder content = new StringBuilder();
+        try {			
+			BufferedReader in = new BufferedReader(
+				new InputStreamReader(
+					new FileInputStream(URLDecoder.decode(path, "UTF-8")),
+					"UTF-8"
+					)
+				);
+            String str;
+            while ((str = in.readLine()) != null) {
+                content.append(str + "\n");
+            };
+            in.close();
+        }
+        catch (IOException e) {
+            fail(e.getMessage());
+        }
+        return content.toString();
+    };
+
+
+    /**
+     * Scan Empirist articles and iterate through 
+     */
+    private class EmpiristScanner implements Iterator {
+        private Scanner sc;
+
+        public EmpiristScanner (String file) {
+            sc = new Scanner(getFile(file));
+            sc.useDelimiter("<(?:posting|article)[^>]+?/>");
+        }
+
+        // Return next posting/article
+        public String next () {
+            return sc.next().trim();
+        }
+
+        // Check if new posting/article exists
+        public boolean hasNext () {
+            return sc.hasNext();
+        }
+    }
+
+    /**
+     * To calculate the difference between the gold standard version and
+     * our version, we calculate the levenshtein difference between both lists.
+     * It's not very intuitive that way, as it does not treat merges and splits
+     * specifically (i.e. a merge is one replacement and one deletion, a split
+     * is one replacement and one insertion) - so the number is not
+     * really meaningful - it's just a way to measure the differences.
+     * It's important to note that this differs from the comparison of
+     * EmpiriST, where the end boundaries of all tokens are compared.
+     */
+    public static int levenshteinForStringArrays (String[] s, String[] t) {
+      if (s == null || t == null) {
+          throw new IllegalArgumentException("Lists must not be null");
+      }
+
+      // Code based on Rosettacode.org
+      int [] costs = new int[t.length + 1];
+
+      for (int j = 0; j < costs.length; j++)
+          costs[j] = j;
+
+      for (int i = 1; i <= s.length; i++) {
+          costs[0] = i;
+          int nw = i - 1;
+          for (int j = 1; j <= t.length; j++) {
+              int cj = Math.min(
+                  1 + Math.min(costs[j], costs[j - 1]),
+                  s[i - 1].equals(t[j - 1]) ? nw : nw + 1
+                  );
+              nw = costs[j];
+              costs[j] = cj;
+          }
+      }
+
+      return costs[t.length];
+    }
+
+    /**
+     * Compare the tokenized data of one example file
+     * with the gold standard and return the sum of
+     * levenshtein distances.
+     */
+    public int distanceToGoldStandard (KorAPTokenizerImpl tok, String suite, String postings) {
+
+        // Load raw postings
+        EmpiristScanner esRaw = new EmpiristScanner(
+            "/empirist_gold_standard/" + suite + "/raw/" + postings + ".txt"
+            );
+
+        // Load tokenized postings
+        EmpiristScanner esTokenized = new EmpiristScanner(
+            "/empirist_gold_standard/" + suite + "/tokenized/" + postings + ".txt"
+            );
+
+        int distance = 0;
+        
+        // Iterate over all postings
+        while (esRaw.hasNext() && esTokenized.hasNext()) {
+
+            // Get the gold standard splitted on new lines
+            String [] goldTokens = esTokenized.next().split("\n+");
+
+            // Tokenize the test data
+            String [] testTokens = tok.tokenize(esRaw.next());
+
+            if (false) {
+                System.err.println("-----------------");
+                for (int i = 0; i < Math.min(goldTokens.length, testTokens.length); i++) {
+                    System.err.println(goldTokens[i] + " = "+ testTokens[i]);
+                }
+            }
+            
+            // Calculate the edit distance of both arrays
+            distance += levenshteinForStringArrays(goldTokens, testTokens);
+        };
+
+        // Return the sum of all distances
+        return distance;
+    };
+    
+
+    @Test
+    public void testTokenizerCoverEmpiristCmc () {
+
+        // Create tokenizer object
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+
+        String test = "cmc_test_blog_comment";
+        int dist = distanceToGoldStandard(tok, "test_cmc", test);
+        assertTrue(test + " = " + dist, dist == 0);
+
+        test = "cmc_test_professional_chat";
+        dist = distanceToGoldStandard(tok, "test_cmc", test);
+        assertTrue(test + " = " + dist, dist <= 20);
+
+        test = "cmc_test_social_chat";
+        dist = distanceToGoldStandard(tok, "test_cmc", test);
+        assertTrue(test + " = " + dist, dist <= 23);
+
+        test = "cmc_test_twitter";
+        dist = distanceToGoldStandard(tok, "test_cmc", test);
+        assertTrue(test + " = " + dist, dist <= 153);
+
+        test = "cmc_test_whatsapp";
+        dist = distanceToGoldStandard(tok, "test_cmc", test);
+        assertTrue(test + " = " + dist, dist <= 0);
+
+        test = "cmc_test_wiki_discussion";
+        dist = distanceToGoldStandard(tok, "test_cmc", test);
+        assertTrue(test + " = " + dist, dist <= 24);
+
+    }
+
+    @Test
+    public void testTokenizerCoverEmpiristWeb () {
+
+        // Create tokenizer object
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+
+        String test = "web_test_001";
+        int dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 21);
+
+        test = "web_test_002";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 5);
+
+        test = "web_test_003";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 17);
+
+        test = "web_test_004";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 20);
+
+        test = "web_test_005";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 8);
+
+        test = "web_test_006";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 8);
+
+        test = "web_test_007";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 12);
+
+        test = "web_test_008";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 2);
+
+        test = "web_test_009";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 8);
+
+        test = "web_test_010";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist == 0);
+
+        test = "web_test_011";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 28);
+
+        test = "web_test_012";
+        dist = distanceToGoldStandard(tok, "test_web", test);
+        assertTrue(test + " = " + dist, dist <= 7);
+    }
+}
diff --git a/src/test/java/TokenizerTest.java b/src/test/java/TokenizerTest.java
new file mode 100644
index 0000000..28cae6f
--- /dev/null
+++ b/src/test/java/TokenizerTest.java
@@ -0,0 +1,478 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import static org.junit.Assert.*;
+import org.junit.Test;
+import org.junit.Ignore;
+import org.junit.runner.RunWith;
+import org.junit.runners.JUnit4;
+
+@RunWith(JUnit4.class)
+public class TokenizerTest {
+
+    @Test
+    public void testTokenizerSimple () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Der alte Mann");
+        assertEquals(tokens[0], "Der");
+        assertEquals(tokens[1], "alte");
+        assertEquals(tokens[2], "Mann");
+        assertEquals(tokens.length, 3);
+
+        tokens = tok.tokenize("Der alte Mann.");
+        assertEquals(tokens[0], "Der");
+        assertEquals(tokens[1], "alte");
+        assertEquals(tokens[2], "Mann");
+        assertEquals(tokens[3], ".");
+        assertEquals(tokens.length, 4);
+    }
+
+    @Test
+    @Ignore
+    public void testTokenizerAbbr () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Der Vorsitzende der F.D.P. hat gewählt");
+        assertEquals(tokens[0], "Der");
+        assertEquals(tokens[1], "Vorsitzende");
+        assertEquals(tokens[2], "der");
+        assertEquals(tokens[3], "F.D.P.");
+        assertEquals(tokens[4], "hat");
+        assertEquals(tokens[5], "gewählt");
+        assertEquals(tokens.length, 6);
+    }    
+
+    @Test
+    public void testTokenizerHost1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Gefunden auf wikipedia.org");
+        assertEquals(tokens[0], "Gefunden");
+        assertEquals(tokens[1], "auf");
+        assertEquals(tokens[2], "wikipedia.org");
+        assertEquals(tokens.length, 3);
+    }
+
+    @Test
+    @Ignore
+    public void testTokenizerHost2 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Gefunden auf www.wikipedia.org");
+        assertEquals(tokens[0], "Gefunden");
+        assertEquals(tokens[1], "auf");
+        assertEquals(tokens[2], "www.wikipedia.org");
+        assertEquals(tokens.length, 3);
+    }
+    
+    @Test
+    public void testTokenizerDash () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Das war -- spitze");
+        assertEquals(tokens[0], "Das");
+        assertEquals(tokens[1], "war");
+        assertEquals(tokens[2], "--");
+        assertEquals(tokens[3], "spitze");
+        assertEquals(tokens.length, 4);
+    }
+
+    @Test
+    public void testTokenizerEmail1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Ich bin unter korap@ids-mannheim.de erreichbar.");
+        assertEquals(tokens[0], "Ich");
+        assertEquals(tokens[1], "bin");
+        assertEquals(tokens[2], "unter");
+        assertEquals(tokens[3], "korap@ids-mannheim.de");
+        assertEquals(tokens[4], "erreichbar");
+        assertEquals(tokens[5], ".");
+        assertEquals(tokens.length, 6);
+    }
+
+    @Test
+    public void testTokenizerEmail2 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Oder unter korap[at]ids-mannheim[dot]de.");
+        assertEquals(tokens[0], "Oder");
+        assertEquals(tokens[1], "unter");
+        assertEquals(tokens[2], "korap[at]ids-mannheim[dot]de");
+        assertEquals(tokens[3], ".");
+        assertEquals(tokens.length, 4);
+    }
+
+    @Test
+    @Ignore
+    public void testTokenizerEmail3 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Oder unter korap(at)ids-mannheim(dot)de.");
+        assertEquals(tokens[0], "Oder");
+        assertEquals(tokens[1], "unter");
+        assertEquals(tokens[2], "korap(at)ids-mannheim(dot)de");
+        assertEquals(tokens[3], ".");
+        assertEquals(tokens.length, 4);
+    }
+
+    @Test
+    public void testTokenizerTwitter () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Folgt @korap und #korap");
+        assertEquals(tokens[0], "Folgt");
+        assertEquals(tokens[1], "@korap");
+        assertEquals(tokens[2], "und");
+        assertEquals(tokens[3], "#korap");
+        assertEquals(tokens.length, 4);
+    }
+
+    @Test
+    public void testTokenizerWeb1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum");
+        assertEquals(tokens[0], "Unsere");
+        assertEquals(tokens[1], "Website");
+        assertEquals(tokens[2], "ist");
+        assertEquals(tokens[3], "https://korap.ids-mannheim.de/?q=Baum");
+        assertEquals(tokens.length, 4);
+    }
+
+    @Test
+    @Ignore
+    public void testTokenizerWeb2 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)");
+        assertEquals(tokens[0], "Wir");
+        assertEquals(tokens[1], "sind");
+        assertEquals(tokens[2], "auch");
+        assertEquals(tokens[3], "im");
+        assertEquals(tokens[4], "Internet");
+        assertEquals(tokens[5], "(");
+        assertEquals(tokens[6], "https://korap.ids-mannheim.de/?q=Baum");
+        assertEquals(tokens[7], ")");
+        assertEquals(tokens.length, 8);
+    }
+
+    @Test
+    @Ignore
+    public void testTokenizerWeb3 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.");
+        assertEquals(tokens[0], "Die");
+        assertEquals(tokens[1], "Adresse");
+        assertEquals(tokens[2], "ist");
+        assertEquals(tokens[3], "https://korap.ids-mannheim.de/?q=Baum");
+        assertEquals(tokens[4], ".");
+        assertEquals(tokens.length, 8);
+    }    
+
+    @Test
+    public void testTokenizerServer () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Unser Server ist 10.0.10.51.");
+        assertEquals(tokens[0], "Unser");
+        assertEquals(tokens[1], "Server");
+        assertEquals(tokens[2], "ist");
+        assertEquals(tokens[3], "10.0.10.51");
+        assertEquals(tokens[4], ".");
+        assertEquals(tokens.length, 5);
+    }
+
+    @Test
+    public void testTokenizerNum () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Zu 50,4% ist es sicher");
+        assertEquals(tokens[0], "Zu");
+        assertEquals(tokens[1], "50,4");
+        assertEquals(tokens[2], "%");  // Arguable
+        assertEquals(tokens[3], "ist");
+        assertEquals(tokens[4], "es");
+        assertEquals(tokens[5], "sicher");
+        assertEquals(tokens.length, 6);
+    }
+    
+    @Test
+    public void testTokenizerDate () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Der Termin ist am 5.9.2018");
+        assertEquals(tokens[0], "Der");
+        assertEquals(tokens[1], "Termin");
+        assertEquals(tokens[2], "ist");
+        assertEquals(tokens[3], "am");
+        assertEquals(tokens[4], "5.9.2018");
+        assertEquals(tokens.length, 5);
+
+        tokens = tok.tokenize("Der Termin ist am 5/9/2018");
+        assertEquals(tokens[0], "Der");
+        assertEquals(tokens[1], "Termin");
+        assertEquals(tokens[2], "ist");
+        assertEquals(tokens[3], "am");
+        assertEquals(tokens[4], "5/9/2018");
+        assertEquals(tokens.length, 5);
+    }
+    
+    @Test
+    @Ignore
+    public void testTokenizerDateRange () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Der Termin war vom 4.-5.9.2018");
+        assertEquals(tokens[0], "Der");
+        assertEquals(tokens[1], "Termin");
+        assertEquals(tokens[2], "war");
+        assertEquals(tokens[3], "vom");
+        assertEquals(tokens[4], "4.");
+        assertEquals(tokens[5], "-");
+        assertEquals(tokens[6], "5.9.2018");
+        assertEquals(tokens.length, 7);
+    }
+
+    @Test
+    public void testTokenizerEmoji1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Das ist toll! ;)");
+        assertEquals(tokens[0], "Das");
+        assertEquals(tokens[1], "ist");
+        assertEquals(tokens[2], "toll");
+        assertEquals(tokens[3], "!");
+        assertEquals(tokens[4], ";)");
+        assertEquals(tokens.length, 5);
+    }
+
+    @Test
+    public void testTokenizerRef1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Kupietz und Schmidt (2018): Korpuslinguistik");
+        assertEquals(tokens[0], "Kupietz");
+        assertEquals(tokens[1], "und");
+        assertEquals(tokens[2], "Schmidt");
+        assertEquals(tokens[3], "(");
+        assertEquals(tokens[4], "2018");
+        assertEquals(tokens[5], ")");
+        assertEquals(tokens[6], ":");
+        assertEquals(tokens[7], "Korpuslinguistik");
+        assertEquals(tokens.length, 8);
+    }
+
+    @Test
+    public void testTokenizerRef2 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Kupietz und Schmidt [2018]: Korpuslinguistik");
+        assertEquals(tokens[0], "Kupietz");
+        assertEquals(tokens[1], "und");
+        assertEquals(tokens[2], "Schmidt");
+        assertEquals(tokens[3], "[");
+        assertEquals(tokens[4], "2018");
+        assertEquals(tokens[5], "]");
+        assertEquals(tokens[6], ":");
+        assertEquals(tokens[7], "Korpuslinguistik");
+        assertEquals(tokens.length, 8);
+    }
+
+    @Test
+    public void testTokenizerOmission1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Er ist ein A****loch!");
+        assertEquals(tokens[0], "Er");
+        assertEquals(tokens[1], "ist");
+        assertEquals(tokens[2], "ein");
+        assertEquals(tokens[3], "A****loch");
+        assertEquals(tokens[4], "!");
+        assertEquals(tokens.length, 5);
+    }
+
+    @Test
+    public void testTokenizerOmission2 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("F*ck!");
+        assertEquals(tokens[0], "F*ck");
+        assertEquals(tokens[1], "!");
+        assertEquals(tokens.length, 2);
+    }
+
+    @Test
+    public void testTokenizerOmission3 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Dieses verf***** Kleid!");
+        assertEquals(tokens[0], "Dieses");
+        assertEquals(tokens[1], "verf*****");
+        assertEquals(tokens[2], "Kleid");
+        assertEquals(tokens[3], "!");
+        assertEquals(tokens.length, 4);
+    }
+
+    @Test
+    // Probably interpreted as HOST
+    public void testTokenizerFileExtension1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Ich habe die readme.txt heruntergeladen");
+        assertEquals(tokens[0], "Ich");
+        assertEquals(tokens[1], "habe");
+        assertEquals(tokens[2], "die");
+        assertEquals(tokens[3], "readme.txt");
+        assertEquals(tokens[4], "heruntergeladen");
+        assertEquals(tokens.length, 5);
+    }
+
+    @Test
+    // Probably interpreted as HOST
+    public void testTokenizerFileExtension2 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Nimm die README.TXT!");
+        assertEquals(tokens[0], "Nimm");
+        assertEquals(tokens[1], "die");
+        assertEquals(tokens[2], "README.TXT");
+        assertEquals(tokens[3], "!");
+        assertEquals(tokens.length, 4);
+    }
+
+    @Test
+    // Probably interpreted as HOST
+    public void testTokenizerFileExtension3 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Zeig mir profile.jpeg");
+        assertEquals(tokens[0], "Zeig");
+        assertEquals(tokens[1], "mir");
+        assertEquals(tokens[2], "profile.jpeg");
+        assertEquals(tokens.length, 3);
+    }
+
+    @Test
+    public void testTokenizerFile1 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.docx");
+        assertEquals(tokens[0], "Zeig");
+        assertEquals(tokens[1], "mir");
+        assertEquals(tokens[2], "c:\\Dokumente\\profile.docx");
+        assertEquals(tokens.length, 3);
+    }
+
+    @Test
+    public void testTokenizerFile2 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Gehe zu /Dokumente/profile.docx");
+        assertEquals(tokens[0], "Gehe");
+        assertEquals(tokens[1], "zu");
+        assertEquals(tokens[2], "/Dokumente/profile.docx");
+        assertEquals(tokens.length, 3);
+    }
+
+    @Test
+    @Ignore
+    public void testTokenizerFile3 () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.jpeg");
+        assertEquals(tokens[0], "Zeig");
+        assertEquals(tokens[1], "mir");
+        assertEquals(tokens[2], "c:\\Dokumente\\profile.jpeg");
+        assertEquals(tokens.length, 3);
+    }
+
+    @Test
+    public void testTokenizerPunct () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Er sagte: \"Es geht mir gut!\", daraufhin ging er.");
+        assertEquals(tokens[0], "Er");
+        assertEquals(tokens[1], "sagte");
+        assertEquals(tokens[2], ":");
+        assertEquals(tokens[3], "\"");
+        assertEquals(tokens[4], "Es");
+        assertEquals(tokens[5], "geht");
+        assertEquals(tokens[6], "mir");
+        assertEquals(tokens[7], "gut");
+        assertEquals(tokens[8], "!");
+        assertEquals(tokens[9], "\"");
+        assertEquals(tokens[10], ",");
+        assertEquals(tokens[11], "daraufhin");
+        assertEquals(tokens[12], "ging");
+        assertEquals(tokens[13], "er");
+        assertEquals(tokens[14], ".");
+        assertEquals(tokens.length, 15);
+    }
+
+    @Test
+    public void testTokenizerPlusAmpersand () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("&quot;Das ist von C&A!&quot;");
+        assertEquals(tokens[0], "&quot;");
+        assertEquals(tokens[1], "Das");
+        assertEquals(tokens[2], "ist");
+        assertEquals(tokens[3], "von");
+        assertEquals(tokens[4], "C&A");
+        assertEquals(tokens[5], "!");
+        assertEquals(tokens[6], "&quot;");
+        assertEquals(tokens.length, 7);
+    }
+
+    @Test
+    public void testTokenizerLongEnd () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Siehst Du?!!?");
+        assertEquals(tokens[0], "Siehst");
+        assertEquals(tokens[1], "Du");
+        assertEquals(tokens[2], "?!!?");
+        assertEquals(tokens.length, 3);
+    }
+
+    @Test
+    public void testTokenizerIrishO () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Peter O'Toole");
+        assertEquals(tokens[0], "Peter");
+        assertEquals(tokens[1], "O'Toole");
+        assertEquals(tokens.length, 2);
+    }
+
+    @Test
+    public void testTokenizerAbr () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Früher bzw. später ...");
+        assertEquals(tokens[0], "Früher");
+        assertEquals(tokens[1], "bzw.");
+        assertEquals(tokens[2], "später");
+        assertEquals(tokens[3], "...");
+        assertEquals(tokens.length, 4);
+    }    
+
+    @Test
+    @Ignore
+    public void testTokenizerUppercaseRule () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Es war spät.Morgen ist es früh.");
+        assertEquals(tokens[0], "Es");
+        assertEquals(tokens[1], "war");
+        assertEquals(tokens[2], "spät");
+        assertEquals(tokens[3], ".");
+        assertEquals(tokens[4], "Morgen");
+        assertEquals(tokens[5], "ist");
+        assertEquals(tokens[6], "es");
+        assertEquals(tokens[7], "früh");
+        assertEquals(tokens[8], ".");
+        assertEquals(tokens.length, 9);
+    }
+
+    @Test
+    public void testTokenizerOrd () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Sie erreichte den 1. Platz!");
+        assertEquals(tokens[0], "Sie");
+        assertEquals(tokens[1], "erreichte");
+        assertEquals(tokens[2], "den");
+        assertEquals(tokens[3], "1.");
+        assertEquals(tokens[4], "Platz");
+        assertEquals(tokens[5], "!");
+        assertEquals(tokens.length, 6);
+    }
+
+    @Test
+    public void testNoZipOuputArchive () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Archive:  Ich bin kein zip\n");
+        assertEquals(tokens[0], "Archive");
+        assertEquals(tokens[1], ":");
+        assertEquals(tokens[2], "Ich");
+        assertEquals(tokens[3], "bin");
+        assertEquals(tokens[4], "kein");
+        assertEquals(tokens[5], "zip");
+        assertEquals(6, tokens.length);
+    }
+
+    @Test
+    public void testZipOuputArchive () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl();
+        String[] tokens = tok.tokenize("Archive:  ich/bin/ein.zip\n");
+        assertEquals(0, tokens.length);
+    }
+}