Make tokenizer implementation exchangeable

Allows for quite an easy adaptation and integration of other language
specializations and other tokenizers, also from the command line with
new -T option that allows to specify the tokenizer class name.

TODO: add documentation to the KorapTokenizer interface

Change-Id: Ib95793f47887a3b0829d68ebdbf8bc40815a0605
diff --git a/.gitignore b/.gitignore
index b109df0..b5ddfd2 100644
--- a/.gitignore
+++ b/.gitignore
@@ -1,7 +1,7 @@
 .*
 !/.gitignore
 target
-!/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
+!/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.java
 tmp
 logs
 cache_store
diff --git a/Readme.md b/Readme.md
index 285866b..31a1a63 100644
--- a/Readme.md
+++ b/Readme.md
@@ -1,13 +1,18 @@
 # KorAP Tokenizer
-Efficient, [OpenNLP tools](https://opennlp.apache.org) compatible DFA tokenizer and sentence splitter with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
+Interface and implementation of a tokenizer and sentence splitter that can be used
 
-## Description
-The KorAP tokenizer is used for the German Reference Corpus DeReKo. Being based on a finite state automaton, 
+* as standalone tokenizer and/or sentence splitter
+* within the KorAP ingestion pipeline
+* within the [OpenNLP tools](https://opennlp.apache.org) framework
+
+## DeReKo Tokenizer (included default implementation)
+The included default implementation (`DerekoDfaTokenizer`) is a highly efficient DFA tokenizer and sentence splitter with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
+It is used for the German Reference Corpus DeReKo. Being based on a finite state automaton, 
 it is not accurate as language model based tokenizers, but with ~5 billion words per hour typically more efficient.
 An important feature in the DeReKo/KorAP context is also, that it reliably reports the character offsets of the tokens 
 so that this information can be used for applying standoff annotations.
  
-The main class `KorAPTokenizerImpl` implements the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
+`DerekoDfaTokenizer` and any implementation of the `KorapTokenizer` interface also implement the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
 and [`opennlp.tools.sentdetect.SentenceDetector`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/sentdetect/SentenceDetector.html)
 interfaces and can thus be used as a drop-in replacement in OpenNLP applications.
 
@@ -16,6 +21,10 @@
 Our changes mainly concern a good coverage of German abbreviations, 
 and some updates for handling computer mediated communication, optimized and tested against the gold data from the [EmpiriST 2015](https://sites.google.com/site/empirist2015/) shared task (Beißwenger et al. 2016).
 
+### Adaptations for other Languages
+To adapt the included implementation to another language you will probably want to start with replacing the abbreviations
+pattern defined in `SEABBR` in the jflex source.
+
 ## Installation
 ```shell script
 $ MAVEN_OPTS="-Xss50m" mvn clean install
diff --git a/pom.xml b/pom.xml
index 9aca3ed..f133064 100644
--- a/pom.xml
+++ b/pom.xml
@@ -117,7 +117,7 @@
                         <manifest>
                             <addClasspath>true</addClasspath>
                             <classpathPrefix>lib/</classpathPrefix>
-                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
+                            <mainClass>de.ids_mannheim.korap.tokenizer.Main</mainClass>
                         </manifest>
                     </archive>
                 </configuration>
@@ -148,7 +148,7 @@
                     </descriptors>
                     <archive>
                         <manifest>
-                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
+                            <mainClass>de.ids_mannheim.korap.tokenizer.Main</mainClass>
                         </manifest>
                     </archive>
                 </configuration>
@@ -199,7 +199,7 @@
                     </binFileExtensions>
                     <programs>
                         <program>
-                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
+                            <mainClass>de.ids_mannheim.korap.tokenizer.Main</mainClass>
                             <id>koraptokenizer</id>
                         </program>
                     </programs>
@@ -244,5 +244,11 @@
             <artifactId>appassembler-maven-plugin</artifactId>
             <version>2.1.0</version>
         </dependency>
+        <dependency>
+            <groupId>org.jetbrains</groupId>
+            <artifactId>annotations</artifactId>
+            <version>RELEASE</version>
+            <scope>compile</scope>
+        </dependency>
     </dependencies>
 </project>
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/KorapTokenizer.java b/src/main/java/de/ids_mannheim/korap/tokenizer/KorapTokenizer.java
new file mode 100644
index 0000000..cb8f280
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/KorapTokenizer.java
@@ -0,0 +1,103 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import opennlp.tools.util.Span;
+
+import java.io.IOException;
+import java.io.PrintStream;
+import java.io.Reader;
+
+public interface KorapTokenizer extends opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
+    void scan() throws IOException;
+
+    /**
+     * Mainly targeted language(s)
+     * @return list of ISO 639 alpha-2 or alpha-3 language codes
+     * @apiNote will later be used to find appropriate implementations via reflection
+     */
+    CharSequence[] getTargetLanguages();
+    
+    void setInputReader(Reader inputReader);
+
+    void setSplitSentences(boolean splitSentences);
+
+    void setEcho(boolean echo);
+
+    void setPrintOffsets(boolean printOffsets);
+
+    void setPrintTokens(boolean tokenize);
+
+    void setOutputStream(PrintStream outputStream);
+
+    void setNormalize(boolean normalize);
+
+    String[] tokenize(String s);
+
+    Span[] tokenizePos(String s);
+
+    String[] sentDetect(String s);
+
+    Span[] sentPosDetect(String s);
+
+    class Builder {
+        private boolean splitSentences;
+        private boolean echo;
+        private boolean printOffsets;
+        private boolean printTokens;
+        private PrintStream outputStream = System.out;
+        private boolean normalize;
+        private Class tokenizerClass;
+        private Reader inputReader;
+
+        public Builder tokenizerClassName(String tokenizerClassName) throws ClassNotFoundException {
+            this.tokenizerClass = Class.forName(tokenizerClassName);
+            return this;
+        }
+
+        public Builder splitSentences(boolean splitSentences) {
+            this.splitSentences = splitSentences;
+            return this;
+        }
+
+        public Builder setEcho(boolean echo) {
+            this.echo = echo;
+            return this;
+        }
+
+        public Builder printOffsets(boolean printOffsets) {
+            this.printOffsets = printOffsets;
+            return this;
+        }
+
+        public Builder printTokens(boolean printTokens) {
+            this.printTokens = printTokens;
+            return this;
+        }
+
+        public Builder inputReader(Reader inputReader) {
+            this.inputReader = inputReader;
+            return this;
+        }
+
+        public Builder normalize(boolean normalize) {
+            this.normalize = normalize;
+            return this;
+        }
+
+        public Builder outputStream(PrintStream outputStream) {
+            this.outputStream = outputStream;
+            return this;
+        }
+
+        public KorapTokenizer build() throws IllegalAccessException, InstantiationException {
+            KorapTokenizer korapTokenizer = (KorapTokenizer) tokenizerClass.newInstance();
+            korapTokenizer.setEcho(echo);
+            korapTokenizer.setInputReader(inputReader);
+            korapTokenizer.setOutputStream(outputStream);
+            korapTokenizer.setNormalize(normalize);
+            korapTokenizer.setPrintOffsets(printOffsets);
+            korapTokenizer.setSplitSentences(splitSentences);
+            korapTokenizer.setPrintTokens(printTokens);
+            return korapTokenizer;
+        }
+    }
+}
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
similarity index 78%
rename from src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java
rename to src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
index c754917..bf3c2ff 100644
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
@@ -8,7 +8,10 @@
 
 @CommandLine.Command(mixinStandardHelpOptions = true,
         name = "koraptokenizer", version = "{}", description = "Tokenizes (and sentence splits) text input.")
-public class KorAPTokenizer implements Callable<Integer> {
+public class Main implements Callable<Integer> {
+
+    @CommandLine.Option(names = {"-T",  "--tokenizer-class"}, description = "Class name of the actual tokenizer that will be used (default: ${DEFAULT-VALUE})")
+    String tokenizerClassName = DerekoDfaTokenizer.class.getName();
 
     @CommandLine.Option(names = {"--no-tokens"}, negatable = true, description = "Print tokens (default: ${DEFAULT-VALUE})")
     boolean tokens = true;
@@ -38,12 +41,12 @@
     @CommandLine.Parameters(arity = "0..*", paramLabel = "FILES", description = "input files")
     private final ArrayList<String> inputFiles = new ArrayList<>();
 
-    public KorAPTokenizer() {
+    public Main() {
 
     }
 
     public static void main(String[] args) {
-        new CommandLine(new KorAPTokenizer()).execute(args);
+        new CommandLine(new Main()).execute(args);
     }
 
     @Override
@@ -57,13 +60,21 @@
         }
 
         for (int i = 0; i < inputFiles.size() || (i == 0 && inputFiles.size() == 0); i++) {
-            KorAPDFATokenizer scanner = null;
             String fn = (inputFiles.size() > 0 ? inputFiles.get(i) : "-");
             try {
                 BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in)) :
                         new BufferedReader(new FileReader(fn));
-                scanner = new KorAPDFATokenizer(br, output_stream, true, tokens, sentencize, positions,  ktt, normalize);
-                scanner.scanThrough();
+                new KorapTokenizer.Builder()
+                        .tokenizerClassName(tokenizerClassName)
+                        .inputReader(br)
+                        .outputStream(output_stream)
+                        .printTokens(tokens)
+                        .printOffsets(positions)
+                        .normalize(normalize)
+                        .splitSentences(sentencize)
+                        .setEcho(true)
+                        .build()
+                        .scan();
             } catch (FileNotFoundException e) {
                 System.err.println("File not found : \"" + fn + "\"");
             } catch (IOException e) {
diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
similarity index 97%
rename from src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.jflex
rename to src/main/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index a79d44b..6c3db85 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
@@ -61,15 +61,16 @@
 * ... which is ...
 * Based on Lucene's StandardTokenizerImpl, but heavily modified.
 */
-%class KorAPDFATokenizer
+%class DerekoDfaTokenizer
 %unicode
 %public
-%implements opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector
+%implements KorapTokenizer, opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector
 %type Span
 %function getNextToken
 %char
 
 %{
+   private static final CharSequence[] targetLanguages = {"de"};
     private boolean xmlEcho = false;
     private boolean normalize = false;
     private boolean debug = false;
@@ -78,36 +79,65 @@
     private long previousFileEndOffset = -1;
     private int tokenId = 0;
     private boolean atEOT = false;
-    private boolean sentencize = false;
+    private boolean splitSentences = false;
     private boolean echo = false;
-    private boolean positions = false;
-    private boolean tokens = false;
+    private boolean printOffsets = false;
+    private boolean printTokens = false;
     private PrintStream outputStream = System.out;
 
-    public KorAPDFATokenizer() {
+    @Override
+    public CharSequence[] getTargetLanguages() {
+        return targetLanguages;
+    }
+
+    public DerekoDfaTokenizer() {
         this.zzReader = null;
     }
 
-    public KorAPDFATokenizer(java.io.Reader in, PrintStream outputStream, boolean echo, boolean tokens, boolean sentencize, boolean positions, boolean xmlEcho, boolean normalize) {
-        this.zzReader = in;
-        if (outputStream != null)
-            this.outputStream = outputStream;
-        this.tokens = tokens;
-        this.sentencize = sentencize;
-        this.positions = positions;
+    @Override
+    public void setInputReader(Reader inputReader) {
+        this.zzReader = inputReader;
+    }
+
+    @Override
+    public void setSplitSentences(boolean splitSentences) {
+        this.splitSentences = splitSentences;
+    }
+
+    @Override
+    public void setEcho(boolean echo) {
         this.echo = echo;
-        this.xmlEcho = xmlEcho;
+    }
+
+    @Override
+    public void setPrintOffsets(boolean printOffsets) {
+        this.printOffsets = printOffsets;
+    }
+
+    @Override
+    public void setPrintTokens(boolean printTokens) {
+        this.printTokens = printTokens;
+    }
+
+    @Override
+    public void setOutputStream(PrintStream outputStream) {
+        this.outputStream = outputStream;
+    }
+
+    @Override
+    public void setNormalize(boolean normalize) {
         this.normalize = normalize;
     }
 
-    public void scanThrough() throws IOException {
+    @Override
+    public void scan() throws IOException {
         List<Span> list = new ArrayList<Span>();
         Span token;
         while (!zzAtEOF) {
             token = this.getNextToken();
             if (atEOT) {
                 if (echo) {
-                    printTokenPositions(list, sentencize);
+                    printTokenPositions(list, splitSentences);
                     list.clear();
                 }
                 atEOT = false;
@@ -118,6 +148,7 @@
         }
     }
 
+    @Override
     public String[] tokenize(String s) {
         Span[] spans;
         int i;
@@ -131,7 +162,7 @@
         return tokens;
     }
 
-    public void printTokenPositions(List<Span> spanList, boolean sentencize) {
+    void printTokenPositions(List<Span> spanList, boolean sentencize) {
         int sentenceStart = -1;
         StringBuilder tokenStringBuffer = new StringBuilder();
         StringBuilder sentenceStringBuffer = new StringBuilder();
@@ -139,7 +170,7 @@
             Span s = spanList.get(i);
             if (sentenceStart == -1)
                 sentenceStart = s.getStart();
-            if (positions) {
+            if (printOffsets) {
                 tokenStringBuffer.append(s.getStart())
                         .append(" ")
                         .append(s.getEnd());
@@ -160,6 +191,7 @@
             outputStream.println(sentenceStringBuffer.toString());
     }
 
+    @Override
     public Span[] tokenizePos(String s) {
         Span token;
         int i = 0;
@@ -171,7 +203,7 @@
                 token = this.getNextToken();
                 if (atEOT) {
                     if (echo) {
-                        printTokenPositions(list, sentencize);
+                        printTokenPositions(list, splitSentences);
                         list.clear();
                     }
                     atEOT = false;
@@ -187,6 +219,7 @@
         return (list.toArray(new Span[list.size()]));
     }
 
+    @Override
     public String[] sentDetect(String s) {
         Span[] spans;
         int i;
@@ -200,6 +233,7 @@
         return sentences;
     }
 
+    @Override
     public Span[] sentPosDetect(String s) {
         final Span tokens[] = tokenizePos(s);
         ArrayList<Span> sentences = new ArrayList<Span>();
@@ -247,7 +281,7 @@
                 to = (yychar - startOffset + yylength() - lengthDiff);
         if (xmlEcho) {
             outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
-        } else if (echo && tokens) {
+        } else if (echo && printTokens) {
             outputStream.println(value);
         }
         startOffset += lengthDiff;
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java b/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
index bf5743c..611549f 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
@@ -47,7 +47,7 @@
     public void testMainWithOffsetsAndSentencesOnDifferentInputFiles() throws IOException {
         File tempFile = File.createTempFile("tokenoutput", ".txt");
         String[] args = {"--no-tokens", "--positions", "--sentence-boundaries", "--force", "-o", tempFile.getAbsolutePath(), input};
-        KorAPTokenizer.main(args);
+        Main.main(args);
         String actualResult = readFile(tempFile.getAbsolutePath());
         String goldData = readFile(gold);
         assertEquals(goldData, actualResult);
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
index 59a181a..bed83cb 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/SentenceSplitterTest.java
@@ -12,21 +12,21 @@
 
     @Test
     public void testSentSplitterSimple () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Der alte Mann.");
         assertEquals(sentences.length, 1);
     }
 
     @Test
     public void testSentSplitterAbbr () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Der Vorsitzende der Abk. hat gewählt.");
         assertEquals(sentences.length, 1);
     }
 
     @Test
     public void testSentSplitterHost1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Gefunden auf wikipedia.org.");
         assertEquals(sentences.length, 1);
     }
@@ -34,14 +34,14 @@
     @Test
     @Ignore
     public void testSentSplitterHost2 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Gefunden auf www.wikipedia.org");
         assertEquals(sentences.length, 1);
     }
 
     @Test
     public void testSentSplitterEmail1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Ich bin unter korap@ids-mannheim.de erreichbar.");
         assertEquals(sentences.length, 1);
     }
@@ -49,28 +49,28 @@
 
     @Test
     public void testSentSplitterWeb1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum");
         assertEquals(sentences.length, 1);
     }
 
    @Test
     public void testSentSplitterServer () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Unser Server ist 10.0.10.51.");
         assertEquals(sentences.length, 1);
     }
 
     @Test
     public void testSentSplitterNum () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Zu 50.4% ist es sicher");
         assertEquals(sentences.length, 1);
     }
 
     @Test
     public void testSentSplitterDate () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Der Termin ist am 5.9.2018");
         assertEquals(sentences.length, 1);
     }
@@ -78,14 +78,14 @@
     @Test
     // Probably interpreted as HOST
     public void testSentSplitterFileExtension1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Ich habe die readme.txt heruntergeladen");
         assertEquals(sentences.length, 1);
     }
 
     @Test
     public void testSentMultiMarker () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("Ausschalten!!! Hast Du nicht gehört???");
         assertEquals("Ausschalten!!!", sentences[0]);
         assertEquals("Hast Du nicht gehört???", sentences[1]);
@@ -95,7 +95,7 @@
     @Test
     @Ignore
     public void testSentSplitterQuote () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] sentences = tok.sentDetect("\"Ausschalten!!!\", sagte er. \"Hast Du nicht gehört???\"");
         assertEquals("\"Ausschalten!!!\", sagte er.", sentences[0]);
         assertEquals("\"Hast Du nicht gehört???\"", sentences[1]);
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerCoverTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerCoverTest.java
index a2da7fb..fd60d6f 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerCoverTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerCoverTest.java
@@ -116,7 +116,7 @@
      * with the gold standard and return the sum of
      * levenshtein distances.
      */
-    public int distanceToGoldStandard (KorAPDFATokenizer tok, String suite, String postings) {
+    public int distanceToGoldStandard (DerekoDfaTokenizer tok, String suite, String postings) {
 
         // Load raw postings
         EmpiristScanner esRaw = new EmpiristScanner(
@@ -159,7 +159,7 @@
     public void testTokenizerCoverEmpiristCmc () {
 
         // Create tokenizer object
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
 
         String test = "cmc_test_blog_comment";
         int dist = distanceToGoldStandard(tok, "test_cmc", test);
@@ -191,7 +191,7 @@
     public void testTokenizerCoverEmpiristWeb () {
 
         // Create tokenizer object
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
 
         String test = "web_test_001";
         int dist = distanceToGoldStandard(tok, "test_web", test);
diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index bbe8080..697c65c 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
@@ -16,7 +16,7 @@
 
     @Test
     public void testTokenizerSimple () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Der alte Mann");
         assertEquals(tokens[0], "Der");
         assertEquals(tokens[1], "alte");
@@ -34,7 +34,7 @@
     @Test
     @Ignore
     public void testTokenizerAbbr () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Der Vorsitzende der F.D.P. hat gewählt");
         assertEquals(tokens[0], "Der");
         assertEquals(tokens[1], "Vorsitzende");
@@ -47,7 +47,7 @@
 
     @Test
     public void testTokenizerHost1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Gefunden auf wikipedia.org");
         assertEquals(tokens[0], "Gefunden");
         assertEquals(tokens[1], "auf");
@@ -58,7 +58,7 @@
     @Test
     @Ignore
     public void testTokenizerHost2 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Gefunden auf www.wikipedia.org");
         assertEquals(tokens[0], "Gefunden");
         assertEquals(tokens[1], "auf");
@@ -68,7 +68,7 @@
     
     @Test
     public void testTokenizerDash () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Das war -- spitze");
         assertEquals(tokens[0], "Das");
         assertEquals(tokens[1], "war");
@@ -79,7 +79,7 @@
 
     @Test
     public void testTokenizerEmail1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Ich bin unter korap@ids-mannheim.de erreichbar.");
         assertEquals(tokens[0], "Ich");
         assertEquals(tokens[1], "bin");
@@ -92,7 +92,7 @@
 
     @Test
     public void testTokenizerEmail2 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Oder unter korap[at]ids-mannheim[dot]de.");
         assertEquals(tokens[0], "Oder");
         assertEquals(tokens[1], "unter");
@@ -104,7 +104,7 @@
     @Test
     @Ignore
     public void testTokenizerEmail3 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Oder unter korap(at)ids-mannheim(dot)de.");
         assertEquals(tokens[0], "Oder");
         assertEquals(tokens[1], "unter");
@@ -115,7 +115,7 @@
 
     @Test
     public void testTokenizerDoNotAcceptQuotedEmailNames () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("\"John Doe\"@xx.com");
         assertEquals("\"", tokens[0]);
         assertEquals("John", tokens[1]);
@@ -129,7 +129,7 @@
 
     @Test
     public void testTokenizerTwitter () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Folgt @korap und #korap");
         assertEquals(tokens[0], "Folgt");
         assertEquals(tokens[1], "@korap");
@@ -140,7 +140,7 @@
 
     @Test
     public void testTokenizerWeb1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Unsere Website ist https://korap.ids-mannheim.de/?q=Baum");
         assertEquals(tokens[0], "Unsere");
         assertEquals(tokens[1], "Website");
@@ -152,7 +152,7 @@
     @Test
     @Ignore
     public void testTokenizerWeb2 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Wir sind auch im Internet (https://korap.ids-mannheim.de/?q=Baum)");
         assertEquals(tokens[0], "Wir");
         assertEquals(tokens[1], "sind");
@@ -168,7 +168,7 @@
     @Test
     @Ignore
     public void testTokenizerWeb3 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Die Adresse ist https://korap.ids-mannheim.de/?q=Baum.");
         assertEquals(tokens[0], "Die");
         assertEquals(tokens[1], "Adresse");
@@ -180,7 +180,7 @@
 
     @Test
     public void testTokenizerServer () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Unser Server ist 10.0.10.51.");
         assertEquals(tokens[0], "Unser");
         assertEquals(tokens[1], "Server");
@@ -192,7 +192,7 @@
 
     @Test
     public void testTokenizerNum () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Zu 50,4% ist es sicher");
         assertEquals(tokens[0], "Zu");
         assertEquals(tokens[1], "50,4");
@@ -205,7 +205,7 @@
     
     @Test
     public void testTokenizerDate () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Der Termin ist am 5.9.2018");
         assertEquals(tokens[0], "Der");
         assertEquals(tokens[1], "Termin");
@@ -226,7 +226,7 @@
     @Test
     @Ignore
     public void testTokenizerDateRange () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Der Termin war vom 4.-5.9.2018");
         assertEquals(tokens[0], "Der");
         assertEquals(tokens[1], "Termin");
@@ -240,7 +240,7 @@
 
     @Test
     public void testTokenizerEmoji1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Das ist toll! ;)");
         assertEquals(tokens[0], "Das");
         assertEquals(tokens[1], "ist");
@@ -252,7 +252,7 @@
 
     @Test
     public void testTokenizerRef1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Kupietz und Schmidt (2018): Korpuslinguistik");
         assertEquals(tokens[0], "Kupietz");
         assertEquals(tokens[1], "und");
@@ -267,7 +267,7 @@
 
     @Test
     public void testTokenizerRef2 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Kupietz und Schmidt [2018]: Korpuslinguistik");
         assertEquals(tokens[0], "Kupietz");
         assertEquals(tokens[1], "und");
@@ -282,7 +282,7 @@
 
     @Test
     public void testTokenizerOmission1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Er ist ein A****loch!");
         assertEquals(tokens[0], "Er");
         assertEquals(tokens[1], "ist");
@@ -294,7 +294,7 @@
 
     @Test
     public void testTokenizerOmission2 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("F*ck!");
         assertEquals(tokens[0], "F*ck");
         assertEquals(tokens[1], "!");
@@ -303,7 +303,7 @@
 
     @Test
     public void testTokenizerOmission3 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Dieses verf***** Kleid!");
         assertEquals(tokens[0], "Dieses");
         assertEquals(tokens[1], "verf*****");
@@ -315,7 +315,7 @@
     @Test
     // Probably interpreted as HOST
     public void testTokenizerFileExtension1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Ich habe die readme.txt heruntergeladen");
         assertEquals(tokens[0], "Ich");
         assertEquals(tokens[1], "habe");
@@ -328,7 +328,7 @@
     @Test
     // Probably interpreted as HOST
     public void testTokenizerFileExtension2 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Nimm die README.TXT!");
         assertEquals(tokens[0], "Nimm");
         assertEquals(tokens[1], "die");
@@ -340,7 +340,7 @@
     @Test
     // Probably interpreted as HOST
     public void testTokenizerFileExtension3 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Zeig mir profile.jpeg");
         assertEquals(tokens[0], "Zeig");
         assertEquals(tokens[1], "mir");
@@ -350,7 +350,7 @@
 
     @Test
     public void testTokenizerFile1 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.docx");
         assertEquals(tokens[0], "Zeig");
         assertEquals(tokens[1], "mir");
@@ -360,7 +360,7 @@
 
     @Test
     public void testTokenizerFile2 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Gehe zu /Dokumente/profile.docx");
         assertEquals(tokens[0], "Gehe");
         assertEquals(tokens[1], "zu");
@@ -371,7 +371,7 @@
     @Test
     @Ignore
     public void testTokenizerFile3 () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Zeig mir c:\\Dokumente\\profile.jpeg");
         assertEquals(tokens[0], "Zeig");
         assertEquals(tokens[1], "mir");
@@ -381,7 +381,7 @@
 
     @Test
     public void testTokenizerPunct () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Er sagte: \"Es geht mir gut!\", daraufhin ging er.");
         assertEquals(tokens[0], "Er");
         assertEquals(tokens[1], "sagte");
@@ -403,7 +403,7 @@
 
     @Test
     public void testTokenizerPlusAmpersand () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("&quot;Das ist von C&A!&quot;");
         assertEquals(tokens[0], "&quot;");
         assertEquals(tokens[1], "Das");
@@ -417,7 +417,7 @@
 
     @Test
     public void testTokenizerLongEnd () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Siehst Du?!!?");
         assertEquals(tokens[0], "Siehst");
         assertEquals(tokens[1], "Du");
@@ -427,7 +427,7 @@
 
     @Test
     public void testTokenizerIrishO () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Peter O'Toole");
         assertEquals(tokens[0], "Peter");
         assertEquals(tokens[1], "O'Toole");
@@ -436,7 +436,7 @@
 
     @Test
     public void testTokenizerAbr () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Früher bzw. später ...");
         assertEquals(tokens[0], "Früher");
         assertEquals(tokens[1], "bzw.");
@@ -448,7 +448,7 @@
     @Test
     @Ignore
     public void testTokenizerUppercaseRule () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Es war spät.Morgen ist es früh.");
         assertEquals(tokens[0], "Es");
         assertEquals(tokens[1], "war");
@@ -464,7 +464,7 @@
 
     @Test
     public void testTokenizerOrd () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Sie erreichte den 1. Platz!");
         assertEquals(tokens[0], "Sie");
         assertEquals(tokens[1], "erreichte");
@@ -477,7 +477,7 @@
 
     @Test
     public void testNoZipOuputArchive () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         String[] tokens = tok.tokenize("Archive:  Ich bin kein zip\n");
         assertEquals(tokens[0], "Archive");
         assertEquals(tokens[1], ":");
@@ -490,7 +490,7 @@
 
     @Test
     public void testZipOuputArchive () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer();
+        DerekoDfaTokenizer tok = new DerekoDfaTokenizer();
         final ByteArrayOutputStream clearOut = new ByteArrayOutputStream();
         System.setOut(new PrintStream(clearOut));
         String[] tokens = tok.tokenize("Archive:  ich/bin/ein.zip\n");
@@ -498,8 +498,11 @@
     }
 
     @Test
-    public void testTextBreakOutputArchive () {
-        KorAPDFATokenizer tok = new KorAPDFATokenizer(null, null, false, false, false, true, false, false);
+    public void testTextBreakOutputArchive () throws InstantiationException, IllegalAccessException, ClassNotFoundException {
+        DerekoDfaTokenizer tok = (DerekoDfaTokenizer) new KorapTokenizer.Builder()
+                .tokenizerClassName(DerekoDfaTokenizer.class.getName())
+                .printOffsets(true)
+                .build();
         Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
         assertEquals("Text1", tokens[0].getType());
         assertEquals(tokens.length, 9 );
diff --git a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.java b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.java
similarity index 99%
rename from target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.java
rename to target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.java
index 7b45aa1..ab96a79 100644
--- a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.java
+++ b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.java
@@ -1,6 +1,6 @@
 // DO NOT EDIT
 // Generated by JFlex 1.8.2 http://jflex.de/
-// source: src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPDFATokenizer.jflex
+// source: src/main/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
 
 package de.ids_mannheim.korap.tokenizer;
 /**
@@ -61,7 +61,7 @@
 
 // See https://github.com/jflex-de/jflex/issues/222
 @SuppressWarnings("FallThrough")
-public class KorAPDFATokenizer implements opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
+public class DerekoDfaTokenizer implements KorapTokenizer, opennlp.tools.tokenize.Tokenizer, opennlp.tools.sentdetect.SentenceDetector {
 
   /** This character denotes the end of file. */
   public static final int YYEOF = -1;
@@ -34137,6 +34137,7 @@
   private boolean zzEOFDone;
 
   /* user code: */
+   private static final CharSequence[] targetLanguages = {"de"};
     private boolean xmlEcho = false;
     private boolean normalize = false;
     private boolean debug = false;
@@ -34145,36 +34146,64 @@
     private long previousFileEndOffset = -1;
     private int tokenId = 0;
     private boolean atEOT = false;
-    private boolean sentencize = false;
+    private boolean splitSentences = false;
     private boolean echo = false;
-    private boolean positions = false;
-    private boolean tokens = false;
+    private boolean printOffsets = false;
+    private boolean printTokens = false;
     private PrintStream outputStream = System.out;
 
-    public KorAPDFATokenizer() {
+    public CharSequence[] getTargetLanguages() {
+        return targetLanguages;
+    }
+
+    public DerekoDfaTokenizer() {
         this.zzReader = null;
     }
 
-    public KorAPDFATokenizer(java.io.Reader in, PrintStream outputStream, boolean echo, boolean tokens, boolean sentencize, boolean positions, boolean xmlEcho, boolean normalize) {
-        this.zzReader = in;
-        if (outputStream != null)
-            this.outputStream = outputStream;
-        this.tokens = tokens;
-        this.sentencize = sentencize;
-        this.positions = positions;
+    @Override
+    public void setInputReader(Reader inputReader) {
+        this.zzReader = inputReader;
+    }
+
+    @Override
+    public void setSplitSentences(boolean splitSentences) {
+        this.splitSentences = splitSentences;
+    }
+
+    @Override
+    public void setEcho(boolean echo) {
         this.echo = echo;
-        this.xmlEcho = xmlEcho;
+    }
+
+    @Override
+    public void setPrintOffsets(boolean printOffsets) {
+        this.printOffsets = printOffsets;
+    }
+
+    @Override
+    public void setPrintTokens(boolean printTokens) {
+        this.printTokens = printTokens;
+    }
+
+    @Override
+    public void setOutputStream(PrintStream outputStream) {
+        this.outputStream = outputStream;
+    }
+
+    @Override
+    public void setNormalize(boolean normalize) {
         this.normalize = normalize;
     }
 
-    public void scanThrough() throws IOException {
+    @Override
+    public void scan() throws IOException {
         List<Span> list = new ArrayList<Span>();
         Span token;
         while (!zzAtEOF) {
             token = this.getNextToken();
             if (atEOT) {
                 if (echo) {
-                    printTokenPositions(list, sentencize);
+                    printTokenPositions(list, splitSentences);
                     list.clear();
                 }
                 atEOT = false;
@@ -34185,6 +34214,7 @@
         }
     }
 
+    @Override
     public String[] tokenize(String s) {
         Span[] spans;
         int i;
@@ -34198,7 +34228,7 @@
         return tokens;
     }
 
-    public void printTokenPositions(List<Span> spanList, boolean sentencize) {
+    void printTokenPositions(List<Span> spanList, boolean sentencize) {
         int sentenceStart = -1;
         StringBuilder tokenStringBuffer = new StringBuilder();
         StringBuilder sentenceStringBuffer = new StringBuilder();
@@ -34206,7 +34236,7 @@
             Span s = spanList.get(i);
             if (sentenceStart == -1)
                 sentenceStart = s.getStart();
-            if (positions) {
+            if (printOffsets) {
                 tokenStringBuffer.append(s.getStart())
                         .append(" ")
                         .append(s.getEnd());
@@ -34227,6 +34257,7 @@
             outputStream.println(sentenceStringBuffer.toString());
     }
 
+    @Override
     public Span[] tokenizePos(String s) {
         Span token;
         int i = 0;
@@ -34238,7 +34269,7 @@
                 token = this.getNextToken();
                 if (atEOT) {
                     if (echo) {
-                        printTokenPositions(list, sentencize);
+                        printTokenPositions(list, splitSentences);
                         list.clear();
                     }
                     atEOT = false;
@@ -34254,6 +34285,7 @@
         return (list.toArray(new Span[list.size()]));
     }
 
+    @Override
     public String[] sentDetect(String s) {
         Span[] spans;
         int i;
@@ -34267,6 +34299,7 @@
         return sentences;
     }
 
+    @Override
     public Span[] sentPosDetect(String s) {
         final Span tokens[] = tokenizePos(s);
         ArrayList<Span> sentences = new ArrayList<Span>();
@@ -34314,7 +34347,7 @@
                 to = (yychar - startOffset + yylength() - lengthDiff);
         if (xmlEcho) {
             outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
-        } else if (echo && tokens) {
+        } else if (echo && printTokens) {
             outputStream.println(value);
         }
         startOffset += lengthDiff;
@@ -34368,7 +34401,7 @@
    *
    * @param   in  the java.io.Reader to read input from.
    */
-  public KorAPDFATokenizer(java.io.Reader in) {
+  public DerekoDfaTokenizer(java.io.Reader in) {
     this.zzReader = in;
   }