Add new command line options using picocli and sanitize code Usage: koraptokenizer [-hnpsV] [--force] [-ktt] [--[no-]tokens] [-o=<output_fillename>] [<inputFiles>...] Tokenizes (and sentence splits) text input. [<inputFiles>...] input files --force Force overwrite (default: false) -h, --help Show this help message and exit. -ktt Deprecated. For internal use only. (default: false) -n, --normalize Normalize tokens (default: false) --[no-]tokens Print tokens (default: true) -o, --output-file=<output_fillename> Output file (default: -) -p, --positions Print token start and end positions as character offsets (default: false) -s, --sentence-boundaries Print sentence boundary positions (default: false) -V, --version Print version information and exit. Change-Id: Ib92678c832a2d95799a8f503c3e86dd4da2b4d73

commit: c419d5b22c508a352f00fb11f23034a10bfbaf3d [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Sep 17 15:21:26 2020 +0200
committer: Akron <nils@diewald-online.de> Fri Sep 18 17:44:06 2020 +0200
tree: 83eb21aa0d47920a7c1fcd225601249c1dc4108c
parent: de949deb083c43f4e0fed3713617aed768c000aa [diff]
diff --git a/Readme.md b/Readme.md
index 84d6051..285866b 100644
--- a/Readme.md
+++ b/Readme.md

@@ -32,14 +32,14 @@
 you will need ad least 5 GB of free RAM.
 
 ## Documentation
-The KorAP tokenizer reads from standard input and writes to standard output. It currently supports two modes.
+The KorAP tokenizer reads from standard input and writes to standard output. It supports multiple modes of operations.
 
-In the default mode, the tokenizer prints all offsets of the first character of a token and the first character after a token.
+With the `--positions` option, for example, the tokenizer prints all offsets of the first character of a token and the first character after a token.
 In order to end a text, flush the output and reset the character position, an EOT character (0x04) can be used.
 #### Command Line Invocation
 ```
 $ echo -n -e 'This is a text.\x0a\x03\x0aAnd this is another text.\n\x03\n' |\
-   java -jar target/KorAP-Tokenizer-1.2-SNAPSHOT.jar
+   java -jar target/KorAP-Tokenizer-1.3-SNAPSHOT.jar --positions
 
 0 4 5 7 8 9 10 15 
 0 3 4 8 9 11 12 19 20 25 
@@ -47,7 +47,7 @@
 #### Invocation with Sentence Splitting
 ```
 echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x03\x0aAnd this is another text.\n\x03\nAnd this a sentence without marker\n' |\
-   java -jar target/KorAP-Tokenizer-1.2-SNAPSHOT.jar -s
+   java -jar target/KorAP-Tokenizer-1.3-SNAPSHOT.jar --positions --sentence-boundaries
 1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76 
 1 28 29 54 55 76
 0 3 4 8 9 11 12 19 20 24 24 25 

diff --git a/pom.xml b/pom.xml
index 4933cf9..9aca3ed 100644
--- a/pom.xml
+++ b/pom.xml

@@ -6,7 +6,7 @@
 
     <groupId>groupId</groupId>
     <artifactId>KorAP-Tokenizer</artifactId>
-    <version>1.3-${git.commit.id.abbrev}</version>
+    <version>1.3-SNAPSHOT</version>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
@@ -117,7 +117,7 @@
                         <manifest>
                             <addClasspath>true</addClasspath>
                             <classpathPrefix>lib/</classpathPrefix>
-                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl</mainClass>
+                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
                         </manifest>
                     </archive>
                 </configuration>
@@ -148,7 +148,7 @@
                     </descriptors>
                     <archive>
                         <manifest>
-                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl</mainClass>
+                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
                         </manifest>
                     </archive>
                 </configuration>
@@ -180,10 +180,42 @@
                     <generateGitPropertiesFile>false</generateGitPropertiesFile><!-- somehow necessary. otherwise the variables are not available in the pom -->
                 </configuration>
             </plugin>
+            <plugin>
+                <groupId>org.codehaus.mojo</groupId>
+                <artifactId>appassembler-maven-plugin</artifactId>
+                <version>1.10</version>
+                <executions>
+                    <execution>
+                        <phase>package</phase>
+                        <goals>
+                            <goal>assemble</goal>
+                        </goals>
+                    </execution>
+                </executions>
+                <configuration>
+                    <binFolder>bin</binFolder>
+                    <binFileExtensions>
+                        <unix></unix>
+                    </binFileExtensions>
+                    <programs>
+                        <program>
+                            <mainClass>de.ids_mannheim.korap.tokenizer.KorAPTokenizer</mainClass>
+                            <id>koraptokenizer</id>
+                        </program>
+                    </programs>
+                </configuration>
+            </plugin>
+
         </plugins>
     </build>
 
     <dependencies>
+        <dependency>
+            <groupId>info.picocli</groupId>
+            <artifactId>picocli</artifactId>
+            <version>4.2.0</version>
+        </dependency>
+
         <!-- https://mvnrepository.com/artifact/org.apache.opennlp/opennlp-tools -->
         <dependency>
             <groupId>org.apache.opennlp</groupId>
@@ -206,5 +238,11 @@
             <version>1.0-1</version>
             <scope>test</scope>
         </dependency>
+        <!-- https://mvnrepository.com/artifact/org.codehaus.mojo/appassembler-maven-plugin -->
+        <dependency>
+            <groupId>org.codehaus.mojo</groupId>
+            <artifactId>appassembler-maven-plugin</artifactId>
+            <version>2.1.0</version>
+        </dependency>
     </dependencies>
 </project>

diff --git a/src/assembly/bin-distribution.xml b/src/assembly/bin-distribution.xml
index c830519..ffddc02 100644
--- a/src/assembly/bin-distribution.xml
+++ b/src/assembly/bin-distribution.xml

@@ -2,7 +2,7 @@
 <assembly xmlns="http://maven.apache.org/ASSEMBLY/2.1.0"
           xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
           xsi:schemaLocation="http://maven.apache.org/ASSEMBLY/2.1.0 http://maven.apache.org/xsd/assembly-2.1.0.xsd">
-    <id>standalone</id>
+    <id>${git.commit.id.abbrev}-standalone</id>
     <formats>
         <format>jar</format>
     </formats>
@@ -17,6 +17,7 @@
                     <include>opennlp/tools/util/Span.class</include>
                     <include>opennlp/tools/tokenize/Tokenizer.class</include>
                     <include>opennlp/tools/sentdetect/SentenceDetector.class</include>
+                    <include>picocli/CommandLine*.class</include>
                 </includes>
             </unpackOptions>
             <scope>runtime</scope>

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java b/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java
new file mode 100644
index 0000000..f5fa9cb
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/KorAPTokenizer.java

@@ -0,0 +1,83 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import picocli.CommandLine;
+
+import java.io.*;
+import java.util.ArrayList;
+import java.util.concurrent.Callable;
+
+@CommandLine.Command(mixinStandardHelpOptions = true,
+        name = "koraptokenizer", version = "{}", description = "Tokenizes (and sentence splits) text input.")
+public class KorAPTokenizer implements Callable<Integer> {
+
+    @CommandLine.Option(names = {"--no-tokens"}, negatable = true, description = "Print tokens (default: ${DEFAULT-VALUE})")
+    boolean tokens = true;
+
+    @CommandLine.Option(names = {"-p", "--positions"}, description = "Print token start and end positions as character offsets (default: ${DEFAULT-VALUE})")
+    boolean positions = false;
+
+    @CommandLine.Option(names = {"-s", "--sentence-boundaries"}, description = "Print sentence boundary positions (default: ${DEFAULT-VALUE})")
+    boolean sentencize = false;
+
+    @CommandLine.Option(names = {"-ktt"}, hidden = true, description = "Deprecated. For internal use only. (default: ${DEFAULT-VALUE})")
+    boolean ktt = false;
+
+    @CommandLine.Option(names = {"-n", "--normalize"}, description = "Normalize tokens (default: ${DEFAULT-VALUE})")
+    boolean normalize = false;
+
+    @SuppressWarnings("CanBeFinal")
+    @CommandLine.Option(names = {"-o",
+            "--output-file"}, paramLabel = "FILE", description = "Output file (default: ${DEFAULT-VALUE})")
+    String output_filename = "-";
+
+    @SuppressWarnings("CanBeFinal")
+    @CommandLine.Option(names = {"--force"}, description = "Force overwrite (default: ${DEFAULT-VALUE})")
+    boolean force_overwrite = false;
+
+
+    @CommandLine.Parameters(arity = "0..*", paramLabel = "FILES", description = "input files")
+    private final ArrayList<String> inputFiles = new ArrayList<>();
+
+    public KorAPTokenizer() {
+
+    }
+
+    public static void main(String[] args) {
+        new CommandLine(new KorAPTokenizer()).execute(args);
+    }
+
+    @Override
+    public Integer call() throws FileNotFoundException {
+        final PrintStream output_stream;
+        if ((output_filename == null) || output_filename.equals("-")) {
+            output_stream = System.out;
+        } else {
+            File f = Utils.createFile(output_filename, force_overwrite);
+            output_stream = new PrintStream(new BufferedOutputStream(new FileOutputStream(f)));
+        }
+
+        for (int i = 0; i < inputFiles.size() || (i == 0 && inputFiles.size() == 0); i++) {
+            KorAPTokenizerImpl scanner = null;
+            String fn = (inputFiles.size() > 0 ? inputFiles.get(i) : "-");
+            try {
+                BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in)) :
+                        new BufferedReader(new FileReader(fn));
+                scanner = new KorAPTokenizerImpl(br, output_stream, true, tokens, sentencize, positions,  ktt, normalize);
+                scanner.scanThrough();
+            } catch (FileNotFoundException e) {
+                System.err.println("File not found : \"" + fn + "\"");
+            } catch (IOException e) {
+                System.err.println("IO error scanning file \"" + fn + "\"");
+                System.err.println(e);
+            } catch (Exception e) {
+                System.err.println("Unexpected exception:");
+                e.printStackTrace();
+            }
+        }
+        if ((output_filename != null) && !output_filename.equals("-")) {
+            output_stream.close();
+        }
+        return 0;
+    }
+}
+

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Utils.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Utils.java
new file mode 100644
index 0000000..6df7f00
--- /dev/null
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Utils.java

@@ -0,0 +1,29 @@
+package de.ids_mannheim.korap.tokenizer;
+
+import java.io.File;
+import java.io.IOException;
+import java.nio.file.AccessDeniedException;
+import java.nio.file.FileAlreadyExistsException;
+import java.nio.file.Files;
+
+public class Utils {
+    public static File createFile(String fname, boolean force_overwrite) {
+        File f = new File(fname);
+        try {
+            Files.createFile(f.toPath());
+        } catch (AccessDeniedException e) {
+            final String message = "ERROR: Cannot write file '" + fname + "'";
+            System.err.println(message);
+            System.exit(-1);
+        } catch (FileAlreadyExistsException e) {
+            if (!force_overwrite) {
+                final String message = "ERROR: '" + fname + "' already exits. Use --force to overwrite";
+                System.err.println(message);
+                System.exit(-1);
+            }
+        } catch (IOException e) {
+            e.printStackTrace();
+        }
+        return f;
+    }
+}

diff --git a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
index 79439c6..85bcadc 100644
--- a/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex
+++ b/src/main/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.jflex

@@ -49,10 +49,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-
-import java.io.BufferedReader;
-import java.io.StringReader;
-import java.io.InputStreamReader;
+import java.io.*;
 import java.lang.StringBuffer;
 import java.util.ArrayList;
 import java.util.List;
@@ -73,252 +70,230 @@
 %char
 
 %{
+    private boolean xmlEcho = false;
+    private boolean normalize = false;
+    private boolean debug = false;
+    private boolean newSentence = true;
+    private long startOffset = 0;
+    private long previousFileEndOffset = -1;
+    private int tokenId = 0;
+    private boolean atEOT = false;
+    private boolean sentencize = false;
+    private boolean echo = false;
+    private boolean positions = false;
+    private boolean tokens = false;
+    private PrintStream outputStream = System.out;
 
-	public boolean xmlEcho = false;
-  public boolean sentences = false;
-	public boolean normalize = false;
-	public boolean debug = false;
-	private boolean newSentence = true;
-	private long startOffset = 0;
-	private long previousFileEndOffset = -1;
-	private int tokenId = 0;
-	private StringBuffer bounds = null;
-	private long fallbackSentenceEndOffset = -1;
-	private StringBuffer sentenceBounds = null;
+    public KorAPTokenizerImpl() {
+        this.zzReader = null;
+    }
 
-  public KorAPTokenizerImpl() {
-    this.zzReader = null;
-    sentenceBounds = null;
-  }
+    public KorAPTokenizerImpl(java.io.Reader in, PrintStream outputStream, boolean echo, boolean tokens, boolean sentencize, boolean positions, boolean xmlEcho, boolean normalize) {
+        this.zzReader = in;
+        if (outputStream != null)
+            this.outputStream = outputStream;
+        this.tokens = tokens;
+        this.sentencize = sentencize;
+        this.positions = positions;
+        this.echo = echo;
+        this.xmlEcho = xmlEcho;
+        this.normalize = normalize;
+    }
 
-	public String[] tokenize(String s) {
-		Span[] spans;
-		int i;
-		String[] tokens;
-
-		spans = tokenizePos(s);
-		tokens = new String[spans.length];
-		for(i=0; i<spans.length; i++) {
-			tokens[i]=spans[i].getType();
-		}
-		return tokens;
-	}
-
-	public Span[] tokenizePos(String s) {
-		Span token;
-		int i=0;
-		List<Span> list = new ArrayList<Span>();
-		tokenId=0;
-		yyreset(new StringReader(s));
-		try {
-			while(!this.zzAtEOF) {
-				token = this.getNextToken();
-				if(token != null) {
-					list.add(token);
-				}
-			}
-		} catch (java.io.IOException e) {
-			System.out.println("IO error scanning "+s);
-			System.out.println(e);
-		}
-		return(list.toArray(new Span[list.size()]));
-	}
-
-	public String[] sentDetect(String s) {
-     Span[] spans;
-     int i;
-     String[] sentences;
-
-     spans = sentPosDetect(s);
-     sentences = new String[spans.length];
-     for (i = 0; i < spans.length; i++) {
-         sentences[i] = spans[i].getType();
-     }
-     return sentences;
-  }
-
-  public Span[] sentPosDetect(String s) {
-    final Span tokens[] = tokenizePos(s);
-    ArrayList<Span> sentences = new ArrayList<Span>();
-    int sentenceStart = 0;
-    if (tokens.length > 0)
-        tokens[0].getStart();
-    for (int i = 0; i < tokens.length; i++) {
-        if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
-            sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
-            if (i < tokens.length - 1) {
-                sentenceStart = tokens[i + 1].getStart();
+    public void scanThrough() throws IOException {
+        List<Span> list = new ArrayList<Span>();
+        Span token;
+        while (!zzAtEOF) {
+            token = this.getNextToken();
+            if (atEOT) {
+                if (echo) {
+                    printTokenPositions(list, sentencize);
+                    list.clear();
+                }
+                atEOT = false;
+            }
+            if (token != null) {
+                list.add(token);
             }
         }
     }
-    return sentences.toArray(new Span[0]);
-  }
 
-	public int[] tokenizeMilestones(String s) {
-		Span[] spans;
-		int i;
-		int[] milestones;
+    public String[] tokenize(String s) {
+        Span[] spans;
+        int i;
+        String[] tokens;
 
-		spans = tokenizePos(s);
-		milestones = new int[2*spans.length];
-		for(i=0; i<spans.length; i++) {
-			milestones[i*2]=spans[i].getStart();
-			milestones[i*2+1]=spans[i].getEnd();
-		}
-		return milestones;
-	}
+        spans = tokenizePos(s);
+        tokens = new String[spans.length];
+        for (i = 0; i < spans.length; i++) {
+            tokens[i] = spans[i].getType();
+        }
+        return tokens;
+    }
 
-	public final long yychar()	{
-    return yychar;
-	}
+    public void printTokenPositions(List<Span> spanList, boolean sentencize) {
+        int sentenceStart = -1;
+        StringBuilder tokenStringBuffer = new StringBuilder();
+        StringBuilder sentenceStringBuffer = new StringBuilder();
+        for (int i = 0; i < spanList.size(); i++) {
+            Span s = spanList.get(i);
+            if (sentenceStart == -1)
+                sentenceStart = s.getStart();
+            if (positions) {
+                tokenStringBuffer.append(s.getStart())
+                        .append(" ")
+                        .append(s.getEnd());
+                if (i < spanList.size() - 1)
+                    tokenStringBuffer.append(" ");
+            }
+            if (isSentenceBound(s.getType()) || (i == spanList.size() - 1)) {
+                sentenceStringBuffer.append(sentenceStart)
+                        .append(" ")
+                        .append(s.getEnd());
+                sentenceStart = -1;
+                if (i < spanList.size() - 1)
+                    sentenceStringBuffer.append(" ");
+            }
+        }
+        outputStream.println(tokenStringBuffer.toString());
+        if (sentencize)
+            outputStream.println(sentenceStringBuffer.toString());
+    }
 
-	final Span  currentToken() {
-    return currentToken(yytext());
-	}
+    public Span[] tokenizePos(String s) {
+        Span token;
+        int i = 0;
+        List<Span> list = new ArrayList<Span>();
+        tokenId = 0;
+        yyreset(new StringReader(s));
+        try {
+            while (!this.zzAtEOF) {
+                token = this.getNextToken();
+                if (atEOT) {
+                    if (echo) {
+                        printTokenPositions(list, sentencize);
+                        list.clear();
+                    }
+                    atEOT = false;
+                }
+                if (token != null) {
+                    list.add(token);
+                }
+            }
+        } catch (java.io.IOException e) {
+            System.err.println("IO error scanning " + s);
+            System.err.println(e);
+        }
+        return (list.toArray(new Span[list.size()]));
+    }
 
-	public boolean isSentenceBound(String s) {
+    public String[] sentDetect(String s) {
+        Span[] spans;
+        int i;
+        String[] sentences;
+
+        spans = sentPosDetect(s);
+        sentences = new String[spans.length];
+        for (i = 0; i < spans.length; i++) {
+            sentences[i] = spans[i].getType();
+        }
+        return sentences;
+    }
+
+    public Span[] sentPosDetect(String s) {
+        final Span tokens[] = tokenizePos(s);
+        ArrayList<Span> sentences = new ArrayList<Span>();
+        int sentenceStart = 0;
+        if (tokens.length > 0)
+            tokens[0].getStart();
+        for (int i = 0; i < tokens.length; i++) {
+            if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
+                sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
+                if (i < tokens.length - 1) {
+                    sentenceStart = tokens[i + 1].getStart();
+                }
+            }
+        }
+        return sentences.toArray(new Span[0]);
+    }
+
+    public final long yychar() {
+        return yychar;
+    }
+
+    final Span currentToken() {
+        return currentToken(yytext());
+    }
+
+    public boolean isSentenceBound(String s) {
         return s.matches("^[.?!]+$");
     }
 
-	final Span currentToken(String normalizedValue) {
-		String value;
-		long lengthDiff=0;
-    previousFileEndOffset = -1;
+    final Span currentToken(String normalizedValue) {
+        String value;
+        long lengthDiff = 0;
+        previousFileEndOffset = -1;
 
-		if(normalize) {
-			value = normalizedValue;
-		} else {
-			value = yytext();
-			lengthDiff = value.length() - value.codePointCount(0, value.length());
-		}
-		if(startOffset > yychar || startOffset < 0) { // how can this happen?
-			startOffset = 0;
-		}
-		long from = (yychar-startOffset),
-			to =  (yychar-startOffset+yylength()-lengthDiff);
-		if(xmlEcho) {
-			System.out.println("<span id=\"t_"+tokenId+"\" from=\""+from+"\" to=\"" + to + "\"/>\n"+value);
-		}
-		startOffset += lengthDiff;
-		tokenId++;
-		if(bounds != null) {
-			if(debug) {
-				System.err.println(from+"-"+to+":"+ value);
-			}
-			bounds.append(from+" "+to+" ");
-            if (sentences) {
-                if (newSentence || sentenceBounds.length() == 0) {
-                    if (sentenceBounds.length() != 0)
-                        sentenceBounds.append(" ");
-                    sentenceBounds.append(from);
-                    newSentence = false;
-                }
-                if (isSentenceBound(value)) {
-                    sentenceBounds.append(" " + to);
-                    fallbackSentenceEndOffset = -1;
-                    newSentence = true;
-                } else {
-                    fallbackSentenceEndOffset = to;
-                }
+        if (normalize) {
+            value = normalizedValue;
+        } else {
+            value = yytext();
+            lengthDiff = value.length() - value.codePointCount(0, value.length());
+        }
+        if (startOffset > yychar || startOffset < 0) { // how can this happen?
+            startOffset = 0;
+        }
+        long from = (yychar - startOffset),
+                to = (yychar - startOffset + yylength() - lengthDiff);
+        if (xmlEcho) {
+            outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
+        } else if (echo && tokens) {
+            outputStream.println(value);
+        }
+        startOffset += lengthDiff;
+        tokenId++;
+        return new Span((int) from, (int) to, value);
+    }
+
+    final void fileEnd() {
+        startOffset = yychar + yylength();
+        // do not end a file multiple times because of additional EOT characters
+        if (startOffset == previousFileEndOffset)
+            return;
+        atEOT = true;
+        previousFileEndOffset = startOffset;
+        tokenId = 0;
+    }
+
+    final Span xmlPassage() {
+        if (xmlEcho) {
+            String dings = yytext();
+            if (dings.indexOf("<text") >= 0) {
+                startOffset = yychar + yylength();
+                tokenId = 0;
             }
+            outputStream.println(dings.replaceAll("[\n\r]+", ""));
+            return null;
+        } else {
+            return currentToken();
         }
-        return new Span((int)from, (int)to, value);
-	}
+    }
 
-	final void fileEnd() {
-		startOffset = yychar+yylength();
-		// do not end a file multiple times because of additional EOT characters
-		if (startOffset == previousFileEndOffset)
-		    return;;
-    previousFileEndOffset = startOffset;
-		tokenId=0;
-		if(bounds != null && !xmlEcho) {
-			System.out.println(bounds.toString().trim());
-        if (sentences && sentenceBounds != null) {
-             if (fallbackSentenceEndOffset != -1 && bounds.toString().trim().length() != 0)
-                sentenceBounds.append(" "+fallbackSentenceEndOffset);
-            System.out.println(sentenceBounds.toString());
-        }
-        bounds.setLength(0);
-        sentenceBounds.setLength(0);
-		}
-	}
+    final void zipArchive() {
+        String name;
+        String matched = yytext();
+        int start = 10;
+        name = matched.substring(start, matched.length() - 1);
+        outputStream.println("<archive name=\"" + name + "\"/>");
+    }
 
-	final Span xmlPassage() {
-		if(xmlEcho) {
-			String dings = yytext();
-			if(dings.indexOf("<text")>=0 ) {
-				startOffset = yychar+yylength();
-				tokenId=0;
-			}
-			System.out.println(dings.replaceAll("[\n\r]+",""));
-			return null;
-		} else {
-			return currentToken();
-		}
-	}
-
-	final void zipArchive() {
-		String name;
-		String matched = yytext();
-		int start = 10;
-		name = matched.substring(start, matched.length() - 1);
-		System.out.println("<archive name=\"" + name + "\"/>");
-	}
-
-	final void zippedFile() {
-		String name;
-		String matched = yytext();
-		int start = 13;
-		name = matched.substring(start, matched.length() - 3);
-		System.out.println("<file name=\"" + name + "\"/>");
-	}
-
-  public static void main(String argv[]) {
-		int args=argv.length;
-		int j=0;
-		boolean xmlout = false;
-		boolean normalize = false;
-        boolean sentences = false;
-
-		for (int i = 0; i < argv.length && argv[i].indexOf("-") == 0; i++) {
-			if(argv[i].equals("-ktt")) { // act as a tokenizer for KorAP TreeTagger
-				xmlout=true;
-			} else if(argv[i].equals("-n")) { // do some normailization
-				normalize=true;
-			}  else if(argv[i].equals("-s")) { // detect sentence boundaries
-        sentences=true;
-      }
-			j++;
-		}
-
-		for (int i = j; i < argv.length || (i == j && argv.length == j); i++) {
-			KorAPTokenizerImpl scanner = null;
-			String fn = (argv.length > j ? argv[i] : "-");
-			try {
-		    BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in)) :
-		        new BufferedReader(new java.io.FileReader(fn));
-				scanner = new KorAPTokenizerImpl(br);
-				scanner.bounds = new StringBuffer(1280000);
-        scanner.sentenceBounds = new StringBuffer(128000);
-				scanner.xmlEcho=xmlout;
-				scanner.normalize=normalize;
-				scanner.sentences=sentences;
-				while ( !scanner.zzAtEOF ) { scanner.getNextToken(); }
-			}
-			catch (java.io.FileNotFoundException e) {
-				System.out.println("File not found : \""+fn+"\"");
-			}
-			catch (java.io.IOException e) {
-				System.out.println("IO error scanning file \""+fn+"\"");
-				System.out.println(e);
-			}
-			catch (Exception e) {
-				System.out.println("Unexpected exception:");
-				e.printStackTrace();
-			}
-		}
-  }
+    final void zippedFile() {
+        String name;
+        String matched = yytext();
+        int start = 13;
+        name = matched.substring(start, matched.length() - 3);
+        outputStream.println("<file name=\"" + name + "\"/>");
+    }
 %}
 
 THAI       = [\u0E00-\u0E59]

diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java b/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
index a4826dd..bf5743c 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/IPCOffsetTests.java

@@ -1,6 +1,5 @@
 package de.ids_mannheim.korap.tokenizer;
 
-import org.apache.maven.surefire.shade.org.apache.commons.io.output.ByteArrayOutputStream;
 import org.junit.Test;
 import org.junit.runner.RunWith;
 import org.junit.runners.Parameterized;
@@ -16,7 +15,6 @@
 import static org.junit.Assert.*;
 
 @RunWith(Parameterized.class)
-@net.jcip.annotations.NotThreadSafe
 public class IPCOffsetTests {
     @Parameterized.Parameters
     public static Collection<Object[]> data() {
@@ -47,12 +45,12 @@
 
     @Test
     public void testMainWithOffsetsAndSentencesOnDifferentInputFiles() throws IOException {
-        final ByteArrayOutputStream myOut = new ByteArrayOutputStream();
-        System.setOut(new PrintStream(myOut));
-        String[] args = {"-s", input};
-        KorAPTokenizerImpl.main(args);
+        File tempFile = File.createTempFile("tokenoutput", ".txt");
+        String[] args = {"--no-tokens", "--positions", "--sentence-boundaries", "--force", "-o", tempFile.getAbsolutePath(), input};
+        KorAPTokenizer.main(args);
+        String actualResult = readFile(tempFile.getAbsolutePath());
         String goldData = readFile(gold);
-        assertEquals(goldData, myOut.toString(StandardCharsets.UTF_8));
+        assertEquals(goldData, actualResult);
     }
 }
 

diff --git a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
index f7811c0..fe694be 100644
--- a/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java
+++ b/src/test/java/de/ids_mannheim/korap/tokenizer/TokenizerTest.java

@@ -2,6 +2,7 @@
 
 import static org.junit.Assert.*;
 
+import opennlp.tools.util.Span;
 import org.apache.maven.surefire.shade.org.apache.commons.io.output.ByteArrayOutputStream;
 import org.junit.Test;
 import org.junit.Ignore;
@@ -495,4 +496,12 @@
         String[] tokens = tok.tokenize("Archive:  ich/bin/ein.zip\n");
         assertEquals(0, tokens.length);
     }
+
+    @Test
+    public void testTextBreakOutputArchive () {
+        KorAPTokenizerImpl tok = new KorAPTokenizerImpl(null, null, false, false, false, true, false, false);
+        Span[] tokens = tok.tokenizePos("Text1\004\nText2 Hallo\004Rumsdibums\004Das freut mich sehr.\n");
+        assertEquals("Text1", tokens[0].getType());
+        assertEquals(tokens.length, 9 );
+    }
 }

diff --git a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
index 07a6e08..0c47090 100644
--- a/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java
+++ b/target/generated-sources/jflex/de/ids_mannheim/korap/tokenizer/KorAPTokenizerImpl.java

@@ -53,10 +53,7 @@
  See the License for the specific language governing permissions and
  limitations under the License.
 */
-
-import java.io.BufferedReader;
-import java.io.StringReader;
-import java.io.InputStreamReader;
+import java.io.*;
 import java.lang.StringBuffer;
 import java.util.ArrayList;
 import java.util.List;
@@ -34140,252 +34137,230 @@
   private boolean zzEOFDone;
 
   /* user code: */
+    private boolean xmlEcho = false;
+    private boolean normalize = false;
+    private boolean debug = false;
+    private boolean newSentence = true;
+    private long startOffset = 0;
+    private long previousFileEndOffset = -1;
+    private int tokenId = 0;
+    private boolean atEOT = false;
+    private boolean sentencize = false;
+    private boolean echo = false;
+    private boolean positions = false;
+    private boolean tokens = false;
+    private PrintStream outputStream = System.out;
 
-	public boolean xmlEcho = false;
-  public boolean sentences = false;
-	public boolean normalize = false;
-	public boolean debug = false;
-	private boolean newSentence = true;
-	private long startOffset = 0;
-	private long previousFileEndOffset = -1;
-	private int tokenId = 0;
-	private StringBuffer bounds = null;
-	private long fallbackSentenceEndOffset = -1;
-	private StringBuffer sentenceBounds = null;
+    public KorAPTokenizerImpl() {
+        this.zzReader = null;
+    }
 
-  public KorAPTokenizerImpl() {
-    this.zzReader = null;
-    sentenceBounds = null;
-  }
+    public KorAPTokenizerImpl(java.io.Reader in, PrintStream outputStream, boolean echo, boolean tokens, boolean sentencize, boolean positions, boolean xmlEcho, boolean normalize) {
+        this.zzReader = in;
+        if (outputStream != null)
+            this.outputStream = outputStream;
+        this.tokens = tokens;
+        this.sentencize = sentencize;
+        this.positions = positions;
+        this.echo = echo;
+        this.xmlEcho = xmlEcho;
+        this.normalize = normalize;
+    }
 
-	public String[] tokenize(String s) {
-		Span[] spans;
-		int i;
-		String[] tokens;
-
-		spans = tokenizePos(s);
-		tokens = new String[spans.length];
-		for(i=0; i<spans.length; i++) {
-			tokens[i]=spans[i].getType();
-		}
-		return tokens;
-	}
-
-	public Span[] tokenizePos(String s) {
-		Span token;
-		int i=0;
-		List<Span> list = new ArrayList<Span>();
-		tokenId=0;
-		yyreset(new StringReader(s));
-		try {
-			while(!this.zzAtEOF) {
-				token = this.getNextToken();
-				if(token != null) {
-					list.add(token);
-				}
-			}
-		} catch (java.io.IOException e) {
-			System.out.println("IO error scanning "+s);
-			System.out.println(e);
-		}
-		return(list.toArray(new Span[list.size()]));
-	}
-
-	public String[] sentDetect(String s) {
-     Span[] spans;
-     int i;
-     String[] sentences;
-
-     spans = sentPosDetect(s);
-     sentences = new String[spans.length];
-     for (i = 0; i < spans.length; i++) {
-         sentences[i] = spans[i].getType();
-     }
-     return sentences;
-  }
-
-  public Span[] sentPosDetect(String s) {
-    final Span tokens[] = tokenizePos(s);
-    ArrayList<Span> sentences = new ArrayList<Span>();
-    int sentenceStart = 0;
-    if (tokens.length > 0)
-        tokens[0].getStart();
-    for (int i = 0; i < tokens.length; i++) {
-        if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
-            sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
-            if (i < tokens.length - 1) {
-                sentenceStart = tokens[i + 1].getStart();
+    public void scanThrough() throws IOException {
+        List<Span> list = new ArrayList<Span>();
+        Span token;
+        while (!zzAtEOF) {
+            token = this.getNextToken();
+            if (atEOT) {
+                if (echo) {
+                    printTokenPositions(list, sentencize);
+                    list.clear();
+                }
+                atEOT = false;
+            }
+            if (token != null) {
+                list.add(token);
             }
         }
     }
-    return sentences.toArray(new Span[0]);
-  }
 
-	public int[] tokenizeMilestones(String s) {
-		Span[] spans;
-		int i;
-		int[] milestones;
+    public String[] tokenize(String s) {
+        Span[] spans;
+        int i;
+        String[] tokens;
 
-		spans = tokenizePos(s);
-		milestones = new int[2*spans.length];
-		for(i=0; i<spans.length; i++) {
-			milestones[i*2]=spans[i].getStart();
-			milestones[i*2+1]=spans[i].getEnd();
-		}
-		return milestones;
-	}
+        spans = tokenizePos(s);
+        tokens = new String[spans.length];
+        for (i = 0; i < spans.length; i++) {
+            tokens[i] = spans[i].getType();
+        }
+        return tokens;
+    }
 
-	public final long yychar()	{
-    return yychar;
-	}
+    public void printTokenPositions(List<Span> spanList, boolean sentencize) {
+        int sentenceStart = -1;
+        StringBuilder tokenStringBuffer = new StringBuilder();
+        StringBuilder sentenceStringBuffer = new StringBuilder();
+        for (int i = 0; i < spanList.size(); i++) {
+            Span s = spanList.get(i);
+            if (sentenceStart == -1)
+                sentenceStart = s.getStart();
+            if (positions) {
+                tokenStringBuffer.append(s.getStart())
+                        .append(" ")
+                        .append(s.getEnd());
+                if (i < spanList.size() - 1)
+                    tokenStringBuffer.append(" ");
+            }
+            if (isSentenceBound(s.getType()) || (i == spanList.size() - 1)) {
+                sentenceStringBuffer.append(sentenceStart)
+                        .append(" ")
+                        .append(s.getEnd());
+                sentenceStart = -1;
+                if (i < spanList.size() - 1)
+                    sentenceStringBuffer.append(" ");
+            }
+        }
+        outputStream.println(tokenStringBuffer.toString());
+        if (sentencize)
+            outputStream.println(sentenceStringBuffer.toString());
+    }
 
-	final Span  currentToken() {
-    return currentToken(yytext());
-	}
+    public Span[] tokenizePos(String s) {
+        Span token;
+        int i = 0;
+        List<Span> list = new ArrayList<Span>();
+        tokenId = 0;
+        yyreset(new StringReader(s));
+        try {
+            while (!this.zzAtEOF) {
+                token = this.getNextToken();
+                if (atEOT) {
+                    if (echo) {
+                        printTokenPositions(list, sentencize);
+                        list.clear();
+                    }
+                    atEOT = false;
+                }
+                if (token != null) {
+                    list.add(token);
+                }
+            }
+        } catch (java.io.IOException e) {
+            System.err.println("IO error scanning " + s);
+            System.err.println(e);
+        }
+        return (list.toArray(new Span[list.size()]));
+    }
 
-	public boolean isSentenceBound(String s) {
+    public String[] sentDetect(String s) {
+        Span[] spans;
+        int i;
+        String[] sentences;
+
+        spans = sentPosDetect(s);
+        sentences = new String[spans.length];
+        for (i = 0; i < spans.length; i++) {
+            sentences[i] = spans[i].getType();
+        }
+        return sentences;
+    }
+
+    public Span[] sentPosDetect(String s) {
+        final Span tokens[] = tokenizePos(s);
+        ArrayList<Span> sentences = new ArrayList<Span>();
+        int sentenceStart = 0;
+        if (tokens.length > 0)
+            tokens[0].getStart();
+        for (int i = 0; i < tokens.length; i++) {
+            if (tokens[i].getType().matches("^[.?!]+$") || i == tokens.length - 1) {
+                sentences.add(new Span(sentenceStart, tokens[i].getEnd(), s.substring(sentenceStart, tokens[i].getEnd())));
+                if (i < tokens.length - 1) {
+                    sentenceStart = tokens[i + 1].getStart();
+                }
+            }
+        }
+        return sentences.toArray(new Span[0]);
+    }
+
+    public final long yychar() {
+        return yychar;
+    }
+
+    final Span currentToken() {
+        return currentToken(yytext());
+    }
+
+    public boolean isSentenceBound(String s) {
         return s.matches("^[.?!]+$");
     }
 
-	final Span currentToken(String normalizedValue) {
-		String value;
-		long lengthDiff=0;
-    previousFileEndOffset = -1;
+    final Span currentToken(String normalizedValue) {
+        String value;
+        long lengthDiff = 0;
+        previousFileEndOffset = -1;
 
-		if(normalize) {
-			value = normalizedValue;
-		} else {
-			value = yytext();
-			lengthDiff = value.length() - value.codePointCount(0, value.length());
-		}
-		if(startOffset > yychar || startOffset < 0) { // how can this happen?
-			startOffset = 0;
-		}
-		long from = (yychar-startOffset),
-			to =  (yychar-startOffset+yylength()-lengthDiff);
-		if(xmlEcho) {
-			System.out.println("<span id=\"t_"+tokenId+"\" from=\""+from+"\" to=\"" + to + "\"/>\n"+value);
-		}
-		startOffset += lengthDiff;
-		tokenId++;
-		if(bounds != null) {
-			if(debug) {
-				System.err.println(from+"-"+to+":"+ value);
-			}
-			bounds.append(from+" "+to+" ");
-            if (sentences) {
-                if (newSentence || sentenceBounds.length() == 0) {
-                    if (sentenceBounds.length() != 0)
-                        sentenceBounds.append(" ");
-                    sentenceBounds.append(from);
-                    newSentence = false;
-                }
-                if (isSentenceBound(value)) {
-                    sentenceBounds.append(" " + to);
-                    fallbackSentenceEndOffset = -1;
-                    newSentence = true;
-                } else {
-                    fallbackSentenceEndOffset = to;
-                }
+        if (normalize) {
+            value = normalizedValue;
+        } else {
+            value = yytext();
+            lengthDiff = value.length() - value.codePointCount(0, value.length());
+        }
+        if (startOffset > yychar || startOffset < 0) { // how can this happen?
+            startOffset = 0;
+        }
+        long from = (yychar - startOffset),
+                to = (yychar - startOffset + yylength() - lengthDiff);
+        if (xmlEcho) {
+            outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
+        } else if (echo && tokens) {
+            outputStream.println(value);
+        }
+        startOffset += lengthDiff;
+        tokenId++;
+        return new Span((int) from, (int) to, value);
+    }
+
+    final void fileEnd() {
+        startOffset = yychar + yylength();
+        // do not end a file multiple times because of additional EOT characters
+        if (startOffset == previousFileEndOffset)
+            return;
+        atEOT = true;
+        previousFileEndOffset = startOffset;
+        tokenId = 0;
+    }
+
+    final Span xmlPassage() {
+        if (xmlEcho) {
+            String dings = yytext();
+            if (dings.indexOf("<text") >= 0) {
+                startOffset = yychar + yylength();
+                tokenId = 0;
             }
+            outputStream.println(dings.replaceAll("[\n\r]+", ""));
+            return null;
+        } else {
+            return currentToken();
         }
-        return new Span((int)from, (int)to, value);
-	}
+    }
 
-	final void fileEnd() {
-		startOffset = yychar+yylength();
-		// do not end a file multiple times because of additional EOT characters
-		if (startOffset == previousFileEndOffset)
-		    return;;
-    previousFileEndOffset = startOffset;
-		tokenId=0;
-		if(bounds != null && !xmlEcho) {
-			System.out.println(bounds.toString().trim());
-        if (sentences && sentenceBounds != null) {
-             if (fallbackSentenceEndOffset != -1 && bounds.toString().trim().length() != 0)
-                sentenceBounds.append(" "+fallbackSentenceEndOffset);
-            System.out.println(sentenceBounds.toString());
-        }
-        bounds.setLength(0);
-        sentenceBounds.setLength(0);
-		}
-	}
+    final void zipArchive() {
+        String name;
+        String matched = yytext();
+        int start = 10;
+        name = matched.substring(start, matched.length() - 1);
+        outputStream.println("<archive name=\"" + name + "\"/>");
+    }
 
-	final Span xmlPassage() {
-		if(xmlEcho) {
-			String dings = yytext();
-			if(dings.indexOf("<text")>=0 ) {
-				startOffset = yychar+yylength();
-				tokenId=0;
-			}
-			System.out.println(dings.replaceAll("[\n\r]+",""));
-			return null;
-		} else {
-			return currentToken();
-		}
-	}
-
-	final void zipArchive() {
-		String name;
-		String matched = yytext();
-		int start = 10;
-		name = matched.substring(start, matched.length() - 1);
-		System.out.println("<archive name=\"" + name + "\"/>");
-	}
-
-	final void zippedFile() {
-		String name;
-		String matched = yytext();
-		int start = 13;
-		name = matched.substring(start, matched.length() - 3);
-		System.out.println("<file name=\"" + name + "\"/>");
-	}
-
-  public static void main(String argv[]) {
-		int args=argv.length;
-		int j=0;
-		boolean xmlout = false;
-		boolean normalize = false;
-        boolean sentences = false;
-
-		for (int i = 0; i < argv.length && argv[i].indexOf("-") == 0; i++) {
-			if(argv[i].equals("-ktt")) { // act as a tokenizer for KorAP TreeTagger
-				xmlout=true;
-			} else if(argv[i].equals("-n")) { // do some normailization
-				normalize=true;
-			}  else if(argv[i].equals("-s")) { // detect sentence boundaries
-        sentences=true;
-      }
-			j++;
-		}
-
-		for (int i = j; i < argv.length || (i == j && argv.length == j); i++) {
-			KorAPTokenizerImpl scanner = null;
-			String fn = (argv.length > j ? argv[i] : "-");
-			try {
-		    BufferedReader br = "-".equals(fn) ? new BufferedReader(new InputStreamReader(System.in)) :
-		        new BufferedReader(new java.io.FileReader(fn));
-				scanner = new KorAPTokenizerImpl(br);
-				scanner.bounds = new StringBuffer(1280000);
-        scanner.sentenceBounds = new StringBuffer(128000);
-				scanner.xmlEcho=xmlout;
-				scanner.normalize=normalize;
-				scanner.sentences=sentences;
-				while ( !scanner.zzAtEOF ) { scanner.getNextToken(); }
-			}
-			catch (java.io.FileNotFoundException e) {
-				System.out.println("File not found : \""+fn+"\"");
-			}
-			catch (java.io.IOException e) {
-				System.out.println("IO error scanning file \""+fn+"\"");
-				System.out.println(e);
-			}
-			catch (Exception e) {
-				System.out.println("Unexpected exception:");
-				e.printStackTrace();
-			}
-		}
-  }
+    final void zippedFile() {
+        String name;
+        String matched = yytext();
+        int start = 13;
+        name = matched.substring(start, matched.length() - 3);
+        outputStream.println("<file name=\"" + name + "\"/>");
+    }
 
 
   /**
commit	c419d5b22c508a352f00fb11f23034a10bfbaf3d	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Sep 17 15:21:26 2020 +0200
committer	Akron <nils@diewald-online.de>	Fri Sep 18 17:44:06 2020 +0200
tree	83eb21aa0d47920a7c1fcd225601249c1dc4108c
parent	de949deb083c43f4e0fed3713617aed768c000aa [diff]