Improve systematicity of options -p, -s, --[no-]tokens -s alone should just print sentence boundary markers an no offsets. E.g.: echo -n -e 'Das ist der 1. Satz. Hier folgt der zweite.' | java -jar target/KorAP-Tokenizer-2.0.0-standalone.jar -s Das ist der 1. Satz . Hier folgt der zweite . Thanks @Roman! Change-Id: I6a5f7e169d1ecb433de0c5f7168f9cd6f3930890

commit: 96bd87c86835cef50d7edbb8be71147f17283640 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Sun Mar 21 16:41:53 2021 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Sat Jun 05 15:39:44 2021 +0200
tree: 67abc5eaf39c886ec4276e74a65acef7b30cd587
parent: 6bf46827dab3955bdfa99ea741bf727fb3d6cdb0 [diff]
diff --git a/Readme.md b/Readme.md
index 4a3c88b..94c9850 100644
--- a/Readme.md
+++ b/Readme.md

@@ -41,9 +41,43 @@
 ## Documentation
 The KorAP tokenizer reads from standard input and writes to standard output. It supports multiple modes of operations.
 
+#### Split into tokens
+```
+$ echo 'This is a sentence. This is a second sentence.' | java -jar target/KorAP-Tokenizer-2.0.0-standalone.jar
+This
+is
+a
+sentence
+.
+This
+is
+a
+second
+sentence
+.
+
+```
+#### Split into tokens and sentences
+```
+$ echo 'This is a sentence. This is a second sentence.' | java -jar target/KorAP-Tokenizer-2.0.0-standalone.jar -s
+This
+is
+a
+sentence
+.
+
+This
+is
+a
+second
+sentence
+.
+
+```
+
+#### Print token character offsets
 With the `--positions` option, for example, the tokenizer prints all offsets of the first character of a token and the first character after a token.
 In order to end a text, flush the output and reset the character position, an EOT character (0x04) can be used.
-#### Command Line Invocation
 ```
 $ echo -n -e 'This is a text.\x0a\x04\x0aAnd this is another text.\n\x04\n' |\
      java -jar target/KorAP-Tokenizer-2.0.0-standalone.jar  --positions
@@ -61,7 +95,7 @@
 .
 0 3 4 8 9 11 12 19 20 24 24 25
 ```
-#### Invocation with Sentence Splitting
+#### Print token and sentence offset
 ```
 echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x04\x0aAnd this is another text.'  |\
    java -jar target/KorAP-Tokenizer-2.0.0-standalone.jar --no-tokens --positions --sentence-boundaries

diff --git a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
index 7434d15..cfbe24e 100644
--- a/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex
+++ b/src/main/jpc/jflex/de/ids_mannheim/korap/tokenizer/DerekoDfaTokenizer.jflex

@@ -138,10 +138,10 @@
         while (!zzAtEOF) {
             token = this.getNextToken();
             if (atEOT) {
-                if (echo) {
+                if (echo && printOffsets) {
                     printTokenPositions(list, splitSentences);
-                    list.clear();
                 }
+                list.clear();
                 atEOT = false;
             }
             if (token != null) {
@@ -285,6 +285,8 @@
             outputStream.println("<span id=\"t_" + tokenId + "\" from=\"" + from + "\" to=\"" + to + "\"/>\n" + value);
         } else if (echo && printTokens) {
             outputStream.println(value);
+            if (splitSentences && isSentenceBound(normalizedValue))
+                outputStream.println("");
         }
         startOffset += lengthDiff;
         tokenId++;

diff --git a/src/test/resources/other_test_data/test.de.latin1.01.tokens.txt b/src/test/resources/other_test_data/test.de.latin1.01.tokens.txt
index 9345c2a..3841111 100644
--- a/src/test/resources/other_test_data/test.de.latin1.01.tokens.txt
+++ b/src/test/resources/other_test_data/test.de.latin1.01.tokens.txt

@@ -103336,4 +103336,3 @@
 XXXII
 .
 )
-

diff --git a/src/test/resources/other_test_data/test.de.utf8.01.tokens.txt b/src/test/resources/other_test_data/test.de.utf8.01.tokens.txt
index 75cf4cf..60b82db 100644
--- a/src/test/resources/other_test_data/test.de.utf8.01.tokens.txt
+++ b/src/test/resources/other_test_data/test.de.utf8.01.tokens.txt

@@ -74505,4 +74505,3 @@
 XXXII
 .
 )
-

diff --git a/src/test/resources/other_test_data/test.en.ascii.01.tokens.txt b/src/test/resources/other_test_data/test.en.ascii.01.tokens.txt
index 8b99b99..e2585aa 100644
--- a/src/test/resources/other_test_data/test.en.ascii.01.tokens.txt
+++ b/src/test/resources/other_test_data/test.en.ascii.01.tokens.txt

@@ -6,15 +6,12 @@
 a
 text
 .
-
 This
 is
 approx.
 text
 number
 2.
-
-
 This
 is
 bspw
@@ -22,4 +19,3 @@
 text
 number4
 .
-

diff --git a/src/test/resources/other_test_data/test.fr.utf8.01.tokens.txt b/src/test/resources/other_test_data/test.fr.utf8.01.tokens.txt
index 10c2843..0286db1 100644
--- a/src/test/resources/other_test_data/test.fr.utf8.01.tokens.txt
+++ b/src/test/resources/other_test_data/test.fr.utf8.01.tokens.txt

@@ -129,4 +129,3 @@
 plaindre
 plus
 tard
-
commit	96bd87c86835cef50d7edbb8be71147f17283640	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Sun Mar 21 16:41:53 2021 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Sat Jun 05 15:39:44 2021 +0200
tree	67abc5eaf39c886ec4276e74a65acef7b30cd587
parent	6bf46827dab3955bdfa99ea741bf727fb3d6cdb0 [diff]