Release v2.1.0 Change-Id: Ic414422b4d0da0265ee6486e343fbd105498d7ab

commit: 732905f6cee69d22fcd18f69ee3c4d42807f82df [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Jun 28 18:44:53 2021 +0200
committer: Akron <nils@diewald-online.de> Tue Jun 29 11:16:30 2021 +0200
tree: bcf9ce418cab1ae697a69545ebdaee6681577567
parent: 9ead93658891632b1efd886fcc70a86e0604de96 [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index c48802a..7e1db6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -1,19 +1,36 @@
 # Changelog
 
+## 2.1.0
+
+* GitHub CI test workflow added
+* Dependencies updated
+* `-Xss2m` added to maven jvm config
+
+### Potentially breaking change
+
+* `--sentence-boundaries|-s` now prints sentence boundaries only if `--positions|-p` is also present
+
 ## 2.0.0
+
 * Dependencies updated
 * Tokenizer and sentence splitter for English (`-l en` option) added
 * Tokenizer and sentence splitter for French (`-l fr` option) added
 * Support for adding more languages
 * `UTF-8` input encoding is now expected by default, different encodings can be set by the `--encoding <enc>` option
-* By default, tokens are now printed to stdout (use options `--no-tokens --positions` to print character offsets instead)
+* By default, tokens are now printed to stdout (use options `--no-tokens --positions` to print character offsets
+  instead)
 * Abbreviated German street names like *Kunststr.* are now recognized as tokens
 * Added heuristics for distinguishing between *I.* as abbrevation vs PPER / CARD
 * URLs without URI-scheme are now recognized as single tokens if they start wit `www.`
+
 ## 1.3
+
 + Standard EOT/EOF character x04 is used instead of magic escape \n\x03\n
+
 * Quoted email names containing space characters, like "John Doe"@xx.com, are no longer interpreted as single tokens
 * Sentence splitter functionality added (`--sentence-boundaries` option)
+
 ## 1.2
+
 * First version published on https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/KorAP-Tokenizer
 * Extracted from KorAP-internal ingestion pipeline

diff --git a/Readme.md b/Readme.md
index 2644bb8..c5e0b57 100644
--- a/Readme.md
+++ b/Readme.md

@@ -8,7 +8,7 @@
 ## DeReKo Tokenizer (included default implementation)
 The included default implementation (`DerekoDfaTokenizer_de`) is a highly efficient DFA tokenizer and sentence splitter with character offset output based on [JFlex](https://www.jflex.de/), suitable for German and other European languages.
 It is used for the German Reference Corpus DeReKo. Being based on a finite state automaton, 
-it is not accurate as language model based tokenizers, but with ~5 billion words per hour typically more efficient.
+it is not as accurate as language model based tokenizers, but with ~5 billion words per hour typically more efficient.
 An important feature in the DeReKo/KorAP context is also, that it reliably reports the character offsets of the tokens 
 so that this information can be used for applying standoff annotations.
  
@@ -43,7 +43,7 @@
 
 #### Split into tokens
 ```
-$ echo 'This is a sentence. This is a second sentence.' | java -jar target/KorAP-Tokenizer-2.0.0-standalone.jar
+$ echo 'This is a sentence. This is a second sentence.' | java -jar target/KorAP-Tokenizer-2.1.0-standalone.jar
 This
 is
 a
@@ -59,7 +59,7 @@
 ```
 #### Split into tokens and sentences
 ```
-$ echo 'This is a sentence. This is a second sentence.' | java -jar target/KorAP-Tokenizer-2.0.0-standalone.jar -s
+$ echo 'This is a sentence. This is a second sentence.' | java -jar target/KorAP-Tokenizer-2.1.0-standalone.jar -s
 This
 is
 a
@@ -80,7 +80,7 @@
 In order to end a text, flush the output and reset the character position, an EOT character (0x04) can be used.
 ```
 $ echo -n -e 'This is a text.\x0a\x04\x0aAnd this is another text.\n\x04\n' |\
-     java -jar target/KorAP-Tokenizer-2.0.0.9000-standalone.jar  --positions
+     java -jar target/KorAP-Tokenizer-2.1.0-standalone.jar  --positions
 This
 is
 a
@@ -98,7 +98,7 @@
 #### Print token and sentence offset
 ```
 echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x04\x0aAnd this is another text.'  |\
-   java -jar target/KorAP-Tokenizer-2.0.0-standalone.jar --no-tokens --positions --sentence-boundaries
+   java -jar target/KorAP-Tokenizer-2.1.0-standalone.jar --no-tokens --positions --sentence-boundaries
 1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76
 1 28 29 54 55 76
 0 3 4 8 9 11 12 19 20 24 24 25

diff --git a/pom.xml b/pom.xml
index 4808d5e..3ddf10c 100644
--- a/pom.xml
+++ b/pom.xml

@@ -6,7 +6,7 @@
 
     <groupId>groupId</groupId>
     <artifactId>KorAP-Tokenizer</artifactId>
-    <version>2.0.0.9000</version>
+    <version>2.1.0</version>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
index 9a275c6..849900d 100644
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java

@@ -14,7 +14,7 @@
 import java.util.stream.Collectors;
 
 @CommandLine.Command(mixinStandardHelpOptions = true,
-        name = "koraptokenizer", version = "2.0.0", description = "Tokenizes (and sentence splits) text input.")
+        name = "koraptokenizer", version = "2.1.0", description = "Tokenizes (and sentence splits) text input.")
 public class Main implements Callable<Integer> {
 
     public final String DEFAULT_LANGUAGE = "de";
commit	732905f6cee69d22fcd18f69ee3c4d42807f82df	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Jun 28 18:44:53 2021 +0200
committer	Akron <nils@diewald-online.de>	Tue Jun 29 11:16:30 2021 +0200
tree	bcf9ce418cab1ae697a69545ebdaee6681577567
parent	9ead93658891632b1efd886fcc70a86e0604de96 [diff]