Prepare for v2.0.0 Change-Id: I6845bddca6ad966699cc029a43d1badb113d223a

commit: 755a150690e99688f4bd0d3d499fcc39183f0554 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Wed Oct 14 16:02:28 2020 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Tue Dec 08 12:22:33 2020 +0100
tree: 0893a093990e2fb1a2a064c961d627d2596a6836
parent: cf9b5f5cb448bcccc15d1bd505f4d4423a1ea3b3 [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..2dc68ea
--- /dev/null
+++ b/CHANGELOG.md

@@ -0,0 +1,18 @@
+# Changelog
+
+## 2.0.0 [Unreleased]
+* Tokenizer and sentence splitter for English (`-l en` option) added
+* Tokenizer and sentence splitter for French (`-l fr` option) added
+* Support for adding more languages
+* `UTF-8` input encoding is now expected by default, different encodings can be set by the `--encoding <enc>` option
+* By default, tokens are now printed to stdout (use options `--no-tokens --positions` to print character offsets instead)
+* Abbreviated German street names like *Kunststr.* are now recognized as tokens
+* Added heuristics for distinguishing between *I.* as abbrevation vs PPER / CARD
+* URLs without URI-scheme are now recognized as single tokens if they start wit `www.`
+## 1.3
++ Standard EOT/EOF character x04 is used instead of magic escape \n\x03\n
+* Quoted email names containing space characters, like "John Doe"@xx.com, are no longer interpreted as single tokens
+* Sentence splitter functionality added (`--sentence-boundaries` option)
+## 1.2
+* First version published on https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/KorAP-Tokenizer
+* Extracted from KorAP-internal ingestion pipeline

diff --git a/Readme.md b/Readme.md
index e7b1f78..4acdd12 100644
--- a/Readme.md
+++ b/Readme.md

@@ -45,16 +45,26 @@
 In order to end a text, flush the output and reset the character position, an EOT character (0x04) can be used.
 #### Command Line Invocation
 ```
-$ echo -n -e 'This is a text.\x0a\x03\x0aAnd this is another text.\n\x03\n' |\
-   java -jar target/KorAP-Tokenizer-1.3-SNAPSHOT.jar --positions
-
-0 4 5 7 8 9 10 15 
-0 3 4 8 9 11 12 19 20 25 
+$ echo -n -e 'This is a text.\x0a\x04\x0aAnd this is another text.\n\x04\n' |\
+     java -jar target/KorAP-Tokenizer-2.0.0-SNAPSHOT-standalone.jar  --positions
+This
+is
+a
+text
+.
+0 4 5 7 8 9 10 14 14 15
+And
+this
+is
+another
+text
+.
+0 3 4 8 9 11 12 19 20 24 24 25
 ```
 #### Invocation with Sentence Splitting
 ```
 echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x04\x0aAnd this is another text.'  |\
-   java -jar target/KorAP-Tokenizer-1.3-SNAPSHOT-standalone.jar --no-tokens --positions --sentence-boundaries
+   java -jar target/KorAP-Tokenizer-2.0.0-SNAPSHOT-standalone.jar --no-tokens --positions --sentence-boundaries
 1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76
 1 28 29 54 55 76
 0 3 4 8 9 11 12 19 20 24 24 25

diff --git a/pom.xml b/pom.xml
index 014690a..1dd6bbe 100644
--- a/pom.xml
+++ b/pom.xml

@@ -6,7 +6,7 @@
 
     <groupId>groupId</groupId>
     <artifactId>KorAP-Tokenizer</artifactId>
-    <version>1.3-SNAPSHOT</version>
+    <version>2.0.0-SNAPSHOT</version>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
index c76c189..c3aa101 100644
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java

@@ -14,7 +14,7 @@
 import java.util.stream.Collectors;
 
 @CommandLine.Command(mixinStandardHelpOptions = true,
-        name = "koraptokenizer", version = "{}", description = "Tokenizes (and sentence splits) text input.")
+        name = "koraptokenizer", version = "2.0.0-SNAPSHOT", description = "Tokenizes (and sentence splits) text input.")
 public class Main implements Callable<Integer> {
 
     public final String DEFAULT_LANGUAGE = "de";
commit	755a150690e99688f4bd0d3d499fcc39183f0554	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Oct 14 16:02:28 2020 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Dec 08 12:22:33 2020 +0100
tree	0893a093990e2fb1a2a064c961d627d2596a6836
parent	cf9b5f5cb448bcccc15d1bd505f4d4423a1ea3b3 [diff]