Prepare for v2.0.0
Change-Id: I6845bddca6ad966699cc029a43d1badb113d223a
diff --git a/CHANGELOG.md b/CHANGELOG.md
new file mode 100644
index 0000000..2dc68ea
--- /dev/null
+++ b/CHANGELOG.md
@@ -0,0 +1,18 @@
+# Changelog
+
+## 2.0.0 [Unreleased]
+* Tokenizer and sentence splitter for English (`-l en` option) added
+* Tokenizer and sentence splitter for French (`-l fr` option) added
+* Support for adding more languages
+* `UTF-8` input encoding is now expected by default, different encodings can be set by the `--encoding <enc>` option
+* By default, tokens are now printed to stdout (use options `--no-tokens --positions` to print character offsets instead)
+* Abbreviated German street names like *Kunststr.* are now recognized as tokens
+* Added heuristics for distinguishing between *I.* as abbrevation vs PPER / CARD
+* URLs without URI-scheme are now recognized as single tokens if they start wit `www.`
+## 1.3
++ Standard EOT/EOF character x04 is used instead of magic escape \n\x03\n
+* Quoted email names containing space characters, like "John Doe"@xx.com, are no longer interpreted as single tokens
+* Sentence splitter functionality added (`--sentence-boundaries` option)
+## 1.2
+* First version published on https://korap.ids-mannheim.de/gerrit/plugins/gitiles/KorAP/KorAP-Tokenizer
+* Extracted from KorAP-internal ingestion pipeline
diff --git a/Readme.md b/Readme.md
index e7b1f78..4acdd12 100644
--- a/Readme.md
+++ b/Readme.md
@@ -45,16 +45,26 @@
In order to end a text, flush the output and reset the character position, an EOT character (0x04) can be used.
#### Command Line Invocation
```
-$ echo -n -e 'This is a text.\x0a\x03\x0aAnd this is another text.\n\x03\n' |\
- java -jar target/KorAP-Tokenizer-1.3-SNAPSHOT.jar --positions
-
-0 4 5 7 8 9 10 15
-0 3 4 8 9 11 12 19 20 25
+$ echo -n -e 'This is a text.\x0a\x04\x0aAnd this is another text.\n\x04\n' |\
+ java -jar target/KorAP-Tokenizer-2.0.0-SNAPSHOT-standalone.jar --positions
+This
+is
+a
+text
+.
+0 4 5 7 8 9 10 14 14 15
+And
+this
+is
+another
+text
+.
+0 3 4 8 9 11 12 19 20 24 24 25
```
#### Invocation with Sentence Splitting
```
echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x04\x0aAnd this is another text.' |\
- java -jar target/KorAP-Tokenizer-1.3-SNAPSHOT-standalone.jar --no-tokens --positions --sentence-boundaries
+ java -jar target/KorAP-Tokenizer-2.0.0-SNAPSHOT-standalone.jar --no-tokens --positions --sentence-boundaries
1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76
1 28 29 54 55 76
0 3 4 8 9 11 12 19 20 24 24 25
diff --git a/pom.xml b/pom.xml
index 014690a..1dd6bbe 100644
--- a/pom.xml
+++ b/pom.xml
@@ -6,7 +6,7 @@
<groupId>groupId</groupId>
<artifactId>KorAP-Tokenizer</artifactId>
- <version>1.3-SNAPSHOT</version>
+ <version>2.0.0-SNAPSHOT</version>
<properties>
<project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>
diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
index c76c189..c3aa101 100644
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
@@ -14,7 +14,7 @@
import java.util.stream.Collectors;
@CommandLine.Command(mixinStandardHelpOptions = true,
- name = "koraptokenizer", version = "{}", description = "Tokenizes (and sentence splits) text input.")
+ name = "koraptokenizer", version = "2.0.0-SNAPSHOT", description = "Tokenizes (and sentence splits) text input.")
public class Main implements Callable<Integer> {
public final String DEFAULT_LANGUAGE = "de";