Bump version to 2.2.3, update Readme and Changlog Change-Id: Ic4928596d72ce3f738a47f112d8064dc63324f56

commit: c2f448ce40518e8df5fa6e75c22580be09b00ad2 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Tue Sep 05 16:04:31 2023 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Sep 06 09:36:25 2023 +0200
tree: 805b7a00098da81e931ca540f970387838d26019
parent: ced7882a257daf3001fcadd5b4b3d0ec1e252db6 [diff]
diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4a78e7c..77a1ad9 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md

@@ -1,5 +1,14 @@
 # Changelog
 
+## 2.2.3
+
+* Updated dependencies
+* Minimum Java version raised to 17
+* Fixed group id in pom.xml
+* Removed compile dependency on Maven Surefire
+* Build artifacts in src/main/jflex are now ignored by git
+* java.io's ByteArrayOutputStream used instead of 3rd-party class
+
 ## 2.2.2
 
 * Bug fix: a single quotation mark at the beginning of a word

diff --git a/Readme.md b/Readme.md
index 2d933b2..7fe7043 100644
--- a/Readme.md
+++ b/Readme.md

@@ -14,8 +14,8 @@
 the tokenizers are potentially not as accurate as language model based ones, but with ~5 billion words per hour typically more efficient.
 An important feature in the DeReKo/KorAP context is also that token character offsets can be reported, which can be used for applying standoff annotations.
  
-The include mplementations of the `KorapTokenizer` interface also implement the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
-and [`opennlp.tools.sentdetect.SentenceDetector`](https://opennlp.apache.org/docs/1.8.2/apidocs/opennlp-tools/opennlp/tools/sentdetect/SentenceDetector.html)
+The included implementations of the `KorapTokenizer` interface also implement the [`opennlp.tools.tokenize.Tokenizer`](https://opennlp.apache.org/docs/2.3.0/apidocs/opennlp-tools/opennlp/tools/tokenize/Tokenizer.html)
+and [`opennlp.tools.sentdetect.SentenceDetector`](https://opennlp.apache.org/docs/2.3.0/apidocs/opennlp-tools/opennlp/tools/sentdetect/SentenceDetector.html)
 interfaces and can thus be used as a drop-in replacements in OpenNLP applications.
 
 The underlying scanner is based on the Lucene scanner with modifications from [David Hall](https://github.com/dlwh).
@@ -38,7 +38,7 @@
 
 #### Split English text into tokens
 ```
-$ echo "It's working." | java -jar target/KorAP-Tokenizer-2.2.2-standalone.jar -l en
+$ echo "It's working." | java -jar target/KorAP-Tokenizer-2.2.3-standalone.jar -l en
 It
 's
 working
@@ -47,7 +47,7 @@
 #### Split French text into tokens and sentences
 ```
 $ echo "C'est une phrase. Ici, il s'agit d'une deuxième phrase." \
-  | java -jar target/KorAP-Tokenizer-2.2.2-standalone.jar -s -l fr
+  | java -jar target/KorAP-Tokenizer-2.2.3-standalone.jar -s -l fr
 C'
 est
 une
@@ -72,7 +72,7 @@
 In order to end a text, flush the output and reset the character position, an EOT character (0x04) can be used.
 ```
 $ echo -n -e 'This is a text.\x0a\x04\x0aAnd this is another text.\n\x04\n' |\
-     java -jar target/KorAP-Tokenizer-2.2.2-standalone.jar  --positions
+     java -jar target/KorAP-Tokenizer-2.2.3-standalone.jar  --positions
 This
 is
 a
@@ -90,7 +90,7 @@
 #### Print token and sentence offset
 ```
 echo -n -e ' This ist a start of a text. And this is a sentence!!! But what the hack????\x0a\x04\x0aAnd this is another text.'  |\
-   java -jar target/KorAP-Tokenizer-2.2.2-standalone.jar --no-tokens --positions --sentence-boundaries
+   java -jar target/KorAP-Tokenizer-2.2.3-standalone.jar --no-tokens --positions --sentence-boundaries
 1 5 6 9 10 11 12 17 18 20 21 22 23 27 27 28 29 32 33 37 38 40 41 42 43 51 51 54 55 58 59 63 64 67 68 72 72 76
 1 28 29 54 55 76
 0 3 4 8 9 11 12 19 20 24 24 25
@@ -111,7 +111,10 @@
 * [Marc Kupietz](https://www.ids-mannheim.de/digspra/personal/kupietz.html)
 * [Nils Diewald](https://www.ids-mannheim.de/digspra/personal/diewald.html)
 
-Copyright (c) 2021, [Leibniz Institute for the German Language](http://www.ids-mannheim.de/), Mannheim, Germany
+**Contributor**:
+* [Gregor Middell](https://github.com/gremid)
+
+Copyright (c) 2023, [Leibniz Institute for the German Language](http://www.ids-mannheim.de/), Mannheim, Germany
 
 This package is developed as part of the [KorAP](http://korap.ids-mannheim.de/)
 Corpus Analysis Platform at the Leibniz Institute for German Language

diff --git a/pom.xml b/pom.xml
index 66de8d1..7eae38d 100644
--- a/pom.xml
+++ b/pom.xml

@@ -6,7 +6,7 @@
 
     <groupId>de.ids_mannheim.korap.tokenizer</groupId>
     <artifactId>KorAP-Tokenizer</artifactId>
-    <version>2.2.2</version>
+    <version>2.2.3</version>
 
     <properties>
         <project.build.sourceEncoding>UTF-8</project.build.sourceEncoding>

diff --git a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
index 8ce48f1..475a843 100644
--- a/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java
+++ b/src/main/java/de/ids_mannheim/korap/tokenizer/Main.java

@@ -14,7 +14,7 @@
 import java.util.stream.Collectors;
 
 @CommandLine.Command(mixinStandardHelpOptions = true,
-        name = "koraptokenizer", version = "2.2.1", description = "Tokenizes (and sentence splits) text input.")
+        name = "koraptokenizer", version = "2.2.3", description = "Tokenizes (and sentence splits) text input.")
 public class Main implements Callable<Integer> {
 
     public final String DEFAULT_LANGUAGE = "de";
commit	c2f448ce40518e8df5fa6e75c22580be09b00ad2	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Sep 05 16:04:31 2023 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Sep 06 09:36:25 2023 +0200
tree	805b7a00098da81e931ca540f970387838d26019
parent	ced7882a257daf3001fcadd5b4b3d0ec1e252db6 [diff]