Bump tokenizer to v2.4.0 and our version to 2.7.0
Change-Id: Idb2b7cc7fb615d4e33c13b04fa569ab25166da87
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index b4276d5..2dde3b7 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -25,7 +25,7 @@
- apk add --no-cache git
script:
- docker build -f Dockerfile -t korap/tei2korapxml:$VID-large --target tei2korapxml .
- - docker run --rm -v /var/run/docker.sock:/var/run/docker.sock mintoolkit/mint --crt-api-version 1.46 build --http-probe=false --exec='PERL5LIB=/tei2korapxml/lib /tei2korapxml/script/tei2korapxml -v || test $? -eq 2 && java -jar /tei2korapxml/share/KorAP-Tokenizer-2.3.0-standalone.jar -V' --include-path=/tei2korapxml/lib --include-path=/usr/local/share/perl5 --include-path=/usr/share/perl5 --include-path=/usr/lib/perl5 --tag korap/tei2korapxml:$VID --tag korap/tei2korapxml:latest korap/tei2korapxml:$VID-large || true
+ - docker run --rm -v /var/run/docker.sock:/var/run/docker.sock mintoolkit/mint --crt-api-version 1.46 build --http-probe=false --exec='PERL5LIB=/tei2korapxml/lib /tei2korapxml/script/tei2korapxml -v || test $? -eq 2 && java -jar /tei2korapxml/share/KorAP-Tokenizer-*-standalone.jar -V' --include-path=/tei2korapxml/lib --include-path=/usr/local/share/perl5 --include-path=/usr/share/perl5 --include-path=/usr/lib/perl5 --tag korap/tei2korapxml:$VID --tag korap/tei2korapxml:latest korap/tei2korapxml:$VID-large || true
- ARTIFACT=tei2korapxml-${VID}.tar.xz
- docker save korap/tei2korapxml:$VID | xz -T0 -M16G -9 > "$ARTIFACT"
artifacts:
diff --git a/Changes b/Changes
index 03ccb1d..5099a22 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,8 @@
+2.7.0 2026-03-03
+ - Upgrade KorAP-Tokenizer to v2.4.0
+ with fixes for soft hyphens, thousands separators, and
+ support for German sensitive spelling forms, separeted by colons, slashes, and brackets.
+
2.6.2 2025-12-10
- Upgrade KorAP-Tokenizer to v2.3.0 (resolves issues with
gendersternchen after hyphens, emoji clusters, and Wikipedia templates).
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index 1aadc4c..be99b7f 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -4,7 +4,7 @@
use warnings;
use File::Share ':all';
-our $VERSION = '2.6.2';
+our $VERSION = '2.7.0';
my $MIN_JAVA_VERSION = 21;
use constant {
@@ -27,7 +27,7 @@
my $tokenizer_jar = dist_file(
'tei2korapxml',
- 'KorAP-Tokenizer-2.3.0-standalone.jar'
+ 'KorAP-Tokenizer-2.4.0-standalone.jar'
);
unless (-f $tokenizer_jar) {
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 5855406..f5aaa95 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -25,7 +25,7 @@
use KorAP::XML::TEI::Header;
use KorAP::XML::TEI::Inline;
-our $VERSION = '2.6.2';
+our $VERSION = '2.7.0';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
diff --git a/share/KorAP-Tokenizer-2.3.0-standalone.jar b/share/KorAP-Tokenizer-2.3.0-standalone.jar
deleted file mode 100644
index fc4c135..0000000
--- a/share/KorAP-Tokenizer-2.3.0-standalone.jar
+++ /dev/null
Binary files differ
diff --git a/share/KorAP-Tokenizer-2.4.0-standalone.jar b/share/KorAP-Tokenizer-2.4.0-standalone.jar
new file mode 100644
index 0000000..0ea5606
--- /dev/null
+++ b/share/KorAP-Tokenizer-2.4.0-standalone.jar
Binary files differ