Bump KorAP-Tokenizer to v2.3.0 and our version to 2.6.2
Change-Id: I7345e72cd67326797ca574bbf5f63bc3fb34de44
diff --git a/.github/workflows/linux.yml b/.github/workflows/linux.yml
index 7334fa7..0ffd43a 100644
--- a/.github/workflows/linux.yml
+++ b/.github/workflows/linux.yml
@@ -20,6 +20,11 @@
with:
perl-version: ${{ matrix.perl }}
- run: perl -V
+ - name: Set up JDK 21
+ uses: actions/setup-java@v3
+ with:
+ java-version: '21'
+ distribution: 'temurin'
- name: Install dependencies
run: |
cpanm File::ShareDir::Install
diff --git a/Changes b/Changes
index 6b6d671..7added2 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,8 @@
+2.6.2 2025-12-10
+ - Upgrade KorAP-Tokenizer to v2.3.0 (resolves issues with
+ gendersternchen after hyphens, emoji clusters, and Wikipedia templates).
+ - Upgrade Java dependency to 21.
+
2.6.1 2025-04-16
- Fix ASCII entity resolution.
- Make KorAP-Tokenizer heap size configurable via environment
diff --git a/Readme.pod b/Readme.pod
index c097fd2..8cb2375 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -83,7 +83,7 @@
In case everything went well, the C<tei2korapxml> tool will
be available on your command line immediately.
-Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
+Minimum requirement for L<KorAP::XML::TEI> is Perl 5.38.
=head1 OPTIONS
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index ac5aae8..1aadc4c 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -4,8 +4,8 @@
use warnings;
use File::Share ':all';
-our $VERSION = '2.6.1';
-my $MIN_JAVA_VERSION = 17;
+our $VERSION = '2.6.2';
+my $MIN_JAVA_VERSION = 21;
use constant {
WAIT_SECS => 30
@@ -27,7 +27,7 @@
my $tokenizer_jar = dist_file(
'tei2korapxml',
- 'KorAP-Tokenizer-2.2.5-standalone.jar'
+ 'KorAP-Tokenizer-2.3.0-standalone.jar'
);
unless (-f $tokenizer_jar) {
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 3e5b335..089b5da 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -25,7 +25,7 @@
use KorAP::XML::TEI::Header;
use KorAP::XML::TEI::Inline;
-our $VERSION = '2.6.1';
+our $VERSION = '2.6.2';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
diff --git a/share/KorAP-Tokenizer-2.2.5-standalone.jar b/share/KorAP-Tokenizer-2.3.0-standalone.jar
similarity index 64%
rename from share/KorAP-Tokenizer-2.2.5-standalone.jar
rename to share/KorAP-Tokenizer-2.3.0-standalone.jar
index 5b5a8ea..fc4c135 100644
--- a/share/KorAP-Tokenizer-2.2.5-standalone.jar
+++ b/share/KorAP-Tokenizer-2.3.0-standalone.jar
Binary files differ
diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
index ae21638..be90c4a 100644
--- a/t/tokenization-korap.t
+++ b/t/tokenization-korap.t
@@ -109,4 +109,55 @@
$t = Test::XML::Loy->new($str);
$t->element_count_is('layer spanList span', 4);
+
+# Tests for issue #115
+$string = "Die Serb*innen wie die Kosovo-Albaner*innen";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('issue-115');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 5, 'Issue #115 - token count');
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 4, 'Issue #115 - Serb*innen from');
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 14, 'Issue #115 - Serb*innen to');
+$t->attr_is('layer spanList span:nth-child(5)', 'from', 23, 'Issue #115 - Kosovo-Albaner*innen from');
+$t->attr_is('layer spanList span:nth-child(5)', 'to', 43, 'Issue #115 - Kosovo-Albaner*innen to');
+
+# Tests for issue #114
+$string = "[_EMOJI:{{S|;)}}_]";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('issue-114');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 1, 'Issue #114 - token count');
+$t->element_exists('layer spanList span:nth-child(1)[from="0"]', 'Issue #114 - EMOJI from');
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 18, 'Issue #114 - EMOJI to');
+
+# Tests for issue #113
+$string = "✊🏿";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('issue-113-1');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 1, 'Issue #113 - emoji modifier count');
+$t->element_exists('layer spanList span:nth-child(1)[from="0"]', 'Issue #113 - emoji modifier from');
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 2, 'Issue #113 - emoji modifier to');
+
+$string = "👨👨👦"; # U+1F468 U+200D U+1F468 U+200D U+1F466
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('issue-113-2');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 1, 'Issue #113 - emoji ZWJ family 1 count');
+$t->element_exists('layer spanList span:nth-child(1)[from="0"]', 'Issue #113 - emoji ZWJ family 1 from');
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 5, 'Issue #113 - emoji ZWJ family 1 to');
+
+$string = "👨👦👦"; # U+1F468 U+200D U+1F466 U+200D U+1F466
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('issue-113-3');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 1, 'Issue #113 - emoji ZWJ family 2 count');
+$t->element_exists('layer spanList span:nth-child(1)[from="0"]', 'Issue #113 - emoji ZWJ family 2 from');
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 5, 'Issue #113 - emoji ZWJ family 2 to');
+
done_testing;