Merge changes Id4185a86,Iaa0333c0,I85fb0618
* changes:
Sync version to with KorAP-Tokenizer version
Add tests for clitics and contractions: French, English, German
Upgrade to KorAP-Tokenizer v2.2.0
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
index d614dfe..b0fadcd 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -20,7 +20,7 @@
my $tokenizer_jar = dist_file(
'tei2korapxml',
- 'KorAP-Tokenizer-2.0.0-standalone.jar'
+ 'KorAP-Tokenizer-2.2.0-standalone.jar'
);
diff --git a/script/tei2korapxml b/script/tei2korapxml
index fa62a27..2d1a6bf 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -29,7 +29,7 @@
1;
};
-our $VERSION = '1.01';
+our $VERSION = '2.2.0';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
diff --git a/share/KorAP-Tokenizer-2.0.0-standalone.jar b/share/KorAP-Tokenizer-2.0.0-standalone.jar
deleted file mode 100644
index dd7410d..0000000
--- a/share/KorAP-Tokenizer-2.0.0-standalone.jar
+++ /dev/null
Binary files differ
diff --git a/share/KorAP-Tokenizer-2.2.0-standalone.jar b/share/KorAP-Tokenizer-2.2.0-standalone.jar
new file mode 100644
index 0000000..9e45fa7
--- /dev/null
+++ b/share/KorAP-Tokenizer-2.2.0-standalone.jar
Binary files differ
diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
index 0ca0719..43583f3 100644
--- a/t/tokenization-korap.t
+++ b/t/tokenization-korap.t
@@ -1,6 +1,6 @@
use strict;
use warnings;
-use Test::More;
+use Test::More tests => 32;
use File::Basename 'dirname';
use File::Spec::Functions qw/catfile/;
use Test::XML::Loy;
@@ -80,4 +80,26 @@
$t->attr_is('layer spanList span:nth-child(3)', 'from', 13);
$t->attr_is('layer spanList span:nth-child(3)', 'to', 29);
$t->element_count_is('layer spanList span', 3);
+
+$string = "J'ai j'habite qu'il d'un jusqu'à Aujourd'hui D'accord Quelqu'un Presqu'île";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 15);
+
+$string = "isn't I've we'll you're";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 8);
+
+$string = "Lu'hafen W'schaft gibt's";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->element_count_is('layer spanList span', 3);
+
done_testing;