Add -tk option to use the standard KoAP tokenizer
Change-Id: I992fe37463926c8ecbca933fbb709f8640d6fb93
diff --git a/Makefile.PL b/Makefile.PL
index 918ef02..25eae0a 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -3,6 +3,9 @@
use strict;
use warnings;
use ExtUtils::MakeMaker;
+use File::ShareDir::Install;
+
+install_share dist => 'share';
WriteMakefile(
NAME => 'tei2korapxml',
@@ -21,6 +24,7 @@
'Capture::Tiny' => '0.48'
},
PREREQ_PM => {
+ 'File::Share' => '0.25',
'XML::CompactTree::XS' => '0.03',
'XML::LibXML::Reader' => '2.0201',
'IO::Compress::Zip' => '2.091',
@@ -29,3 +33,6 @@
MIN_PERL_VERSION => '5.016',
EXE_FILES => ['script/tei2korapxml']
);
+
+package MY;
+use File::ShareDir::Install 'postamble';
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 9417efa..fb9c972 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -5,6 +5,7 @@
use Log::Any qw($log);
use IO::Select;
use IPC::Open2 qw(open2);
+use Encode qw(encode);
# This tokenizer starts an external process for
# tokenization. It writes the data to tokenize
@@ -22,9 +23,6 @@
sub new {
my ($class, $cmd, $sep) = @_;
- # e.g. 'java -cp '. join(':', '.', glob(dirname(__FILE__) . "/../target/*.jar")).
- # " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl"
-
unless ($cmd) {
$log->warn('Tokenizer not established');
return;
@@ -54,7 +52,7 @@
my ($self, $txt) = @_;
return unless $self->{pid};
my $out = $self->{chld_in};
- print $out $txt . $self->{sep};
+ print $out encode( "UTF-8", $txt ) . $self->{sep};
return $self;
};
@@ -128,8 +126,9 @@
if (defined $_ && $_ ne '') {
- # This warning is sometimes thrown, though not yet replicated in the test suite.
- # See the discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+ # This warning is sometimes thrown, though not yet replicated
+ # in the test suite. See the discussion in gerrit (3123:
+ # Establish tokenizer object for external base tokenization)
# for further issues.
$log->warn("Extra output: $_");
}
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
new file mode 100644
index 0000000..b0ad51e
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -0,0 +1,37 @@
+package KorAP::XML::TEI::Tokenizer::KorAP;
+use base 'KorAP::XML::TEI::Tokenizer::External';
+use strict;
+use warnings;
+use File::Share ':all';
+
+use constant {
+ WAIT_SECS => 30
+};
+
+my $java = `sh -c 'command -v java'`;
+chomp $java;
+
+
+if ($java eq '') {
+ warn('No java executable found in PATH. ' . __PACKAGE__ . ' requires a JVM.');
+ return 0;
+};
+
+
+my $tokenizer_jar = dist_file(
+ 'tei2korapxml',
+ 'KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar'
+);
+
+
+# Construct a new KorAP Tokenizer
+sub new {
+ my $class = shift;
+ my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+ $self->{name} = 'korap';
+ $self->{sep} = "\x{04}\n";
+ return bless $self, $class;
+};
+
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 0546658..ab1975c 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -28,6 +28,10 @@
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
+eval {
+ require KorAP::XML::TEI::Tokenizer::KorAP;
+ 1;
+};
our $VERSION = '0.01';
@@ -39,6 +43,7 @@
"root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
"input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+ 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
'use-intern-tokenization|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
@@ -76,13 +81,18 @@
#
## extern tokenization
-my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0;
+my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
+
# TODO:
# Read tokenizer call from configuration file.
# was 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
my $ext_tok;
if ($tokenizer_call) {
$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+ }
+
+ elsif ($tokenizer_korap) {
+ $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
};
my $_tok_file_ext = "tokens.xml";
##
@@ -1044,6 +1054,10 @@
Call an external tokenizer process, that will tokenize
a single line from STDIN and outputs one token per line.
+=item B<--tokenizer-korap|-tk>
+
+Use the standard KorAP/DeReKo tokenizer.
+
=item B<--use-intern-tokenization|-ti>
Tokenize the data using two embedded tokenizers,
diff --git a/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar b/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar
new file mode 100644
index 0000000..748b341
--- /dev/null
+++ b/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar
Binary files differ
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index bf948e8..4e8b2d0 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl
@@ -2,6 +2,7 @@
use strict;
use warnings;
use FindBin;
+use Encode;
BEGIN {
unshift @INC, "$FindBin::Bin/../../lib";
};
@@ -14,7 +15,7 @@
# Read lines from input and return boundaries
while (!eof(STDIN)) {
- my $line = <>;
+ my $line = decode_utf8(<>);
for my $text (split(/\n?\x{04}\n?/, $line)) {
$tok->tokenize($text);
print join(' ', $tok->boundaries), "\n";
diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
new file mode 100644
index 0000000..809dd45
--- /dev/null
+++ b/t/tokenization-korap.t
@@ -0,0 +1,66 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile/;
+use Test::XML::Loy;
+
+use FindBin;
+use utf8;
+
+BEGIN {
+ eval {
+ require KorAP::XML::TEI::Tokenizer::KorAP;
+ 1;
+ } or do {
+ plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
+ };
+}
+
+require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
+
+my $f = dirname(__FILE__);
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new();
+
+$ext->tokenize("Der alte Mann");
+my $str = $ext->to_string('unknown');
+my $t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->element_count_is('layer spanList span', 3);
+
+$ext->tokenize("ging über die Straße");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
+$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
+$t->element_count_is('layer spanList span', 4);
+
+$ext->reset;
+$ext->tokenize("Hu aha\x{04}\ndas ist cool");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
+$t->element_count_is('layer spanList span', 2);
+
+my $string = "Pluto.\" Eris-Entdecker Mike Brown, der im Kurznachrichtendienst Twitter unter \"\@plutokiller";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(14)', 'from', 80);
+$t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
+$t->element_count_is('layer spanList span', 14);
+done_testing;