Add -tk option to use the standard KoAP tokenizer

Change-Id: I992fe37463926c8ecbca933fbb709f8640d6fb93
diff --git a/Makefile.PL b/Makefile.PL
index 918ef02..25eae0a 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -3,6 +3,9 @@
 use strict;
 use warnings;
 use ExtUtils::MakeMaker;
+use File::ShareDir::Install;
+
+install_share dist => 'share';
 
 WriteMakefile(
   NAME         => 'tei2korapxml',
@@ -21,6 +24,7 @@
     'Capture::Tiny' => '0.48'
   },
   PREREQ_PM => {
+    'File::Share' => '0.25',
     'XML::CompactTree::XS'     => '0.03',
     'XML::LibXML::Reader' => '2.0201',
     'IO::Compress::Zip' => '2.091',
@@ -29,3 +33,6 @@
   MIN_PERL_VERSION => '5.016',
   EXE_FILES => ['script/tei2korapxml']
 );
+
+package MY;
+use File::ShareDir::Install 'postamble';
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 9417efa..fb9c972 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -5,6 +5,7 @@
 use Log::Any qw($log);
 use IO::Select;
 use IPC::Open2 qw(open2);
+use Encode qw(encode);
 
 # This tokenizer starts an external process for
 # tokenization. It writes the data to tokenize
@@ -22,9 +23,6 @@
 sub new {
   my ($class, $cmd, $sep) = @_;
 
-  # e.g. 'java  -cp '. join(':', '.', glob(dirname(__FILE__) . "/../target/*.jar")).
-  #      " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl"
-
   unless ($cmd) {
     $log->warn('Tokenizer not established');
     return;
@@ -54,7 +52,7 @@
   my ($self, $txt) = @_;
   return unless $self->{pid};
   my $out = $self->{chld_in};
-  print $out $txt . $self->{sep};
+  print $out encode( "UTF-8", $txt ) . $self->{sep};
   return $self;
 };
 
@@ -128,8 +126,9 @@
 
       if (defined $_ && $_ ne '') {
 
-        # This warning is sometimes thrown, though not yet replicated in the test suite.
-        # See the discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+        # This warning is sometimes thrown, though not yet replicated
+        # in the test suite. See the discussion in gerrit (3123:
+        # Establish tokenizer object for external base tokenization)
         # for further issues.
         $log->warn("Extra output: $_");
       }
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
new file mode 100644
index 0000000..b0ad51e
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -0,0 +1,37 @@
+package KorAP::XML::TEI::Tokenizer::KorAP;
+use base 'KorAP::XML::TEI::Tokenizer::External';
+use strict;
+use warnings;
+use File::Share ':all';
+
+use constant {
+  WAIT_SECS => 30
+};
+
+my $java = `sh -c 'command -v java'`;
+chomp $java;
+
+
+if ($java eq '') {
+  warn('No java executable found in PATH. ' . __PACKAGE__ . ' requires a JVM.');
+  return 0;
+};
+
+
+my $tokenizer_jar = dist_file(
+  'tei2korapxml',
+  'KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar'
+);
+
+
+# Construct a new KorAP Tokenizer
+sub new {
+  my $class = shift;
+  my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+  $self->{name} = 'korap';
+  $self->{sep} = "\x{04}\n";
+  return bless $self, $class;
+};
+
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 0546658..ab1975c 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -28,6 +28,10 @@
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 
+eval {
+  require KorAP::XML::TEI::Tokenizer::KorAP;
+  1;
+};
 
 our $VERSION = '0.01';
 
@@ -39,6 +43,7 @@
   "root|r=s"  => \(my $_root_dir = '.'),  # name of root directory inside zip file
   "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
   'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+  'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
   'use-intern-tokenization|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
@@ -76,13 +81,18 @@
 #
 
 ## extern tokenization
-my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0;
+my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
+
   # TODO:
   #   Read tokenizer call from configuration file.
   #   was 'java  -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
   my $ext_tok;
   if ($tokenizer_call) {
     $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+  }
+
+  elsif ($tokenizer_korap) {
+    $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
   };
   my $_tok_file_ext  = "tokens.xml";
 ##
@@ -1044,6 +1054,10 @@
 Call an external tokenizer process, that will tokenize
 a single line from STDIN and outputs one token per line.
 
+=item B<--tokenizer-korap|-tk>
+
+Use the standard KorAP/DeReKo tokenizer.
+
 =item B<--use-intern-tokenization|-ti>
 
 Tokenize the data using two embedded tokenizers,
diff --git a/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar b/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar
new file mode 100644
index 0000000..748b341
--- /dev/null
+++ b/share/KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar
Binary files differ
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
index bf948e8..4e8b2d0 100644
--- a/t/cmd/tokenizer.pl
+++ b/t/cmd/tokenizer.pl
@@ -2,6 +2,7 @@
 use strict;
 use warnings;
 use FindBin;
+use Encode;
 BEGIN {
   unshift @INC, "$FindBin::Bin/../../lib";
 };
@@ -14,7 +15,7 @@
 
 # Read lines from input and return boundaries
 while (!eof(STDIN)) {
-  my $line = <>;
+  my $line = decode_utf8(<>);
   for my $text (split(/\n?\x{04}\n?/, $line)) {
     $tok->tokenize($text);
     print join(' ', $tok->boundaries), "\n";
diff --git a/t/tokenization-korap.t b/t/tokenization-korap.t
new file mode 100644
index 0000000..809dd45
--- /dev/null
+++ b/t/tokenization-korap.t
@@ -0,0 +1,66 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use File::Spec::Functions qw/catfile/;
+use Test::XML::Loy;
+
+use FindBin;
+use utf8;
+
+BEGIN {
+  eval {
+    require KorAP::XML::TEI::Tokenizer::KorAP;
+    1;
+  } or do {
+    plan skip_all => "KorAP::XML::TEI::Tokenizer::KorAP cannot be used";
+  };
+}
+
+require_ok('KorAP::XML::TEI::Tokenizer::KorAP');
+
+my $f = dirname(__FILE__);
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+my $ext = KorAP::XML::TEI::Tokenizer::KorAP->new();
+
+$ext->tokenize("Der alte Mann");
+my $str = $ext->to_string('unknown');
+my $t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->element_count_is('layer spanList span', 3);
+
+$ext->tokenize("ging über die Straße");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 5);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 10);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->attr_is('layer spanList span:nth-child(4)', 'from', 14);
+$t->attr_is('layer spanList span:nth-child(4)', 'to', 20);
+$t->element_count_is('layer spanList span', 4);
+
+$ext->reset;
+$ext->tokenize("Hu aha\x{04}\ndas ist cool");
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
+$t->element_count_is('layer spanList span', 2);
+
+my $string = "Pluto.\"  Eris-Entdecker Mike Brown, der im Kurznachrichtendienst Twitter unter \"\@plutokiller";
+$ext->reset;
+$ext->tokenize($string);
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(14)', 'from', 80);
+$t->attr_is('layer spanList span:nth-child(14)', 'to', 92);
+$t->element_count_is('layer spanList span', 14);
+done_testing;