Add -tk option to use the standard KoAP tokenizer

Change-Id: I992fe37463926c8ecbca933fbb709f8640d6fb93
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 9417efa..fb9c972 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -5,6 +5,7 @@
 use Log::Any qw($log);
 use IO::Select;
 use IPC::Open2 qw(open2);
+use Encode qw(encode);
 
 # This tokenizer starts an external process for
 # tokenization. It writes the data to tokenize
@@ -22,9 +23,6 @@
 sub new {
   my ($class, $cmd, $sep) = @_;
 
-  # e.g. 'java  -cp '. join(':', '.', glob(dirname(__FILE__) . "/../target/*.jar")).
-  #      " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl"
-
   unless ($cmd) {
     $log->warn('Tokenizer not established');
     return;
@@ -54,7 +52,7 @@
   my ($self, $txt) = @_;
   return unless $self->{pid};
   my $out = $self->{chld_in};
-  print $out $txt . $self->{sep};
+  print $out encode( "UTF-8", $txt ) . $self->{sep};
   return $self;
 };
 
@@ -128,8 +126,9 @@
 
       if (defined $_ && $_ ne '') {
 
-        # This warning is sometimes thrown, though not yet replicated in the test suite.
-        # See the discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+        # This warning is sometimes thrown, though not yet replicated
+        # in the test suite. See the discussion in gerrit (3123:
+        # Establish tokenizer object for external base tokenization)
         # for further issues.
         $log->warn("Extra output: $_");
       }
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
new file mode 100644
index 0000000..b0ad51e
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -0,0 +1,37 @@
+package KorAP::XML::TEI::Tokenizer::KorAP;
+use base 'KorAP::XML::TEI::Tokenizer::External';
+use strict;
+use warnings;
+use File::Share ':all';
+
+use constant {
+  WAIT_SECS => 30
+};
+
+my $java = `sh -c 'command -v java'`;
+chomp $java;
+
+
+if ($java eq '') {
+  warn('No java executable found in PATH. ' . __PACKAGE__ . ' requires a JVM.');
+  return 0;
+};
+
+
+my $tokenizer_jar = dist_file(
+  'tei2korapxml',
+  'KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar'
+);
+
+
+# Construct a new KorAP Tokenizer
+sub new {
+  my $class = shift;
+  my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+  $self->{name} = 'korap';
+  $self->{sep} = "\x{04}\n";
+  return bless $self, $class;
+};
+
+
+1;