Add -tk option to use the standard KoAP tokenizer
Change-Id: I992fe37463926c8ecbca933fbb709f8640d6fb93
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 9417efa..fb9c972 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -5,6 +5,7 @@
use Log::Any qw($log);
use IO::Select;
use IPC::Open2 qw(open2);
+use Encode qw(encode);
# This tokenizer starts an external process for
# tokenization. It writes the data to tokenize
@@ -22,9 +23,6 @@
sub new {
my ($class, $cmd, $sep) = @_;
- # e.g. 'java -cp '. join(':', '.', glob(dirname(__FILE__) . "/../target/*.jar")).
- # " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl"
-
unless ($cmd) {
$log->warn('Tokenizer not established');
return;
@@ -54,7 +52,7 @@
my ($self, $txt) = @_;
return unless $self->{pid};
my $out = $self->{chld_in};
- print $out $txt . $self->{sep};
+ print $out encode( "UTF-8", $txt ) . $self->{sep};
return $self;
};
@@ -128,8 +126,9 @@
if (defined $_ && $_ ne '') {
- # This warning is sometimes thrown, though not yet replicated in the test suite.
- # See the discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+ # This warning is sometimes thrown, though not yet replicated
+ # in the test suite. See the discussion in gerrit (3123:
+ # Establish tokenizer object for external base tokenization)
# for further issues.
$log->warn("Extra output: $_");
}
diff --git a/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
new file mode 100644
index 0000000..b0ad51e
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/KorAP.pm
@@ -0,0 +1,37 @@
+package KorAP::XML::TEI::Tokenizer::KorAP;
+use base 'KorAP::XML::TEI::Tokenizer::External';
+use strict;
+use warnings;
+use File::Share ':all';
+
+use constant {
+ WAIT_SECS => 30
+};
+
+my $java = `sh -c 'command -v java'`;
+chomp $java;
+
+
+if ($java eq '') {
+ warn('No java executable found in PATH. ' . __PACKAGE__ . ' requires a JVM.');
+ return 0;
+};
+
+
+my $tokenizer_jar = dist_file(
+ 'tei2korapxml',
+ 'KorAP-Tokenizer-1.3-SNAPSHOT-6cc760f-standalone.jar'
+);
+
+
+# Construct a new KorAP Tokenizer
+sub new {
+ my $class = shift;
+ my $self = $class->SUPER::new("$java -jar $tokenizer_jar --no-tokens --positions");
+ $self->{name} = 'korap';
+ $self->{sep} = "\x{04}\n";
+ return bless $self, $class;
+};
+
+
+1;