Establish tokenizer object for external base tokenization
Change-Id: Ie69c280042da5125e0934c87ccaad88b0be5494f
diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
new file mode 100644
index 0000000..8cfa0cf
--- /dev/null
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm
@@ -0,0 +1,165 @@
+package KorAP::XML::TEI::Tokenizer::External;
+use base 'KorAP::XML::TEI::Tokenizer';
+use strict;
+use warnings;
+use IO::Select;
+use IPC::Open2 qw(open2);
+
+# This tokenizer starts an external process for
+# tokenization. It writes the data to tokenize
+# to STDIN and reads boundary data from STDOUT.
+
+use constant {
+ WAIT_SECS => 3600
+};
+
+
+# Construct a new tokenizer.
+# Accepts the command to call the external tokenizer
+# and optionally a character sequence indicating the
+# end of an input.
+sub new {
+ my ($class, $cmd, $sep) = @_;
+
+ # e.g. 'java -cp '. join(':', '.', glob(dirname(__FILE__) . "/../target/*.jar")).
+ # " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl"
+
+ unless ($cmd) {
+ print STDERR "WARNING: tokenizer not established.\n";
+ return;
+ };
+
+ # Send this sequence to separate inputs
+ # TODO: needs to be explored furthermore ...
+ # '\x03' produces a warning in 't/tokenization-external.t' (WARNING: extra output: 0 1)
+ # - see discussion in gerrit (3123: Establish tokenizer object for external base tokenization)
+ # an empty $sep leads to a blocking situation inside t/cmd/tokenizer.pl (right before the while-loop)
+ #$sep //= "\n\x03\n";
+ $sep //= "\n";
+
+ my $self = bless {
+ chld_in => undef,
+ chld_out => undef,
+ pid => undef,
+ cmd => $cmd,
+ select => undef,
+ sep => $sep,
+ }, $class;
+
+ # Initialize tokenizer
+ $self->_init;
+ return $self;
+};
+
+
+# Tokenize text in an external process
+sub tokenize {
+ my ($self, $txt, $offset) = @_;
+ return unless $self->{pid};
+ my $out = $self->{chld_in};
+ print $out $txt . $self->{sep};
+};
+
+
+# Initialize the tokenizer and bind the communication
+sub _init {
+ my $self = shift;
+
+ # Open process
+ if ($self->{pid} = open2(
+ $self->{chld_out},
+ $self->{chld_in},
+ $self->{cmd}
+ )) {
+ $self->{select} = IO::Select->new;
+ $self->{select}->add(*{$self->{chld_out}});
+ }
+
+ else {
+ print STDERR "WARNING: tokenizer can't be started.\n";
+ };
+};
+
+
+# Reset the inner state of the tokenizer
+# and return the tokenizer object.
+sub reset {
+ my $self = shift;
+ $self->close;
+ $self->_init;
+ return $self;
+};
+
+
+# Return data as a string
+sub to_string {
+ my ($self, $text_id) = @_;
+
+ unless ($text_id) {
+ warn 'Missing textID';
+ return;
+ };
+
+ return '' unless $self->{select};
+
+ # Start header
+ my $output = $self->_header($text_id);
+
+ # TODO:
+ # Escape the stringification of cmd.
+ $output .= ' <!-- ' . $self->{cmd} . " -->\n";
+
+ # Wait 60m for the external tokenizer
+ if ($self->{select}->can_read(WAIT_SECS)) {
+
+ my $out = $self->{chld_out};
+ $_ = <$out>;
+
+ my @bounds = split;
+
+ # Serialize all bounds
+ my $c = 0;
+ for (my $i = 0; $i < @bounds; $i += 2 ){
+ $output .= qq! <span id="t_$c" from="! . $bounds[$i] . '" to="' .
+ $bounds[$i+1] . qq!" />\n!;
+ $c++;
+ };
+
+ while ($self->{select}->can_read(0)) {
+ $_ = <$out>;
+
+ if (defined $_ && $_ ne '') {
+ print STDERR "WARNING: extra output: $_\n"
+ }
+ else {
+ print STDERR "WARNING: tokenizer seems to have crashed, restarting.\n";
+ $self->reset;
+ };
+ };
+ }
+
+ else {
+ die "ERROR ($0): cannot retrieve token bounds from external tokenizer for text '$text_id' => Aborting ...\n";
+ };
+
+ # Add footer
+ return $output . $self->_footer;
+};
+
+
+# Close communication channel
+sub close {
+ my $self = shift;
+ close($self->{chld_in});
+ close($self->{chld_out});
+ $self->{chld_out} = $self->{chld_in} = undef;
+
+ # Close the pid if still open
+ if ($self->{pid}) {
+ waitpid $self->{pid}, 0;
+ $self->{pid} = undef;
+ };
+};
+
+
+1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 359fea3..671c26e 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -6,15 +6,12 @@
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use File::Basename qw(dirname);
-use IO::Handle;
-use IO::Select;
use open qw(:std :utf8); # assume utf-8 encoding
use Encode qw(encode_utf8 decode_utf8);
use XML::CompactTree::XS;
use XML::LibXML::Reader;
-use IPC::Open2 qw(open2);
use FindBin;
BEGIN {
@@ -22,6 +19,7 @@
};
use KorAP::XML::TEI;
+use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Zipper;
@@ -35,7 +33,8 @@
GetOptions(
"root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
"input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
- 'help|h' => sub {
+ 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+ 'help|h' => sub {
pod2usage(
-verbose => 99,
-sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -76,19 +75,26 @@
# ~~~ constants ~~~
#
-## DEPRECATED (only IDS-intern - the tokenization is normally done by external tools)
-my $_GEN_TOK_BAS = 0; # IDS internal tokenization
- my( $chld_out, $chld_in, $pid, $select );
+my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0; # (used for IDS internal tokenization)
+
+ # TODO:
+ # Read tokenizer call from configuration file.
+ # was 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
+ my $ext_tok;
+ if ($tokenizer_call) {
+ $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+ };
+ my $_tok_file_ext = "tokens.xml";
##
-## dummy tokenization (only for testing)
-my $_GEN_TOK_DUMMY = 1; # use dummy base tokenization for testing (base tokenization is normally done by external tools)
+## intern tokenization
+my $_GEN_TOK_INT = 1; # this simple tokenization can be used for testing (base tokenization is normally done by external tools)
my $_tok_file_con = "tokens_conservative.xml";
my $_tok_file_agg = "tokens_aggressive.xml";
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
my ( $txt, $offset );
-my $_base_tokenization_dir = "base"; # name of directory for storing files of dummy tokenization (only used in func. select_tokenization)
+my $_tok_dir = "base"; # name of directory for storing tokenization files
my $_DEBUG = 0; # set to 1 for minimal more debug output (no need to be parametrized)
my $_XCT_LN = 0; # only for debugging: include line numbers in elements of $tree_data
@@ -185,10 +191,6 @@
my ( $i, $c ); # index variables used in loops
-## DEPRECATED (only IDS-intern)
-my $_tok_file_bas = "tokens.xml";
-##
-
my ( $_CORP_HEADER_END, $_DOC_HEADER_END, $_TEXT_HEADER_END );
@@ -232,10 +234,6 @@
$data_sfx = "</text>\n</raw_text>";
-## DEPRECATED (only IDS-intern)
-startTokenizer() if $_GEN_TOK_BAS;
-##
-
# ~ read input and write output (text by text) ~
process();
@@ -289,7 +287,7 @@
# ~ end of text body ~
- # write data.xml, structure.xml and evtl. morpho.xml and/or the dummy tokenization files (s.a.: $_tok_file_con and $_tok_file_agg)
+ # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
$pfx = $1; $sfx = $2;
@@ -333,13 +331,11 @@
$data = encode_utf8( $data );
- ## DEPRECATED (only IDS-intern)
- # first write it to tokenization pipe to give it some time
- if ( $_GEN_TOK_BAS ){
- print $chld_in "$data\n\x03\n";
+ if ( $_GEN_TOK_EXT ){
+ # TODO: $offset is only necessary for $cons_tok and $aggr_tok and as long as they're part of 'retr_info'
+ $ext_tok->tokenize($data, $offset);
}
- ##
-
+
print STDERR "DEBUG ($0): main(): Writing (utf8-formatted) xml file $_root_dir$dir/$_data_file\n" if $_DEBUG;
@@ -358,13 +354,13 @@
write_tokens() if $_TOKENS_PROC && @tokens;
- # ~ dummy tokenization ~
+ # ~ tokenization ~
- if ( $_GEN_TOK_BAS || $_GEN_TOK_DUMMY ){ ## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
+ if ( $_GEN_TOK_EXT || $_GEN_TOK_INT ){
select_tokenization();
- if ( $_GEN_TOK_DUMMY ){
+ if ( $_GEN_TOK_INT ){
$offset = 0;
$aggr_tok->reset;
$cons_tok->reset;
@@ -661,12 +657,9 @@
$zipper->close;
- ## DEPRECATED (only IDS-intern)
- if( $_GEN_TOK_BAS ){
- close($chld_in);
- close($chld_out);
+ if( $_GEN_TOK_EXT ){
+ $ext_tok->close;
}
- ##
} # end: sub process
@@ -1000,16 +993,17 @@
#~~~~~
- # from here (until end): dummy tokenization
+ # from here (until end): intern tokenization
#~~~~~
- if ( $_GEN_TOK_DUMMY ){
+ if ( $_GEN_TOK_INT ){
$txt = $e->[1];
if ( substr( $txt, 0, 1 ) ne ' ' || substr( $txt, 1, 1) ne ' ' ){ # $txt has at least 2 chars, if it's not empty or equal to ' '
+ # TODO: implement outside retr_info() (like $ext_tok) on whole $data, instead on every text-node (more efficient and $offset not needed anymore)
$cons_tok->tokenize($txt, $offset);
$aggr_tok->tokenize($txt, $offset);
@@ -1017,7 +1011,7 @@
} # fi
- } # fi: $_GEN_TOK_DUMMY
+ } # fi: $_GEN_TOK_INT
#elsif ( $e->[0] == XML_READER_TYPE_ATTRIBUTE ) # attribute node
@@ -1038,35 +1032,22 @@
#print STDERR "$0: select_tokenization() ...\n";
- ## DEPRECATED (only IDS-intern)
- if( $_GEN_TOK_BAS ) {
- if( $select->can_read(3600) ){ # wait 60m for external tokenizer
- $_ = <$chld_out>;
- my @bounds = split;
- write_tokenization("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_bas", $text_id_esc, \@bounds);
- while($select->can_read(0)) {
- $_ = <$chld_out>;
- if (defined $_ && $_ ne '') {
- print STDERR "WARNING: extra output: $_\n"
- } else {
- print STDERR "WARNING: tokenizer seems to have crashed, restarting.\n";
- startTokenizer();
- }
- }
- }else{
- $zipper->close;
- die "ERROR ($0): cannot retrieve token bounds from external tokenizer for text '$text_id' => Aborting ...\n";
- }
- ##
- }elsif( $_GEN_TOK_DUMMY ){
+ if( $_GEN_TOK_EXT ) {
+
+ $ext_tok->to_zip(
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_ext"),
+ $text_id_esc
+ );
+
+ }elsif( $_GEN_TOK_INT ){
# Output token streams to zip streams
$cons_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_con"),
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_con"),
$text_id_esc
);
$aggr_tok->to_zip(
- $zipper->new_stream("$_root_dir$dir/$_base_tokenization_dir/$_tok_file_agg"),
+ $zipper->new_stream("$_root_dir$dir/$_tok_dir/$_tok_file_agg"),
$text_id_esc
);
}
@@ -1076,30 +1057,6 @@
} # end: select_tokenization
-sub write_tokenization { # called from select_tokenization()
-
- my ( $fname, $textid_esc, $bounds ) = @_;
-
- $output = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n<?xml-model href=\"span.rng\" type=\"application/xml\""
- ." schematypens=\"http://relaxng.org/ns/structure/1.0\"?>\n\n<layer docid=\"$text_id_esc\" xmlns=\"http://ids-mannheim.de/ns/KorAP\""
- ." version=\"KorAP-0.4\">\n <spanList>\n";
-
- $c = 0;
-
- for( $i = 0; $i < ($#$bounds + 1); $i += 2 ){
-
- $output .= " <span id=\"t_$c\" from=\"".$bounds->[$i]."\" to=\"".$bounds->[$i+1]."\" />\n";
-
- $c++;
- }
-
- $output .= " </spanList>\n</layer>";
-
- $zipper->new_stream($fname)->print($output);
-
-} # end: sub write_tokenization
-
-
sub write_structures { # called from process()
# ~ write @structures ~
@@ -1279,14 +1236,6 @@
} # end: sub write_tokens
-## DEPRECATED ($_GEN_TOK_BAS: only IDS-intern)
-sub startTokenizer {
- $pid = open2($chld_out, $chld_in, 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar"))." de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl");
- $select = IO::Select->new();
- $select->add(*$chld_out);
-}
-##
-
__END__
=pod
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
new file mode 100644
index 0000000..e484160
--- /dev/null
+++ b/t/cmd/tokenizer.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../../lib";
+};
+use KorAP::XML::TEI::Tokenizer::Aggressive;
+
+use open qw(:std :utf8); # assume utf-8 encoding
+
+$| = 1;
+
+# Init tokenizer
+my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+
+# Read lines from input and return boundaries
+while (!eof(STDIN)) {
+ my $line = <>;
+ $tok->tokenize($line);
+ print join(' ', $tok->boundaries), "\n";
+ $tok->reset;
+};
+
+1;
diff --git a/t/script.t b/t/script.t
index 3ac91d1..5010789 100644
--- a/t/script.t
+++ b/t/script.t
@@ -32,6 +32,8 @@
my $outzip = tmpnam();
# Generate zip file (unportable!)
+# TODO:
+# Call with aggressive and conservative tokenizations!
stderr_like(
sub { `cat '$file' | perl '$script' > '$outzip'` },
qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
@@ -142,7 +144,37 @@
# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
-# Read GOE/AGA/00000/base/tok.xml
+$tokens_xml = '';
+$tokens_xml .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($tokens_xml);
+$t->attr_is('spanList span:nth-child(1)', 'to', 8);
+
+$t->attr_is('spanList span#t_1', 'from', 9);
+$t->attr_is('spanList span#t_1', 'to', 11);
+
+$t->attr_is('spanList span#t_67', 'from', 427);
+$t->attr_is('spanList span#t_67', 'to', 430);
+
+$t->attr_is('spanList span#t_214', 'from', 1209);
+$t->attr_is('spanList span#t_214', 'to', 1212);
+
+$t->element_count_is('spanList span', 227);
+
+# Tokenize with external tokenizer
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+stderr_like(
+ sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
+ qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+ 'Processing'
+);
+
+# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+
+# Read GOE/AGA/00000/base/tokens.xml
$tokens_xml = '';
$tokens_xml .= $zip->getline while !$zip->eof;
ok($zip->close, 'Closed');
diff --git a/t/tokenization-external.t b/t/tokenization-external.t
new file mode 100644
index 0000000..e867aed
--- /dev/null
+++ b/t/tokenization-external.t
@@ -0,0 +1,51 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use Data::Dumper;
+use File::Spec::Functions qw/catfile/;
+use File::Temp 'tempfile';
+use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+ unshift @INC, "$FindBin::Bin/../lib";
+};
+
+require_ok('KorAP::XML::TEI::Tokenizer::External');
+
+my $f = dirname(__FILE__);
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+# Test aggressive
+my $ext = KorAP::XML::TEI::Tokenizer::External->new(
+ 'perl ' . $cmd
+ # 'java -cp Ingestion/target/KorAP-Ingestion-pipeline.jar de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl'
+);
+
+$ext->tokenize("Der alte Mann");
+# TODO:
+# see comments on $sep in 'lib/KorAP/XML/TEI/Tokenizer/External.pm'
+#$ext->tokenize("ging über die Straße");
+
+my $str = $ext->to_string('unknown');
+my $t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->element_count_is('layer spanList span', 3);
+
+$ext->reset;
+$ext->tokenize("Hu aha\ndas ist cool");
+
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
+$t->element_count_is('layer spanList span', 2);
+
+
+done_testing;