Establish tokenizer object for external base tokenization

Change-Id: Ie69c280042da5125e0934c87ccaad88b0be5494f
diff --git a/t/cmd/tokenizer.pl b/t/cmd/tokenizer.pl
new file mode 100644
index 0000000..e484160
--- /dev/null
+++ b/t/cmd/tokenizer.pl
@@ -0,0 +1,25 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../../lib";
+};
+use KorAP::XML::TEI::Tokenizer::Aggressive;
+
+use open qw(:std :utf8); # assume utf-8 encoding
+
+$| = 1;
+
+# Init tokenizer
+my $tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+
+# Read lines from input and return boundaries
+while (!eof(STDIN)) {
+  my $line = <>;
+  $tok->tokenize($line);
+  print join(' ', $tok->boundaries), "\n";
+  $tok->reset;
+};
+
+1;
diff --git a/t/script.t b/t/script.t
index 3ac91d1..5010789 100644
--- a/t/script.t
+++ b/t/script.t
@@ -32,6 +32,8 @@
 my $outzip = tmpnam();
 
 # Generate zip file (unportable!)
+# TODO:
+#   Call with aggressive and conservative tokenizations!
 stderr_like(
   sub { `cat '$file' | perl '$script' > '$outzip'` },
   qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
@@ -142,7 +144,37 @@
 # Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
 $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
 
-# Read GOE/AGA/00000/base/tok.xml
+$tokens_xml = '';
+$tokens_xml .= $zip->getline while !$zip->eof;
+ok($zip->close, 'Closed');
+
+$t = Test::XML::Loy->new($tokens_xml);
+$t->attr_is('spanList span:nth-child(1)', 'to', 8);
+
+$t->attr_is('spanList span#t_1', 'from', 9);
+$t->attr_is('spanList span#t_1', 'to', 11);
+
+$t->attr_is('spanList span#t_67', 'from', 427);
+$t->attr_is('spanList span#t_67', 'to', 430);
+
+$t->attr_is('spanList span#t_214', 'from', 1209);
+$t->attr_is('spanList span#t_214', 'to', 1212);
+
+$t->element_count_is('spanList span', 227);
+
+# Tokenize with external tokenizer
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+stderr_like(
+  sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip'` },
+  qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+  'Processing'
+);
+
+# Uncompress GOE/AGA/00000/base/tokens_conservative.xml from zip file
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+
+# Read GOE/AGA/00000/base/tokens.xml
 $tokens_xml = '';
 $tokens_xml .= $zip->getline while !$zip->eof;
 ok($zip->close, 'Closed');
diff --git a/t/tokenization-external.t b/t/tokenization-external.t
new file mode 100644
index 0000000..e867aed
--- /dev/null
+++ b/t/tokenization-external.t
@@ -0,0 +1,51 @@
+use strict;
+use warnings;
+use Test::More;
+use File::Basename 'dirname';
+use Data::Dumper;
+use File::Spec::Functions qw/catfile/;
+use File::Temp 'tempfile';
+use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
+
+require_ok('KorAP::XML::TEI::Tokenizer::External');
+
+my $f = dirname(__FILE__);
+my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+# Test aggressive
+my $ext = KorAP::XML::TEI::Tokenizer::External->new(
+  'perl ' . $cmd
+  #  'java -cp Ingestion/target/KorAP-Ingestion-pipeline.jar de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl'
+);
+
+$ext->tokenize("Der alte Mann");
+# TODO:
+#   see comments on $sep in 'lib/KorAP/XML/TEI/Tokenizer/External.pm'
+#$ext->tokenize("ging über die Straße");
+
+my $str = $ext->to_string('unknown');
+my $t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 4);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 8);
+$t->attr_is('layer spanList span:nth-child(3)', 'from', 9);
+$t->attr_is('layer spanList span:nth-child(3)', 'to', 13);
+$t->element_count_is('layer spanList span', 3);
+
+$ext->reset;
+$ext->tokenize("Hu aha\ndas ist cool");
+
+$str = $ext->to_string('unknown');
+$t = Test::XML::Loy->new($str);
+$t->attr_is('layer spanList span:nth-child(1)', 'to', 2);
+$t->attr_is('layer spanList span:nth-child(2)', 'from', 3);
+$t->attr_is('layer spanList span:nth-child(2)', 'to', 6);
+$t->element_count_is('layer spanList span', 2);
+
+
+done_testing;