allow to specify both tokenizations (extern and intern)

Change-Id: I2e3ff83e5122f803c5e4a18a0c1b89b93269d444
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 2b2c6da..9ed2ae3 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -355,7 +355,9 @@
 
           $ext_tok->tokenize($data);
 
-        } elsif ( $_GEN_TOK_INT ){
+        }
+
+        if ( $_GEN_TOK_INT ){
 
           $cons_tok->tokenize($data);
           $aggr_tok->tokenize($data);
@@ -390,7 +392,9 @@
             $text_id_esc
           )
 
-        } elsif ( $_GEN_TOK_INT ){
+        }
+
+        if ( $_GEN_TOK_INT ){
 
           # Output token streams to zip streams
           $cons_tok->to_zip(
diff --git a/t/script.t b/t/script.t
index 8d17be6..f7f9468 100644
--- a/t/script.t
+++ b/t/script.t
@@ -7,6 +7,11 @@
 use Test::More;
 use Test::Output;
 use Test::XML::Loy;
+
+use FindBin;
+BEGIN {
+  unshift @INC, "$FindBin::Bin/../lib";
+};
 use Test::KorAP::XML::TEI qw!korap_tempfile!;
 
 my $f = dirname(__FILE__);
@@ -129,6 +134,8 @@
 $t = Test::XML::Loy->new($struct_xml);
 $t->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content');
 
+$zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+ok(!$zip, 'External not generated');
 
 # Uncompress GOE/AGA/00000/base/tokens_aggressive.xml from zip file
 $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
@@ -174,19 +181,22 @@
 
 $t->element_count_is('spanList span', 227);
 
+
 # Tokenize with external tokenizer
 my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
 
 my ($fh2, $outzip2) = korap_tempfile('script_out2');
 
 stderr_like(
-  sub { `cat '$file' | perl '$script' --tc='perl $cmd' > '$outzip2'` },
+  sub { `cat '$file' | perl '$script' -tc='perl $cmd' > '$outzip2'` },
   qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
   'Processing'
 );
 
 # Uncompress GOE/AGA/00000/base/tokens.xml from zip file
 $zip = IO::Uncompress::Unzip->new($outzip2, Name => 'GOE/AGA/00000/base/tokens.xml');
+ok($zip, 'Found');
+ok(!$zip->eof, 'Readable');
 
 # Read GOE/AGA/00000/base/tokens.xml
 $tokens_xml = '';
@@ -208,10 +218,6 @@
 $t->element_count_is('spanList span', 227);
 
 
-
-# TODO: call $script with approp. parameter for internal tokenization (actual: '$_GEN_TOK_INT = 1' hardcoded)
-
-
 my ($fh3, $outzip3) = korap_tempfile('script_out3');
 
 
@@ -306,4 +312,32 @@
 $t->element_count_is('spanList span', 22);
 
 
+subtest 'Check Tokenization Flags' => sub {
+
+  # Get external tokenizer
+  my $f = dirname(__FILE__);
+  my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+  # Load example file
+  my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+  my ($fh, $outzip) = korap_tempfile('script_tokflags');
+
+  # Generate zip file (unportable!)
+  stderr_like(
+    sub { `cat '$file' | perl '$script' -ti -tc 'perl $cmd' > '$outzip'` },
+    qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!,
+    'Processing'
+  );
+
+  ok(-e $outzip, "File $outzip exists");
+
+  $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_aggressive.xml');
+  ok($zip, 'Aggressive generated');
+  $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens_conservative.xml');
+  ok($zip, 'Conservative generated');
+  $zip = IO::Uncompress::Unzip->new($outzip, Name => 'GOE/AGA/00000/base/tokens.xml');
+  ok($zip, 'External generated');
+};
+
 done_testing;
diff --git a/t/tei.t b/t/tei.t
index 53f372e..6dca05c 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -1,13 +1,14 @@
 use strict;
 use warnings;
 use Test::More;
-use Test::KorAP::XML::TEI qw!korap_tempfile!;
 
 use FindBin;
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
+
 use_ok('KorAP::XML::TEI', 'remove_xml_comments');
 
 my ($fh, $filename) = korap_tempfile('tei');
diff --git a/t/tokenization.t b/t/tokenization.t
index 1d75e5f..b132a63 100644
--- a/t/tokenization.t
+++ b/t/tokenization.t
@@ -3,6 +3,7 @@
 use Test::More;
 use File::Basename 'dirname';
 use File::Spec::Functions qw/catfile/;
+use IO::Uncompress::Unzip;
 use open qw(:std :utf8); # assume utf-8 encoding
 
 use FindBin;
@@ -10,8 +11,10 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use_ok('Test::KorAP::XML::TEI','korap_tempfile');
 require_ok('KorAP::XML::TEI::Tokenizer::Aggressive');
 require_ok('KorAP::XML::TEI::Tokenizer::Conservative');
+require_ok('KorAP::XML::TEI::Zipper');
 
 # Test aggressive
 my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
@@ -111,4 +114,25 @@
 is(302, scalar(@$cons));
 
 
+subtest 'Test Zipper' => sub {
+  # Test Zipper
+  my ($fh, $outzip) = korap_tempfile('tokenize_zipper');
+  my $zip = KorAP::XML::TEI::Zipper->new($outzip);
+  $fh->close;
+
+  my $aggr = KorAP::XML::TEI::Tokenizer::Aggressive->new;
+  $aggr->tokenize("Der alte Mann");
+  ok($aggr->to_zip(
+    $zip->new_stream('tokens.xml'),
+    'fun'
+  ), 'Written successfully');
+
+  $zip->close;
+
+  ok(-e $outzip, 'Zip exists');
+  my $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'tokens.xml');
+  ok(!$unzip->eof, 'Unzip successful');
+};
+
+
 done_testing;
diff --git a/t/zipper.t b/t/zipper.t
index 2ee7fab..86aa52a 100644
--- a/t/zipper.t
+++ b/t/zipper.t
@@ -2,7 +2,6 @@
 use warnings;
 use Test::More;
 use File::Spec::Functions qw/catfile/;
-use Test::KorAP::XML::TEI qw!korap_tempfile!;
 use IO::Uncompress::Unzip;
 
 use FindBin;
@@ -10,6 +9,8 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
+
 require_ok('KorAP::XML::TEI::Zipper');
 
 my $data;
@@ -36,7 +37,7 @@
 is($data, 'hello', 'Data correct');
 
 
-# Uncompress GOE/header.xml from zip file
+# Uncompress data/file2.txt from zip file
 $unzip = IO::Uncompress::Unzip->new($outzip, Name => 'data/file2.txt');
 
 $data = '';
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 3407451..163b85b 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -13,6 +13,7 @@
   unshift @INC, "$FindBin::Bin/../lib";
 };
 
+use Test::KorAP::XML::TEI qw!korap_tempfile!;
 use KorAP::XML::TEI 'remove_xml_comments';
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Tokenizer::Conservative;
@@ -46,7 +47,7 @@
 my $result;
 
 # Data for delHTMLcom-long
-my ($fh, $filename) = tempfile();
+my ($fh, $filename) = korap_tempfile('benchmark');
 
 print $fh <<'HTML';
 mehrzeiliger