parametrize internal tokenization
Change-Id: I19df6812cb39f5e48ae6aa5fd16951e18aef82a5
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 3390ff6..2b2c6da 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -34,6 +34,7 @@
"root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
"input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+ 'use-intern-tokenization|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
'help|h' => sub {
pod2usage(
-verbose => 99,
@@ -76,7 +77,7 @@
#
## extern tokenization
-my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0; # (used for IDS internal tokenization)
+my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0;
# TODO:
# Read tokenizer call from configuration file.
# was 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
@@ -88,7 +89,7 @@
##
## intern tokenization
-my $_GEN_TOK_INT = 1; # simple tokenization, recommended for testing (for use of an external tokenizer see $_GEN_TOK_EXT)
+my $_GEN_TOK_INT = $tokenizer_intern; # simple tokenization (recommended for testing)
my $_tok_file_con = "tokens_conservative.xml";
my $_tok_file_agg = "tokens_aggressive.xml";
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
diff --git a/t/script.t b/t/script.t
index 2bb00bb..401d76b 100644
--- a/t/script.t
+++ b/t/script.t
@@ -39,7 +39,7 @@
# Generate zip file (unportable!)
stderr_like(
- sub { `cat '$file' | perl '$script' > '$outzip'` },
+ sub { `cat '$file' | perl '$script' -ti > '$outzip'` },
# approaches for working with $fh (also better use OO interface then)
# sub { open STDOUT, '>&', $fh; system("cat '$file' | perl '$script'") },
# sub { open(my $pipe, "cat '$file' | perl '$script'|"); while(<$pipe>){$fh->print($_)}; $fh->close },
@@ -226,7 +226,7 @@
$file = catfile($f, 'data', 'text_with_blanks.i5.xml');
stderr_like(
- sub { `cat '$file' | perl '$script' > '$outzip3'` },
+ sub { `cat '$file' | perl '$script' --ti > '$outzip3'` },
qr!tei2korapxml: .*? text_id=CORP_DOC.00001!,
'Processing'
);
diff --git a/xt/benchmark.pl b/xt/benchmark.pl
index 9d0489a..4bdc255 100644
--- a/xt/benchmark.pl
+++ b/xt/benchmark.pl
@@ -77,7 +77,7 @@
Dumbbench::Instance::PerlSub->new(
name => 'SimpleConversion',
code => sub {
- `cat '$file' | perl '$script' > /dev/null 2>&1`
+ `cat '$file' | perl '$script' -ti > /dev/null 2>&1`
}
),
Dumbbench::Instance::PerlSub->new(