Add -tk option to use the standard KoAP tokenizer
Change-Id: I992fe37463926c8ecbca933fbb709f8640d6fb93
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 0546658..ab1975c 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -28,6 +28,10 @@
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
+eval {
+ require KorAP::XML::TEI::Tokenizer::KorAP;
+ 1;
+};
our $VERSION = '0.01';
@@ -39,6 +43,7 @@
"root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
"input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
+ 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
'use-intern-tokenization|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
@@ -76,13 +81,18 @@
#
## extern tokenization
-my $_GEN_TOK_EXT = $tokenizer_call ? 1 : 0;
+my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
+
# TODO:
# Read tokenizer call from configuration file.
# was 'java -cp '. join(":", ".", glob(&dirname(__FILE__)."/../target/*.jar")). " de.ids_mannheim.korap.tokenizer.KorAPTokenizerImpl";
my $ext_tok;
if ($tokenizer_call) {
$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+ }
+
+ elsif ($tokenizer_korap) {
+ $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
};
my $_tok_file_ext = "tokens.xml";
##
@@ -1044,6 +1054,10 @@
Call an external tokenizer process, that will tokenize
a single line from STDIN and outputs one token per line.
+=item B<--tokenizer-korap|-tk>
+
+Use the standard KorAP/DeReKo tokenizer.
+
=item B<--use-intern-tokenization|-ti>
Tokenize the data using two embedded tokenizers,