Introduce --tokens-file parameter Change-Id: I3438b30b065d61533ca9483084096e9ccadf696d

commit: 91705d7f9a806286f8fa185b39208ca541aad3f4 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Feb 19 10:59:45 2021 +0100
committer: Akron <nils@diewald-online.de> Tue Feb 23 15:35:57 2021 +0100
tree: 66f1e0d76c24a2566f8a62c7ca3370190a6e7815
parent: 9157792eccff19f3882e066d60f632ba133b84ed [diff]
diff --git a/Changes b/Changes
index 7bdf664..2783199 100644
--- a/Changes
+++ b/Changes

@@ -1,6 +1,7 @@
         - Remove unnecessary branch in recursive call
         - Support inline-structures parameter
         - Introduce --base-foundry, --data-file, and --header-file parameters
+        - Introduce --tokens-file parameter
 
 1.00 2021-02-18 Release
         - -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)

diff --git a/Readme.pod b/Readme.pod
index 0ed0fb1..e627275 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -149,6 +149,13 @@
 Replace existing with, or add new, sentence boundary information
 provided by the KorAP tokenizer (currently supported only).
 
+=item B<--tokens-file> <file>
+
+Define the file (without extension)
+to store generated token information in
+(either from the KorAP tokenizer or an externally called tokenizer).
+Defaults to C<tokens>.
+
 =item B<--log|-l>
 
 Loglevel for I<Log::Any>. Defaults to C<notice>.

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 3044045..b5a2ed3 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm

@@ -41,4 +41,9 @@
 };
 
 
+# Name of the tokenizer file
+sub name {
+  'tokens_aggressive';
+};
+
 1;

diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index 2b68786..f08f430 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm

@@ -87,4 +87,10 @@
 };
 
 
+# Name of the tokenizer file
+sub name {
+  'tokens_conservative';
+};
+
+
 1;

diff --git a/script/tei2korapxml b/script/tei2korapxml
index cbd0b1e..152eaac 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -46,13 +46,14 @@
   "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
   'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
   'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
-  'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
+  'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
   'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
   'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
   'base-foundry=s' => \(my $_tok_dir = 'base'),
   'data-file=s' => \(my $_data_file = 'data'),
   'header-file=s' => \(my $_header_file = 'header'),
+  'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
     pod2usage(
@@ -103,7 +104,6 @@
 elsif ($tokenizer_korap) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
 };
-my $_tok_file_ext  = "tokens.xml";
 ##
 
 
@@ -113,9 +113,6 @@
 
 
 ## intern tokenization
-my $_GEN_TOK_INT = $tokenizer_intern;                  # simple tokenization (recommended for testing)
-my $_tok_file_con  = "tokens_conservative.xml";
-my $_tok_file_agg  = "tokens_aggressive.xml";
 my $aggr_tok       = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 my $cons_tok       = KorAP::XML::TEI::Tokenizer::Conservative->new;
 ##
@@ -266,7 +263,7 @@
       # ~ end of text body ~
       if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
 
-        # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
+        # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
 
         if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
           die $log->fatal("input line number $.: " .
@@ -312,7 +309,7 @@
 
             # Tokenize and output
             $ext_tok->tokenize($data->data)->to_zip(
-              $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
+              $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
               $text_id_esc
             );
           };
@@ -321,12 +318,12 @@
 
             # Tokenize and output
             $cons_tok->tokenize($data->data)->to_zip(
-              $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
+              $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
               $text_id_esc
             );
 
             $aggr_tok->tokenize($data->data)->to_zip(
-              $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
+              $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
               $text_id_esc
             );
 
@@ -745,6 +742,13 @@
 Replace existing with, or add new, sentence boundary information
 provided by the KorAP tokenizer (currently supported only).
 
+=item B<--tokens-file> <file>
+
+Define the file (without extension)
+to store generated token information in
+(either from the KorAP tokenizer or an externally called tokenizer).
+Defaults to C<tokens>.
+
 =item B<--log|-l>
 
 Loglevel for I<Log::Any>. Defaults to C<notice>.

diff --git a/t/script.t b/t/script.t
index 8249f79..c7506d5 100644
--- a/t/script.t
+++ b/t/script.t

@@ -175,6 +175,29 @@
     ->element_count_is('spanList span', 227);
 };
 
+subtest 'Tokenize with external tokenizer and defined folder' => sub {
+
+  my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+  test_tei2korapxml(
+    file => $file,
+    param => "-tc='perl $cmd' --tokens-file=yadda",
+    tmp => 'script_out2'
+  )
+    ->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+    ->file_exists_not('GOE/AGA/00000/base/tokens.xml')
+    ->file_readable('GOE/AGA/00000/base/yadda.xml')
+    ->unzip_xml('GOE/AGA/00000/base/yadda.xml')
+    ->attr_is('spanList span:nth-child(1)', 'to', 8)
+    ->attr_is('spanList span#t_1', 'from', 9)
+    ->attr_is('spanList span#t_1', 'to', 11)
+    ->attr_is('spanList span#t_67', 'from', 427)
+    ->attr_is('spanList span#t_67', 'to', 430)
+    ->attr_is('spanList span#t_214', 'from', 1209)
+    ->attr_is('spanList span#t_214', 'to', 1212)
+    ->element_count_is('spanList span', 227);
+};
+
 subtest 'Check KorAP tokenizer for infinite loop bug' => sub {
 
   my $file = catfile($f, 'data', 'korap_tokenizer_challenge.xml');
@@ -368,7 +391,7 @@
     tmp => 'script_out',
     file => $file,
     param => '-ti --inline-structures=myfoundry#mystr'
-  )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+  )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
     ->file_exists_not('GOE/AGA/00000/struct/structure.xml', 'Structure not generated')
     ->unzip_xml('GOE/AGA/00000/myfoundry/mystr.xml')
     ->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
@@ -384,7 +407,7 @@
     tmp => 'script_out',
     file => $file,
     param => '-ti --inline-structures=myfoundry'
-  )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+  )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
     ->file_exists_not('GOE/AGA/00000/struct/structure.xml', 'Structure not generated')
     ->unzip_xml('GOE/AGA/00000/myfoundry/structure.xml')
     ->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
@@ -461,7 +484,7 @@
     tmp => 'script_out',
     file => $file,
     param => '-ti --base-foundry=root --data-file=primary --header-file=meta'
-  )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+  )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
     ->file_exists_not('GOE/AGA/00000/header.xml', 'Header not there')
     ->file_exists_not('GOE/AGA/header.xml', 'Header not there')
     ->file_exists_not('GOE/header.xml', 'Header not there')
commit	91705d7f9a806286f8fa185b39208ca541aad3f4	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Feb 19 10:59:45 2021 +0100
committer	Akron <nils@diewald-online.de>	Tue Feb 23 15:35:57 2021 +0100
tree	66f1e0d76c24a2566f8a62c7ca3370190a6e7815
parent	9157792eccff19f3882e066d60f632ba133b84ed [diff]