Introduce --skip-inline-tokens parameter Change-Id: Ia717aeb29058f7a3eb549c811c83d26c6f0b4dd2

commit: 75d6314a28573d932f4caa80ab2696735551121b [log] [tgz]
author: Akron <nils@diewald-online.de> Tue Feb 23 18:40:56 2021 +0100
committer: Akron <nils@diewald-online.de> Tue Feb 23 18:40:56 2021 +0100
tree: 221d39603c5acc7d6965a32be0163f8d8f95fda0
parent: b87c58d45011f8a1a917be1708613ce9c5e68bd7 [diff]
diff --git a/Changes b/Changes
index 2783199..14bfcba 100644
--- a/Changes
+++ b/Changes

@@ -2,6 +2,7 @@
         - Support inline-structures parameter
         - Introduce --base-foundry, --data-file, and --header-file parameters
         - Introduce --tokens-file parameter
+        - Introduce --skip-inline-tokens parameter
 
 1.00 2021-02-18 Release
         - -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)

diff --git a/Readme.pod b/Readme.pod
index e627275..16d259d 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -112,6 +112,11 @@
 that will take an I<Aggressive> and a I<conservative>
 approach.
 
+=item B<--skip-inline-tokens>
+
+Boolean flag indicating that inline tokens should not
+be processed. Defaults to false (meaning inline tokens will be processed).
+
 =item B<--inline-tokens> <foundry>#[<file>]
 
 Define the foundry and file (without extension)
@@ -125,6 +130,7 @@
 Define the foundry and file (without extension)
 to store inline structure information in.
 Defaults to C<struct> and C<structures>.
+
 =item B<--base-foundry> <foundry>
 
 Define the base foundry to store newly generated
@@ -195,4 +201,4 @@
 This program is free software published under the
 L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
 
-=cut
\ No newline at end of file
+=cut

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 784ed1a..ce31ddd 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -42,20 +42,21 @@
 
 # Parse options from the command line
 GetOptions(
-  "root|r=s"  => \(my $_root_dir = '.'),  # name of root directory inside zip file
-  "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
-  'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
-  'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
-  'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
-  'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
-  'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
-  'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
-  'base-foundry=s' => \(my $_tok_dir = 'base'),
-  'data-file=s' => \(my $_data_file = 'data'),
-  'header-file=s' => \(my $_header_file = 'header'),
-  'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
-  'log|l=s' => \(my $log_level = 'notice'),
-  'help|h'    => sub {
+  "root|r=s"              => \(my $_root_dir = '.'),
+  "input|i=s"             => \(my $input_fname = ''),
+  'tokenizer-call|tc=s'   => \(my $tokenizer_call),
+  'tokenizer-korap|tk'    => \(my $tokenizer_korap),
+  'tokenizer-internal|ti' => \(my $_GEN_TOK_INT),
+  'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
+  'inline-tokens=s'       => \(my $inline_tokens = 'tokens#morpho'),
+  'inline-structures=s'   => \(my $inline_structures = 'struct#structure'),
+  'skip-inline-tokens'    => \(my $skip_inline_tokens = 0),
+  'base-foundry=s'        => \(my $_tok_dir = 'base'),
+  'data-file=s'           => \(my $_data_file = 'data'),
+  'header-file=s'         => \(my $_header_file = 'header'),
+  'tokens-file=s'         => \(my $_tok_file_ext = 'tokens'),
+  'log|l=s'               => \(my $log_level = 'notice'),
+  'help|h' => sub {
     pod2usage(
       -verbose => 99,
       -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
@@ -110,9 +111,6 @@
 my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
 ##
 
-# Processing of ${_TOKEN_TAG}'s - on/off (default: 1)
-my $_TOKENS_PROC = 1;
-
 # Name of the directory and the file containing all inline structure informations
 # except for $_TOKEN_TAG information
 my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
@@ -268,7 +266,7 @@
 
         $structures->reset;
 
-        $tokens->reset if $_TOKENS_PROC;
+        $tokens->reset unless $skip_inline_tokens;
 
         # ~ whitespace related issue ~
         $add_one = 0;
@@ -328,7 +326,7 @@
         };
 
         # ~ write tokens ~
-        if ($_TOKENS_PROC && !$tokens->empty) {
+        unless ($skip_inline_tokens || $tokens->empty) {
           $tokens->to_zip(
             $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
             $text_id_esc,
@@ -451,7 +449,7 @@
       my $anno = $structures->add_new_annotation($e->[1]);
 
       # Add element also to token list
-      if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
+      if (!$skip_inline_tokens && $e->[1] eq $_TOKENS_TAG) {
         $tokens->add_annotation($anno);
       };
 
@@ -678,6 +676,11 @@
 that will take an I<Aggressive> and a I<conservative>
 approach.
 
+=item B<--skip-inline-tokens>
+
+Boolean flag indicating that inline tokens should not
+be processed. Defaults to false (meaning inline tokens will be processed).
+
 =item B<--inline-tokens> <foundry>#[<file>]
 
 Define the foundry and file (without extension)

diff --git a/t/script.t b/t/script.t
index c7506d5..d99dd21 100644
--- a/t/script.t
+++ b/t/script.t

@@ -421,6 +421,32 @@
 };
 
 
+subtest 'Check parsing but skip inline tokens' => sub {
+  # Load example file
+  my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+  my $t = test_tei2korapxml(
+    tmp => 'script_skip_inline_tokens_1',
+    file => $file,
+    param => '-ti --skip-inline-tokens'
+  )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+    ->file_exists('GOE/AGA/00000/data.xml', 'Data exists')
+    ->file_exists('GOE/AGA/00000/struct/structure.xml', 'Structure generated')
+    ->file_exists_not('GOE/AGA/00000/tokens/morpho.xml', 'Morpho not generated')
+    ;
+
+  $t = test_tei2korapxml(
+    tmp => 'script_skip_inline_tokens_2',
+    file => $file,
+    param => '-ti --skip-inline-tokens --inline-tokens=myfoundry#myfile'
+  )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+    ->file_exists('GOE/AGA/00000/struct/structure.xml', 'Structure generated')
+    ->file_exists_not('GOE/AGA/00000/tokens/morpho.xml', 'Morpho not generated')
+    ->file_exists_not('GOE/AGA/00000/myfoundry/myfile.xml', 'Morpho not generated')
+    ;
+};
+
+
 subtest 'Check Inline annotations' => sub {
 
   # Load example file
commit	75d6314a28573d932f4caa80ab2696735551121b	[log] [tgz]
author	Akron <nils@diewald-online.de>	Tue Feb 23 18:40:56 2021 +0100
committer	Akron <nils@diewald-online.de>	Tue Feb 23 18:40:56 2021 +0100
tree	221d39603c5acc7d6965a32be0163f8d8f95fda0
parent	b87c58d45011f8a1a917be1708613ce9c5e68bd7 [diff]