Introduce --skip-inline-tags parameter Change-Id: Icc6317a7076b5164c16b8cfd63b2758445aff71c

commit: 54c3ff165752444a04ea35c29eaba30d1b78a1b4 [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Feb 25 11:33:37 2021 +0100
committer: Akron <nils@diewald-online.de> Thu Feb 25 11:40:26 2021 +0100
tree: 389f531a122d309c8bd5d5b8679f5f527731bba2
parent: b43b491d9f39b9c41fa9724b259cbcf9f07d1c35 [diff]
diff --git a/Changes b/Changes
index 3195113..581cbc8 100644
--- a/Changes
+++ b/Changes

@@ -4,6 +4,7 @@
         - Introduce --tokens-file parameter
         - Introduce --skip-inline-tokens parameter
         - Minor cleanups and improvements
+        - Introduce --skip-inline-tags parameter
 
 1.00 2021-02-18 Release
         - -s option added that uses sentence boundaries

diff --git a/Readme.pod b/Readme.pod
index 16d259d..a3dcc50 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -117,6 +117,11 @@
 Boolean flag indicating that inline tokens should not
 be processed. Defaults to false (meaning inline tokens will be processed).
 
+=item B<--skip-inline-tags>
+
+Expects a comma-separated list of tags to be ignored when the structure
+is parsed. Content of these tags however will be processed.
+
 =item B<--inline-tokens> <foundry>#[<file>]
 
 Define the foundry and file (without extension)
@@ -201,4 +206,4 @@
 This program is free software published under the
 L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
 
-=cut
+=cut
\ No newline at end of file

diff --git a/script/tei2korapxml b/script/tei2korapxml
index fb58d8e..af1bc05 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -62,6 +62,7 @@
   'inline-tokens=s'       => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s'   => \(my $inline_structures = 'struct#structure'),
   'skip-inline-tokens'    => \(my $skip_inline_tokens = 0),
+  'skip-inline-tags=s'    => \(my $skip_inline_tags_str = ''),
   'base-foundry=s'        => \(my $base_dir    = 'base'),
   'data-file=s'           => \(my $data_file   = 'data'),
   'header-file=s'         => \(my $header_file = 'header'),
@@ -110,6 +111,14 @@
     );
 };
 
+# Remember to skip certain inline tags
+my %skip_inline_tags = ();
+if ($skip_inline_tags_str) {
+  foreach (split /\s*,\s*/, $skip_inline_tags_str) {
+    $skip_inline_tags{$_} = 1;
+  };
+};
+
 # External tokenization
 my $ext_tok;
 if ($tokenizer_call) {
@@ -118,6 +127,9 @@
 
 elsif ($tokenizer_korap) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
+  if ($use_tokenizer_sentence_splits) {
+    $skip_inline_tags{s} = 1;
+  };
 };
 
 
@@ -444,8 +456,8 @@
       # This is likely to be optimized away by the compiler.
       my $children = $e->[DEBUG ? 5 : 4];
 
-      # Skip sentences
-      if ($use_tokenizer_sentence_splits && $node_info eq 's') {
+      # Skip certain tags
+      if ($skip_inline_tags{$node_info}) {
         descend($depth + 1, $children) if defined $children;
         next;
       };
@@ -675,6 +687,11 @@
 Boolean flag indicating that inline tokens should not
 be processed. Defaults to false (meaning inline tokens will be processed).
 
+=item B<--skip-inline-tags>
+
+Expects a comma-separated list of tags to be ignored when the structure
+is parsed. Content of these tags however will be processed.
+
 =item B<--inline-tokens> <foundry>#[<file>]
 
 Define the foundry and file (without extension)

diff --git a/t/script.t b/t/script.t
index d99dd21..6f1d2d1 100644
--- a/t/script.t
+++ b/t/script.t

@@ -420,6 +420,54 @@
     ;
 };
 
+subtest 'Check structure parsing with skipped tags' => sub {
+  # Load example file
+  my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+  my $t = test_tei2korapxml(
+    tmp => 'script_out',
+    file => $file,
+    param => '-ti'
+  )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+    ->file_exists('GOE/AGA/00000/struct/structure.xml', 'Structure generated')
+    ->unzip_xml('GOE/AGA/00000/struct/structure.xml')
+    ->text_is('layer spanList span fs f', 'text')
+    ->text_is('#s5 fs f[name=name]','head')
+    ->text_is('#s6 fs f[name=name]','s')
+    ->text_is('#s7 fs f[name=name]','head')
+    ->text_is('#s8 fs f[name=name]','s')
+    ->text_is('#s9 fs f[name=name]','quote')
+    ->text_is('#s10 fs f[name=name]','s')
+    ;
+
+  $t = test_tei2korapxml(
+    tmp => 'script_out',
+    file => $file,
+    param => '-ti --skip-inline-tags=head'
+  )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+    ->file_exists('GOE/AGA/00000/struct/structure.xml', 'Structure generated')
+    ->unzip_xml('GOE/AGA/00000/struct/structure.xml')
+    ->text_is('layer spanList span fs f', 'text')
+    ->text_is('#s5 fs f[name=name]','s')
+    ->text_is('#s6 fs f[name=name]','s')
+    ->text_is('#s7 fs f[name=name]','quote')
+    ->text_is('#s8 fs f[name=name]','s')
+    ;
+
+  $t = test_tei2korapxml(
+    tmp => 'script_out',
+    file => $file,
+    param => '-ti --skip-inline-tags=head,quote'
+  )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+    ->file_exists('GOE/AGA/00000/struct/structure.xml', 'Structure generated')
+    ->unzip_xml('GOE/AGA/00000/struct/structure.xml')
+    ->text_is('layer spanList span fs f', 'text')
+    ->text_is('#s5 fs f[name=name]','s')
+    ->text_is('#s6 fs f[name=name]','s')
+    ->text_is('#s7 fs f[name=name]','s')
+    ;
+};
+
 
 subtest 'Check parsing but skip inline tokens' => sub {
   # Load example file
commit	54c3ff165752444a04ea35c29eaba30d1b78a1b4	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Feb 25 11:33:37 2021 +0100
committer	Akron <nils@diewald-online.de>	Thu Feb 25 11:40:26 2021 +0100
tree	389f531a122d309c8bd5d5b8679f5f527731bba2
parent	b43b491d9f39b9c41fa9724b259cbcf9f07d1c35 [diff]