Introduce --skip-inline-tags parameter
Change-Id: Icc6317a7076b5164c16b8cfd63b2758445aff71c
diff --git a/Changes b/Changes
index 3195113..581cbc8 100644
--- a/Changes
+++ b/Changes
@@ -4,6 +4,7 @@
- Introduce --tokens-file parameter
- Introduce --skip-inline-tokens parameter
- Minor cleanups and improvements
+ - Introduce --skip-inline-tags parameter
1.00 2021-02-18 Release
- -s option added that uses sentence boundaries
diff --git a/Readme.pod b/Readme.pod
index 16d259d..a3dcc50 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -117,6 +117,11 @@
Boolean flag indicating that inline tokens should not
be processed. Defaults to false (meaning inline tokens will be processed).
+=item B<--skip-inline-tags>
+
+Expects a comma-separated list of tags to be ignored when the structure
+is parsed. Content of these tags however will be processed.
+
=item B<--inline-tokens> <foundry>#[<file>]
Define the foundry and file (without extension)
@@ -201,4 +206,4 @@
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
-=cut
+=cut
\ No newline at end of file
diff --git a/script/tei2korapxml b/script/tei2korapxml
index fb58d8e..af1bc05 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -62,6 +62,7 @@
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
+ 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
'base-foundry=s' => \(my $base_dir = 'base'),
'data-file=s' => \(my $data_file = 'data'),
'header-file=s' => \(my $header_file = 'header'),
@@ -110,6 +111,14 @@
);
};
+# Remember to skip certain inline tags
+my %skip_inline_tags = ();
+if ($skip_inline_tags_str) {
+ foreach (split /\s*,\s*/, $skip_inline_tags_str) {
+ $skip_inline_tags{$_} = 1;
+ };
+};
+
# External tokenization
my $ext_tok;
if ($tokenizer_call) {
@@ -118,6 +127,9 @@
elsif ($tokenizer_korap) {
$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
+ if ($use_tokenizer_sentence_splits) {
+ $skip_inline_tags{s} = 1;
+ };
};
@@ -444,8 +456,8 @@
# This is likely to be optimized away by the compiler.
my $children = $e->[DEBUG ? 5 : 4];
- # Skip sentences
- if ($use_tokenizer_sentence_splits && $node_info eq 's') {
+ # Skip certain tags
+ if ($skip_inline_tags{$node_info}) {
descend($depth + 1, $children) if defined $children;
next;
};
@@ -675,6 +687,11 @@
Boolean flag indicating that inline tokens should not
be processed. Defaults to false (meaning inline tokens will be processed).
+=item B<--skip-inline-tags>
+
+Expects a comma-separated list of tags to be ignored when the structure
+is parsed. Content of these tags however will be processed.
+
=item B<--inline-tokens> <foundry>#[<file>]
Define the foundry and file (without extension)
diff --git a/t/script.t b/t/script.t
index d99dd21..6f1d2d1 100644
--- a/t/script.t
+++ b/t/script.t
@@ -420,6 +420,54 @@
;
};
+subtest 'Check structure parsing with skipped tags' => sub {
+ # Load example file
+ my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+ my $t = test_tei2korapxml(
+ tmp => 'script_out',
+ file => $file,
+ param => '-ti'
+ )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+ ->file_exists('GOE/AGA/00000/struct/structure.xml', 'Structure generated')
+ ->unzip_xml('GOE/AGA/00000/struct/structure.xml')
+ ->text_is('layer spanList span fs f', 'text')
+ ->text_is('#s5 fs f[name=name]','head')
+ ->text_is('#s6 fs f[name=name]','s')
+ ->text_is('#s7 fs f[name=name]','head')
+ ->text_is('#s8 fs f[name=name]','s')
+ ->text_is('#s9 fs f[name=name]','quote')
+ ->text_is('#s10 fs f[name=name]','s')
+ ;
+
+ $t = test_tei2korapxml(
+ tmp => 'script_out',
+ file => $file,
+ param => '-ti --skip-inline-tags=head'
+ )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+ ->file_exists('GOE/AGA/00000/struct/structure.xml', 'Structure generated')
+ ->unzip_xml('GOE/AGA/00000/struct/structure.xml')
+ ->text_is('layer spanList span fs f', 'text')
+ ->text_is('#s5 fs f[name=name]','s')
+ ->text_is('#s6 fs f[name=name]','s')
+ ->text_is('#s7 fs f[name=name]','quote')
+ ->text_is('#s8 fs f[name=name]','s')
+ ;
+
+ $t = test_tei2korapxml(
+ tmp => 'script_out',
+ file => $file,
+ param => '-ti --skip-inline-tags=head,quote'
+ )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+ ->file_exists('GOE/AGA/00000/struct/structure.xml', 'Structure generated')
+ ->unzip_xml('GOE/AGA/00000/struct/structure.xml')
+ ->text_is('layer spanList span fs f', 'text')
+ ->text_is('#s5 fs f[name=name]','s')
+ ->text_is('#s6 fs f[name=name]','s')
+ ->text_is('#s7 fs f[name=name]','s')
+ ;
+};
+
subtest 'Check parsing but skip inline tokens' => sub {
# Load example file