Support --inline-structures parameter
Change-Id: I4e0e951f2f688e42b52818b86a22f7cb722e67dc
diff --git a/Changes b/Changes
index 9d7be83..47836c2 100644
--- a/Changes
+++ b/Changes
@@ -1,4 +1,5 @@
- Remove unnecessary branch in recursive call
+ - Support inline-structures parameter
1.00 2021-02-18 Release
- -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)
diff --git a/Readme.pod b/Readme.pod
index 1689eb4..443a944 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -120,6 +120,12 @@
annotations as well.
Defaults to C<tokens> and C<morpho>.
+=item B<--inline-structures> <foundry>#[<file>]
+
+Define the foundry and file (without extension)
+to store inline structure information in.
+Defaults to C<struct> and C<structures>.
+
=item B<--use-tokenizer-sentence-splits|-s>
Replace existing with, or add new, sentence boundary information
@@ -164,4 +170,4 @@
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
-=cut
+=cut
\ No newline at end of file
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 785621e..9a0cbf9 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -49,6 +49,7 @@
'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
+ 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
pod2usage(
@@ -121,13 +122,17 @@
my $_header_file = "header.xml"; # name of files containing the text, document and corpus header
my $_data_file = "data.xml"; # name of file containing the primary text data (tokens)
-my $_structure_dir = "struct"; # name of directory containing the $_structure_file
-my $_structure_file = "structure.xml"; # name of file containing all tags (except ${_TOKEN_TAG}'s) related information
- # (= their names and byte offsets in $_data)
+
## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
+# Name of the directory and the file containing all inline structure informations
+# except for $_TOKEN_TAG information
+my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
+$_structure_file .= '.xml';
+
+
# Name of the directory and the file containing all inline token informations
# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
@@ -714,6 +719,12 @@
annotations as well.
Defaults to C<tokens> and C<morpho>.
+=item B<--inline-structures> <foundry>#[<file>]
+
+Define the foundry and file (without extension)
+to store inline structure information in.
+Defaults to C<struct> and C<structures>.
+
=item B<--use-tokenizer-sentence-splits|-s>
Replace existing with, or add new, sentence boundary information
diff --git a/t/script.t b/t/script.t
index 0ac27d9..f9c336b 100644
--- a/t/script.t
+++ b/t/script.t
@@ -360,6 +360,44 @@
};
+subtest 'Check structure parsing with defined foundry and folder' => sub {
+ # Load example file
+ my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+ my $t = test_tei2korapxml(
+ tmp => 'script_out',
+ file => $file,
+ param => '-ti --inline-structures=myfoundry#mystr'
+ )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+ ->file_exists_not('GOE/AGA/00000/struct/structure.xml', 'Structure not generated')
+ ->unzip_xml('GOE/AGA/00000/myfoundry/mystr.xml')
+ ->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
+ ->text_is('#s3 *[name=type]', 'Autobiographie', 'text content')
+ ->attr_is('#s0','to','1266')
+ ->attr_is('#s0','l','1')
+ ->attr_is('#s18','from','925')
+ ->attr_is('#s18','to','1266')
+ ->attr_is('#s18','l','5')
+ ;
+
+ $t = test_tei2korapxml(
+ tmp => 'script_out',
+ file => $file,
+ param => '-ti --inline-structures=myfoundry'
+ )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+ ->file_exists_not('GOE/AGA/00000/struct/structure.xml', 'Structure not generated')
+ ->unzip_xml('GOE/AGA/00000/myfoundry/structure.xml')
+ ->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
+ ->text_is('#s3 *[name=type]', 'Autobiographie', 'text content')
+ ->attr_is('#s0','to','1266')
+ ->attr_is('#s0','l','1')
+ ->attr_is('#s18','from','925')
+ ->attr_is('#s18','to','1266')
+ ->attr_is('#s18','l','5')
+ ;
+};
+
+
subtest 'Check Inline annotations' => sub {
# Load example file