Support --inline-structures parameter Change-Id: I4e0e951f2f688e42b52818b86a22f7cb722e67dc

commit: dd0be8fc2e5774b207c2a92037ed0e5a28ecd59b [log] [tgz]
author: Akron <nils@diewald-online.de> Thu Feb 18 19:29:41 2021 +0100
committer: Akron <nils@diewald-online.de> Tue Feb 23 15:17:53 2021 +0100
tree: e5749495065cf196de4ba051690cebc7ee5d8255
parent: d658df73a6bd03ac1099a40733a1d7739035e3e7 [diff]
diff --git a/Changes b/Changes
index 9d7be83..47836c2 100644
--- a/Changes
+++ b/Changes

@@ -1,4 +1,5 @@
         - Remove unnecessary branch in recursive call
+        - Support inline-structures parameter
 
 1.00 2021-02-18 Release
         - -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)

diff --git a/Readme.pod b/Readme.pod
index 1689eb4..443a944 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -120,6 +120,12 @@
 annotations as well.
 Defaults to C<tokens> and C<morpho>.
 
+=item B<--inline-structures> <foundry>#[<file>]
+
+Define the foundry and file (without extension)
+to store inline structure information in.
+Defaults to C<struct> and C<structures>.
+
 =item B<--use-tokenizer-sentence-splits|-s>
 
 Replace existing with, or add new, sentence boundary information
@@ -164,4 +170,4 @@
 This program is free software published under the
 L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
 
-=cut
+=cut
\ No newline at end of file

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 785621e..9a0cbf9 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -49,6 +49,7 @@
   'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
   'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
   'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
+  'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
     pod2usage(
@@ -121,13 +122,17 @@
 
 my $_header_file     = "header.xml";                 # name of files      containing the  text, document and corpus header
 my $_data_file       = "data.xml";                   # name of file       containing the  primary text data (tokens)
-my $_structure_dir   = "struct";                     # name of directory  containing the  $_structure_file
-my $_structure_file  = "structure.xml";              # name of file       containing all  tags (except ${_TOKEN_TAG}'s) related information
-                                                     #                                     (= their names and byte offsets in $_data)
+
 ## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
 my $_TOKENS_PROC     = 1;                            # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
 
 
+# Name of the directory and the file containing all inline structure informations
+# except for $_TOKEN_TAG information
+my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
+$_structure_file .= '.xml';
+
+
 # Name of the directory and the file containing all inline token informations
 # i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
 my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
@@ -714,6 +719,12 @@
 annotations as well.
 Defaults to C<tokens> and C<morpho>.
 
+=item B<--inline-structures> <foundry>#[<file>]
+
+Define the foundry and file (without extension)
+to store inline structure information in.
+Defaults to C<struct> and C<structures>.
+
 =item B<--use-tokenizer-sentence-splits|-s>
 
 Replace existing with, or add new, sentence boundary information

diff --git a/t/script.t b/t/script.t
index 0ac27d9..f9c336b 100644
--- a/t/script.t
+++ b/t/script.t

@@ -360,6 +360,44 @@
 };
 
 
+subtest 'Check structure parsing with defined foundry and folder' => sub {
+  # Load example file
+  my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+
+  my $t = test_tei2korapxml(
+    tmp => 'script_out',
+    file => $file,
+    param => '-ti --inline-structures=myfoundry#mystr'
+  )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+    ->file_exists_not('GOE/AGA/00000/struct/structure.xml', 'Structure not generated')
+    ->unzip_xml('GOE/AGA/00000/myfoundry/mystr.xml')
+    ->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
+    ->text_is('#s3 *[name=type]', 'Autobiographie', 'text content')
+    ->attr_is('#s0','to','1266')
+    ->attr_is('#s0','l','1')
+    ->attr_is('#s18','from','925')
+    ->attr_is('#s18','to','1266')
+    ->attr_is('#s18','l','5')
+    ;
+
+  $t = test_tei2korapxml(
+    tmp => 'script_out',
+    file => $file,
+    param => '-ti --inline-structures=myfoundry'
+  )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+    ->file_exists_not('GOE/AGA/00000/struct/structure.xml', 'Structure not generated')
+    ->unzip_xml('GOE/AGA/00000/myfoundry/structure.xml')
+    ->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
+    ->text_is('#s3 *[name=type]', 'Autobiographie', 'text content')
+    ->attr_is('#s0','to','1266')
+    ->attr_is('#s0','l','1')
+    ->attr_is('#s18','from','925')
+    ->attr_is('#s18','to','1266')
+    ->attr_is('#s18','l','5')
+    ;
+};
+
+
 subtest 'Check Inline annotations' => sub {
 
   # Load example file
commit	dd0be8fc2e5774b207c2a92037ed0e5a28ecd59b	[log] [tgz]
author	Akron <nils@diewald-online.de>	Thu Feb 18 19:29:41 2021 +0100
committer	Akron <nils@diewald-online.de>	Tue Feb 23 15:17:53 2021 +0100
tree	e5749495065cf196de4ba051690cebc7ee5d8255
parent	d658df73a6bd03ac1099a40733a1d7739035e3e7 [diff]