Introduce --base-foundry, --data-file, and --header-file parameters Change-Id: I467b47cb0f60f84bed2b662f5dd177481a2758fe

commit: 26a715249095791b18136ce8523c5694b2e1f468 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Feb 19 10:27:37 2021 +0100
committer: Akron <nils@diewald-online.de> Tue Feb 23 15:21:40 2021 +0100
tree: 1e58f0770df3a711259d1b6bb79590f4c02cd299
parent: dd0be8fc2e5774b207c2a92037ed0e5a28ecd59b [diff]
diff --git a/Changes b/Changes
index 47836c2..7bdf664 100644
--- a/Changes
+++ b/Changes

@@ -1,5 +1,6 @@
         - Remove unnecessary branch in recursive call
         - Support inline-structures parameter
+        - Introduce --base-foundry, --data-file, and --header-file parameters
 
 1.00 2021-02-18 Release
         - -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)

diff --git a/Readme.pod b/Readme.pod
index 443a944..0ed0fb1 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -125,6 +125,24 @@
 Define the foundry and file (without extension)
 to store inline structure information in.
 Defaults to C<struct> and C<structures>.
+=item B<--base-foundry> <foundry>
+
+Define the base foundry to store newly generated
+token information in.
+Defaults to C<base>.
+
+=item B<--data-file> <file>
+
+Define the file (without extension)
+to store primary data information in.
+Defaults to C<data>.
+
+=item B<--header-file> <file>
+
+Define the file name (without extension)
+to store header information on
+the corpus, document, and text level in.
+Defaults to C<header>.
 
 =item B<--use-tokenizer-sentence-splits|-s>
 

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 9a0cbf9..2e3aa0e 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -50,6 +50,9 @@
   'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
   'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
+  'base-foundry=s' => \(my $_tok_dir = 'base'),
+  'data-file=s' => \(my $_data_file = 'data'),
+  'header-file=s' => \(my $_header_file = 'header'),
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
     pod2usage(
@@ -117,12 +120,6 @@
 my $cons_tok       = KorAP::XML::TEI::Tokenizer::Conservative->new;
 ##
 
-
-my $_tok_dir         = "base";                       # name of directory for storing tokenization files
-
-my $_header_file     = "header.xml";                 # name of files      containing the  text, document and corpus header
-my $_data_file       = "data.xml";                   # name of file       containing the  primary text data (tokens)
-
 ## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
 my $_TOKENS_PROC     = 1;                            # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
 
@@ -301,12 +298,12 @@
           retr_info(1, \$tree_data->[2] ); # parse input data
 
           if (DEBUG) {
-            $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+            $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
           };
 
           # ~ write data.xml ~
           $data->to_zip(
-            $zipper->new_stream("$dir/${_data_file}"),
+            $zipper->new_stream("$dir/${_data_file}.xml"),
             $text_id_esc
           );
 
@@ -418,7 +415,7 @@
     if ($header) {
 
       # Write header to zip
-      my $file = $header->dir . '/' . $_header_file;
+      my $file = $header->dir . '/' . $_header_file . '.xml';
 
       $log->debug("Writing file $file") if DEBUG;
 
@@ -725,6 +722,25 @@
 to store inline structure information in.
 Defaults to C<struct> and C<structures>.
 
+=item B<--base-foundry> <foundry>
+
+Define the base foundry to store newly generated
+token information in.
+Defaults to C<base>.
+
+=item B<--data-file> <file>
+
+Define the file (without extension)
+to store primary data information in.
+Defaults to C<data>.
+
+=item B<--header-file> <file>
+
+Define the file name (without extension)
+to store header information on
+the corpus, document, and text level in.
+Defaults to C<header>.
+
 =item B<--use-tokenizer-sentence-splits|-s>
 
 Replace existing with, or add new, sentence boundary information

diff --git a/t/script.t b/t/script.t
index f9c336b..658cc79 100644
--- a/t/script.t
+++ b/t/script.t

@@ -453,6 +453,34 @@
     ;
 };
 
+
+subtest 'Check file structure with defined folder and filenames' => sub {
+  # Load example file
+  my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+  my $t = test_tei2korapxml(
+    tmp => 'script_out',
+    file => $file,
+    param => '-ti --base-foundry=root --data-file=primary --header-file=meta'
+  )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+    ->file_exists_not('GOE/AGA/00000/header.xml', 'Header not there')
+    ->file_exists_not('GOE/AGA/header.xml', 'Header not there')
+    ->file_exists_not('GOE/header.xml', 'Header not there')
+    ->file_exists_not('GOE/AGA/00000/data.xml', 'Data not there')
+    ->file_exists_not('GOE/AGA/00000/base/tokens_conservative.xml', 'Tokens not there')
+    ->file_exists_not('GOE/AGA/00000/base/tokens_aggressive.xml', 'Tokens not there')
+    ->file_exists('GOE/AGA/00000/meta.xml', 'Header there')
+    ->file_exists('GOE/AGA/meta.xml', 'Header there')
+    ->file_exists('GOE/meta.xml', 'Header there')
+    ->file_exists('GOE/AGA/00000/primary.xml', 'Data there')
+    ->file_exists('GOE/AGA/00000/root/tokens_conservative.xml', 'Tokens there')
+    ->file_exists('GOE/AGA/00000/root/tokens_aggressive.xml', 'Tokens there')
+    ;
+
+  $t->unzip_xml('GOE/AGA/00000/primary.xml')
+    ->content_like(qr/\Q&quot;Kriegstheater&quot;\E/)
+    ;
+};
+
 subtest 'Check Inline annotations with defined foundry and folder' => sub {
   # Load example file
   my $file = catfile($f, 'data', 'goe_sample_tagged.i5.xml');
commit	26a715249095791b18136ce8523c5694b2e1f468	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Feb 19 10:27:37 2021 +0100
committer	Akron <nils@diewald-online.de>	Tue Feb 23 15:21:40 2021 +0100
tree	1e58f0770df3a711259d1b6bb79590f4c02cd299
parent	dd0be8fc2e5774b207c2a92037ed0e5a28ecd59b [diff]