Introduce --base-foundry, --data-file, and --header-file parameters Change-Id: I467b47cb0f60f84bed2b662f5dd177481a2758fe

commit: 26a715249095791b18136ce8523c5694b2e1f468 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Feb 19 10:27:37 2021 +0100
committer: Akron <nils@diewald-online.de> Tue Feb 23 15:21:40 2021 +0100
tree: 1e58f0770df3a711259d1b6bb79590f4c02cd299
parent: dd0be8fc2e5774b207c2a92037ed0e5a28ecd59b [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 9a0cbf9..2e3aa0e 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -50,6 +50,9 @@
   'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
   'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
+  'base-foundry=s' => \(my $_tok_dir = 'base'),
+  'data-file=s' => \(my $_data_file = 'data'),
+  'header-file=s' => \(my $_header_file = 'header'),
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
     pod2usage(
@@ -117,12 +120,6 @@
 my $cons_tok       = KorAP::XML::TEI::Tokenizer::Conservative->new;
 ##
 
-
-my $_tok_dir         = "base";                       # name of directory for storing tokenization files
-
-my $_header_file     = "header.xml";                 # name of files      containing the  text, document and corpus header
-my $_data_file       = "data.xml";                   # name of file       containing the  primary text data (tokens)
-
 ## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
 my $_TOKENS_PROC     = 1;                            # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
 
@@ -301,12 +298,12 @@
           retr_info(1, \$tree_data->[2] ); # parse input data
 
           if (DEBUG) {
-            $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+            $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
           };
 
           # ~ write data.xml ~
           $data->to_zip(
-            $zipper->new_stream("$dir/${_data_file}"),
+            $zipper->new_stream("$dir/${_data_file}.xml"),
             $text_id_esc
           );
 
@@ -418,7 +415,7 @@
     if ($header) {
 
       # Write header to zip
-      my $file = $header->dir . '/' . $_header_file;
+      my $file = $header->dir . '/' . $_header_file . '.xml';
 
       $log->debug("Writing file $file") if DEBUG;
 
@@ -725,6 +722,25 @@
 to store inline structure information in.
 Defaults to C<struct> and C<structures>.
 
+=item B<--base-foundry> <foundry>
+
+Define the base foundry to store newly generated
+token information in.
+Defaults to C<base>.
+
+=item B<--data-file> <file>
+
+Define the file (without extension)
+to store primary data information in.
+Defaults to C<data>.
+
+=item B<--header-file> <file>
+
+Define the file name (without extension)
+to store header information on
+the corpus, document, and text level in.
+Defaults to C<header>.
+
 =item B<--use-tokenizer-sentence-splits|-s>
 
 Replace existing with, or add new, sentence boundary information
commit	26a715249095791b18136ce8523c5694b2e1f468	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Feb 19 10:27:37 2021 +0100
committer	Akron <nils@diewald-online.de>	Tue Feb 23 15:21:40 2021 +0100
tree	1e58f0770df3a711259d1b6bb79590f4c02cd299
parent	dd0be8fc2e5774b207c2a92037ed0e5a28ecd59b [diff] [blame]