Introduce --base-foundry, --data-file, and --header-file parameters
Change-Id: I467b47cb0f60f84bed2b662f5dd177481a2758fe
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 9a0cbf9..2e3aa0e 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -50,6 +50,9 @@
'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
+ 'base-foundry=s' => \(my $_tok_dir = 'base'),
+ 'data-file=s' => \(my $_data_file = 'data'),
+ 'header-file=s' => \(my $_header_file = 'header'),
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
pod2usage(
@@ -117,12 +120,6 @@
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
##
-
-my $_tok_dir = "base"; # name of directory for storing tokenization files
-
-my $_header_file = "header.xml"; # name of files containing the text, document and corpus header
-my $_data_file = "data.xml"; # name of file containing the primary text data (tokens)
-
## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
@@ -301,12 +298,12 @@
retr_info(1, \$tree_data->[2] ); # parse input data
if (DEBUG) {
- $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+ $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
};
# ~ write data.xml ~
$data->to_zip(
- $zipper->new_stream("$dir/${_data_file}"),
+ $zipper->new_stream("$dir/${_data_file}.xml"),
$text_id_esc
);
@@ -418,7 +415,7 @@
if ($header) {
# Write header to zip
- my $file = $header->dir . '/' . $_header_file;
+ my $file = $header->dir . '/' . $_header_file . '.xml';
$log->debug("Writing file $file") if DEBUG;
@@ -725,6 +722,25 @@
to store inline structure information in.
Defaults to C<struct> and C<structures>.
+=item B<--base-foundry> <foundry>
+
+Define the base foundry to store newly generated
+token information in.
+Defaults to C<base>.
+
+=item B<--data-file> <file>
+
+Define the file (without extension)
+to store primary data information in.
+Defaults to C<data>.
+
+=item B<--header-file> <file>
+
+Define the file name (without extension)
+to store header information on
+the corpus, document, and text level in.
+Defaults to C<header>.
+
=item B<--use-tokenizer-sentence-splits|-s>
Replace existing with, or add new, sentence boundary information