Introduce --base-foundry, --data-file, and --header-file parameters
Change-Id: I467b47cb0f60f84bed2b662f5dd177481a2758fe
diff --git a/Changes b/Changes
index 47836c2..7bdf664 100644
--- a/Changes
+++ b/Changes
@@ -1,5 +1,6 @@
- Remove unnecessary branch in recursive call
- Support inline-structures parameter
+ - Introduce --base-foundry, --data-file, and --header-file parameters
1.00 2021-02-18 Release
- -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)
diff --git a/Readme.pod b/Readme.pod
index 443a944..0ed0fb1 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -125,6 +125,24 @@
Define the foundry and file (without extension)
to store inline structure information in.
Defaults to C<struct> and C<structures>.
+=item B<--base-foundry> <foundry>
+
+Define the base foundry to store newly generated
+token information in.
+Defaults to C<base>.
+
+=item B<--data-file> <file>
+
+Define the file (without extension)
+to store primary data information in.
+Defaults to C<data>.
+
+=item B<--header-file> <file>
+
+Define the file name (without extension)
+to store header information on
+the corpus, document, and text level in.
+Defaults to C<header>.
=item B<--use-tokenizer-sentence-splits|-s>
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 9a0cbf9..2e3aa0e 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -50,6 +50,9 @@
'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
+ 'base-foundry=s' => \(my $_tok_dir = 'base'),
+ 'data-file=s' => \(my $_data_file = 'data'),
+ 'header-file=s' => \(my $_header_file = 'header'),
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
pod2usage(
@@ -117,12 +120,6 @@
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
##
-
-my $_tok_dir = "base"; # name of directory for storing tokenization files
-
-my $_header_file = "header.xml"; # name of files containing the text, document and corpus header
-my $_data_file = "data.xml"; # name of file containing the primary text data (tokens)
-
## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
@@ -301,12 +298,12 @@
retr_info(1, \$tree_data->[2] ); # parse input data
if (DEBUG) {
- $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
+ $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
};
# ~ write data.xml ~
$data->to_zip(
- $zipper->new_stream("$dir/${_data_file}"),
+ $zipper->new_stream("$dir/${_data_file}.xml"),
$text_id_esc
);
@@ -418,7 +415,7 @@
if ($header) {
# Write header to zip
- my $file = $header->dir . '/' . $_header_file;
+ my $file = $header->dir . '/' . $_header_file . '.xml';
$log->debug("Writing file $file") if DEBUG;
@@ -725,6 +722,25 @@
to store inline structure information in.
Defaults to C<struct> and C<structures>.
+=item B<--base-foundry> <foundry>
+
+Define the base foundry to store newly generated
+token information in.
+Defaults to C<base>.
+
+=item B<--data-file> <file>
+
+Define the file (without extension)
+to store primary data information in.
+Defaults to C<data>.
+
+=item B<--header-file> <file>
+
+Define the file name (without extension)
+to store header information on
+the corpus, document, and text level in.
+Defaults to C<header>.
+
=item B<--use-tokenizer-sentence-splits|-s>
Replace existing with, or add new, sentence boundary information
diff --git a/t/script.t b/t/script.t
index f9c336b..658cc79 100644
--- a/t/script.t
+++ b/t/script.t
@@ -453,6 +453,34 @@
;
};
+
+subtest 'Check file structure with defined folder and filenames' => sub {
+ # Load example file
+ my $file = catfile($f, 'data', 'goe_sample.i5.xml');
+ my $t = test_tei2korapxml(
+ tmp => 'script_out',
+ file => $file,
+ param => '-ti --base-foundry=root --data-file=primary --header-file=meta'
+ )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+ ->file_exists_not('GOE/AGA/00000/header.xml', 'Header not there')
+ ->file_exists_not('GOE/AGA/header.xml', 'Header not there')
+ ->file_exists_not('GOE/header.xml', 'Header not there')
+ ->file_exists_not('GOE/AGA/00000/data.xml', 'Data not there')
+ ->file_exists_not('GOE/AGA/00000/base/tokens_conservative.xml', 'Tokens not there')
+ ->file_exists_not('GOE/AGA/00000/base/tokens_aggressive.xml', 'Tokens not there')
+ ->file_exists('GOE/AGA/00000/meta.xml', 'Header there')
+ ->file_exists('GOE/AGA/meta.xml', 'Header there')
+ ->file_exists('GOE/meta.xml', 'Header there')
+ ->file_exists('GOE/AGA/00000/primary.xml', 'Data there')
+ ->file_exists('GOE/AGA/00000/root/tokens_conservative.xml', 'Tokens there')
+ ->file_exists('GOE/AGA/00000/root/tokens_aggressive.xml', 'Tokens there')
+ ;
+
+ $t->unzip_xml('GOE/AGA/00000/primary.xml')
+ ->content_like(qr/\Q"Kriegstheater"\E/)
+ ;
+};
+
subtest 'Check Inline annotations with defined foundry and folder' => sub {
# Load example file
my $file = catfile($f, 'data', 'goe_sample_tagged.i5.xml');