Introduce --tokens-file parameter
Change-Id: I3438b30b065d61533ca9483084096e9ccadf696d
diff --git a/Changes b/Changes
index 7bdf664..2783199 100644
--- a/Changes
+++ b/Changes
@@ -1,6 +1,7 @@
- Remove unnecessary branch in recursive call
- Support inline-structures parameter
- Introduce --base-foundry, --data-file, and --header-file parameters
+ - Introduce --tokens-file parameter
1.00 2021-02-18 Release
- -s option added that uses sentence boundaries provided by the KorAP tokenizer (-tk)
diff --git a/Readme.pod b/Readme.pod
index 0ed0fb1..e627275 100644
--- a/Readme.pod
+++ b/Readme.pod
@@ -149,6 +149,13 @@
Replace existing with, or add new, sentence boundary information
provided by the KorAP tokenizer (currently supported only).
+=item B<--tokens-file> <file>
+
+Define the file (without extension)
+to store generated token information in
+(either from the KorAP tokenizer or an externally called tokenizer).
+Defaults to C<tokens>.
+
=item B<--log|-l>
Loglevel for I<Log::Any>. Defaults to C<notice>.
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
index 3044045..b5a2ed3 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Aggressive.pm
@@ -41,4 +41,9 @@
};
+# Name of the tokenizer file
+sub name {
+ 'tokens_aggressive';
+};
+
1;
diff --git a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
index 2b68786..f08f430 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/Conservative.pm
@@ -87,4 +87,10 @@
};
+# Name of the tokenizer file
+sub name {
+ 'tokens_conservative';
+};
+
+
1;
diff --git a/script/tei2korapxml b/script/tei2korapxml
index cbd0b1e..152eaac 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -46,13 +46,14 @@
"input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
- 'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
+ 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
'base-foundry=s' => \(my $_tok_dir = 'base'),
'data-file=s' => \(my $_data_file = 'data'),
'header-file=s' => \(my $_header_file = 'header'),
+ 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
pod2usage(
@@ -103,7 +104,6 @@
elsif ($tokenizer_korap) {
$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
};
-my $_tok_file_ext = "tokens.xml";
##
@@ -113,9 +113,6 @@
## intern tokenization
-my $_GEN_TOK_INT = $tokenizer_intern; # simple tokenization (recommended for testing)
-my $_tok_file_con = "tokens_conservative.xml";
-my $_tok_file_agg = "tokens_aggressive.xml";
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
##
@@ -266,7 +263,7 @@
# ~ end of text body ~
if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
- # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
+ # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
@@ -312,7 +309,7 @@
# Tokenize and output
$ext_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
+ $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
$text_id_esc
);
};
@@ -321,12 +318,12 @@
# Tokenize and output
$cons_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
+ $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
$text_id_esc
);
$aggr_tok->tokenize($data->data)->to_zip(
- $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
+ $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
$text_id_esc
);
@@ -745,6 +742,13 @@
Replace existing with, or add new, sentence boundary information
provided by the KorAP tokenizer (currently supported only).
+=item B<--tokens-file> <file>
+
+Define the file (without extension)
+to store generated token information in
+(either from the KorAP tokenizer or an externally called tokenizer).
+Defaults to C<tokens>.
+
=item B<--log|-l>
Loglevel for I<Log::Any>. Defaults to C<notice>.
diff --git a/t/script.t b/t/script.t
index 8249f79..c7506d5 100644
--- a/t/script.t
+++ b/t/script.t
@@ -175,6 +175,29 @@
->element_count_is('spanList span', 227);
};
+subtest 'Tokenize with external tokenizer and defined folder' => sub {
+
+ my $cmd = catfile($f, 'cmd', 'tokenizer.pl');
+
+ test_tei2korapxml(
+ file => $file,
+ param => "-tc='perl $cmd' --tokens-file=yadda",
+ tmp => 'script_out2'
+ )
+ ->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
+ ->file_exists_not('GOE/AGA/00000/base/tokens.xml')
+ ->file_readable('GOE/AGA/00000/base/yadda.xml')
+ ->unzip_xml('GOE/AGA/00000/base/yadda.xml')
+ ->attr_is('spanList span:nth-child(1)', 'to', 8)
+ ->attr_is('spanList span#t_1', 'from', 9)
+ ->attr_is('spanList span#t_1', 'to', 11)
+ ->attr_is('spanList span#t_67', 'from', 427)
+ ->attr_is('spanList span#t_67', 'to', 430)
+ ->attr_is('spanList span#t_214', 'from', 1209)
+ ->attr_is('spanList span#t_214', 'to', 1212)
+ ->element_count_is('spanList span', 227);
+};
+
subtest 'Check KorAP tokenizer for infinite loop bug' => sub {
my $file = catfile($f, 'data', 'korap_tokenizer_challenge.xml');
@@ -368,7 +391,7 @@
tmp => 'script_out',
file => $file,
param => '-ti --inline-structures=myfoundry#mystr'
- )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+ )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
->file_exists_not('GOE/AGA/00000/struct/structure.xml', 'Structure not generated')
->unzip_xml('GOE/AGA/00000/myfoundry/mystr.xml')
->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
@@ -384,7 +407,7 @@
tmp => 'script_out',
file => $file,
param => '-ti --inline-structures=myfoundry'
- )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+ )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
->file_exists_not('GOE/AGA/00000/struct/structure.xml', 'Structure not generated')
->unzip_xml('GOE/AGA/00000/myfoundry/structure.xml')
->text_is('span[id=s3] *[name=type]', 'Autobiographie', 'text content')
@@ -461,7 +484,7 @@
tmp => 'script_out',
file => $file,
param => '-ti --base-foundry=root --data-file=primary --header-file=meta'
- )->stderr_like(qr!tei2korapxml: .*? text_id=GOE_AGA\.00000!)
+ )->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
->file_exists_not('GOE/AGA/00000/header.xml', 'Header not there')
->file_exists_not('GOE/AGA/header.xml', 'Header not there')
->file_exists_not('GOE/header.xml', 'Header not there')