Introduce --no-tokenizer parameter
Change-Id: Icc12c24dd3d01b0e31d14b6d5a6c6214da0d9918
diff --git a/Changes b/Changes
index 5c4f541..444ff50 100644
--- a/Changes
+++ b/Changes
@@ -3,6 +3,8 @@
- Option --xmlid-to-textsigle <from-regex>@<to-c/to-d/to-t>
added to convert standard P5 text id attributes to I5
sigles with three parts.
+ - Add --no-tokenizer parameter as a requirement
+ for relying on inline tokens only.
2.3.4 2022-11-09
- Improve stability of XML entity replacement.
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f2553b4..54c5b9f 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml
@@ -47,6 +47,7 @@
'tokenizer-call|tc=s' => \(my $tokenizer_call),
'tokenizer-korap|tk' => \(my $tokenizer_korap),
'tokenizer-internal|ti' => \(my $tokenizer_intern),
+ 'no-tokenizer' => \(my $no_tokenizer),
'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
@@ -120,6 +121,7 @@
$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
}
+# KorAP tokenization
elsif ($tokenizer_korap) {
eval {
require KorAP::XML::TEI::Tokenizer::KorAP;
@@ -133,6 +135,12 @@
};
$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
+}
+
+# No internal tokenizer chosen
+elsif (!$tokenizer_intern && !$no_tokenizer) {
+ $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
+ exit(1);
};
if ($use_tokenizer_sentence_splits) {
@@ -599,6 +607,13 @@
$ --no-tokens --sentence-positions -' - \
$ > corpus.korapxml.zip
+=item B<--no-tokenizer>
+
+Boolean flag indicating that no tokenizer should be used.
+This is meant to ensure that by default a final token layer always
+exists.
+If a separate tokenizer is chosen, this flag is ignored.
+
=item B<--skip-inline-tokens>
Boolean flag indicating that inline tokens should not
diff --git a/t/script.t b/t/script.t
index e7b93f3..80b67b5 100644
--- a/t/script.t
+++ b/t/script.t
@@ -503,7 +503,8 @@
my $t = test_tei2korapxml(
file => $file,
env => 'KORAPXMLTEI_INLINE=1',
- tmp => 'script_tagged'
+ tmp => 'script_tagged',
+ param => '--no-tokenizer'
)
->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
->stderr_like(qr!KORAPXMLTEI_INLINE is deprecated!)
@@ -586,7 +587,7 @@
my $t = test_tei2korapxml(
file => $file,
tmp => 'script_tagged',
- param => '--inline-tokens=myfoundry#myfile --skip-inline-token-annotations=0'
+ param => '--inline-tokens=myfoundry#myfile --skip-inline-token-annotations=0 --no-tokenizer'
)
->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
->stderr_unlike(qr!KORAPXMLTEI_INLINE is deprecated!)
@@ -606,7 +607,7 @@
$t = test_tei2korapxml(
file => $file,
tmp => 'script_tagged',
- param => '--inline-tokens=myfoundry --skip-inline-token-annotations=0'
+ param => '--inline-tokens=myfoundry --skip-inline-token-annotations=0 --no-tokenizer'
)
->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
@@ -632,14 +633,14 @@
# Generate zip file (unportable!)
stderr_like(
- sub { `cat '$file' | perl '$script' --skip-token-inline-annotations=0 - > '$outzip'` },
+ sub { `cat '$file' | perl '$script' --skip-token-inline-annotations=0 --no-tokenizer - > '$outzip'` },
qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!,
'Processing 1'
);
# TODO: there should be a better way to test this
stderr_unlike(
- sub { `cat '$file' | perl '$script' --skip-token-inline-annotations=0 - > '$outzip'` },
+ sub { `cat '$file' | perl '$script' --skip-token-inline-annotations=0 --no-tokenizer - > '$outzip'` },
qr!.*undefined value.*!,
'Processing 2'
);
@@ -667,7 +668,7 @@
test_tei2korapxml(
file => catfile($f, 'data', 'goe_sample.i5.xml'),
tmp => 'script_utf8_enc',
- param => '--skip-inline-token-annotations=0',
+ param => '--skip-inline-token-annotations=0 --no-tokenizer',
)
->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
->unzip_xml('GOE/AGA/00000/data.xml')
@@ -677,7 +678,7 @@
test_tei2korapxml(
file => catfile($f, 'data', 'goe_sample.i5.iso.xml'),
- param => '--skip-inline-token-annotations=0',
+ param => '--skip-inline-token-annotations=0 --no-tokenizer',
tmp => 'script_iso_enc'
)
->stderr_like(qr!tei2korapxml:.*? text_id=GOE_AGA\.00000!)
@@ -730,7 +731,7 @@
test_tei2korapxml(
tmp => 'script_out',
file => $file,
- param => '-l=warn'
+ param => '-l=warn --no-tokenizer'
)->stderr_is('');
};
@@ -763,13 +764,13 @@
test_tei2korapxml(
tmp => 'script_out',
file => $file,
- param => '-rv=' . $KorAP::XML::TEI::Tokenizer::KorAP::VERSION
+ param => '-rv=' . $KorAP::XML::TEI::Tokenizer::KorAP::VERSION . ' --no-tokenizer'
)->stderr_like(qr!GOE_AGA\.00000!);
test_tei2korapxml(
tmp => 'script_out',
file => $file,
- param => '-rv= " ' . $KorAP::XML::TEI::Tokenizer::KorAP::VERSION . ' "'
+ param => '-rv= " ' . $KorAP::XML::TEI::Tokenizer::KorAP::VERSION . ' " --no-tokenizer'
)->stderr_like(qr!GOE_AGA\.00000!);
};
@@ -789,4 +790,12 @@
};
+subtest 'Require tokenizer' => sub {
+
+ my $t = test_tei2korapxml(
+ file => catfile($f, 'data', 'icc_german_sample.p5.xml'),
+ tmp => 'script_utf8_enc'
+ )->stderr_like(qr!No tokenizer chosen!);
+};
+
done_testing;
diff --git a/t/tei.t b/t/tei.t
index 4d068a6..98b945e 100644
--- a/t/tei.t
+++ b/t/tei.t
@@ -53,7 +53,8 @@
text => "<!--\nDies ist ein\nmehrzeiligerKommentar -->Text1",
textSigle => 'A/B.1',
pattern => 'xx'
- }
+ },
+ param => '--no-tokenizer'
)
->file_exists('A/B/1/data.xml')
->unzip_xml('A/B/1/data.xml')
@@ -67,7 +68,8 @@
text => "Nur ein Test",
textSigle => '',
pattern => 'missing_dir'
- }
+ },
+ param => '--no-tokenizer'
)
->file_exists_not('A/B/1/data.xml')
->stderr_like(qr!Empty '<textSigle />' \(L29\) in header!)