Introduce --no-tokenizer parameter Change-Id: Icc12c24dd3d01b0e31d14b6d5a6c6214da0d9918

commit: b93fabbadeeac0a402bda909791e310e364910d9 [log] [tgz]
author: Akron <nils@diewald-online.de> Fri Jan 13 12:05:44 2023 +0100
committer: Akron <nils@diewald-online.de> Mon Jan 16 13:47:15 2023 +0100
tree: f180848bb9cc4e21fc402d47accabfe76b747cdd
parent: d26319b6c1d6be74b2526f1b8aedc7a1c3dc0940 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index f2553b4..54c5b9f 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -47,6 +47,7 @@
   'tokenizer-call|tc=s'   => \(my $tokenizer_call),
   'tokenizer-korap|tk'    => \(my $tokenizer_korap),
   'tokenizer-internal|ti' => \(my $tokenizer_intern),
+  'no-tokenizer'          => \(my $no_tokenizer),
   'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
   'inline-tokens=s'       => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s'   => \(my $inline_structures = 'struct#structure'),
@@ -120,6 +121,7 @@
   $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
 }
 
+# KorAP tokenization
 elsif ($tokenizer_korap) {
   eval {
     require KorAP::XML::TEI::Tokenizer::KorAP;
@@ -133,6 +135,12 @@
   };
 
   $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
+}
+
+# No internal tokenizer chosen
+elsif (!$tokenizer_intern && !$no_tokenizer) {
+  $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
+  exit(1);
 };
 
 if ($use_tokenizer_sentence_splits) {
@@ -599,6 +607,13 @@
   $        --no-tokens --sentence-positions -' - \
   $        > corpus.korapxml.zip
 
+=item B<--no-tokenizer>
+
+Boolean flag indicating that no tokenizer should be used.
+This is meant to ensure that by default a final token layer always
+exists.
+If a separate tokenizer is chosen, this flag is ignored.
+
 =item B<--skip-inline-tokens>
 
 Boolean flag indicating that inline tokens should not
commit	b93fabbadeeac0a402bda909791e310e364910d9	[log] [tgz]
author	Akron <nils@diewald-online.de>	Fri Jan 13 12:05:44 2023 +0100
committer	Akron <nils@diewald-online.de>	Mon Jan 16 13:47:15 2023 +0100
tree	f180848bb9cc4e21fc402d47accabfe76b747cdd
parent	d26319b6c1d6be74b2526f1b8aedc7a1c3dc0940 [diff] [blame]