Improve documentation and support for external tokenizers Change-Id: Ia65d4e9bcd2a28a7a77903dd49e2456dc566e7fe

commit: 1148478d879d06f0d39d66ad36a4c08e6da6598f [log] [tgz]
author: Akron <nils@diewald-online.de> Wed Nov 03 20:12:14 2021 +0100
committer: Akron <nils@diewald-online.de> Wed Nov 03 20:12:14 2021 +0100
tree: c2d9810c965384a31d725fd8ec1c11795622ca06
parent: a2cb2816ab9367332fb0d8d83617b4c1c0e26774 [diff]
diff --git a/Readme.pod b/Readme.pod
index 79180c4..5dff430 100644
--- a/Readme.pod
+++ b/Readme.pod

@@ -8,7 +8,7 @@
 
 =head1 SYNOPSIS
 
-  cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
+  cat corpus.i5.xml | tei2korapxml - > corpus.korapxml.zip
 
 =head1 DESCRIPTION
 
@@ -16,9 +16,6 @@
 L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
 based documents to the
 L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
-If no specific input is defined, data is
-read from C<STDIN>. If no specific output is defined, data is written
-to C<STDOUT>.
 
 This program is usually called from inside another script.
 
@@ -90,6 +87,12 @@
 
 =over 2
 
+=item B<--input|-i>
+
+The input file to process. If no specific input is defined and a single
+dash C<-> is passed as an argument, data is read from C<STDIN>.
+
+
 =item B<--root|-r>
 
 The root directory for output. Defaults to C<.>.
@@ -105,7 +108,23 @@
 =item B<--tokenizer-call|-tc>
 
 Call an external tokenizer process, that will tokenize
-a single line from STDIN and outputs one token per line.
+from STDIN and outputs the offsets of all tokens.
+
+Texts are separated using C<\x04\n>. The external process
+should add a new line per text.
+
+If the L</--use-tokenizer-sentence-splits> option is activated,
+sentences are marked by offset as well in new lines.
+
+To use L<Datok|https://github.com/KorAP/Datok> including sentence
+splitting, call C<tei2korap> as follows:
+
+  $ cat corpus.i5.xml | tei2korapxml -s \
+  $   -tc 'datok tokenize \
+  $        -t ./tokenizer.matok \
+  $        -p --newline-after-eot --no-sentences \
+  $        --no-tokens --sentence-positions -' - \
+  $        > corpus.korapxml.zip
 
 =item B<--tokenizer-korap|-tk>
 
@@ -180,7 +199,9 @@
 =item B<--use-tokenizer-sentence-splits|-s>
 
 Replace existing with, or add new, sentence boundary information
-provided by the KorAP tokenizer (currently supported only).
+provided by the tokenizer.
+Currently KorAP-tokenizer and certain external tokenizers support
+these boundaries.
 
 =item B<--tokens-file> <file>
 

diff --git a/lib/KorAP/XML/TEI/Tokenizer/External.pm b/lib/KorAP/XML/TEI/Tokenizer/External.pm
index 7e3fa5d..92f0c31 100644
--- a/lib/KorAP/XML/TEI/Tokenizer/External.pm
+++ b/lib/KorAP/XML/TEI/Tokenizer/External.pm

@@ -172,6 +172,14 @@
   };
 };
 
+
+# Set sentence split option
+sub sentence_splits {
+  my ($self, $bool) = @_;
+  $self->{sentence_split} = !!$bool;
+};
+
+
 sub sentencize_from_previous_input {
   my ($self, $structures) = @_;
 

diff --git a/script/tei2korapxml b/script/tei2korapxml
index 6f3ad1f..e7ffd17 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -96,15 +96,6 @@
 # TODO: IDS-specific (and redundant)
 my $_HEADER_TAG = 'idsHeader';
 
-
-# Define tokenizers
-if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
-  die $log->fatal(
-    'Sentence splitting is currently only supported by KorAP tokenizer ' .
-      '(use -tk to activate it)'
-    );
-};
-
 # Remember to skip certain inline tags
 my %skip_inline_tags = ();
 if ($skip_inline_tags_str) {
@@ -117,15 +108,16 @@
 my $ext_tok;
 if ($tokenizer_call) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
+  $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
 }
 
 elsif ($tokenizer_korap) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
-  if ($use_tokenizer_sentence_splits) {
-    $skip_inline_tags{s} = 1;
-  };
 };
 
+if ($use_tokenizer_sentence_splits) {
+  $skip_inline_tags{s} = 1;
+};
 
 # Internal tokenization
 my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
@@ -526,7 +518,23 @@
 =item B<--tokenizer-call|-tc>
 
 Call an external tokenizer process, that will tokenize
-a single line from STDIN and outputs one token per line.
+from STDIN and outputs the offsets of all tokens.
+
+Texts are separated using C<\x04\n>. The external process
+should add a new line per text.
+
+If the L</--use-tokenizer-sentence-splits> option is activated,
+sentences are marked by offset as well in new lines.
+
+To use L<Datok|https://github.com/KorAP/Datok> including sentence
+splitting, call C<tei2korap> as follows:
+
+  $ cat corpus.i5.xml | tei2korapxml -s \
+  $   -tc 'datok tokenize \
+  $        -t ./tokenizer.matok \
+  $        -p --newline-after-eot --no-sentences \
+  $        --no-tokens --sentence-positions -' - \
+  $        > corpus.korapxml.zip
 
 =item B<--tokenizer-korap|-tk>
 
@@ -601,7 +609,9 @@
 =item B<--use-tokenizer-sentence-splits|-s>
 
 Replace existing with, or add new, sentence boundary information
-provided by the KorAP tokenizer (currently supported only).
+provided by the tokenizer.
+Currently KorAP-tokenizer and certain external tokenizers support
+these boundaries.
 
 =item B<--tokens-file> <file>
commit	1148478d879d06f0d39d66ad36a4c08e6da6598f	[log] [tgz]
author	Akron <nils@diewald-online.de>	Wed Nov 03 20:12:14 2021 +0100
committer	Akron <nils@diewald-online.de>	Wed Nov 03 20:12:14 2021 +0100
tree	c2d9810c965384a31d725fd8ec1c11795622ca06
parent	a2cb2816ab9367332fb0d8d83617b4c1c0e26774 [diff]