Add -s option to use sentence boundaries provided by KorAP tokenizer Change-Id: Id3aaa50d7775256e336013cc0fbe56803c125052

commit: 985da0cafd247c9d6bb891ae862f9c4ac324820d [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Feb 15 19:29:50 2021 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Wed Feb 17 11:57:19 2021 +0100
tree: a0f7ea1889e4e2f62b35b39dbbebe60eab43884c
parent: f7084c4f4c5e24613cf7935be5eb2bf8c92ce804 [diff] [blame]
diff --git a/script/tei2korapxml b/script/tei2korapxml
index 690cf2d..f15376f 100755
--- a/script/tei2korapxml
+++ b/script/tei2korapxml

@@ -47,6 +47,7 @@
   'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
   'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
   'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
+  'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
     pod2usage(
@@ -89,13 +90,17 @@
 ## extern tokenization
 my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
 
+if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
+  die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
+}
+
 my $ext_tok;
 if ($tokenizer_call) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
 }
 
 elsif ($tokenizer_korap) {
-  $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new;
+  $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
 };
 my $_tok_file_ext  = "tokens.xml";
 ##
@@ -336,6 +341,10 @@
             $cons_tok->reset;
           };
 
+          if ($use_tokenizer_sentence_splits) {
+            $ext_tok->sentencize_from_previous_input($structures);
+          }
+
           # ~ write structures ~
           if (!$structures->empty) {
             $structures->to_zip(
@@ -469,6 +478,11 @@
   # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
   my $rl = shift;
 
+  my $dummy_anno;
+  if ($use_tokenizer_sentence_splits) {
+    $dummy_anno = $structures->new_dummy_annotation();
+  }
+
   #  Notes on how 'XML::CompactTree::XS' works
   #
   #  Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
@@ -570,8 +584,14 @@
 
       # ~ handle structures ~
 
+      my $anno;
+
       # $e->[1] represents the tag name
-      my $anno = $structures->add_new_annotation($e->[1]);
+      if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
+        $anno = $dummy_anno;
+      } else {
+        $anno = $structures->add_new_annotation($e->[1]);
+      }
 
       # ~ handle tokens ~
 
@@ -875,6 +895,11 @@
 that will take an I<Aggressive> and a I<conservative>
 approach.
 
+=item B<--use-tokenizer-sentence-splits|-s>
+
+Replace existing with, or add new, sentence boundary information
+provided by the KorAP tokenizer (currently supported only).
+
 =item B<--log|-l>
 
 Loglevel for I<Log::Any>. Defaults to C<notice>.
commit	985da0cafd247c9d6bb891ae862f9c4ac324820d	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Feb 15 19:29:50 2021 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Wed Feb 17 11:57:19 2021 +0100
tree	a0f7ea1889e4e2f62b35b39dbbebe60eab43884c
parent	f7084c4f4c5e24613cf7935be5eb2bf8c92ce804 [diff] [blame]