k2c: by default use s bounds from structure.xml use --s-bounds-from-morpho option otherwise Change-Id: Ic8321767d41416283dd5081620a1c07710fc3460

commit: 15c84fdbaccd94b79aed7f8b55596c4be5fbbbb0 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Tue Oct 12 12:20:27 2021 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Tue Oct 12 12:20:27 2021 +0200
tree: 7040f70bee83460aadde93f4a9e167b4bc0c1220
parent: 97ba2ba98c64c66f695628a78c35fc1e466507d8 [diff] [blame]
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index 9d7b7d5..6c7e9b8 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu

@@ -30,6 +30,7 @@
 GetOptions(
   'sigle-pattern|p=s'            => \(my $sigle_pattern = ''),
   'extract-attributes-regex|e=s' => \(my $extract_attributes_regex = ''),
+  's-bounds-from-morpho'         => \(my $s_bounds_from_morpho = 0),
   'log|l=s'                      => \(my $log_level = 'warn'),
   'columns|c=n'                  => \(my $columns = 10),
 
@@ -99,9 +100,10 @@
   my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
   if (!$baseOnly) {
     $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${sigle_pattern}*/*/*/morpho.xml' $zipsiglepattern |";
-    if ($extract_attributes_regex) {
+    if ($extract_attributes_regex || !$s_bounds_from_morpho) {
       $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
     } else {
+      $log->debug("Not reading structure information.");
       $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${sigle_pattern}*/*/data.xml' $zipsiglepattern |";
     }
   } else {
@@ -203,6 +205,7 @@
         }
         fetch_plaintext($docid);
         if ($sentence_ends{$docid}{$current_to}) {
+          $log->debug("Using sentence end for $docid \@$current_to");
           $current .= "\n";
           printTokenRanges();
           print STDOUT $current;
@@ -223,7 +226,8 @@
       } else {
         $current .= join("\t", @conll[0..$columns-1]) . "\n"; # conll columns
       }
-      if($conll[$XPOS_idx] eq '$.' || ($conll[$XPOS_idx] eq 'SENT' && $token eq '.') || $known + $unknown >= $MAX_SENTENCE_LENGTH) {
+      if($sentence_ends{$docid}{$current_to} || ($s_bounds_from_morpho && $conll[$XPOS_idx] eq '$.' || ($conll[$XPOS_idx] eq 'SENT' && $token eq '.')) || $known + $unknown >= $MAX_SENTENCE_LENGTH) {
+        $log->debug("Using sentence end for $docid \@$current_to");
         $current .= "\n";
         if($known + $unknown > 0) { # only print sentence if it contains some words
           printTokenRanges();
@@ -277,7 +281,7 @@
   my $text_count = 0;
   my ($current_id, $current_from, $current_to);
 
-  if($plain_texts{$target_id} && (!$baseOnly || $sentence_ends{$target_id}{-1})) {
+  if($plain_texts{$target_id} && ($s_bounds_from_morpho || $sentence_ends{$target_id})) {
     $log->debug("Already got $target_id");
     return 1;
   }
@@ -288,14 +292,17 @@
       $text_started=0;
     } elsif(/<layer[^>]+docid="([^"]*)/) {
       $docid=$1;
-      $sentence_ends{$docid}{-1}=1;
     } elsif(m@<span @) {
         ($current_id) = /id="[^0-9]*([^\"]*)"/;
         ($current_from) = /from="([^\"]*)"/;
         ($current_to) = /to="([^\"]*)"/;
     } elsif(m@<f\s[^>]*>s</f>@) {
-      $log->debug("Found sentence end for $docid \@$current_to");
-      $sentence_ends{$docid}{$current_to}=1;
+      if ($s_bounds_from_morpho) {
+        $log->debug("Ignoring sentence end for $docid \@$current_to because of --s-bounds-from-morpho");
+      } else {
+        $log->debug("Found sentence end for $docid \@$current_to");
+        $sentence_ends{$docid}{$current_to} = 1;
+      }
     } elsif($extract_attributes_regex && m@<f\sname="name"[^>]*>([^<]+)</f>@) {
       my $current_element = $1;
       $log->debug("Looking for matching attributes in $docid");
@@ -395,6 +402,10 @@
 
 Print n columns (default: 10). If n=1, only the token itself is printed.
 
+=item B<--s-bounds-from-morpho>
+
+Get sentence boundary information from tagger output rather than from s annotion in structure.xml files.
+
 =item B<--help|-h>
 
 Print help information.
commit	15c84fdbaccd94b79aed7f8b55596c4be5fbbbb0	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Oct 12 12:20:27 2021 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Tue Oct 12 12:20:27 2021 +0200
tree	7040f70bee83460aadde93f4a9e167b4bc0c1220
parent	97ba2ba98c64c66f695628a78c35fc1e466507d8 [diff] [blame]