Add ability to convert KorAP-XML base zips to CoNLL-U Change-Id: I7ed7dc4a1f86769076b91247cfbdd408b7539641

commit: d8455833dabf1b5be6b896cf5b8e318c480d55ae [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Feb 11 17:30:29 2021 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Fri Feb 12 08:31:06 2021 +0100
tree: 8d9396cf01ab049e7d8fb53b95a909747402c0f1
parent: 396b4d6a994cfd70d86220fb332cd0d6966de1d6 [diff] [blame]
diff --git a/script/korapxml2conllu b/script/korapxml2conllu
index aca61c3..9dcec42 100755
--- a/script/korapxml2conllu
+++ b/script/korapxml2conllu

@@ -12,6 +12,8 @@
 my $text_no=0;
 my %opts;
 my %plain_texts;
+my %sentence_ends;
+
 my $usage=<<EOF;
 Usage: $0 [options] ZIPFILE [ZIPFILE...]
 
@@ -19,10 +21,12 @@
  -p pattern
 
 Description:
- Convert KorAP-XML morpho zip to CoNLL(-U) format with all information necessary
+ Convert KorAP-XML base or morpho zips to CoNLL(-U) format with all information necessary
  for reconstruction in comment lines.
 
 Examples:
+ $0 /vol/corpora/DeReKo/current/KorAP/zip/zca20.zip
+
  $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
 
  ZIPSIGLEPATTERN='-x "*15/FEB*" "*15/MAR*"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
@@ -41,6 +45,7 @@
 my @current_lines;
 my %processedFilenames;
 my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
+my $baseOnly;
 
 my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
 
@@ -73,20 +78,29 @@
   my @conll = ("_") x 10;
   my $filename;
 
-  my $morphocommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/morpho.xml' $zipsiglepattern |";
-  # print STDERR $morphocommand, "\n";
-  open (MORPHOPIPE, $morphocommand) or die "cannot unzip $morpho_zip";
-  open (PLAINTEXTPIPE, "$UNZIP -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |") or die "cannot unzip $data_zip";
+  $baseOnly = $morpho_zip eq $data_zip;
+  my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
+  if(!$baseOnly) {
+    $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/morpho.xml' $zipsiglepattern |";
+    $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |";
+  } else {
+    $foundry = "base";
+    $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/tokens.xml' $zipsiglepattern |";
+    $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
+  }
+
+  open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
+  open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
   print "$COMMENT_START foundry = $foundry\n";
-  while (<MORPHOPIPE>) {
+  while (<MORPHO_OR_TOKENPIPE>) {
     if (/^  inflating: (.*)/) {
       $filename=$1;
-      while($processedFilenames{$filename} && !eof(MORPHOPIPE)) {
+      while($processedFilenames{$filename} && !eof(MORPHO_OR_TOKENPIPE)) {
         print STDERR "WARNING: $filename already processed\n";
-        while (<MORPHOPIPE>) {
+        while (<MORPHO_OR_TOKENPIPE>) {
           last if(/\s+inflating:\s+(.*)/);
         }
-        $filename=$1 if(!eof(MORPHOPIPE) && /\s+inflating:\s+(.*)/);
+        $filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
       }
     } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
       last if($test && $text_no++ > 3);
@@ -102,7 +116,7 @@
           $first = 0;
       }
       if(!fetch_plaintext($docid)) { # skip this text
-        while (<MORPHOPIPE>) {
+        while (<MORPHO_OR_TOKENPIPE>) {
           last if(m@</layer>@);
         }
       }
@@ -139,6 +153,24 @@
       $token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
       @conll = ("_") x 10;
       $conll[$FORM_idx] = encode("utf-8", $token);
+      if($baseOnly) {
+        my @vals = ($current_from, $current_to);
+        print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
+        push @current_lines, \@vals;
+        $known++;
+        $conll[$ID_idx] = $#current_lines+1;
+        $current .= join("\t", @conll) . "\n"; # conll columns
+        fetch_plaintext($docid);
+        if ($sentence_ends{$docid}{$current_to}) {
+          $current .= "\n";
+          printTokenRanges();
+          print STDOUT $current;
+          $current = "";
+          $known = 0;
+          $unknown = 0;
+          @current_lines = ();
+        }
+      }
     } elsif (m@^\s*</fs>@) {
       my @vals = ($current_from, $current_to);
       print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
@@ -155,14 +187,14 @@
         $current=""; $known=0; $unknown=0;
         @current_lines = ();
       }
-      while (<MORPHOPIPE>) {
+      while (<MORPHO_OR_TOKENPIPE>) {
         last if (m@</span>@);    # only consider first interpretation
       }
     }
   }
   $current .= "\n";
   closeDoc(1);
-  close(MORPHOPIPE);
+  close(MORPHO_OR_TOKENPIPE);
   close(PLAINTEXTPIPE);
 }
 exit;
@@ -197,15 +229,26 @@
   my ($target_id) = @_;
   my $docid;
   my $text_started=0;
+  my ($current_id, $current_from, $current_to);
 
-  if($plain_texts{$target_id}) {
+  if($plain_texts{$target_id} && (!$baseOnly || $sentence_ends{$target_id}{-1})) {
 #    print STDERR "already got $target_id\n";
-    return;
+    return 1;
   }
   while(<PLAINTEXTPIPE>) {
     if(/<raw_text[^>]+docid="([^"]*)/) {
       $docid=$1;
       $text_started=0;
+    } elsif(/<layer[^>]+docid="([^"]*)/) {
+      $docid=$1;
+      $sentence_ends{$docid}{-1}=1;
+    } elsif(m@<span @) {
+        ($current_id) = /id="[^0-9]*([^\"]*)"/;
+        ($current_from) = /from="([^\"]*)"/;
+        ($current_to) = /to="([^\"]*)"/;
+    } elsif(m@<f\s[^>]*>s</f>@) {
+      print STDERR "Found sentence end for $docid \@$current_to\n" if($debug);
+      $sentence_ends{$docid}{$current_to}=1;
     } elsif (m@<text>(.*)</text>@) {
       $_= decode("utf-8", $1, Encode::FB_DEFAULT);
       s/&lt;/</go;
commit	d8455833dabf1b5be6b896cf5b8e318c480d55ae	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Feb 11 17:30:29 2021 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Fri Feb 12 08:31:06 2021 +0100
tree	8d9396cf01ab049e7d8fb53b95a909747402c0f1
parent	396b4d6a994cfd70d86220fb332cd0d6966de1d6 [diff] [blame]