Provide conllu2korapxml to convert from ConLL-U to KorAP-XML zip

Change-Id: I8913abac4713800bf38b38935004fd6ee416aab1
diff --git a/script/conllu2korapxml b/script/conllu2korapxml
new file mode 100755
index 0000000..b12886c
--- /dev/null
+++ b/script/conllu2korapxml
@@ -0,0 +1,185 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use POSIX;
+use Getopt::Std;
+use Encode;
+use IO::Compress::Zip qw(zip $ZipError :constants);
+use File::Basename;
+
+my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
+my %opts;
+my %processedFilenames;
+
+my $usage=<<EOF;
+Usage: $0 [options] [CoNLL-U-FILE...]
+
+Options:
+ -d        debug
+Description:
+ Converts CoNLL-U files that follow KorAP-specific comment conventions
+ and contain morphosyntactic and/or dependency annotations to
+ corresponding KorAP-XML zip files.
+
+Examples:
+ $0 zca20.spacy.conllu > zca20.spacy.zip
+
+ $0 < zca20.spacy.conllu > zca20.spacy.zip
+EOF
+
+
+getopts('hd', \%opts);
+die $usage if($opts{h});
+my $debug=($opts{d}? 1 : 0);
+
+my $docid="";
+my $zip = undef;
+my $outh = \*STDOUT;
+my $parser_file;
+my $parse;
+my $morpho_file;
+my $morpho;
+my @spansFrom;
+my @spansTo;
+my $current;
+my ($unknown, $known) = (0, 0);
+
+my ($write_morpho, $write_syntax, $base) = (1, 0, 0);
+my $filename;
+my $foundry_name;
+my $first=1;
+my @conllu_files = @ARGV;
+push @conllu_files, "-" if (@conllu_files == 0);
+my $fh;
+foreach my $conllu_file (@conllu_files) {
+  if ($conllu_file eq '-') {
+    $fh = \*STDIN;
+  } else {
+    open($fh, "<", $conllu_file) or die "Cannot open $conllu_file";
+  }
+  my $i=0; my $s=0; my $first_in_sentence=0;
+  my $lastDocSigle="";
+  while (<$fh>) {
+    if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
+      $filename=$1;
+      if(!$first) {
+        closeDoc(0);
+      } else {
+        $first=0;
+      }
+      if($processedFilenames{$filename}) {
+        print STDERR "WARNING: $filename is already processed\n";
+      }
+      $processedFilenames{$filename}=1;
+      $i=0;
+    } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
+      $foundry_name=$1;
+      print STDERR "Foundry: $foundry_name\n" if($debug);
+    } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
+      $docid=$1;
+      my $docSigle = $docid;
+      $docSigle =~ s/\..*//;
+      if($docSigle ne $lastDocSigle) {
+        print STDERR "Analyzing $docSigle\n";
+        $lastDocSigle = $docSigle;
+      }
+      $known=$unknown=0;
+      $current="";
+      $parser_file = dirname($filename);
+      $parser_file =~ s@(.*)/[^/]+$@$1@;
+      $morpho_file = $parser_file;
+      $morpho_file .= "/$foundry_name/morpho.xml";
+      $parser_file .= "/$foundry_name/dependency.xml";
+      $parse = $morpho = layer_header($docid);
+    }  elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
+      @spansFrom = split(/\s+/, $1);
+    }  elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
+      @spansTo = split(/\s+/, $1);
+    } elsif (! /^\s*$/) {
+      my @parsed=split('\t');
+      chomp  $parsed[9];
+      if(@parsed != 10) {
+        print STDERR "WARNING: skipping strange parser output line in $docid\n";
+        $i++;
+        next;
+      }
+      my $t=$parsed[0];
+      if($t == 1) {
+        $s++;
+        $first_in_sentence = $i;
+      }
+      if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) {
+        $write_syntax=1;
+        my $from=$spansFrom[$parsed[6]];
+        my $to=$spansTo[$parsed[6]];
+          $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
+<rel label="$parsed[7]">
+<span from="$from" to="$to"/>
+</rel>
+</span>
+@;
+        }
+        $morpho .= qq(  <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="lemma">$parsed[2]</f>
+      <f name="pos">$parsed[3]</f>
+);
+      $morpho .= qq(      <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
+      if($parsed[9] ne "_") {
+        if ($parsed[9] =~ /[0-9.e]+/) {
+          $morpho .= qq(      <f name="certainty">$parsed[9]</f>\n)
+        }
+        else {
+          $morpho .= qq(      <f name="misc">$parsed[9]</f>\n)
+        }
+      }
+      $morpho .= qq(     </fs>
+    </f>
+   </fs>
+  </span>
+);
+        $i++;
+    }
+  }
+  $current .= "\n";
+  closeDoc(1);
+  $zip->close();
+  close($fh);
+}
+exit;
+
+sub newZipStream {
+  my ($fname) = @_;
+  if (defined $zip) {
+    $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
+        Append            => 1, Name => $fname)
+        or die "ERROR ('$fname'): zip failed: $ZipError\n";
+  } else {
+    $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1,
+        Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname"
+        or die "ERROR ('$fname'): zip failed: $ZipError\n";
+  }
+}
+
+sub closeDoc {
+  if ($write_morpho) {
+    newZipStream($morpho_file);
+    $zip->print($morpho, qq( </spanList>\n</layer>\n));
+  }
+  if ($write_syntax) {
+    $write_syntax = 0;
+    newZipStream($parser_file);
+    $zip->print($parse, qq(</spanList>\n</layer>\n));
+  }
+}
+
+sub layer_header {
+  my ($docid) = @_;
+  return(qq(<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<spanList>
+));
+}
\ No newline at end of file