Provide conllu2korapxml to convert from ConLL-U to KorAP-XML zip

Change-Id: I8913abac4713800bf38b38935004fd6ee416aab1
diff --git a/Changes b/Changes
index abdb3a6..c828668 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,5 @@
+        - Provide conllu2korapxml to convert from ConLL-U to KorAP-XML zip
+
 0.2 2021-02-12
         - Convert also KorAP-XML base zips
 
diff --git a/Makefile.PL b/Makefile.PL
index b66381c..d807fae 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -7,18 +7,22 @@
 WriteMakefile(
     NAME             => 'korapxml2conllu',
     AUTHOR           => 'Marc Kupietz',
-    ABSTRACT         => 'Conversion from KorAP-XML zip files to CoNLL-U format',
+    ABSTRACT         => 'Conversion between KorAP-XML zip and CoNLL-U',
     VERSION          => '0.2',
     LICENSE          => 'freebsd',
     BUILD_REQUIRES   => {
+        'Test::More'          => "1.302177",
+        'Test::Script'        => "1.12",
+        'Test::TempDir::Tiny' => 0,
+        'File::Temp'          => 0.2308,
+        'IO::Compress::Zip'   => '2.091',
     },
     PREREQ_PM        => {
-        'POSIX'        => 0,
-        'Getopt::Std'  => 0,
-        'Encode'       => "3.07",
-        'Test::More'   => "1.302177",
-        'Test::Script' => "1.12"
+        'POSIX'             => 0,
+        'Getopt::Std'       => 0,
+        'Encode'            => "3.07",
+        'IO::Compress::Zip' => '2.091'
     },
     MIN_PERL_VERSION => '5.016',
-    EXE_FILES        => [ './script/korapxml2conllu' ]
+    EXE_FILES        => [ './script/korapxml2conllu', './script/conllu2korapxml' ]
 );
diff --git a/Readme.md b/Readme.md
index 83eb315..6b9c11e 100644
--- a/Readme.md
+++ b/Readme.md
@@ -4,7 +4,11 @@
 
 ## Description
 
-The state of the package is very preliminary. Currently, only a script `korapxml2conllu` is provided. It converts KorAP XML zip "base" and "morpho" (with POS and lemma annotations) files to corresponding CoNLL-U files with foundry information, text ids and token offsets in comments.
+The state of the package is very preliminary. Currently, two scripts are provided:
+* `korapxml2conllu` converts KorAP XML zip "base" and "morpho" (with POS and lemma annotations) files to corresponding CoNLL-U files with foundry information, text ids and token offsets in comments
+* `conllu2korapxml` converts CoNLL-U files that follow KorAP-specific comment conventions
+  and contain morphosyntactic and/or dependency annotations to
+  corresponding KorAP-XML zip files
 
 **! This software is in its early stages and not stable yet! Use it on your own risk! It is very likely to be changed or replaced.**
 ## Installation
@@ -17,7 +21,7 @@
 ```
 
 ## Command Line Invocation
-
+### `korapxml2conllu`
 ```
 
 $ korapxml2conllu wpd17.tree_tagger.zip | head -42
@@ -89,6 +93,10 @@
 5	Champagne	_	_	_	_	_	_	_	_
 
 ```
+### `conllu2korapxml`
+```
+./script/conllu2korapxml < t/data/goe.morpho.conllu > goe.morpho.zip
+```
 
 ## Development and License
 
diff --git a/script/conllu2korapxml b/script/conllu2korapxml
new file mode 100755
index 0000000..b12886c
--- /dev/null
+++ b/script/conllu2korapxml
@@ -0,0 +1,185 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use POSIX;
+use Getopt::Std;
+use Encode;
+use IO::Compress::Zip qw(zip $ZipError :constants);
+use File::Basename;
+
+my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
+my %opts;
+my %processedFilenames;
+
+my $usage=<<EOF;
+Usage: $0 [options] [CoNLL-U-FILE...]
+
+Options:
+ -d        debug
+Description:
+ Converts CoNLL-U files that follow KorAP-specific comment conventions
+ and contain morphosyntactic and/or dependency annotations to
+ corresponding KorAP-XML zip files.
+
+Examples:
+ $0 zca20.spacy.conllu > zca20.spacy.zip
+
+ $0 < zca20.spacy.conllu > zca20.spacy.zip
+EOF
+
+
+getopts('hd', \%opts);
+die $usage if($opts{h});
+my $debug=($opts{d}? 1 : 0);
+
+my $docid="";
+my $zip = undef;
+my $outh = \*STDOUT;
+my $parser_file;
+my $parse;
+my $morpho_file;
+my $morpho;
+my @spansFrom;
+my @spansTo;
+my $current;
+my ($unknown, $known) = (0, 0);
+
+my ($write_morpho, $write_syntax, $base) = (1, 0, 0);
+my $filename;
+my $foundry_name;
+my $first=1;
+my @conllu_files = @ARGV;
+push @conllu_files, "-" if (@conllu_files == 0);
+my $fh;
+foreach my $conllu_file (@conllu_files) {
+  if ($conllu_file eq '-') {
+    $fh = \*STDIN;
+  } else {
+    open($fh, "<", $conllu_file) or die "Cannot open $conllu_file";
+  }
+  my $i=0; my $s=0; my $first_in_sentence=0;
+  my $lastDocSigle="";
+  while (<$fh>) {
+    if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
+      $filename=$1;
+      if(!$first) {
+        closeDoc(0);
+      } else {
+        $first=0;
+      }
+      if($processedFilenames{$filename}) {
+        print STDERR "WARNING: $filename is already processed\n";
+      }
+      $processedFilenames{$filename}=1;
+      $i=0;
+    } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
+      $foundry_name=$1;
+      print STDERR "Foundry: $foundry_name\n" if($debug);
+    } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
+      $docid=$1;
+      my $docSigle = $docid;
+      $docSigle =~ s/\..*//;
+      if($docSigle ne $lastDocSigle) {
+        print STDERR "Analyzing $docSigle\n";
+        $lastDocSigle = $docSigle;
+      }
+      $known=$unknown=0;
+      $current="";
+      $parser_file = dirname($filename);
+      $parser_file =~ s@(.*)/[^/]+$@$1@;
+      $morpho_file = $parser_file;
+      $morpho_file .= "/$foundry_name/morpho.xml";
+      $parser_file .= "/$foundry_name/dependency.xml";
+      $parse = $morpho = layer_header($docid);
+    }  elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
+      @spansFrom = split(/\s+/, $1);
+    }  elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
+      @spansTo = split(/\s+/, $1);
+    } elsif (! /^\s*$/) {
+      my @parsed=split('\t');
+      chomp  $parsed[9];
+      if(@parsed != 10) {
+        print STDERR "WARNING: skipping strange parser output line in $docid\n";
+        $i++;
+        next;
+      }
+      my $t=$parsed[0];
+      if($t == 1) {
+        $s++;
+        $first_in_sentence = $i;
+      }
+      if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) {
+        $write_syntax=1;
+        my $from=$spansFrom[$parsed[6]];
+        my $to=$spansTo[$parsed[6]];
+          $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
+<rel label="$parsed[7]">
+<span from="$from" to="$to"/>
+</rel>
+</span>
+@;
+        }
+        $morpho .= qq(  <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
+   <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+    <f name="lex">
+     <fs>
+      <f name="lemma">$parsed[2]</f>
+      <f name="pos">$parsed[3]</f>
+);
+      $morpho .= qq(      <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
+      if($parsed[9] ne "_") {
+        if ($parsed[9] =~ /[0-9.e]+/) {
+          $morpho .= qq(      <f name="certainty">$parsed[9]</f>\n)
+        }
+        else {
+          $morpho .= qq(      <f name="misc">$parsed[9]</f>\n)
+        }
+      }
+      $morpho .= qq(     </fs>
+    </f>
+   </fs>
+  </span>
+);
+        $i++;
+    }
+  }
+  $current .= "\n";
+  closeDoc(1);
+  $zip->close();
+  close($fh);
+}
+exit;
+
+sub newZipStream {
+  my ($fname) = @_;
+  if (defined $zip) {
+    $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
+        Append            => 1, Name => $fname)
+        or die "ERROR ('$fname'): zip failed: $ZipError\n";
+  } else {
+    $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1,
+        Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname"
+        or die "ERROR ('$fname'): zip failed: $ZipError\n";
+  }
+}
+
+sub closeDoc {
+  if ($write_morpho) {
+    newZipStream($morpho_file);
+    $zip->print($morpho, qq( </spanList>\n</layer>\n));
+  }
+  if ($write_syntax) {
+    $write_syntax = 0;
+    newZipStream($parser_file);
+    $zip->print($parse, qq(</spanList>\n</layer>\n));
+  }
+}
+
+sub layer_header {
+  my ($docid) = @_;
+  return(qq(<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<spanList>
+));
+}
\ No newline at end of file
diff --git a/t/test.t b/t/test.t
index 1077bc0..73c3515 100644
--- a/t/test.t
+++ b/t/test.t
@@ -2,6 +2,8 @@
 use warnings;
 use Test::More;
 use Test::Script;
+use Test::TempDir::Tiny;
+use File::Copy;
 
 script_runs([ 'script/korapxml2conllu', '-h' ], { exit => 255 });
 script_stderr_like "Description", "Can print help message";
@@ -37,4 +39,25 @@
     script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U";
 }
 
-done_testing;
\ No newline at end of file
+my $test_tempdir = tempdir();
+my $expected;
+open(my $fh, '<', "t/data/goe.morpho.conllu"); {
+    local $/;
+    $expected = <$fh>;
+}
+close($fh);
+
+my $zipfile = "$test_tempdir/goe.tree_tagger.zip";
+my $zipcontent;
+script_runs([ 'script/conllu2korapxml', "t/data/goe.morpho.conllu" ], {stdout => \$zipcontent},
+    "Converts t/data/goe.morpho.conllu to KorAP-XML zip");
+open(my $fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
+print $fh $zipcontent;
+close($fh);
+
+copy("t/data/goe.zip", $test_tempdir);
+script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ],
+    "Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U");
+script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly";
+
+done_testing;