Provide conllu2korapxml to convert from ConLL-U to KorAP-XML zip
Change-Id: I8913abac4713800bf38b38935004fd6ee416aab1
diff --git a/Changes b/Changes
index abdb3a6..c828668 100644
--- a/Changes
+++ b/Changes
@@ -1,3 +1,5 @@
+ - Provide conllu2korapxml to convert from ConLL-U to KorAP-XML zip
+
0.2 2021-02-12
- Convert also KorAP-XML base zips
diff --git a/Makefile.PL b/Makefile.PL
index b66381c..d807fae 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -7,18 +7,22 @@
WriteMakefile(
NAME => 'korapxml2conllu',
AUTHOR => 'Marc Kupietz',
- ABSTRACT => 'Conversion from KorAP-XML zip files to CoNLL-U format',
+ ABSTRACT => 'Conversion between KorAP-XML zip and CoNLL-U',
VERSION => '0.2',
LICENSE => 'freebsd',
BUILD_REQUIRES => {
+ 'Test::More' => "1.302177",
+ 'Test::Script' => "1.12",
+ 'Test::TempDir::Tiny' => 0,
+ 'File::Temp' => 0.2308,
+ 'IO::Compress::Zip' => '2.091',
},
PREREQ_PM => {
- 'POSIX' => 0,
- 'Getopt::Std' => 0,
- 'Encode' => "3.07",
- 'Test::More' => "1.302177",
- 'Test::Script' => "1.12"
+ 'POSIX' => 0,
+ 'Getopt::Std' => 0,
+ 'Encode' => "3.07",
+ 'IO::Compress::Zip' => '2.091'
},
MIN_PERL_VERSION => '5.016',
- EXE_FILES => [ './script/korapxml2conllu' ]
+ EXE_FILES => [ './script/korapxml2conllu', './script/conllu2korapxml' ]
);
diff --git a/Readme.md b/Readme.md
index 83eb315..6b9c11e 100644
--- a/Readme.md
+++ b/Readme.md
@@ -4,7 +4,11 @@
## Description
-The state of the package is very preliminary. Currently, only a script `korapxml2conllu` is provided. It converts KorAP XML zip "base" and "morpho" (with POS and lemma annotations) files to corresponding CoNLL-U files with foundry information, text ids and token offsets in comments.
+The state of the package is very preliminary. Currently, two scripts are provided:
+* `korapxml2conllu` converts KorAP XML zip "base" and "morpho" (with POS and lemma annotations) files to corresponding CoNLL-U files with foundry information, text ids and token offsets in comments
+* `conllu2korapxml` converts CoNLL-U files that follow KorAP-specific comment conventions
+ and contain morphosyntactic and/or dependency annotations to
+ corresponding KorAP-XML zip files
**! This software is in its early stages and not stable yet! Use it on your own risk! It is very likely to be changed or replaced.**
## Installation
@@ -17,7 +21,7 @@
```
## Command Line Invocation
-
+### `korapxml2conllu`
```
$ korapxml2conllu wpd17.tree_tagger.zip | head -42
@@ -89,6 +93,10 @@
5 Champagne _ _ _ _ _ _ _ _
```
+### `conllu2korapxml`
+```
+./script/conllu2korapxml < t/data/goe.morpho.conllu > goe.morpho.zip
+```
## Development and License
diff --git a/script/conllu2korapxml b/script/conllu2korapxml
new file mode 100755
index 0000000..b12886c
--- /dev/null
+++ b/script/conllu2korapxml
@@ -0,0 +1,185 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use POSIX;
+use Getopt::Std;
+use Encode;
+use IO::Compress::Zip qw(zip $ZipError :constants);
+use File::Basename;
+
+my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
+my %opts;
+my %processedFilenames;
+
+my $usage=<<EOF;
+Usage: $0 [options] [CoNLL-U-FILE...]
+
+Options:
+ -d debug
+Description:
+ Converts CoNLL-U files that follow KorAP-specific comment conventions
+ and contain morphosyntactic and/or dependency annotations to
+ corresponding KorAP-XML zip files.
+
+Examples:
+ $0 zca20.spacy.conllu > zca20.spacy.zip
+
+ $0 < zca20.spacy.conllu > zca20.spacy.zip
+EOF
+
+
+getopts('hd', \%opts);
+die $usage if($opts{h});
+my $debug=($opts{d}? 1 : 0);
+
+my $docid="";
+my $zip = undef;
+my $outh = \*STDOUT;
+my $parser_file;
+my $parse;
+my $morpho_file;
+my $morpho;
+my @spansFrom;
+my @spansTo;
+my $current;
+my ($unknown, $known) = (0, 0);
+
+my ($write_morpho, $write_syntax, $base) = (1, 0, 0);
+my $filename;
+my $foundry_name;
+my $first=1;
+my @conllu_files = @ARGV;
+push @conllu_files, "-" if (@conllu_files == 0);
+my $fh;
+foreach my $conllu_file (@conllu_files) {
+ if ($conllu_file eq '-') {
+ $fh = \*STDIN;
+ } else {
+ open($fh, "<", $conllu_file) or die "Cannot open $conllu_file";
+ }
+ my $i=0; my $s=0; my $first_in_sentence=0;
+ my $lastDocSigle="";
+ while (<$fh>) {
+ if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
+ $filename=$1;
+ if(!$first) {
+ closeDoc(0);
+ } else {
+ $first=0;
+ }
+ if($processedFilenames{$filename}) {
+ print STDERR "WARNING: $filename is already processed\n";
+ }
+ $processedFilenames{$filename}=1;
+ $i=0;
+ } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
+ $foundry_name=$1;
+ print STDERR "Foundry: $foundry_name\n" if($debug);
+ } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
+ $docid=$1;
+ my $docSigle = $docid;
+ $docSigle =~ s/\..*//;
+ if($docSigle ne $lastDocSigle) {
+ print STDERR "Analyzing $docSigle\n";
+ $lastDocSigle = $docSigle;
+ }
+ $known=$unknown=0;
+ $current="";
+ $parser_file = dirname($filename);
+ $parser_file =~ s@(.*)/[^/]+$@$1@;
+ $morpho_file = $parser_file;
+ $morpho_file .= "/$foundry_name/morpho.xml";
+ $parser_file .= "/$foundry_name/dependency.xml";
+ $parse = $morpho = layer_header($docid);
+ } elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
+ @spansFrom = split(/\s+/, $1);
+ } elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
+ @spansTo = split(/\s+/, $1);
+ } elsif (! /^\s*$/) {
+ my @parsed=split('\t');
+ chomp $parsed[9];
+ if(@parsed != 10) {
+ print STDERR "WARNING: skipping strange parser output line in $docid\n";
+ $i++;
+ next;
+ }
+ my $t=$parsed[0];
+ if($t == 1) {
+ $s++;
+ $first_in_sentence = $i;
+ }
+ if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) {
+ $write_syntax=1;
+ my $from=$spansFrom[$parsed[6]];
+ my $to=$spansTo[$parsed[6]];
+ $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
+<rel label="$parsed[7]">
+<span from="$from" to="$to"/>
+</rel>
+</span>
+@;
+ }
+ $morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
+ <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
+ <f name="lex">
+ <fs>
+ <f name="lemma">$parsed[2]</f>
+ <f name="pos">$parsed[3]</f>
+);
+ $morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
+ if($parsed[9] ne "_") {
+ if ($parsed[9] =~ /[0-9.e]+/) {
+ $morpho .= qq( <f name="certainty">$parsed[9]</f>\n)
+ }
+ else {
+ $morpho .= qq( <f name="misc">$parsed[9]</f>\n)
+ }
+ }
+ $morpho .= qq( </fs>
+ </f>
+ </fs>
+ </span>
+);
+ $i++;
+ }
+ }
+ $current .= "\n";
+ closeDoc(1);
+ $zip->close();
+ close($fh);
+}
+exit;
+
+sub newZipStream {
+ my ($fname) = @_;
+ if (defined $zip) {
+ $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
+ Append => 1, Name => $fname)
+ or die "ERROR ('$fname'): zip failed: $ZipError\n";
+ } else {
+ $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1,
+ Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname"
+ or die "ERROR ('$fname'): zip failed: $ZipError\n";
+ }
+}
+
+sub closeDoc {
+ if ($write_morpho) {
+ newZipStream($morpho_file);
+ $zip->print($morpho, qq( </spanList>\n</layer>\n));
+ }
+ if ($write_syntax) {
+ $write_syntax = 0;
+ newZipStream($parser_file);
+ $zip->print($parse, qq(</spanList>\n</layer>\n));
+ }
+}
+
+sub layer_header {
+ my ($docid) = @_;
+ return(qq(<?xml version="1.0" encoding="UTF-8"?>
+<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
+<layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
+<spanList>
+));
+}
\ No newline at end of file
diff --git a/t/test.t b/t/test.t
index 1077bc0..73c3515 100644
--- a/t/test.t
+++ b/t/test.t
@@ -2,6 +2,8 @@
use warnings;
use Test::More;
use Test::Script;
+use Test::TempDir::Tiny;
+use File::Copy;
script_runs([ 'script/korapxml2conllu', '-h' ], { exit => 255 });
script_stderr_like "Description", "Can print help message";
@@ -37,4 +39,25 @@
script_stdout_is $expected, "Converts $base_fname correctly to CoNLL-U";
}
-done_testing;
\ No newline at end of file
+my $test_tempdir = tempdir();
+my $expected;
+open(my $fh, '<', "t/data/goe.morpho.conllu"); {
+ local $/;
+ $expected = <$fh>;
+}
+close($fh);
+
+my $zipfile = "$test_tempdir/goe.tree_tagger.zip";
+my $zipcontent;
+script_runs([ 'script/conllu2korapxml', "t/data/goe.morpho.conllu" ], {stdout => \$zipcontent},
+ "Converts t/data/goe.morpho.conllu to KorAP-XML zip");
+open(my $fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
+print $fh $zipcontent;
+close($fh);
+
+copy("t/data/goe.zip", $test_tempdir);
+script_runs([ 'script/korapxml2conllu', "$test_tempdir/goe.tree_tagger.zip" ],
+ "Converts $test_tempdir/goe.tree_tagger.zip to CoNLL-U");
+script_stdout_is $expected, "Full round trip: Converts goe.morpho.conllu to KorAP-XML and back to CoNLL-U correctly";
+
+done_testing;