Handle UDPipe comments and ignore non-interpretable comments

TODO:
* handle XPosTags (column 5)?
* convert more metadata (udpipe_model_licence, ...)

Resolves #1, #2

Change-Id: Ic29125bdcdf7ba9bb8d84c94757a72cea6bcf500
diff --git a/Changes b/Changes
index 78c0829..c376227 100644
--- a/Changes
+++ b/Changes
@@ -4,6 +4,9 @@
         - korapxml2conllu: use morpho.xml if present when run on base zips
         - korapxml2conllu: new option -c <columns>
         - conllu2korapxml: ignore _-lemmas
+        - conllu2korapxml: handle UDPipe comments
+        - conllu2korapxml: ignore non-interpretable comments
+
 
 0.4.1 2021-07-31
         - korapxml2conllu: fix patterns not extracted for last texts in archive
diff --git a/script/conllu2korapxml b/script/conllu2korapxml
index a9bb030..6de032f 100755
--- a/script/conllu2korapxml
+++ b/script/conllu2korapxml
@@ -75,43 +75,54 @@
   my $i=0; my $s=0; my $first_in_sentence=0;
   my $lastDocSigle="";
   while (<$fh>) {
-    if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
-      $filename=$1;
-      if(!$first) {
-        closeDoc(0);
-      } else {
-        $first=0;
+    if(/^\s*(?:#|0\.\d)/) {
+      if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
+        $filename=$1;
+        if(!$first) {
+          closeDoc(0);
+        } else {
+          $first=0;
+        }
+        if($processedFilenames{$filename}) {
+          $log->warn("WARNING: $filename is already processed");
+        }
+        $processedFilenames{$filename}=1;
+        $i=0;
+      } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
+        if(!$foundry_name) {
+          $foundry_name = $1;
+          $log->debug("Foundry: $foundry_name\n");
+        } else {
+          $log->debug("Ignored foundry name: $1\n");
+        }
+      } elsif(/^#\s*generator\s*[=]\s*udpipe/i) {
+        if(!$foundry_name) {
+          $foundry_name = "ud";
+          $log->debug("Foundry: $foundry_name\n");
+        } else {
+          $log->debug("Ignored foundry name: ud\n");
+        }
+      } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
+        $docid=$1;
+        my $docSigle = $docid;
+        $docSigle =~ s/\..*//;
+        if($docSigle ne $lastDocSigle) {
+          $log->info("Analyzing $docSigle");
+          $lastDocSigle = $docSigle;
+        }
+        $known=$unknown=0;
+        $current="";
+        $parser_file = dirname($filename);
+        $parser_file =~ s@(.*)/[^/]+$@$1@;
+        $morpho_file = $parser_file;
+        $morpho_file .= "/$foundry_name/morpho.xml";
+        $parser_file .= "/$foundry_name/dependency.xml";
+        $parse = $morpho = layer_header($docid);
+      }  elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
+        @spansFrom = split(/\s+/, $1);
+      }  elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
+        @spansTo = split(/\s+/, $1);
       }
-      if($processedFilenames{$filename}) {
-        $log->warn("WARNING: $filename is already processed");
-      }
-      $processedFilenames{$filename}=1;
-      $i=0;
-    } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
-      if(!$foundry_name) {
-        $foundry_name = $1;
-        $log->debug("Foundry: $foundry_name\n");
-      }
-    } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
-      $docid=$1;
-      my $docSigle = $docid;
-      $docSigle =~ s/\..*//;
-      if($docSigle ne $lastDocSigle) {
-        $log->info("Analyzing $docSigle");
-        $lastDocSigle = $docSigle;
-      }
-      $known=$unknown=0;
-      $current="";
-      $parser_file = dirname($filename);
-      $parser_file =~ s@(.*)/[^/]+$@$1@;
-      $morpho_file = $parser_file;
-      $morpho_file .= "/$foundry_name/morpho.xml";
-      $parser_file .= "/$foundry_name/dependency.xml";
-      $parse = $morpho = layer_header($docid);
-    }  elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
-      @spansFrom = split(/\s+/, $1);
-    }  elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
-      @spansTo = split(/\s+/, $1);
     } elsif (! /^\s*$/) {
       my @parsed=split('\t');
       chomp  $parsed[9];
diff --git a/t/data/goe.ud.conllu b/t/data/goe.ud.conllu
new file mode 100644
index 0000000..147071a
--- /dev/null
+++ b/t/data/goe.ud.conllu
@@ -0,0 +1,68 @@
+# generator = UDPipe 2, https://lindat.mff.cuni.cz/services/udpipe
+# this is a comment that should be ignored
+# udpipe_model = german-hdt-ud-2.6-200830
+# udpipe_model_licence = CC BY-NC-SA
+# foundry = base
+# filename = GOE/AGA/00000/base/tokens.xml  
+# text_id = GOE_AGA.00000
+# start_offsets = 0 0 9 12
+# end_offsets = 22 8 11 22
+1	Campagne	Campagne	NOUN	NN	Gender=Fem|Number=Sing|Person=3	0	root	_	_
+2	in	in	ADP	APPR	AdpType=Prep|Case=Dat	3	case	_	_
+3	Frankreich	Frankreich	PROPN	NE	Case=Dat|Number=Sing|Person=3	1	nmod	_	_
+
+# start_offsets = 23 23
+# end_offsets = 27 27
+1	1792	1792	NUM	CARD	Number=Plur|NumType=Card|Person=3	0	root	_	_
+
+# start_offsets = 28 28 33 37 40 44 53
+# end_offsets = 54 32 36 39 43 53 54
+1	auch	auch	ADV	ADV	_	2	advmod	_	_
+2	ich	ich	PRON	PPER	Case=Nom|Number=Sing|Person=1|PronType=Prs	0	root	_	_
+3	in	in	ADP	APPR	AdpType=Prep|Case=Dat	5	case	_	_
+4	der	der	DET	ART	Case=Dat|Gender=Fem|Number=Sing|PronType=Art	5	det	_	_
+5	Champagne	Champagne	NOUN	NN	Gender=Fem|Number=Sing|Person=3	2	nmod	_	_
+6	!	!	PUNCT	$.	PunctType=Peri	2	punct	_	_
+
+# start_offsets = 55 55 59 63 70 75 82 87 94 102 105 111 120 124 130 134 140 144 151 153 163 175 187 191 207 209 213 218 222 239 248 255 259 264 267 271 277 283 297 307 319
+# end_offsets = 320 58 62 69 74 81 86 93 101 104 110 119 123 129 133 139 143 151 152 162 174 186 190 207 208 212 217 221 238 247 254 258 263 266 270 276 282 296 306 319 320
+1	den	den	DET	ART	Case=Acc|Gender=Masc|Number=Sing|PronType=Art	3	det	_	_
+2	23.	23.	ADJ	ADJA	Person=3	3	amod	_	_
+3	August	August	NOUN	NN	Gender=Masc|Number=Sing|Person=3	11	obj	_	_
+4	1792	1792	NUM	CARD	Number=Plur|NumType=Card|Person=3	3	nmod	_	_
+5	gleich	gleich	ADJ	ADJD	Degree=Pos|Variant=Short	8	advmod	_	_
+6	nach	nach	ADP	APPR	AdpType=Prep|Case=Dat	8	case	_	_
+7	meiner	mein	PRON	PPOSAT	Case=Dat|Gender=Fem|Number=Sing|Person=1|Poss=Yes|PronType=Prs	8	det	_	_
+8	Ankunft	Ankunft	NOUN	NN	Gender=Fem|Number=Sing|Person=3	3	nmod	_	_
+9	in	in	ADP	APPR	AdpType=Prep|Case=Dat	10	case	_	_
+10	Mainz	Mainz	PROPN	NE	Case=Dat|Number=Sing|Person=3	8	nmod	_	_
+11	besuchte	besuchen	VERB	VVFIN	Mood=Ind|Number=Sing|Person=1|Tense=Past|VerbForm=Fin	0	root	_	_
+12	ich	ich	PRON	PPER	Case=Nom|Number=Sing|Person=1|PronType=Prs	11	nsubj	_	_
+13	Herrn	Herr	NOUN	NN	Gender=Masc|Number=Sing|Person=3	11	iobj	_	_
+14	von	von	ADP	APPR	AdpType=Prep|Case=Dat	15	case	_	_
+15	Stein	Stein	PROPN	NE	Gender=Masc|Number=Sing|Person=3	13	nmod	_	_
+16	den	den	DET	ART	Case=Acc|Gender=Masc|Number=Sing|PronType=Art	21	det	_	_
+17	älteren	alt	ADJ	ADJA	Case=Acc|Degree=Cmp|Gender=Masc|Number=Sing	21	amod	_	_
+18	,	,	PUNCT	$,	PunctType=Comm	20	punct	_	_
+19	königlich	königlich	ADJ	ADJD	Degree=Pos|Variant=Short	20	advmod	_	_
+20	preußischen	preußisch	ADJ	ADJA	Case=Acc|Degree=Pos|Gender=Masc|Number=Sing	21	amod	_	_
+21	Kammerherrn	Kammerherr	NOUN	NN	Gender=Masc|Number=Sing|Person=3	11	obj	_	_
+22	und	und	CCONJ	KON	_	23	cc	_	_
+23	Oberforstmeister	Meister	NOUN	NN	Gender=Masc|Number=Sing|Person=3	21	conj	_	_
+24	,	,	PUNCT	$,	PunctType=Comm	30	punct	_	_
+25	der	der	PRON	PRELS	Case=Nom|Gender=Masc|Number=Sing|Person=3|PronType=Rel	30	nsubj	_	_
+26	eine	eine	DET	ART	Case=Acc|Gender=Fem|Number=Sing|PronType=Art	27	det	_	_
+27	Art	Art	NOUN	NN	Gender=Fem|Number=Sing|Person=3	30	obj	_	_
+28	Residentenstelle	Stelle	NOUN	NN	Gender=Fem|Number=Sing|Person=3	27	appos	_	_
+29	daselbst	daselbst	ADV	ADV	_	30	advmod	_	_
+30	versah	versehen	VERB	VVFIN	Mood=Ind|Number=Sing|Person=3|Tense=Past|VerbForm=Fin	23	acl	_	_
+31	und	und	CCONJ	KON	_	39	cc	_	_
+32	sich	sich	PRON	PRF	Case=Acc|Person=3|PronType=Prs|Reflex=Yes	39	obj	_	_
+33	im	im	ADP	APPRART	AdpType=Prep|Case=Dat|PronType=Art	34	case	_	_
+34	Haß	Hass	NOUN	NN	Gender=Masc|Number=Sing|Person=3	39	obl	_	_
+35	gegen	gegen	ADP	APPR	AdpType=Prep|Case=Acc	37	case	_	_
+36	alles	all	DET	PIDAT	Case=Acc|Gender=Neut|Number=Sing|Person=3	37	det	_	_
+37	Revolutionäre	Revolutionär	NOUN	NN	Degree=Pos|Gender=Neut|Number=Sing|Person=3	34	nmod	_	_
+38	gewaltsam	gewaltsam	ADJ	ADJD	Degree=Pos|Variant=Short	39	advmod	_	_
+39	auszeichnete	auszeichnen	VERB	VVFIN	Mood=Ind|Number=Sing|Person=3|Tense=Pres|VerbForm=Fin	30	conj	_	_
+40	.	.	PUNCT	$.	PunctType=Peri	11	punct	_	_
diff --git a/t/test.t b/t/test.t
index 3a34be9..23ee224 100644
--- a/t/test.t
+++ b/t/test.t
@@ -1,6 +1,6 @@
 use strict;
 use warnings;
-use Test::More tests => 41;
+use Test::More tests => 46;
 use Test::Script;
 use Test::TempDir::Tiny;
 use File::Copy;
@@ -181,4 +181,19 @@
 $zipcontent = `$UNZIP -c $zipfile`;
 unlike($zipcontent, qr/.*name ="lemma".*/, "conllu2korapxml igores _ lemmas.");
 like($zipcontent, qr/.*<f name="pos">NN|NN<\/f>.*/, "conllu2korapxml does not ignore pos for _ lemmas.");
+
+script_runs([ 'script/conllu2korapxml', '-l', 'debug', 't/data/goe.ud.conllu' ], {stdout => \$zipcontent}, "Runs conllu2korap with UDPipe and unparsable comments");
+script_stderr_like "Foundry:\\s+ud", "Found generator based foundry";
+script_stderr_like "Ignored\\s+foundry\\s+name:\\s+base", "Ignore defined foundry";
+
+$zipfile = "$test_tempdir/goe.ud.zip";
+open($fh, ">", $zipfile) or fail("cannot open file $zipfile for writing");
+print $fh $zipcontent;
+close($fh);
+
+$zipcontent = `$UNZIP -l $zipfile`;
+like($zipcontent, qr@GOE/AGA/00000/ud/morpho\.xml@, "conllu2korapxml UDPipe input conversion contains morpho layer with foundry name 'ud'");
+like($zipcontent, qr@GOE/AGA/00000/ud/dependency\.xml@, "conllu2korapxml UDPipe input conversion contains dependency layer with foundry name 'ud'");
+
+
 done_testing;