blob: b12886c8aed7027443c64d3cf9d24091df46d03f [file] [log] [blame]
Marc Kupietz79ba1e52021-02-12 17:26:54 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
5use Getopt::Std;
6use Encode;
7use IO::Compress::Zip qw(zip $ZipError :constants);
8use File::Basename;
9
10my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
11my %opts;
12my %processedFilenames;
13
14my $usage=<<EOF;
15Usage: $0 [options] [CoNLL-U-FILE...]
16
17Options:
18 -d debug
19Description:
20 Converts CoNLL-U files that follow KorAP-specific comment conventions
21 and contain morphosyntactic and/or dependency annotations to
22 corresponding KorAP-XML zip files.
23
24Examples:
25 $0 zca20.spacy.conllu > zca20.spacy.zip
26
27 $0 < zca20.spacy.conllu > zca20.spacy.zip
28EOF
29
30
31getopts('hd', \%opts);
32die $usage if($opts{h});
33my $debug=($opts{d}? 1 : 0);
34
35my $docid="";
36my $zip = undef;
37my $outh = \*STDOUT;
38my $parser_file;
39my $parse;
40my $morpho_file;
41my $morpho;
42my @spansFrom;
43my @spansTo;
44my $current;
45my ($unknown, $known) = (0, 0);
46
47my ($write_morpho, $write_syntax, $base) = (1, 0, 0);
48my $filename;
49my $foundry_name;
50my $first=1;
51my @conllu_files = @ARGV;
52push @conllu_files, "-" if (@conllu_files == 0);
53my $fh;
54foreach my $conllu_file (@conllu_files) {
55 if ($conllu_file eq '-') {
56 $fh = \*STDIN;
57 } else {
58 open($fh, "<", $conllu_file) or die "Cannot open $conllu_file";
59 }
60 my $i=0; my $s=0; my $first_in_sentence=0;
61 my $lastDocSigle="";
62 while (<$fh>) {
63 if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
64 $filename=$1;
65 if(!$first) {
66 closeDoc(0);
67 } else {
68 $first=0;
69 }
70 if($processedFilenames{$filename}) {
71 print STDERR "WARNING: $filename is already processed\n";
72 }
73 $processedFilenames{$filename}=1;
74 $i=0;
75 } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
76 $foundry_name=$1;
77 print STDERR "Foundry: $foundry_name\n" if($debug);
78 } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
79 $docid=$1;
80 my $docSigle = $docid;
81 $docSigle =~ s/\..*//;
82 if($docSigle ne $lastDocSigle) {
83 print STDERR "Analyzing $docSigle\n";
84 $lastDocSigle = $docSigle;
85 }
86 $known=$unknown=0;
87 $current="";
88 $parser_file = dirname($filename);
89 $parser_file =~ s@(.*)/[^/]+$@$1@;
90 $morpho_file = $parser_file;
91 $morpho_file .= "/$foundry_name/morpho.xml";
92 $parser_file .= "/$foundry_name/dependency.xml";
93 $parse = $morpho = layer_header($docid);
94 } elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
95 @spansFrom = split(/\s+/, $1);
96 } elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
97 @spansTo = split(/\s+/, $1);
98 } elsif (! /^\s*$/) {
99 my @parsed=split('\t');
100 chomp $parsed[9];
101 if(@parsed != 10) {
102 print STDERR "WARNING: skipping strange parser output line in $docid\n";
103 $i++;
104 next;
105 }
106 my $t=$parsed[0];
107 if($t == 1) {
108 $s++;
109 $first_in_sentence = $i;
110 }
111 if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) {
112 $write_syntax=1;
113 my $from=$spansFrom[$parsed[6]];
114 my $to=$spansTo[$parsed[6]];
115 $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
116<rel label="$parsed[7]">
117<span from="$from" to="$to"/>
118</rel>
119</span>
120@;
121 }
122 $morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
123 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
124 <f name="lex">
125 <fs>
126 <f name="lemma">$parsed[2]</f>
127 <f name="pos">$parsed[3]</f>
128);
129 $morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
130 if($parsed[9] ne "_") {
131 if ($parsed[9] =~ /[0-9.e]+/) {
132 $morpho .= qq( <f name="certainty">$parsed[9]</f>\n)
133 }
134 else {
135 $morpho .= qq( <f name="misc">$parsed[9]</f>\n)
136 }
137 }
138 $morpho .= qq( </fs>
139 </f>
140 </fs>
141 </span>
142);
143 $i++;
144 }
145 }
146 $current .= "\n";
147 closeDoc(1);
148 $zip->close();
149 close($fh);
150}
151exit;
152
153sub newZipStream {
154 my ($fname) = @_;
155 if (defined $zip) {
156 $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
157 Append => 1, Name => $fname)
158 or die "ERROR ('$fname'): zip failed: $ZipError\n";
159 } else {
160 $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1,
161 Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname"
162 or die "ERROR ('$fname'): zip failed: $ZipError\n";
163 }
164}
165
166sub closeDoc {
167 if ($write_morpho) {
168 newZipStream($morpho_file);
169 $zip->print($morpho, qq( </spanList>\n</layer>\n));
170 }
171 if ($write_syntax) {
172 $write_syntax = 0;
173 newZipStream($parser_file);
174 $zip->print($parse, qq(</spanList>\n</layer>\n));
175 }
176}
177
178sub layer_header {
179 my ($docid) = @_;
180 return(qq(<?xml version="1.0" encoding="UTF-8"?>
181<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
182<layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
183<spanList>
184));
185}