blob: 1763708dc0bdfef2815b951600b69f4bb7a42a5b [file] [log] [blame]
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
5use Getopt::Std;
6use Encode;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01007
8my $MAX_SENTENCE_LENGTH=10000;
9my $COMMENT_START="#";
10
11my $test=0;
12my $text_no=0;
13my %opts;
14my %plain_texts;
Marc Kupietzd8455832021-02-11 17:30:29 +010015my %sentence_ends;
16
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010017my $usage=<<EOF;
18Usage: $0 [options] ZIPFILE [ZIPFILE...]
Marc Kupietz0ab8a2c2021-03-19 16:21:00 +010019our $VERSION = '0.3.900';
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010020
21Options:
22 -p pattern
Marc Kupietz0ab8a2c2021-03-19 16:21:00 +010023our $VERSION_MSG = "\nkorapxml2conllu - v$VERSION\n";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010024
25Description:
Marc Kupietzd8455832021-02-11 17:30:29 +010026 Convert KorAP-XML base or morpho zips to CoNLL(-U) format with all information necessary
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010027 for reconstruction in comment lines.
28
29Examples:
Marc Kupietzd8455832021-02-11 17:30:29 +010030 $0 /vol/corpora/DeReKo/current/KorAP/zip/zca20.zip
31
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010032 $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
33
34 ZIPSIGLEPATTERN='-x "*15/FEB*" "*15/MAR*"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
35
36 Results will be written to stdout
37EOF
38
39getopts('dhp:', \%opts);
40die $usage if($opts{h} || @ARGV == 0);
41my $debug=($opts{d}? 1 : 0);
42
43my $docid="";
44my ($current_id, $current_from, $current_to, $token);
45my $current;
46my ($unknown, $known) = (0, 0);
47my @current_lines;
48my %processedFilenames;
49my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
Marc Kupietzd8455832021-02-11 17:30:29 +010050my $baseOnly;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010051
52my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
53
Marc Kupietzc7d1b932020-09-23 13:17:17 +020054my $UNZIP = `sh -c 'command -v unzip'`;
55chomp $UNZIP;
56
57
58if ($UNZIP eq '') {
59 warn('No unzip executable found in PATH.');
60 return 0;
61};
62
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010063foreach my $morpho_zip (@ARGV) {
64 die "cannot open $morpho_zip" if(! -r $morpho_zip);
65 my $data_zip = $morpho_zip;
66 if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
67 open(CONLL, "<$data_zip") or die "cannot open $data_zip";
68 while(<CONLL>) {
69 print;
70 }
71 close(CONLL);
72 next;
73 }
74 $data_zip =~ s/\.([^.]+)\.zip$/.zip/;
75 my $foundry = $1;
76 die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
77
78 my $first=1;
79 my $pattern = (defined($opts{p})? $opts{p} : '');
80 my @conll = ("_") x 10;
81 my $filename;
82
Marc Kupietzd8455832021-02-11 17:30:29 +010083 $baseOnly = $morpho_zip eq $data_zip;
84 my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
85 if(!$baseOnly) {
86 $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/morpho.xml' $zipsiglepattern |";
87 $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |";
88 } else {
89 $foundry = "base";
90 $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/tokens.xml' $zipsiglepattern |";
91 $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
92 }
93
94 open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
95 open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010096 print "$COMMENT_START foundry = $foundry\n";
Marc Kupietzd8455832021-02-11 17:30:29 +010097 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz30c41b12020-09-22 14:32:34 +020098 if (/^ inflating: (.*)/) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010099 $filename=$1;
Marc Kupietzd8455832021-02-11 17:30:29 +0100100 while($processedFilenames{$filename} && !eof(MORPHO_OR_TOKENPIPE)) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100101 print STDERR "WARNING: $filename already processed\n";
Marc Kupietzd8455832021-02-11 17:30:29 +0100102 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100103 last if(/\s+inflating:\s+(.*)/);
104 }
Marc Kupietzd8455832021-02-11 17:30:29 +0100105 $filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100106 }
Marc Kupietz30c41b12020-09-22 14:32:34 +0200107 } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100108 last if($test && $text_no++ > 3);
109 if(!$first) {
110 closeDoc(0);
111 }
112 $processedFilenames{$filename}=1;
113 $docid=$1;
114 @current_lines=();
115 $known=$unknown=0;
116 $current="";
117 if ($first) {
118 $first = 0;
119 }
120 if(!fetch_plaintext($docid)) { # skip this text
Marc Kupietzd8455832021-02-11 17:30:29 +0100121 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100122 last if(m@</layer>@);
123 }
124 }
125 print STDOUT "$COMMENT_START filename = $filename\n$COMMENT_START text_id = $docid\n";
126 print STDERR "Analyzing $docid\n" if ($debug);
Marc Kupietz30c41b12020-09-22 14:32:34 +0200127 } elsif (m@^\s*<f\s+.*name="([^"]+)">([^<]+)</f>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100128 if ($1 eq "lemma") {
129 $conll[$LEMMA_idx] = $2;
130 $conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
131 if($conll[$LEMMA_idx] eq 'UNKNOWN') {
132 $conll[$LEMMA_idx] = "--";
133 $unknown++;
134 } else {
135 $known++;
136 }
137 } elsif ($1 eq 'pos' || $1 eq "ctag") {
138 $unknown++;
139 $conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
140 } elsif ($1 eq 'msd') {
141 $conll[$FEATS_idx] = $2;
142 } elsif ($1 eq 'certainty') {
143 $conll[$MISC_idx] = $2;
144 }
145 } elsif (/<span /) {
146 ($current_id) = /id="[^0-9]*([^\"]*)"/;
147 ($current_from) = /from="([^\"]*)"/;
148 ($current_to) = /to="([^\"]*)"/;
149 print STDERR "found span: $current_id $current_from $current_to\n" if($debug);
Marc Kupietz7e71a822020-06-22 17:14:30 +0200150 $token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100151 if (!defined $token) {
152 print STDERR "WARNING: could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10), "\n";
153 $token = "_";
154 }
155 $token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
156 @conll = ("_") x 10;
157 $conll[$FORM_idx] = encode("utf-8", $token);
Marc Kupietzd8455832021-02-11 17:30:29 +0100158 if($baseOnly) {
159 my @vals = ($current_from, $current_to);
160 print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
161 push @current_lines, \@vals;
162 $known++;
163 $conll[$ID_idx] = $#current_lines+1;
164 $current .= join("\t", @conll) . "\n"; # conll columns
165 fetch_plaintext($docid);
166 if ($sentence_ends{$docid}{$current_to}) {
167 $current .= "\n";
168 printTokenRanges();
169 print STDOUT $current;
170 $current = "";
171 $known = 0;
172 $unknown = 0;
173 @current_lines = ();
174 }
175 }
Marc Kupietz30c41b12020-09-22 14:32:34 +0200176 } elsif (m@^\s*</fs>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100177 my @vals = ($current_from, $current_to);
178 print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
179 push @current_lines, \@vals;
180 # convert gathered information to CONLL
181 $conll[$ID_idx] = $#current_lines+1;
182 $current .= join("\t", @conll) . "\n"; # conll columns
183 if($conll[$XPOS_idx] eq '$.' || ($conll[$XPOS_idx] eq 'SENT' && $token eq '.') || $known + $unknown >= $MAX_SENTENCE_LENGTH) {
184 $current .= "\n";
185 if($known + $unknown > 0) { # only print sentence if it contains some words
186 printTokenRanges();
187 print STDOUT $current;
188 }
189 $current=""; $known=0; $unknown=0;
190 @current_lines = ();
191 }
Marc Kupietzd8455832021-02-11 17:30:29 +0100192 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100193 last if (m@</span>@); # only consider first interpretation
194 }
195 }
196 }
197 $current .= "\n";
198 closeDoc(1);
Marc Kupietzd8455832021-02-11 17:30:29 +0100199 close(MORPHO_OR_TOKENPIPE);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100200 close(PLAINTEXTPIPE);
201}
202exit;
203
204sub printTokenRanges {
205 print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
206 foreach my $t (@current_lines) {
207 print STDOUT " $t->[0]";
208 }
209 print "\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1];
210 foreach my $t (@current_lines) {
211 print STDOUT " $t->[1]";
212 }
213 print "\n";
214}
215
216sub closeDoc {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100217 print STDERR "closing doc\n" if($debug);
218 if($known + $unknown > 0) { # only parse a sentence if it has some words
219 chomp $current;
220 chomp $current;
221 chomp $current;
222 $current .= "\n\n";
223 printTokenRanges();
224 print STDOUT $current;
225 }
226}
227
228# read data.xml to figure out the tokens
229# (ideally tokens should also be in in morpho.xml, but they are not)
230sub fetch_plaintext {
231 my ($target_id) = @_;
232 my $docid;
233 my $text_started=0;
Marc Kupietzd8455832021-02-11 17:30:29 +0100234 my ($current_id, $current_from, $current_to);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100235
Marc Kupietzd8455832021-02-11 17:30:29 +0100236 if($plain_texts{$target_id} && (!$baseOnly || $sentence_ends{$target_id}{-1})) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100237# print STDERR "already got $target_id\n";
Marc Kupietzd8455832021-02-11 17:30:29 +0100238 return 1;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100239 }
240 while(<PLAINTEXTPIPE>) {
241 if(/<raw_text[^>]+docid="([^"]*)/) {
242 $docid=$1;
243 $text_started=0;
Marc Kupietzd8455832021-02-11 17:30:29 +0100244 } elsif(/<layer[^>]+docid="([^"]*)/) {
245 $docid=$1;
246 $sentence_ends{$docid}{-1}=1;
247 } elsif(m@<span @) {
248 ($current_id) = /id="[^0-9]*([^\"]*)"/;
249 ($current_from) = /from="([^\"]*)"/;
250 ($current_to) = /to="([^\"]*)"/;
251 } elsif(m@<f\s[^>]*>s</f>@) {
252 print STDERR "Found sentence end for $docid \@$current_to\n" if($debug);
253 $sentence_ends{$docid}{$current_to}=1;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100254 } elsif (m@<text>(.*)</text>@) {
255 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
256 s/&lt;/</go;
257 s/&gt;/>/go;
258 s/&amp;/&/go;
259 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
260 $plain_texts{$docid} = $_;
261 last if($docid eq $target_id);
262 } elsif (m@<text>(.*)@) {
263 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
264 s/&lt;/</go;
265 s/&gt;/>/go;
266 s/&amp;/&/go;
267 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
268 $plain_texts{$docid} = "$_ ";
269 $text_started=1;
270 } elsif ($text_started && m@(.*)</text>@) {
271 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
272 s/&lt;/</go;
273 s/&gt;/>/go;
274 s/&amp;/&/go;
275 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
276 $plain_texts{$docid} .= $_;
277 $text_started=0;
278 last if($docid eq $target_id);
279 } elsif ($text_started) {
280 chomp;
281 $_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
282 s/&lt;/</go;
283 s/&gt;/>/go;
284 s/&amp;/&/go;
285 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
286 $plain_texts{$docid} .= $_;
287 }
288 }
289 if(defined($ENV{PLAINTEXTFILTER})) {
290 if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
291 $plain_texts{$docid} = undef;
292 print STDERR "Skipping $docid\n";
293 return(undef);
294 } else {
295 print STDERR "Using $docid\n";
296 }
297 }
298 return(1);
299}