blob: 9dcec4211aee9a2e5b66f49e6a0b1e5364c99602 [file] [log] [blame]
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
5use Getopt::Std;
6use Encode;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01007
8my $MAX_SENTENCE_LENGTH=10000;
9my $COMMENT_START="#";
10
11my $test=0;
12my $text_no=0;
13my %opts;
14my %plain_texts;
Marc Kupietzd8455832021-02-11 17:30:29 +010015my %sentence_ends;
16
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010017my $usage=<<EOF;
18Usage: $0 [options] ZIPFILE [ZIPFILE...]
19
20Options:
21 -p pattern
22
23Description:
Marc Kupietzd8455832021-02-11 17:30:29 +010024 Convert KorAP-XML base or morpho zips to CoNLL(-U) format with all information necessary
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010025 for reconstruction in comment lines.
26
27Examples:
Marc Kupietzd8455832021-02-11 17:30:29 +010028 $0 /vol/corpora/DeReKo/current/KorAP/zip/zca20.zip
29
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010030 $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
31
32 ZIPSIGLEPATTERN='-x "*15/FEB*" "*15/MAR*"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
33
34 Results will be written to stdout
35EOF
36
37getopts('dhp:', \%opts);
38die $usage if($opts{h} || @ARGV == 0);
39my $debug=($opts{d}? 1 : 0);
40
41my $docid="";
42my ($current_id, $current_from, $current_to, $token);
43my $current;
44my ($unknown, $known) = (0, 0);
45my @current_lines;
46my %processedFilenames;
47my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
Marc Kupietzd8455832021-02-11 17:30:29 +010048my $baseOnly;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010049
50my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
51
Marc Kupietzc7d1b932020-09-23 13:17:17 +020052my $UNZIP = `sh -c 'command -v unzip'`;
53chomp $UNZIP;
54
55
56if ($UNZIP eq '') {
57 warn('No unzip executable found in PATH.');
58 return 0;
59};
60
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010061foreach my $morpho_zip (@ARGV) {
62 die "cannot open $morpho_zip" if(! -r $morpho_zip);
63 my $data_zip = $morpho_zip;
64 if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
65 open(CONLL, "<$data_zip") or die "cannot open $data_zip";
66 while(<CONLL>) {
67 print;
68 }
69 close(CONLL);
70 next;
71 }
72 $data_zip =~ s/\.([^.]+)\.zip$/.zip/;
73 my $foundry = $1;
74 die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
75
76 my $first=1;
77 my $pattern = (defined($opts{p})? $opts{p} : '');
78 my @conll = ("_") x 10;
79 my $filename;
80
Marc Kupietzd8455832021-02-11 17:30:29 +010081 $baseOnly = $morpho_zip eq $data_zip;
82 my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
83 if(!$baseOnly) {
84 $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/morpho.xml' $zipsiglepattern |";
85 $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |";
86 } else {
87 $foundry = "base";
88 $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/tokens.xml' $zipsiglepattern |";
89 $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*/${pattern}*/*/[sd][ta]*.xml' $zipsiglepattern |";
90 }
91
92 open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
93 open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010094 print "$COMMENT_START foundry = $foundry\n";
Marc Kupietzd8455832021-02-11 17:30:29 +010095 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz30c41b12020-09-22 14:32:34 +020096 if (/^ inflating: (.*)/) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010097 $filename=$1;
Marc Kupietzd8455832021-02-11 17:30:29 +010098 while($processedFilenames{$filename} && !eof(MORPHO_OR_TOKENPIPE)) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010099 print STDERR "WARNING: $filename already processed\n";
Marc Kupietzd8455832021-02-11 17:30:29 +0100100 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100101 last if(/\s+inflating:\s+(.*)/);
102 }
Marc Kupietzd8455832021-02-11 17:30:29 +0100103 $filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100104 }
Marc Kupietz30c41b12020-09-22 14:32:34 +0200105 } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100106 last if($test && $text_no++ > 3);
107 if(!$first) {
108 closeDoc(0);
109 }
110 $processedFilenames{$filename}=1;
111 $docid=$1;
112 @current_lines=();
113 $known=$unknown=0;
114 $current="";
115 if ($first) {
116 $first = 0;
117 }
118 if(!fetch_plaintext($docid)) { # skip this text
Marc Kupietzd8455832021-02-11 17:30:29 +0100119 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100120 last if(m@</layer>@);
121 }
122 }
123 print STDOUT "$COMMENT_START filename = $filename\n$COMMENT_START text_id = $docid\n";
124 print STDERR "Analyzing $docid\n" if ($debug);
Marc Kupietz30c41b12020-09-22 14:32:34 +0200125 } elsif (m@^\s*<f\s+.*name="([^"]+)">([^<]+)</f>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100126 if ($1 eq "lemma") {
127 $conll[$LEMMA_idx] = $2;
128 $conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
129 if($conll[$LEMMA_idx] eq 'UNKNOWN') {
130 $conll[$LEMMA_idx] = "--";
131 $unknown++;
132 } else {
133 $known++;
134 }
135 } elsif ($1 eq 'pos' || $1 eq "ctag") {
136 $unknown++;
137 $conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
138 } elsif ($1 eq 'msd') {
139 $conll[$FEATS_idx] = $2;
140 } elsif ($1 eq 'certainty') {
141 $conll[$MISC_idx] = $2;
142 }
143 } elsif (/<span /) {
144 ($current_id) = /id="[^0-9]*([^\"]*)"/;
145 ($current_from) = /from="([^\"]*)"/;
146 ($current_to) = /to="([^\"]*)"/;
147 print STDERR "found span: $current_id $current_from $current_to\n" if($debug);
Marc Kupietz7e71a822020-06-22 17:14:30 +0200148 $token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100149 if (!defined $token) {
150 print STDERR "WARNING: could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10), "\n";
151 $token = "_";
152 }
153 $token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
154 @conll = ("_") x 10;
155 $conll[$FORM_idx] = encode("utf-8", $token);
Marc Kupietzd8455832021-02-11 17:30:29 +0100156 if($baseOnly) {
157 my @vals = ($current_from, $current_to);
158 print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
159 push @current_lines, \@vals;
160 $known++;
161 $conll[$ID_idx] = $#current_lines+1;
162 $current .= join("\t", @conll) . "\n"; # conll columns
163 fetch_plaintext($docid);
164 if ($sentence_ends{$docid}{$current_to}) {
165 $current .= "\n";
166 printTokenRanges();
167 print STDOUT $current;
168 $current = "";
169 $known = 0;
170 $unknown = 0;
171 @current_lines = ();
172 }
173 }
Marc Kupietz30c41b12020-09-22 14:32:34 +0200174 } elsif (m@^\s*</fs>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100175 my @vals = ($current_from, $current_to);
176 print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
177 push @current_lines, \@vals;
178 # convert gathered information to CONLL
179 $conll[$ID_idx] = $#current_lines+1;
180 $current .= join("\t", @conll) . "\n"; # conll columns
181 if($conll[$XPOS_idx] eq '$.' || ($conll[$XPOS_idx] eq 'SENT' && $token eq '.') || $known + $unknown >= $MAX_SENTENCE_LENGTH) {
182 $current .= "\n";
183 if($known + $unknown > 0) { # only print sentence if it contains some words
184 printTokenRanges();
185 print STDOUT $current;
186 }
187 $current=""; $known=0; $unknown=0;
188 @current_lines = ();
189 }
Marc Kupietzd8455832021-02-11 17:30:29 +0100190 while (<MORPHO_OR_TOKENPIPE>) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100191 last if (m@</span>@); # only consider first interpretation
192 }
193 }
194 }
195 $current .= "\n";
196 closeDoc(1);
Marc Kupietzd8455832021-02-11 17:30:29 +0100197 close(MORPHO_OR_TOKENPIPE);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100198 close(PLAINTEXTPIPE);
199}
200exit;
201
202sub printTokenRanges {
203 print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
204 foreach my $t (@current_lines) {
205 print STDOUT " $t->[0]";
206 }
207 print "\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1];
208 foreach my $t (@current_lines) {
209 print STDOUT " $t->[1]";
210 }
211 print "\n";
212}
213
214sub closeDoc {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100215 print STDERR "closing doc\n" if($debug);
216 if($known + $unknown > 0) { # only parse a sentence if it has some words
217 chomp $current;
218 chomp $current;
219 chomp $current;
220 $current .= "\n\n";
221 printTokenRanges();
222 print STDOUT $current;
223 }
224}
225
226# read data.xml to figure out the tokens
227# (ideally tokens should also be in in morpho.xml, but they are not)
228sub fetch_plaintext {
229 my ($target_id) = @_;
230 my $docid;
231 my $text_started=0;
Marc Kupietzd8455832021-02-11 17:30:29 +0100232 my ($current_id, $current_from, $current_to);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100233
Marc Kupietzd8455832021-02-11 17:30:29 +0100234 if($plain_texts{$target_id} && (!$baseOnly || $sentence_ends{$target_id}{-1})) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100235# print STDERR "already got $target_id\n";
Marc Kupietzd8455832021-02-11 17:30:29 +0100236 return 1;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100237 }
238 while(<PLAINTEXTPIPE>) {
239 if(/<raw_text[^>]+docid="([^"]*)/) {
240 $docid=$1;
241 $text_started=0;
Marc Kupietzd8455832021-02-11 17:30:29 +0100242 } elsif(/<layer[^>]+docid="([^"]*)/) {
243 $docid=$1;
244 $sentence_ends{$docid}{-1}=1;
245 } elsif(m@<span @) {
246 ($current_id) = /id="[^0-9]*([^\"]*)"/;
247 ($current_from) = /from="([^\"]*)"/;
248 ($current_to) = /to="([^\"]*)"/;
249 } elsif(m@<f\s[^>]*>s</f>@) {
250 print STDERR "Found sentence end for $docid \@$current_to\n" if($debug);
251 $sentence_ends{$docid}{$current_to}=1;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100252 } elsif (m@<text>(.*)</text>@) {
253 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
254 s/&lt;/</go;
255 s/&gt;/>/go;
256 s/&amp;/&/go;
257 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
258 $plain_texts{$docid} = $_;
259 last if($docid eq $target_id);
260 } elsif (m@<text>(.*)@) {
261 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
262 s/&lt;/</go;
263 s/&gt;/>/go;
264 s/&amp;/&/go;
265 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
266 $plain_texts{$docid} = "$_ ";
267 $text_started=1;
268 } elsif ($text_started && m@(.*)</text>@) {
269 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
270 s/&lt;/</go;
271 s/&gt;/>/go;
272 s/&amp;/&/go;
273 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
274 $plain_texts{$docid} .= $_;
275 $text_started=0;
276 last if($docid eq $target_id);
277 } elsif ($text_started) {
278 chomp;
279 $_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
280 s/&lt;/</go;
281 s/&gt;/>/go;
282 s/&amp;/&/go;
283 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
284 $plain_texts{$docid} .= $_;
285 }
286 }
287 if(defined($ENV{PLAINTEXTFILTER})) {
288 if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
289 $plain_texts{$docid} = undef;
290 print STDERR "Skipping $docid\n";
291 return(undef);
292 } else {
293 print STDERR "Using $docid\n";
294 }
295 }
296 return(1);
297}