blob: 32d6ec8b83ffe14899eef86a1e1ccfe59857dffc [file] [log] [blame]
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
5use Getopt::Std;
6use Encode;
7use List::Util qw[min max];
8
9my $MAX_SENTENCE_LENGTH=10000;
10my $COMMENT_START="#";
11
12my $test=0;
13my $text_no=0;
14my %opts;
15my %plain_texts;
16my $usage=<<EOF;
17Usage: $0 [options] ZIPFILE [ZIPFILE...]
18
19Options:
20 -p pattern
21
22Description:
23 Convert KorAP-XML morpho zip to CoNLL(-U) format with all information necessary
24 for reconstruction in comment lines.
25
26Examples:
27 $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
28
29 ZIPSIGLEPATTERN='-x "*15/FEB*" "*15/MAR*"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
30
31 Results will be written to stdout
32EOF
33
34getopts('dhp:', \%opts);
35die $usage if($opts{h} || @ARGV == 0);
36my $debug=($opts{d}? 1 : 0);
37
38my $docid="";
39my ($current_id, $current_from, $current_to, $token);
40my $current;
41my ($unknown, $known) = (0, 0);
42my @current_lines;
43my %processedFilenames;
44my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
45
46my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
47
Marc Kupietzc7d1b932020-09-23 13:17:17 +020048my $UNZIP = `sh -c 'command -v unzip'`;
49chomp $UNZIP;
50
51
52if ($UNZIP eq '') {
53 warn('No unzip executable found in PATH.');
54 return 0;
55};
56
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010057foreach my $morpho_zip (@ARGV) {
58 die "cannot open $morpho_zip" if(! -r $morpho_zip);
59 my $data_zip = $morpho_zip;
60 if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
61 open(CONLL, "<$data_zip") or die "cannot open $data_zip";
62 while(<CONLL>) {
63 print;
64 }
65 close(CONLL);
66 next;
67 }
68 $data_zip =~ s/\.([^.]+)\.zip$/.zip/;
69 my $foundry = $1;
70 die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
71
72 my $first=1;
73 my $pattern = (defined($opts{p})? $opts{p} : '');
74 my @conll = ("_") x 10;
75 my $filename;
76
Marc Kupietzc7d1b932020-09-23 13:17:17 +020077 my $morphocommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/morpho.xml' $zipsiglepattern |";
Marc Kupietz7022cc12020-09-22 14:32:34 +020078 # print STDERR $morphocommand, "\n";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010079 open (MORPHOPIPE, $morphocommand) or die "cannot unzip $morpho_zip";
Marc Kupietzc7d1b932020-09-23 13:17:17 +020080 open (PLAINTEXTPIPE, "$UNZIP -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |") or die "cannot unzip $data_zip";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010081 print "$COMMENT_START foundry = $foundry\n";
82 while (<MORPHOPIPE>) {
Marc Kupietz30c41b12020-09-22 14:32:34 +020083 if (/^ inflating: (.*)/) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010084 $filename=$1;
85 while($processedFilenames{$filename} && !eof(MORPHOPIPE)) {
86 print STDERR "WARNING: $filename already processed\n";
87 while (<MORPHOPIPE>) {
88 last if(/\s+inflating:\s+(.*)/);
89 }
90 $filename=$1 if(!eof(MORPHOPIPE) && /\s+inflating:\s+(.*)/);
91 }
Marc Kupietz30c41b12020-09-22 14:32:34 +020092 } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010093 last if($test && $text_no++ > 3);
94 if(!$first) {
95 closeDoc(0);
96 }
97 $processedFilenames{$filename}=1;
98 $docid=$1;
99 @current_lines=();
100 $known=$unknown=0;
101 $current="";
102 if ($first) {
103 $first = 0;
104 }
105 if(!fetch_plaintext($docid)) { # skip this text
106 while (<MORPHOPIPE>) {
107 last if(m@</layer>@);
108 }
109 }
110 print STDOUT "$COMMENT_START filename = $filename\n$COMMENT_START text_id = $docid\n";
111 print STDERR "Analyzing $docid\n" if ($debug);
Marc Kupietz30c41b12020-09-22 14:32:34 +0200112 } elsif (m@^\s*<f\s+.*name="([^"]+)">([^<]+)</f>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100113 if ($1 eq "lemma") {
114 $conll[$LEMMA_idx] = $2;
115 $conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
116 if($conll[$LEMMA_idx] eq 'UNKNOWN') {
117 $conll[$LEMMA_idx] = "--";
118 $unknown++;
119 } else {
120 $known++;
121 }
122 } elsif ($1 eq 'pos' || $1 eq "ctag") {
123 $unknown++;
124 $conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
125 } elsif ($1 eq 'msd') {
126 $conll[$FEATS_idx] = $2;
127 } elsif ($1 eq 'certainty') {
128 $conll[$MISC_idx] = $2;
129 }
130 } elsif (/<span /) {
131 ($current_id) = /id="[^0-9]*([^\"]*)"/;
132 ($current_from) = /from="([^\"]*)"/;
133 ($current_to) = /to="([^\"]*)"/;
134 print STDERR "found span: $current_id $current_from $current_to\n" if($debug);
Marc Kupietz7e71a822020-06-22 17:14:30 +0200135 $token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100136 if (!defined $token) {
137 print STDERR "WARNING: could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10), "\n";
138 $token = "_";
139 }
140 $token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
141 @conll = ("_") x 10;
142 $conll[$FORM_idx] = encode("utf-8", $token);
Marc Kupietz30c41b12020-09-22 14:32:34 +0200143 } elsif (m@^\s*</fs>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100144 my @vals = ($current_from, $current_to);
145 print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
146 push @current_lines, \@vals;
147 # convert gathered information to CONLL
148 $conll[$ID_idx] = $#current_lines+1;
149 $current .= join("\t", @conll) . "\n"; # conll columns
150 if($conll[$XPOS_idx] eq '$.' || ($conll[$XPOS_idx] eq 'SENT' && $token eq '.') || $known + $unknown >= $MAX_SENTENCE_LENGTH) {
151 $current .= "\n";
152 if($known + $unknown > 0) { # only print sentence if it contains some words
153 printTokenRanges();
154 print STDOUT $current;
155 }
156 $current=""; $known=0; $unknown=0;
157 @current_lines = ();
158 }
159 while (<MORPHOPIPE>) {
160 last if (m@</span>@); # only consider first interpretation
161 }
162 }
163 }
164 $current .= "\n";
165 closeDoc(1);
166 close(MORPHOPIPE);
167 close(PLAINTEXTPIPE);
168}
169exit;
170
171sub printTokenRanges {
172 print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
173 foreach my $t (@current_lines) {
174 print STDOUT " $t->[0]";
175 }
176 print "\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1];
177 foreach my $t (@current_lines) {
178 print STDOUT " $t->[1]";
179 }
180 print "\n";
181}
182
183sub closeDoc {
184 my ($end) = @_;
185 print STDERR "closing doc\n" if($debug);
186 if($known + $unknown > 0) { # only parse a sentence if it has some words
187 chomp $current;
188 chomp $current;
189 chomp $current;
190 $current .= "\n\n";
191 printTokenRanges();
192 print STDOUT $current;
193 }
194}
195
196# read data.xml to figure out the tokens
197# (ideally tokens should also be in in morpho.xml, but they are not)
198sub fetch_plaintext {
199 my ($target_id) = @_;
200 my $docid;
201 my $text_started=0;
202
203 if($plain_texts{$target_id}) {
204# print STDERR "already got $target_id\n";
205 return;
206 }
207 while(<PLAINTEXTPIPE>) {
208 if(/<raw_text[^>]+docid="([^"]*)/) {
209 $docid=$1;
210 $text_started=0;
211 } elsif (m@<text>(.*)</text>@) {
212 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
213 s/&lt;/</go;
214 s/&gt;/>/go;
215 s/&amp;/&/go;
216 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
217 $plain_texts{$docid} = $_;
218 last if($docid eq $target_id);
219 } elsif (m@<text>(.*)@) {
220 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
221 s/&lt;/</go;
222 s/&gt;/>/go;
223 s/&amp;/&/go;
224 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
225 $plain_texts{$docid} = "$_ ";
226 $text_started=1;
227 } elsif ($text_started && m@(.*)</text>@) {
228 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
229 s/&lt;/</go;
230 s/&gt;/>/go;
231 s/&amp;/&/go;
232 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
233 $plain_texts{$docid} .= $_;
234 $text_started=0;
235 last if($docid eq $target_id);
236 } elsif ($text_started) {
237 chomp;
238 $_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
239 s/&lt;/</go;
240 s/&gt;/>/go;
241 s/&amp;/&/go;
242 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
243 $plain_texts{$docid} .= $_;
244 }
245 }
246 if(defined($ENV{PLAINTEXTFILTER})) {
247 if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
248 $plain_texts{$docid} = undef;
249 print STDERR "Skipping $docid\n";
250 return(undef);
251 } else {
252 print STDERR "Using $docid\n";
253 }
254 }
255 return(1);
256}