blob: aca61c3ac18a3b2236995e7304da15566a0a7be1 [file] [log] [blame]
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
5use Getopt::Std;
6use Encode;
Marc Kupietz5e7f20a2020-02-17 18:17:11 +01007
8my $MAX_SENTENCE_LENGTH=10000;
9my $COMMENT_START="#";
10
11my $test=0;
12my $text_no=0;
13my %opts;
14my %plain_texts;
15my $usage=<<EOF;
16Usage: $0 [options] ZIPFILE [ZIPFILE...]
17
18Options:
19 -p pattern
20
21Description:
22 Convert KorAP-XML morpho zip to CoNLL(-U) format with all information necessary
23 for reconstruction in comment lines.
24
25Examples:
26 $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
27
28 ZIPSIGLEPATTERN='-x "*15/FEB*" "*15/MAR*"' $0 /vol/corpora/DeReKo/current/KorAP/zip/zca15.tree_tagger.zip
29
30 Results will be written to stdout
31EOF
32
33getopts('dhp:', \%opts);
34die $usage if($opts{h} || @ARGV == 0);
35my $debug=($opts{d}? 1 : 0);
36
37my $docid="";
38my ($current_id, $current_from, $current_to, $token);
39my $current;
40my ($unknown, $known) = (0, 0);
41my @current_lines;
42my %processedFilenames;
43my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
44
45my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);
46
Marc Kupietzc7d1b932020-09-23 13:17:17 +020047my $UNZIP = `sh -c 'command -v unzip'`;
48chomp $UNZIP;
49
50
51if ($UNZIP eq '') {
52 warn('No unzip executable found in PATH.');
53 return 0;
54};
55
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010056foreach my $morpho_zip (@ARGV) {
57 die "cannot open $morpho_zip" if(! -r $morpho_zip);
58 my $data_zip = $morpho_zip;
59 if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
60 open(CONLL, "<$data_zip") or die "cannot open $data_zip";
61 while(<CONLL>) {
62 print;
63 }
64 close(CONLL);
65 next;
66 }
67 $data_zip =~ s/\.([^.]+)\.zip$/.zip/;
68 my $foundry = $1;
69 die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);
70
71 my $first=1;
72 my $pattern = (defined($opts{p})? $opts{p} : '');
73 my @conll = ("_") x 10;
74 my $filename;
75
Marc Kupietzc7d1b932020-09-23 13:17:17 +020076 my $morphocommand = "$UNZIP -c $morpho_zip '*/${pattern}*/*/*/morpho.xml' $zipsiglepattern |";
Marc Kupietz7022cc12020-09-22 14:32:34 +020077 # print STDERR $morphocommand, "\n";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010078 open (MORPHOPIPE, $morphocommand) or die "cannot unzip $morpho_zip";
Marc Kupietzc7d1b932020-09-23 13:17:17 +020079 open (PLAINTEXTPIPE, "$UNZIP -c $data_zip '*/${pattern}*/*/data.xml' $zipsiglepattern |") or die "cannot unzip $data_zip";
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010080 print "$COMMENT_START foundry = $foundry\n";
81 while (<MORPHOPIPE>) {
Marc Kupietz30c41b12020-09-22 14:32:34 +020082 if (/^ inflating: (.*)/) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010083 $filename=$1;
84 while($processedFilenames{$filename} && !eof(MORPHOPIPE)) {
85 print STDERR "WARNING: $filename already processed\n";
86 while (<MORPHOPIPE>) {
87 last if(/\s+inflating:\s+(.*)/);
88 }
89 $filename=$1 if(!eof(MORPHOPIPE) && /\s+inflating:\s+(.*)/);
90 }
Marc Kupietz30c41b12020-09-22 14:32:34 +020091 } elsif(m@^\s*<layer\s+.*docid="([^"]+)"@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +010092 last if($test && $text_no++ > 3);
93 if(!$first) {
94 closeDoc(0);
95 }
96 $processedFilenames{$filename}=1;
97 $docid=$1;
98 @current_lines=();
99 $known=$unknown=0;
100 $current="";
101 if ($first) {
102 $first = 0;
103 }
104 if(!fetch_plaintext($docid)) { # skip this text
105 while (<MORPHOPIPE>) {
106 last if(m@</layer>@);
107 }
108 }
109 print STDOUT "$COMMENT_START filename = $filename\n$COMMENT_START text_id = $docid\n";
110 print STDERR "Analyzing $docid\n" if ($debug);
Marc Kupietz30c41b12020-09-22 14:32:34 +0200111 } elsif (m@^\s*<f\s+.*name="([^"]+)">([^<]+)</f>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100112 if ($1 eq "lemma") {
113 $conll[$LEMMA_idx] = $2;
114 $conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
115 if($conll[$LEMMA_idx] eq 'UNKNOWN') {
116 $conll[$LEMMA_idx] = "--";
117 $unknown++;
118 } else {
119 $known++;
120 }
121 } elsif ($1 eq 'pos' || $1 eq "ctag") {
122 $unknown++;
123 $conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
124 } elsif ($1 eq 'msd') {
125 $conll[$FEATS_idx] = $2;
126 } elsif ($1 eq 'certainty') {
127 $conll[$MISC_idx] = $2;
128 }
129 } elsif (/<span /) {
130 ($current_id) = /id="[^0-9]*([^\"]*)"/;
131 ($current_from) = /from="([^\"]*)"/;
132 ($current_to) = /to="([^\"]*)"/;
133 print STDERR "found span: $current_id $current_from $current_to\n" if($debug);
Marc Kupietz7e71a822020-06-22 17:14:30 +0200134 $token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100135 if (!defined $token) {
136 print STDERR "WARNING: could not retrieve token for $docid at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10), "\n";
137 $token = "_";
138 }
139 $token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
140 @conll = ("_") x 10;
141 $conll[$FORM_idx] = encode("utf-8", $token);
Marc Kupietz30c41b12020-09-22 14:32:34 +0200142 } elsif (m@^\s*</fs>@) {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100143 my @vals = ($current_from, $current_to);
144 print STDERR "joining : ", join(" ", @vals), "\n" if($debug);
145 push @current_lines, \@vals;
146 # convert gathered information to CONLL
147 $conll[$ID_idx] = $#current_lines+1;
148 $current .= join("\t", @conll) . "\n"; # conll columns
149 if($conll[$XPOS_idx] eq '$.' || ($conll[$XPOS_idx] eq 'SENT' && $token eq '.') || $known + $unknown >= $MAX_SENTENCE_LENGTH) {
150 $current .= "\n";
151 if($known + $unknown > 0) { # only print sentence if it contains some words
152 printTokenRanges();
153 print STDOUT $current;
154 }
155 $current=""; $known=0; $unknown=0;
156 @current_lines = ();
157 }
158 while (<MORPHOPIPE>) {
159 last if (m@</span>@); # only consider first interpretation
160 }
161 }
162 }
163 $current .= "\n";
164 closeDoc(1);
165 close(MORPHOPIPE);
166 close(PLAINTEXTPIPE);
167}
168exit;
169
170sub printTokenRanges {
171 print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
172 foreach my $t (@current_lines) {
173 print STDOUT " $t->[0]";
174 }
175 print "\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1];
176 foreach my $t (@current_lines) {
177 print STDOUT " $t->[1]";
178 }
179 print "\n";
180}
181
182sub closeDoc {
Marc Kupietz5e7f20a2020-02-17 18:17:11 +0100183 print STDERR "closing doc\n" if($debug);
184 if($known + $unknown > 0) { # only parse a sentence if it has some words
185 chomp $current;
186 chomp $current;
187 chomp $current;
188 $current .= "\n\n";
189 printTokenRanges();
190 print STDOUT $current;
191 }
192}
193
194# read data.xml to figure out the tokens
195# (ideally tokens should also be in in morpho.xml, but they are not)
196sub fetch_plaintext {
197 my ($target_id) = @_;
198 my $docid;
199 my $text_started=0;
200
201 if($plain_texts{$target_id}) {
202# print STDERR "already got $target_id\n";
203 return;
204 }
205 while(<PLAINTEXTPIPE>) {
206 if(/<raw_text[^>]+docid="([^"]*)/) {
207 $docid=$1;
208 $text_started=0;
209 } elsif (m@<text>(.*)</text>@) {
210 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
211 s/&lt;/</go;
212 s/&gt;/>/go;
213 s/&amp;/&/go;
214 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
215 $plain_texts{$docid} = $_;
216 last if($docid eq $target_id);
217 } elsif (m@<text>(.*)@) {
218 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
219 s/&lt;/</go;
220 s/&gt;/>/go;
221 s/&amp;/&/go;
222 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
223 $plain_texts{$docid} = "$_ ";
224 $text_started=1;
225 } elsif ($text_started && m@(.*)</text>@) {
226 $_= decode("utf-8", $1, Encode::FB_DEFAULT);
227 s/&lt;/</go;
228 s/&gt;/>/go;
229 s/&amp;/&/go;
230 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
231 $plain_texts{$docid} .= $_;
232 $text_started=0;
233 last if($docid eq $target_id);
234 } elsif ($text_started) {
235 chomp;
236 $_ = decode("utf-8", $_, Encode::FB_DEFAULT) . ' ';
237 s/&lt;/</go;
238 s/&gt;/>/go;
239 s/&amp;/&/go;
240 tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣-/...""""""'''''''-/;
241 $plain_texts{$docid} .= $_;
242 }
243 }
244 if(defined($ENV{PLAINTEXTFILTER})) {
245 if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
246 $plain_texts{$docid} = undef;
247 print STDERR "Skipping $docid\n";
248 return(undef);
249 } else {
250 print STDERR "Using $docid\n";
251 }
252 }
253 return(1);
254}