blob: c8e4ec0adcb92c87a9a306d87bc5c8626162fc9a [file] [log] [blame]
Marc Kupietz79ba1e52021-02-12 17:26:54 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
Marc Kupietzaeb84a02021-10-11 17:57:29 +02005use Getopt::Long qw(GetOptions :config no_auto_abbrev);
6use Log::Any '$log';
7use Log::Any::Adapter;
Marc Kupietz79ba1e52021-02-12 17:26:54 +01008use Encode;
9use IO::Compress::Zip qw(zip $ZipError :constants);
10use File::Basename;
Marc Kupietzaeb84a02021-10-11 17:57:29 +020011use Pod::Usage;
Marc Kupietz79ba1e52021-02-12 17:26:54 +010012
13my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
14my %opts;
15my %processedFilenames;
16
Akron249fc832024-06-04 16:36:44 +020017our $VERSION = '0.6.3';
Marc Kupietzaeb84a02021-10-11 17:57:29 +020018our $VERSION_MSG = "\nconllu2korapxml - v$VERSION\n";
Marc Kupietz4cc243a2021-10-11 17:15:16 +020019
Marc Kupietzaeb84a02021-10-11 17:57:29 +020020use constant {
21 # Set to 1 for minimal more debug output (no need to be parametrized)
22 DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0
23};
Marc Kupietz79ba1e52021-02-12 17:26:54 +010024
Marc Kupietzaeb84a02021-10-11 17:57:29 +020025GetOptions(
26 'force-foundry|f=s' => \(my $foundry_name = ''),
27 'log|l=s' => \(my $log_level = 'warn'),
Marc Kupietz187abd72024-06-25 14:30:01 +020028 'output|o=s' => \(my $outh = '-'),
Marc Kupietz79ba1e52021-02-12 17:26:54 +010029
Marc Kupietzaeb84a02021-10-11 17:57:29 +020030 'help|h' => sub {
31 pod2usage(
32 -verbose => 99,
33 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES',
34 -msg => $VERSION_MSG,
35 -output => '-'
36 )
37 },
38 'version|v' => sub {
39 pod2usage(
40 -verbose => 0,
41 -msg => $VERSION_MSG,
42 -output => '-'
43 );
44 }
45);
Marc Kupietz79ba1e52021-02-12 17:26:54 +010046
Marc Kupietzaeb84a02021-10-11 17:57:29 +020047# Establish logger
48binmode(STDERR, ':encoding(UTF-8)');
49Log::Any::Adapter->set('Stderr', log_level => $log_level);
50$log->notice('Debugging is activated') if DEBUG;
Marc Kupietz79ba1e52021-02-12 17:26:54 +010051
52my $docid="";
53my $zip = undef;
Marc Kupietz79ba1e52021-02-12 17:26:54 +010054my $parser_file;
55my $parse;
56my $morpho_file;
57my $morpho;
58my @spansFrom;
59my @spansTo;
60my $current;
61my ($unknown, $known) = (0, 0);
62
63my ($write_morpho, $write_syntax, $base) = (1, 0, 0);
64my $filename;
Marc Kupietz79ba1e52021-02-12 17:26:54 +010065my $first=1;
66my @conllu_files = @ARGV;
67push @conllu_files, "-" if (@conllu_files == 0);
68my $fh;
Marc Kupietzdd546a82024-03-22 16:30:09 +010069
70my $dependency_foundry_name = $foundry_name;
71if ($foundry_name =~ /(.*) dependency:(.*)/) {
72 $foundry_name = $1;
73 $dependency_foundry_name = $2;
74}
75
Marc Kupietz79ba1e52021-02-12 17:26:54 +010076foreach my $conllu_file (@conllu_files) {
77 if ($conllu_file eq '-') {
78 $fh = \*STDIN;
79 } else {
80 open($fh, "<", $conllu_file) or die "Cannot open $conllu_file";
81 }
82 my $i=0; my $s=0; my $first_in_sentence=0;
83 my $lastDocSigle="";
Akron49f333b2022-09-27 17:03:49 +020084 MAIN: while (<$fh>) {
Marc Kupietzbcb55b82022-09-15 11:42:26 +020085 if(/^\s*(?:#|0\.\d)/) {
86 if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
87 $filename=$1;
88 if(!$first) {
89 closeDoc(0);
90 } else {
91 $first=0;
92 }
93 if($processedFilenames{$filename}) {
94 $log->warn("WARNING: $filename is already processed");
95 }
96 $processedFilenames{$filename}=1;
97 $i=0;
98 } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
99 if(!$foundry_name) {
Marc Kupietzdd546a82024-03-22 16:30:09 +0100100 $dependency_foundry_name = $foundry_name = $1;
101 if ($foundry_name =~ /(.*) dependency:(.*)/) {
102 $foundry_name = $1;
103 $dependency_foundry_name = $2;
104 }
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200105 $log->debug("Foundry: $foundry_name\n");
106 } else {
107 $log->debug("Ignored foundry name: $1\n");
108 }
109 } elsif(/^#\s*generator\s*[=]\s*udpipe/i) {
110 if(!$foundry_name) {
Marc Kupietzdd546a82024-03-22 16:30:09 +0100111 $dependency_foundry_name = $foundry_name = "ud";
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200112 $log->debug("Foundry: $foundry_name\n");
113 } else {
114 $log->debug("Ignored foundry name: ud\n");
115 }
Akron49f333b2022-09-27 17:03:49 +0200116 } elsif(/^(?:#|0\.2)\s+text_id\s*[:=]\s*(.*)/) {
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200117 $docid=$1;
Marc Kupietzcc391472024-06-24 10:48:34 +0200118 $docid =~ s/\s+$//;
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200119 my $docSigle = $docid;
120 $docSigle =~ s/\..*//;
121 if($docSigle ne $lastDocSigle) {
122 $log->info("Analyzing $docSigle");
123 $lastDocSigle = $docSigle;
124 }
125 $known=$unknown=0;
126 $current="";
127 $parser_file = dirname($filename);
128 $parser_file =~ s@(.*)/[^/]+$@$1@;
129 $morpho_file = $parser_file;
130 $morpho_file .= "/$foundry_name/morpho.xml";
Marc Kupietzdd546a82024-03-22 16:30:09 +0100131 $parser_file .= "/$dependency_foundry_name/dependency.xml";
Marc Kupietzbcb55b82022-09-15 11:42:26 +0200132 $parse = $morpho = layer_header($docid);
133 } elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
134 @spansFrom = split(/\s+/, $1);
135 } elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
136 @spansTo = split(/\s+/, $1);
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100137 }
Akron49f333b2022-09-27 17:03:49 +0200138 } elsif ( !/^\s*$/ ) {
139 if ( !$docid || scalar @spansTo == 0 || scalar @spansFrom == 0 ) {
140 if ( !$docid ) {
Marc Kupietz67d8c432024-06-25 14:32:16 +0200141 $log->warn("WARNING: Invalid input in $conllu_file: text_id (e.g. '# text_id = GOE_AGA.00000') missing in line $. when writing to $outh");
Akron49f333b2022-09-27 17:03:49 +0200142 }
143 if ( scalar @spansTo == 0 || scalar @spansFrom == 0 ) {
Marc Kupietz67d8c432024-06-25 14:32:16 +0200144 $log->warn("WARNING: Invalid input in $conllu_file: token offsets missing in line $. when writing to $outh");
Akron49f333b2022-09-27 17:03:49 +0200145 }
146
147 # Skip to next potentially valid document
148 while (<$fh>) {
149 next MAIN if m!^\s*$!s;
150 }
151 };
Marc Kupietzd50de7c2024-03-10 15:24:55 +0100152 my @parsed = map {
153 my $s = $_;
154 $s =~ s/&/&amp;/g;
155 $s =~ s/</&lt;/g;
156 $s =~ s/>/&gt;/g;
157 $s;
158 } split('\t');
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100159 chomp $parsed[9];
Akron49f333b2022-09-27 17:03:49 +0200160 if (@parsed != 10) {
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200161 $log->warn("WARNING: skipping strange parser output line in $docid");
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100162 $i++;
163 next;
164 }
165 my $t=$parsed[0];
166 if($t == 1) {
167 $s++;
168 $first_in_sentence = $i;
169 }
170 if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) {
171 $write_syntax=1;
172 my $from=$spansFrom[$parsed[6]];
173 my $to=$spansTo[$parsed[6]];
174 $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
175<rel label="$parsed[7]">
176<span from="$from" to="$to"/>
177</rel>
178</span>
179@;
Marc Kupietza591cdd2021-10-12 13:23:48 +0200180 }
Marc Kupietz5cc4df22024-03-24 13:46:42 +0100181 my $pos = $parsed[4];
182 my $upos = $parsed[3];
Marc Kupietza591cdd2021-10-12 13:23:48 +0200183 $pos =~ s/\|.*//;
184 $morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100185 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
186 <f name="lex">
187 <fs>
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100188);
Marc Kupietz5cc4df22024-03-24 13:46:42 +0100189 if($pos ne "_") {
190 $morpho .= qq( <f name="pos">$pos</f>\n);
191 }
192 if($upos ne "_") {
193 $morpho .= qq( <f name="upos">$upos</f>\n);
194 }
Marc Kupietz97ba2ba2021-10-11 17:55:47 +0200195 $morpho .= qq( <f name="lemma">$parsed[2]</f>\n) if($parsed[2] ne "_" || $parsed[1] eq '_');
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100196 $morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
197 if($parsed[9] ne "_") {
198 if ($parsed[9] =~ /[0-9.e]+/) {
199 $morpho .= qq( <f name="certainty">$parsed[9]</f>\n)
200 }
201 else {
202 $morpho .= qq( <f name="misc">$parsed[9]</f>\n)
203 }
204 }
205 $morpho .= qq( </fs>
206 </f>
207 </fs>
208 </span>
209);
210 $i++;
211 }
212 }
213 $current .= "\n";
214 closeDoc(1);
Akron49f333b2022-09-27 17:03:49 +0200215 $zip->close() if $zip;
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100216 close($fh);
217}
218exit;
219
220sub newZipStream {
221 my ($fname) = @_;
222 if (defined $zip) {
223 $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
Marc Kupietz447f4752024-03-22 17:35:57 +0100224 Append => 1, Name => $fname, ExtAttr => 0100666 << 16)
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100225 or die "ERROR ('$fname'): zip failed: $ZipError\n";
226 } else {
227 $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1,
Marc Kupietz447f4752024-03-22 17:35:57 +0100228 Method => $_COMPRESSION_METHOD, Append => 0, Name => "$fname", ExtAttr => 0100666 << 16
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100229 or die "ERROR ('$fname'): zip failed: $ZipError\n";
230 }
231}
232
233sub closeDoc {
Akron49f333b2022-09-27 17:03:49 +0200234 if ($write_morpho && $morpho_file) {
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100235 newZipStream($morpho_file);
236 $zip->print($morpho, qq( </spanList>\n</layer>\n));
237 }
Akron49f333b2022-09-27 17:03:49 +0200238 if ($write_syntax && $parser_file) {
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100239 $write_syntax = 0;
240 newZipStream($parser_file);
241 $zip->print($parse, qq(</spanList>\n</layer>\n));
242 }
243}
244
245sub layer_header {
246 my ($docid) = @_;
247 return(qq(<?xml version="1.0" encoding="UTF-8"?>
248<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
249<layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
250<spanList>
251));
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200252}
253
254=pod
255
256=encoding utf8
257
258=head1 NAME
259
260conllu2korapxml - Conversion of KorAP-XML CoNLL-U to KorAP-XML zips
261
262=head1 SYNOPSIS
263
264 conllu2korapxml < zca15.tree_tagger.conllu > zca15.tree_tagger.zip
265
266=head1 DESCRIPTION
267
268C<conllu2korapxml> converts CoNLL-U files that follow KorAP-specific comment conventions
269 and contain morphosyntactic and/or dependency annotations to
270 corresponding KorAP-XML zip files.
271
272=head1 INSTALLATION
273
274 $ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git
275
276=head1 OPTIONS
277
278=over 2
279
280=item B<--force-foundry|-f>
281
282Set foundry name and ignore foundry names in the input.
283
Marc Kupietzdd546a82024-03-22 16:30:09 +0100284
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200285=item B<--help|-h>
286
287Print help information.
288
289=item B<--version|-v>
290
291Print version information.
292
293
294=item B<--log|-l>
295
296Loglevel for I<Log::Any>. Defaults to C<warn>.
297
Marc Kupietz187abd72024-06-25 14:30:01 +0200298=item B<--output|-o>
299
300Output file. Defaults to C<-> (stdout).
301
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200302=back
303
304=head1 EXAMPLES
305
306 conllu2korapxml -f tree_tagger < t/data/wdf19.morpho.conllu > wdf19.tree_tagger.zip
307
Marc Kupietzdd546a82024-03-22 16:30:09 +0100308 conllu2korapxml -f "tree_tagger dependency:malt" < t/data/wdf19.tt-malt.conllu > wdf19.tree_tagger.zip
309
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200310=head1 COPYRIGHT AND LICENSE
311
Akron249fc832024-06-04 16:36:44 +0200312Copyright (C) 2021-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200313
314Author: Marc Kupietz
315
316Contributors: Nils Diewald
317
318L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
319Corpus Analysis Platform at the
320L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
321member of the
322L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
323
324This program is free software published under the
325L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.