blob: 1fdad0d7d802417391e45fbd5e754005b8cfa4f7 [file] [log] [blame]
Marc Kupietz79ba1e52021-02-12 17:26:54 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
Marc Kupietzaeb84a02021-10-11 17:57:29 +02005use Getopt::Long qw(GetOptions :config no_auto_abbrev);
6use Log::Any '$log';
7use Log::Any::Adapter;
Marc Kupietz79ba1e52021-02-12 17:26:54 +01008use Encode;
9use IO::Compress::Zip qw(zip $ZipError :constants);
10use File::Basename;
Marc Kupietzaeb84a02021-10-11 17:57:29 +020011use Pod::Usage;
Marc Kupietz79ba1e52021-02-12 17:26:54 +010012
13my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
14my %opts;
15my %processedFilenames;
16
Marc Kupietz4cc243a2021-10-11 17:15:16 +020017our $VERSION = '0.4.1.9000';
Marc Kupietzaeb84a02021-10-11 17:57:29 +020018our $VERSION_MSG = "\nconllu2korapxml - v$VERSION\n";
Marc Kupietz4cc243a2021-10-11 17:15:16 +020019
Marc Kupietzaeb84a02021-10-11 17:57:29 +020020use constant {
21 # Set to 1 for minimal more debug output (no need to be parametrized)
22 DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0
23};
Marc Kupietz79ba1e52021-02-12 17:26:54 +010024
Marc Kupietzaeb84a02021-10-11 17:57:29 +020025GetOptions(
26 'force-foundry|f=s' => \(my $foundry_name = ''),
27 'log|l=s' => \(my $log_level = 'warn'),
Marc Kupietz79ba1e52021-02-12 17:26:54 +010028
Marc Kupietzaeb84a02021-10-11 17:57:29 +020029 'help|h' => sub {
30 pod2usage(
31 -verbose => 99,
32 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES',
33 -msg => $VERSION_MSG,
34 -output => '-'
35 )
36 },
37 'version|v' => sub {
38 pod2usage(
39 -verbose => 0,
40 -msg => $VERSION_MSG,
41 -output => '-'
42 );
43 }
44);
Marc Kupietz79ba1e52021-02-12 17:26:54 +010045
Marc Kupietzaeb84a02021-10-11 17:57:29 +020046# Establish logger
47binmode(STDERR, ':encoding(UTF-8)');
48Log::Any::Adapter->set('Stderr', log_level => $log_level);
49$log->notice('Debugging is activated') if DEBUG;
Marc Kupietz79ba1e52021-02-12 17:26:54 +010050
51my $docid="";
52my $zip = undef;
53my $outh = \*STDOUT;
54my $parser_file;
55my $parse;
56my $morpho_file;
57my $morpho;
58my @spansFrom;
59my @spansTo;
60my $current;
61my ($unknown, $known) = (0, 0);
62
63my ($write_morpho, $write_syntax, $base) = (1, 0, 0);
64my $filename;
Marc Kupietz79ba1e52021-02-12 17:26:54 +010065my $first=1;
66my @conllu_files = @ARGV;
67push @conllu_files, "-" if (@conllu_files == 0);
68my $fh;
69foreach my $conllu_file (@conllu_files) {
70 if ($conllu_file eq '-') {
71 $fh = \*STDIN;
72 } else {
73 open($fh, "<", $conllu_file) or die "Cannot open $conllu_file";
74 }
75 my $i=0; my $s=0; my $first_in_sentence=0;
76 my $lastDocSigle="";
77 while (<$fh>) {
78 if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
79 $filename=$1;
80 if(!$first) {
81 closeDoc(0);
82 } else {
83 $first=0;
84 }
85 if($processedFilenames{$filename}) {
Marc Kupietzaeb84a02021-10-11 17:57:29 +020086 $log->warn("WARNING: $filename is already processed");
Marc Kupietz79ba1e52021-02-12 17:26:54 +010087 }
88 $processedFilenames{$filename}=1;
89 $i=0;
90 } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
Marc Kupietzaeb84a02021-10-11 17:57:29 +020091 if(!$foundry_name) {
92 $foundry_name = $1;
93 $log->debug("Foundry: $foundry_name\n");
94 }
Marc Kupietz79ba1e52021-02-12 17:26:54 +010095 } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
96 $docid=$1;
97 my $docSigle = $docid;
98 $docSigle =~ s/\..*//;
99 if($docSigle ne $lastDocSigle) {
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200100 $log->info("Analyzing $docSigle");
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100101 $lastDocSigle = $docSigle;
102 }
103 $known=$unknown=0;
104 $current="";
105 $parser_file = dirname($filename);
106 $parser_file =~ s@(.*)/[^/]+$@$1@;
107 $morpho_file = $parser_file;
108 $morpho_file .= "/$foundry_name/morpho.xml";
109 $parser_file .= "/$foundry_name/dependency.xml";
110 $parse = $morpho = layer_header($docid);
111 } elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
112 @spansFrom = split(/\s+/, $1);
113 } elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
114 @spansTo = split(/\s+/, $1);
115 } elsif (! /^\s*$/) {
116 my @parsed=split('\t');
117 chomp $parsed[9];
118 if(@parsed != 10) {
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200119 $log->warn("WARNING: skipping strange parser output line in $docid");
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100120 $i++;
121 next;
122 }
123 my $t=$parsed[0];
124 if($t == 1) {
125 $s++;
126 $first_in_sentence = $i;
127 }
128 if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) {
129 $write_syntax=1;
130 my $from=$spansFrom[$parsed[6]];
131 my $to=$spansTo[$parsed[6]];
132 $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
133<rel label="$parsed[7]">
134<span from="$from" to="$to"/>
135</rel>
136</span>
137@;
138 }
139 $morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
140 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
141 <f name="lex">
142 <fs>
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100143 <f name="pos">$parsed[3]</f>
144);
Marc Kupietz97ba2ba2021-10-11 17:55:47 +0200145 $morpho .= qq( <f name="lemma">$parsed[2]</f>\n) if($parsed[2] ne "_" || $parsed[1] eq '_');
Marc Kupietz79ba1e52021-02-12 17:26:54 +0100146 $morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
147 if($parsed[9] ne "_") {
148 if ($parsed[9] =~ /[0-9.e]+/) {
149 $morpho .= qq( <f name="certainty">$parsed[9]</f>\n)
150 }
151 else {
152 $morpho .= qq( <f name="misc">$parsed[9]</f>\n)
153 }
154 }
155 $morpho .= qq( </fs>
156 </f>
157 </fs>
158 </span>
159);
160 $i++;
161 }
162 }
163 $current .= "\n";
164 closeDoc(1);
165 $zip->close();
166 close($fh);
167}
168exit;
169
170sub newZipStream {
171 my ($fname) = @_;
172 if (defined $zip) {
173 $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
174 Append => 1, Name => $fname)
175 or die "ERROR ('$fname'): zip failed: $ZipError\n";
176 } else {
177 $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1,
178 Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname"
179 or die "ERROR ('$fname'): zip failed: $ZipError\n";
180 }
181}
182
183sub closeDoc {
184 if ($write_morpho) {
185 newZipStream($morpho_file);
186 $zip->print($morpho, qq( </spanList>\n</layer>\n));
187 }
188 if ($write_syntax) {
189 $write_syntax = 0;
190 newZipStream($parser_file);
191 $zip->print($parse, qq(</spanList>\n</layer>\n));
192 }
193}
194
195sub layer_header {
196 my ($docid) = @_;
197 return(qq(<?xml version="1.0" encoding="UTF-8"?>
198<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
199<layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
200<spanList>
201));
Marc Kupietzaeb84a02021-10-11 17:57:29 +0200202}
203
204=pod
205
206=encoding utf8
207
208=head1 NAME
209
210conllu2korapxml - Conversion of KorAP-XML CoNLL-U to KorAP-XML zips
211
212=head1 SYNOPSIS
213
214 conllu2korapxml < zca15.tree_tagger.conllu > zca15.tree_tagger.zip
215
216=head1 DESCRIPTION
217
218C<conllu2korapxml> converts CoNLL-U files that follow KorAP-specific comment conventions
219 and contain morphosyntactic and/or dependency annotations to
220 corresponding KorAP-XML zip files.
221
222=head1 INSTALLATION
223
224 $ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git
225
226=head1 OPTIONS
227
228=over 2
229
230=item B<--force-foundry|-f>
231
232Set foundry name and ignore foundry names in the input.
233
234=item B<--help|-h>
235
236Print help information.
237
238=item B<--version|-v>
239
240Print version information.
241
242
243=item B<--log|-l>
244
245Loglevel for I<Log::Any>. Defaults to C<warn>.
246
247=back
248
249=head1 EXAMPLES
250
251 conllu2korapxml -f tree_tagger < t/data/wdf19.morpho.conllu > wdf19.tree_tagger.zip
252
253=head1 COPYRIGHT AND LICENSE
254
255Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
256
257Author: Marc Kupietz
258
259Contributors: Nils Diewald
260
261L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
262Corpus Analysis Platform at the
263L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
264member of the
265L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
266
267This program is free software published under the
268L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.