Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use POSIX; |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 5 | use Getopt::Long qw(GetOptions :config no_auto_abbrev); |
| 6 | use Log::Any '$log'; |
| 7 | use Log::Any::Adapter; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 8 | use Encode; |
| 9 | use IO::Compress::Zip qw(zip $ZipError :constants); |
| 10 | use File::Basename; |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 11 | use Pod::Usage; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 12 | |
| 13 | my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; |
| 14 | my %opts; |
| 15 | my %processedFilenames; |
| 16 | |
Akron | 249fc83 | 2024-06-04 16:36:44 +0200 | [diff] [blame^] | 17 | our $VERSION = '0.6.3'; |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 18 | our $VERSION_MSG = "\nconllu2korapxml - v$VERSION\n"; |
Marc Kupietz | 4cc243a | 2021-10-11 17:15:16 +0200 | [diff] [blame] | 19 | |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 20 | use constant { |
| 21 | # Set to 1 for minimal more debug output (no need to be parametrized) |
| 22 | DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0 |
| 23 | }; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 24 | |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 25 | GetOptions( |
| 26 | 'force-foundry|f=s' => \(my $foundry_name = ''), |
| 27 | 'log|l=s' => \(my $log_level = 'warn'), |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 28 | |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 29 | 'help|h' => sub { |
| 30 | pod2usage( |
| 31 | -verbose => 99, |
| 32 | -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES', |
| 33 | -msg => $VERSION_MSG, |
| 34 | -output => '-' |
| 35 | ) |
| 36 | }, |
| 37 | 'version|v' => sub { |
| 38 | pod2usage( |
| 39 | -verbose => 0, |
| 40 | -msg => $VERSION_MSG, |
| 41 | -output => '-' |
| 42 | ); |
| 43 | } |
| 44 | ); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 45 | |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 46 | # Establish logger |
| 47 | binmode(STDERR, ':encoding(UTF-8)'); |
| 48 | Log::Any::Adapter->set('Stderr', log_level => $log_level); |
| 49 | $log->notice('Debugging is activated') if DEBUG; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 50 | |
| 51 | my $docid=""; |
| 52 | my $zip = undef; |
Akron | 1ce1bb5 | 2023-03-22 08:18:08 +0100 | [diff] [blame] | 53 | my $outh = '-'; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 54 | my $parser_file; |
| 55 | my $parse; |
| 56 | my $morpho_file; |
| 57 | my $morpho; |
| 58 | my @spansFrom; |
| 59 | my @spansTo; |
| 60 | my $current; |
| 61 | my ($unknown, $known) = (0, 0); |
| 62 | |
| 63 | my ($write_morpho, $write_syntax, $base) = (1, 0, 0); |
| 64 | my $filename; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 65 | my $first=1; |
| 66 | my @conllu_files = @ARGV; |
| 67 | push @conllu_files, "-" if (@conllu_files == 0); |
| 68 | my $fh; |
Marc Kupietz | dd546a8 | 2024-03-22 16:30:09 +0100 | [diff] [blame] | 69 | |
| 70 | my $dependency_foundry_name = $foundry_name; |
| 71 | if ($foundry_name =~ /(.*) dependency:(.*)/) { |
| 72 | $foundry_name = $1; |
| 73 | $dependency_foundry_name = $2; |
| 74 | } |
| 75 | |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 76 | foreach my $conllu_file (@conllu_files) { |
| 77 | if ($conllu_file eq '-') { |
| 78 | $fh = \*STDIN; |
| 79 | } else { |
| 80 | open($fh, "<", $conllu_file) or die "Cannot open $conllu_file"; |
| 81 | } |
| 82 | my $i=0; my $s=0; my $first_in_sentence=0; |
| 83 | my $lastDocSigle=""; |
Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 84 | MAIN: while (<$fh>) { |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 85 | if(/^\s*(?:#|0\.\d)/) { |
| 86 | if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) { |
| 87 | $filename=$1; |
| 88 | if(!$first) { |
| 89 | closeDoc(0); |
| 90 | } else { |
| 91 | $first=0; |
| 92 | } |
| 93 | if($processedFilenames{$filename}) { |
| 94 | $log->warn("WARNING: $filename is already processed"); |
| 95 | } |
| 96 | $processedFilenames{$filename}=1; |
| 97 | $i=0; |
| 98 | } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) { |
| 99 | if(!$foundry_name) { |
Marc Kupietz | dd546a8 | 2024-03-22 16:30:09 +0100 | [diff] [blame] | 100 | $dependency_foundry_name = $foundry_name = $1; |
| 101 | if ($foundry_name =~ /(.*) dependency:(.*)/) { |
| 102 | $foundry_name = $1; |
| 103 | $dependency_foundry_name = $2; |
| 104 | } |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 105 | $log->debug("Foundry: $foundry_name\n"); |
| 106 | } else { |
| 107 | $log->debug("Ignored foundry name: $1\n"); |
| 108 | } |
| 109 | } elsif(/^#\s*generator\s*[=]\s*udpipe/i) { |
| 110 | if(!$foundry_name) { |
Marc Kupietz | dd546a8 | 2024-03-22 16:30:09 +0100 | [diff] [blame] | 111 | $dependency_foundry_name = $foundry_name = "ud"; |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 112 | $log->debug("Foundry: $foundry_name\n"); |
| 113 | } else { |
| 114 | $log->debug("Ignored foundry name: ud\n"); |
| 115 | } |
Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 116 | } elsif(/^(?:#|0\.2)\s+text_id\s*[:=]\s*(.*)/) { |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 117 | $docid=$1; |
| 118 | my $docSigle = $docid; |
| 119 | $docSigle =~ s/\..*//; |
| 120 | if($docSigle ne $lastDocSigle) { |
| 121 | $log->info("Analyzing $docSigle"); |
| 122 | $lastDocSigle = $docSigle; |
| 123 | } |
| 124 | $known=$unknown=0; |
| 125 | $current=""; |
| 126 | $parser_file = dirname($filename); |
| 127 | $parser_file =~ s@(.*)/[^/]+$@$1@; |
| 128 | $morpho_file = $parser_file; |
| 129 | $morpho_file .= "/$foundry_name/morpho.xml"; |
Marc Kupietz | dd546a8 | 2024-03-22 16:30:09 +0100 | [diff] [blame] | 130 | $parser_file .= "/$dependency_foundry_name/dependency.xml"; |
Marc Kupietz | bcb55b8 | 2022-09-15 11:42:26 +0200 | [diff] [blame] | 131 | $parse = $morpho = layer_header($docid); |
| 132 | } elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) { |
| 133 | @spansFrom = split(/\s+/, $1); |
| 134 | } elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) { |
| 135 | @spansTo = split(/\s+/, $1); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 136 | } |
Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 137 | } elsif ( !/^\s*$/ ) { |
| 138 | if ( !$docid || scalar @spansTo == 0 || scalar @spansFrom == 0 ) { |
| 139 | if ( !$docid ) { |
| 140 | $log->warn("WARNING: No valid input document: text_id (e.g. '# text_id = GOE_AGA.00000') missing"); |
| 141 | } |
| 142 | if ( scalar @spansTo == 0 || scalar @spansFrom == 0 ) { |
| 143 | $log->warn("WARNING: No valid input document: token offsets missing"); |
| 144 | } |
| 145 | |
| 146 | # Skip to next potentially valid document |
| 147 | while (<$fh>) { |
| 148 | next MAIN if m!^\s*$!s; |
| 149 | } |
| 150 | }; |
Marc Kupietz | d50de7c | 2024-03-10 15:24:55 +0100 | [diff] [blame] | 151 | my @parsed = map { |
| 152 | my $s = $_; |
| 153 | $s =~ s/&/&/g; |
| 154 | $s =~ s/</</g; |
| 155 | $s =~ s/>/>/g; |
| 156 | $s; |
| 157 | } split('\t'); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 158 | chomp $parsed[9]; |
Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 159 | if (@parsed != 10) { |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 160 | $log->warn("WARNING: skipping strange parser output line in $docid"); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 161 | $i++; |
| 162 | next; |
| 163 | } |
| 164 | my $t=$parsed[0]; |
| 165 | if($t == 1) { |
| 166 | $s++; |
| 167 | $first_in_sentence = $i; |
| 168 | } |
| 169 | if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) { |
| 170 | $write_syntax=1; |
| 171 | my $from=$spansFrom[$parsed[6]]; |
| 172 | my $to=$spansTo[$parsed[6]]; |
| 173 | $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> |
| 174 | <rel label="$parsed[7]"> |
| 175 | <span from="$from" to="$to"/> |
| 176 | </rel> |
| 177 | </span> |
| 178 | @; |
Marc Kupietz | a591cdd | 2021-10-12 13:23:48 +0200 | [diff] [blame] | 179 | } |
Marc Kupietz | 5cc4df2 | 2024-03-24 13:46:42 +0100 | [diff] [blame] | 180 | my $pos = $parsed[4]; |
| 181 | my $upos = $parsed[3]; |
Marc Kupietz | a591cdd | 2021-10-12 13:23:48 +0200 | [diff] [blame] | 182 | $pos =~ s/\|.*//; |
| 183 | $morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 184 | <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0"> |
| 185 | <f name="lex"> |
| 186 | <fs> |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 187 | ); |
Marc Kupietz | 5cc4df2 | 2024-03-24 13:46:42 +0100 | [diff] [blame] | 188 | if($pos ne "_") { |
| 189 | $morpho .= qq( <f name="pos">$pos</f>\n); |
| 190 | } |
| 191 | if($upos ne "_") { |
| 192 | $morpho .= qq( <f name="upos">$upos</f>\n); |
| 193 | } |
Marc Kupietz | 97ba2ba | 2021-10-11 17:55:47 +0200 | [diff] [blame] | 194 | $morpho .= qq( <f name="lemma">$parsed[2]</f>\n) if($parsed[2] ne "_" || $parsed[1] eq '_'); |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 195 | $morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_"); |
| 196 | if($parsed[9] ne "_") { |
| 197 | if ($parsed[9] =~ /[0-9.e]+/) { |
| 198 | $morpho .= qq( <f name="certainty">$parsed[9]</f>\n) |
| 199 | } |
| 200 | else { |
| 201 | $morpho .= qq( <f name="misc">$parsed[9]</f>\n) |
| 202 | } |
| 203 | } |
| 204 | $morpho .= qq( </fs> |
| 205 | </f> |
| 206 | </fs> |
| 207 | </span> |
| 208 | ); |
| 209 | $i++; |
| 210 | } |
| 211 | } |
| 212 | $current .= "\n"; |
| 213 | closeDoc(1); |
Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 214 | $zip->close() if $zip; |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 215 | close($fh); |
| 216 | } |
| 217 | exit; |
| 218 | |
| 219 | sub newZipStream { |
| 220 | my ($fname) = @_; |
| 221 | if (defined $zip) { |
| 222 | $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, |
Marc Kupietz | 447f475 | 2024-03-22 17:35:57 +0100 | [diff] [blame] | 223 | Append => 1, Name => $fname, ExtAttr => 0100666 << 16) |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 224 | or die "ERROR ('$fname'): zip failed: $ZipError\n"; |
| 225 | } else { |
| 226 | $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1, |
Marc Kupietz | 447f475 | 2024-03-22 17:35:57 +0100 | [diff] [blame] | 227 | Method => $_COMPRESSION_METHOD, Append => 0, Name => "$fname", ExtAttr => 0100666 << 16 |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 228 | or die "ERROR ('$fname'): zip failed: $ZipError\n"; |
| 229 | } |
| 230 | } |
| 231 | |
| 232 | sub closeDoc { |
Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 233 | if ($write_morpho && $morpho_file) { |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 234 | newZipStream($morpho_file); |
| 235 | $zip->print($morpho, qq( </spanList>\n</layer>\n)); |
| 236 | } |
Akron | 49f333b | 2022-09-27 17:03:49 +0200 | [diff] [blame] | 237 | if ($write_syntax && $parser_file) { |
Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 238 | $write_syntax = 0; |
| 239 | newZipStream($parser_file); |
| 240 | $zip->print($parse, qq(</spanList>\n</layer>\n)); |
| 241 | } |
| 242 | } |
| 243 | |
| 244 | sub layer_header { |
| 245 | my ($docid) = @_; |
| 246 | return(qq(<?xml version="1.0" encoding="UTF-8"?> |
| 247 | <?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> |
| 248 | <layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4"> |
| 249 | <spanList> |
| 250 | )); |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 251 | } |
| 252 | |
| 253 | =pod |
| 254 | |
| 255 | =encoding utf8 |
| 256 | |
| 257 | =head1 NAME |
| 258 | |
| 259 | conllu2korapxml - Conversion of KorAP-XML CoNLL-U to KorAP-XML zips |
| 260 | |
| 261 | =head1 SYNOPSIS |
| 262 | |
| 263 | conllu2korapxml < zca15.tree_tagger.conllu > zca15.tree_tagger.zip |
| 264 | |
| 265 | =head1 DESCRIPTION |
| 266 | |
| 267 | C<conllu2korapxml> converts CoNLL-U files that follow KorAP-specific comment conventions |
| 268 | and contain morphosyntactic and/or dependency annotations to |
| 269 | corresponding KorAP-XML zip files. |
| 270 | |
| 271 | =head1 INSTALLATION |
| 272 | |
| 273 | $ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git |
| 274 | |
| 275 | =head1 OPTIONS |
| 276 | |
| 277 | =over 2 |
| 278 | |
| 279 | =item B<--force-foundry|-f> |
| 280 | |
| 281 | Set foundry name and ignore foundry names in the input. |
| 282 | |
Marc Kupietz | dd546a8 | 2024-03-22 16:30:09 +0100 | [diff] [blame] | 283 | |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 284 | =item B<--help|-h> |
| 285 | |
| 286 | Print help information. |
| 287 | |
| 288 | =item B<--version|-v> |
| 289 | |
| 290 | Print version information. |
| 291 | |
| 292 | |
| 293 | =item B<--log|-l> |
| 294 | |
| 295 | Loglevel for I<Log::Any>. Defaults to C<warn>. |
| 296 | |
| 297 | =back |
| 298 | |
| 299 | =head1 EXAMPLES |
| 300 | |
| 301 | conllu2korapxml -f tree_tagger < t/data/wdf19.morpho.conllu > wdf19.tree_tagger.zip |
| 302 | |
Marc Kupietz | dd546a8 | 2024-03-22 16:30:09 +0100 | [diff] [blame] | 303 | conllu2korapxml -f "tree_tagger dependency:malt" < t/data/wdf19.tt-malt.conllu > wdf19.tree_tagger.zip |
| 304 | |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 305 | =head1 COPYRIGHT AND LICENSE |
| 306 | |
Akron | 249fc83 | 2024-06-04 16:36:44 +0200 | [diff] [blame^] | 307 | Copyright (C) 2021-2024, L<IDS Mannheim|https://www.ids-mannheim.de/> |
Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame] | 308 | |
| 309 | Author: Marc Kupietz |
| 310 | |
| 311 | Contributors: Nils Diewald |
| 312 | |
| 313 | L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/> |
| 314 | Corpus Analysis Platform at the |
| 315 | L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>, |
| 316 | member of the |
| 317 | L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>. |
| 318 | |
| 319 | This program is free software published under the |
| 320 | L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>. |