| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
 | 2 | use strict; | 
 | 3 | use warnings; | 
 | 4 | use POSIX; | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 5 | use Getopt::Long qw(GetOptions :config no_auto_abbrev); | 
 | 6 | use Log::Any '$log'; | 
 | 7 | use Log::Any::Adapter; | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 8 | use Encode; | 
 | 9 | use IO::Compress::Zip qw(zip $ZipError :constants); | 
 | 10 | use File::Basename; | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 11 | use Pod::Usage; | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 12 |  | 
 | 13 | my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; | 
 | 14 | my %opts; | 
 | 15 | my %processedFilenames; | 
 | 16 |  | 
| Marc Kupietz | 4cc243a | 2021-10-11 17:15:16 +0200 | [diff] [blame] | 17 | our $VERSION = '0.4.1.9000'; | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 18 | our $VERSION_MSG = "\nconllu2korapxml - v$VERSION\n"; | 
| Marc Kupietz | 4cc243a | 2021-10-11 17:15:16 +0200 | [diff] [blame] | 19 |  | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 20 | use constant { | 
 | 21 |     # Set to 1 for minimal more debug output (no need to be parametrized) | 
 | 22 |     DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0 | 
 | 23 | }; | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 24 |  | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 25 | GetOptions( | 
 | 26 |     'force-foundry|f=s'            => \(my $foundry_name = ''), | 
 | 27 |     'log|l=s'                      => \(my $log_level = 'warn'), | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 28 |  | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 29 |     'help|h'                       => sub { | 
 | 30 |       pod2usage( | 
 | 31 |           -verbose  => 99, | 
 | 32 |           -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES', | 
 | 33 |           -msg      => $VERSION_MSG, | 
 | 34 |           -output   => '-' | 
 | 35 |       ) | 
 | 36 |     }, | 
 | 37 |     'version|v'                    => sub { | 
 | 38 |       pod2usage( | 
 | 39 |           -verbose => 0, | 
 | 40 |           -msg     => $VERSION_MSG, | 
 | 41 |           -output  => '-' | 
 | 42 |       ); | 
 | 43 |     } | 
 | 44 | ); | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 45 |  | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 46 | # Establish logger | 
 | 47 | binmode(STDERR, ':encoding(UTF-8)'); | 
 | 48 | Log::Any::Adapter->set('Stderr', log_level => $log_level); | 
 | 49 | $log->notice('Debugging is activated') if DEBUG; | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 50 |  | 
 | 51 | my $docid=""; | 
 | 52 | my $zip = undef; | 
 | 53 | my $outh = \*STDOUT; | 
 | 54 | my $parser_file; | 
 | 55 | my $parse; | 
 | 56 | my $morpho_file; | 
 | 57 | my $morpho; | 
 | 58 | my @spansFrom; | 
 | 59 | my @spansTo; | 
 | 60 | my $current; | 
 | 61 | my ($unknown, $known) = (0, 0); | 
 | 62 |  | 
 | 63 | my ($write_morpho, $write_syntax, $base) = (1, 0, 0); | 
 | 64 | my $filename; | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 65 | my $first=1; | 
 | 66 | my @conllu_files = @ARGV; | 
 | 67 | push @conllu_files, "-" if (@conllu_files == 0); | 
 | 68 | my $fh; | 
 | 69 | foreach my $conllu_file (@conllu_files) { | 
 | 70 |   if ($conllu_file eq '-') { | 
 | 71 |     $fh = \*STDIN; | 
 | 72 |   } else { | 
 | 73 |     open($fh, "<", $conllu_file) or die "Cannot open $conllu_file"; | 
 | 74 |   } | 
 | 75 |   my $i=0; my $s=0; my $first_in_sentence=0; | 
 | 76 |   my $lastDocSigle=""; | 
 | 77 |   while (<$fh>) { | 
 | 78 |     if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) { | 
 | 79 |       $filename=$1; | 
 | 80 |       if(!$first) { | 
 | 81 |         closeDoc(0); | 
 | 82 |       } else { | 
 | 83 |         $first=0; | 
 | 84 |       } | 
 | 85 |       if($processedFilenames{$filename}) { | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 86 |         $log->warn("WARNING: $filename is already processed"); | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 87 |       } | 
 | 88 |       $processedFilenames{$filename}=1; | 
 | 89 |       $i=0; | 
 | 90 |     } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) { | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 91 |       if(!$foundry_name) { | 
 | 92 |         $foundry_name = $1; | 
 | 93 |         $log->debug("Foundry: $foundry_name\n"); | 
 | 94 |       } | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 95 |     } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) { | 
 | 96 |       $docid=$1; | 
 | 97 |       my $docSigle = $docid; | 
 | 98 |       $docSigle =~ s/\..*//; | 
 | 99 |       if($docSigle ne $lastDocSigle) { | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 100 |         $log->info("Analyzing $docSigle"); | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 101 |         $lastDocSigle = $docSigle; | 
 | 102 |       } | 
 | 103 |       $known=$unknown=0; | 
 | 104 |       $current=""; | 
 | 105 |       $parser_file = dirname($filename); | 
 | 106 |       $parser_file =~ s@(.*)/[^/]+$@$1@; | 
 | 107 |       $morpho_file = $parser_file; | 
 | 108 |       $morpho_file .= "/$foundry_name/morpho.xml"; | 
 | 109 |       $parser_file .= "/$foundry_name/dependency.xml"; | 
 | 110 |       $parse = $morpho = layer_header($docid); | 
 | 111 |     }  elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) { | 
 | 112 |       @spansFrom = split(/\s+/, $1); | 
 | 113 |     }  elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) { | 
 | 114 |       @spansTo = split(/\s+/, $1); | 
 | 115 |     } elsif (! /^\s*$/) { | 
 | 116 |       my @parsed=split('\t'); | 
 | 117 |       chomp  $parsed[9]; | 
 | 118 |       if(@parsed != 10) { | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 119 |         $log->warn("WARNING: skipping strange parser output line in $docid"); | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 120 |         $i++; | 
 | 121 |         next; | 
 | 122 |       } | 
 | 123 |       my $t=$parsed[0]; | 
 | 124 |       if($t == 1) { | 
 | 125 |         $s++; | 
 | 126 |         $first_in_sentence = $i; | 
 | 127 |       } | 
 | 128 |       if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) { | 
 | 129 |         $write_syntax=1; | 
 | 130 |         my $from=$spansFrom[$parsed[6]]; | 
 | 131 |         my $to=$spansTo[$parsed[6]]; | 
 | 132 |           $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> | 
 | 133 | <rel label="$parsed[7]"> | 
 | 134 | <span from="$from" to="$to"/> | 
 | 135 | </rel> | 
 | 136 | </span> | 
 | 137 | @; | 
 | 138 |         } | 
 | 139 |         $morpho .= qq(  <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> | 
 | 140 |    <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0"> | 
 | 141 |     <f name="lex"> | 
 | 142 |      <fs> | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 143 |       <f name="pos">$parsed[3]</f> | 
 | 144 | ); | 
 | 145 |       $morpho .= qq(      <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_"); | 
 | 146 |       if($parsed[9] ne "_") { | 
 | 147 |         if ($parsed[9] =~ /[0-9.e]+/) { | 
 | 148 |           $morpho .= qq(      <f name="certainty">$parsed[9]</f>\n) | 
 | 149 |         } | 
 | 150 |         else { | 
 | 151 |           $morpho .= qq(      <f name="misc">$parsed[9]</f>\n) | 
 | 152 |         } | 
 | 153 |       } | 
 | 154 |       $morpho .= qq(     </fs> | 
 | 155 |     </f> | 
 | 156 |    </fs> | 
 | 157 |   </span> | 
 | 158 | ); | 
 | 159 |         $i++; | 
 | 160 |     } | 
 | 161 |   } | 
 | 162 |   $current .= "\n"; | 
 | 163 |   closeDoc(1); | 
 | 164 |   $zip->close(); | 
 | 165 |   close($fh); | 
 | 166 | } | 
 | 167 | exit; | 
 | 168 |  | 
 | 169 | sub newZipStream { | 
 | 170 |   my ($fname) = @_; | 
 | 171 |   if (defined $zip) { | 
 | 172 |     $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, | 
 | 173 |         Append            => 1, Name => $fname) | 
 | 174 |         or die "ERROR ('$fname'): zip failed: $ZipError\n"; | 
 | 175 |   } else { | 
 | 176 |     $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1, | 
 | 177 |         Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname" | 
 | 178 |         or die "ERROR ('$fname'): zip failed: $ZipError\n"; | 
 | 179 |   } | 
 | 180 | } | 
 | 181 |  | 
 | 182 | sub closeDoc { | 
 | 183 |   if ($write_morpho) { | 
 | 184 |     newZipStream($morpho_file); | 
 | 185 |     $zip->print($morpho, qq( </spanList>\n</layer>\n)); | 
 | 186 |   } | 
 | 187 |   if ($write_syntax) { | 
 | 188 |     $write_syntax = 0; | 
 | 189 |     newZipStream($parser_file); | 
 | 190 |     $zip->print($parse, qq(</spanList>\n</layer>\n)); | 
 | 191 |   } | 
 | 192 | } | 
 | 193 |  | 
 | 194 | sub layer_header { | 
 | 195 |   my ($docid) = @_; | 
 | 196 |   return(qq(<?xml version="1.0" encoding="UTF-8"?> | 
 | 197 | <?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> | 
 | 198 | <layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4"> | 
 | 199 | <spanList> | 
 | 200 | )); | 
| Marc Kupietz | aeb84a0 | 2021-10-11 17:57:29 +0200 | [diff] [blame^] | 201 | } | 
 | 202 |  | 
 | 203 | =pod | 
 | 204 |  | 
 | 205 | =encoding utf8 | 
 | 206 |  | 
 | 207 | =head1 NAME | 
 | 208 |  | 
 | 209 | conllu2korapxml - Conversion of KorAP-XML CoNLL-U to KorAP-XML zips | 
 | 210 |  | 
 | 211 | =head1 SYNOPSIS | 
 | 212 |  | 
 | 213 |   conllu2korapxml < zca15.tree_tagger.conllu > zca15.tree_tagger.zip | 
 | 214 |  | 
 | 215 | =head1 DESCRIPTION | 
 | 216 |  | 
 | 217 | C<conllu2korapxml> converts CoNLL-U files that follow KorAP-specific comment conventions | 
 | 218 |  and contain morphosyntactic and/or dependency annotations to | 
 | 219 |  corresponding KorAP-XML zip files. | 
 | 220 |  | 
 | 221 | =head1 INSTALLATION | 
 | 222 |  | 
 | 223 |   $ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git | 
 | 224 |  | 
 | 225 | =head1 OPTIONS | 
 | 226 |  | 
 | 227 | =over 2 | 
 | 228 |  | 
 | 229 | =item B<--force-foundry|-f> | 
 | 230 |  | 
 | 231 | Set foundry name and ignore foundry names in the input. | 
 | 232 |  | 
 | 233 | =item B<--help|-h> | 
 | 234 |  | 
 | 235 | Print help information. | 
 | 236 |  | 
 | 237 | =item B<--version|-v> | 
 | 238 |  | 
 | 239 | Print version information. | 
 | 240 |  | 
 | 241 |  | 
 | 242 | =item B<--log|-l> | 
 | 243 |  | 
 | 244 | Loglevel for I<Log::Any>. Defaults to C<warn>. | 
 | 245 |  | 
 | 246 | =back | 
 | 247 |  | 
 | 248 | =head1 EXAMPLES | 
 | 249 |  | 
 | 250 |  conllu2korapxml -f tree_tagger < t/data/wdf19.morpho.conllu > wdf19.tree_tagger.zip | 
 | 251 |  | 
 | 252 | =head1 COPYRIGHT AND LICENSE | 
 | 253 |  | 
 | 254 | Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/> | 
 | 255 |  | 
 | 256 | Author: Marc Kupietz | 
 | 257 |  | 
 | 258 | Contributors: Nils Diewald | 
 | 259 |  | 
 | 260 | L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/> | 
 | 261 | Corpus Analysis Platform at the | 
 | 262 | L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>, | 
 | 263 | member of the | 
 | 264 | L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>. | 
 | 265 |  | 
 | 266 | This program is free software published under the | 
 | 267 | L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>. |