| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
 | 2 | use strict; | 
 | 3 | use warnings; | 
 | 4 | use POSIX; | 
 | 5 | use Getopt::Std; | 
 | 6 | use Encode; | 
 | 7 | use IO::Compress::Zip qw(zip $ZipError :constants); | 
 | 8 | use File::Basename; | 
 | 9 |  | 
 | 10 | my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; | 
 | 11 | my %opts; | 
 | 12 | my %processedFilenames; | 
 | 13 |  | 
 | 14 | my $usage=<<EOF; | 
 | 15 | Usage: $0 [options] [CoNLL-U-FILE...] | 
 | 16 |  | 
 | 17 | Options: | 
 | 18 |  -d        debug | 
 | 19 | Description: | 
 | 20 |  Converts CoNLL-U files that follow KorAP-specific comment conventions | 
 | 21 |  and contain morphosyntactic and/or dependency annotations to | 
 | 22 |  corresponding KorAP-XML zip files. | 
 | 23 |  | 
 | 24 | Examples: | 
 | 25 |  $0 zca20.spacy.conllu > zca20.spacy.zip | 
 | 26 |  | 
 | 27 |  $0 < zca20.spacy.conllu > zca20.spacy.zip | 
 | 28 | EOF | 
 | 29 |  | 
 | 30 |  | 
 | 31 | getopts('hd', \%opts); | 
 | 32 | die $usage if($opts{h}); | 
 | 33 | my $debug=($opts{d}? 1 : 0); | 
 | 34 |  | 
 | 35 | my $docid=""; | 
 | 36 | my $zip = undef; | 
 | 37 | my $outh = \*STDOUT; | 
 | 38 | my $parser_file; | 
 | 39 | my $parse; | 
 | 40 | my $morpho_file; | 
 | 41 | my $morpho; | 
 | 42 | my @spansFrom; | 
 | 43 | my @spansTo; | 
 | 44 | my $current; | 
 | 45 | my ($unknown, $known) = (0, 0); | 
 | 46 |  | 
 | 47 | my ($write_morpho, $write_syntax, $base) = (1, 0, 0); | 
 | 48 | my $filename; | 
 | 49 | my $foundry_name; | 
 | 50 | my $first=1; | 
 | 51 | my @conllu_files = @ARGV; | 
 | 52 | push @conllu_files, "-" if (@conllu_files == 0); | 
 | 53 | my $fh; | 
 | 54 | foreach my $conllu_file (@conllu_files) { | 
 | 55 |   if ($conllu_file eq '-') { | 
 | 56 |     $fh = \*STDIN; | 
 | 57 |   } else { | 
 | 58 |     open($fh, "<", $conllu_file) or die "Cannot open $conllu_file"; | 
 | 59 |   } | 
 | 60 |   my $i=0; my $s=0; my $first_in_sentence=0; | 
 | 61 |   my $lastDocSigle=""; | 
 | 62 |   while (<$fh>) { | 
 | 63 |     if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) { | 
 | 64 |       $filename=$1; | 
 | 65 |       if(!$first) { | 
 | 66 |         closeDoc(0); | 
 | 67 |       } else { | 
 | 68 |         $first=0; | 
 | 69 |       } | 
 | 70 |       if($processedFilenames{$filename}) { | 
 | 71 |         print STDERR "WARNING: $filename is already processed\n"; | 
 | 72 |       } | 
 | 73 |       $processedFilenames{$filename}=1; | 
 | 74 |       $i=0; | 
 | 75 |     } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) { | 
 | 76 |       $foundry_name=$1; | 
 | 77 |       print STDERR "Foundry: $foundry_name\n" if($debug); | 
 | 78 |     } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) { | 
 | 79 |       $docid=$1; | 
 | 80 |       my $docSigle = $docid; | 
 | 81 |       $docSigle =~ s/\..*//; | 
 | 82 |       if($docSigle ne $lastDocSigle) { | 
 | 83 |         print STDERR "Analyzing $docSigle\n"; | 
 | 84 |         $lastDocSigle = $docSigle; | 
 | 85 |       } | 
 | 86 |       $known=$unknown=0; | 
 | 87 |       $current=""; | 
 | 88 |       $parser_file = dirname($filename); | 
 | 89 |       $parser_file =~ s@(.*)/[^/]+$@$1@; | 
 | 90 |       $morpho_file = $parser_file; | 
 | 91 |       $morpho_file .= "/$foundry_name/morpho.xml"; | 
 | 92 |       $parser_file .= "/$foundry_name/dependency.xml"; | 
 | 93 |       $parse = $morpho = layer_header($docid); | 
 | 94 |     }  elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) { | 
 | 95 |       @spansFrom = split(/\s+/, $1); | 
 | 96 |     }  elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) { | 
 | 97 |       @spansTo = split(/\s+/, $1); | 
 | 98 |     } elsif (! /^\s*$/) { | 
 | 99 |       my @parsed=split('\t'); | 
 | 100 |       chomp  $parsed[9]; | 
 | 101 |       if(@parsed != 10) { | 
 | 102 |         print STDERR "WARNING: skipping strange parser output line in $docid\n"; | 
 | 103 |         $i++; | 
 | 104 |         next; | 
 | 105 |       } | 
 | 106 |       my $t=$parsed[0]; | 
 | 107 |       if($t == 1) { | 
 | 108 |         $s++; | 
 | 109 |         $first_in_sentence = $i; | 
 | 110 |       } | 
 | 111 |       if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) { | 
 | 112 |         $write_syntax=1; | 
 | 113 |         my $from=$spansFrom[$parsed[6]]; | 
 | 114 |         my $to=$spansTo[$parsed[6]]; | 
 | 115 |           $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> | 
 | 116 | <rel label="$parsed[7]"> | 
 | 117 | <span from="$from" to="$to"/> | 
 | 118 | </rel> | 
 | 119 | </span> | 
 | 120 | @; | 
 | 121 |         } | 
 | 122 |         $morpho .= qq(  <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> | 
 | 123 |    <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0"> | 
 | 124 |     <f name="lex"> | 
 | 125 |      <fs> | 
 | 126 |       <f name="lemma">$parsed[2]</f> | 
 | 127 |       <f name="pos">$parsed[3]</f> | 
 | 128 | ); | 
 | 129 |       $morpho .= qq(      <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_"); | 
 | 130 |       if($parsed[9] ne "_") { | 
 | 131 |         if ($parsed[9] =~ /[0-9.e]+/) { | 
 | 132 |           $morpho .= qq(      <f name="certainty">$parsed[9]</f>\n) | 
 | 133 |         } | 
 | 134 |         else { | 
 | 135 |           $morpho .= qq(      <f name="misc">$parsed[9]</f>\n) | 
 | 136 |         } | 
 | 137 |       } | 
 | 138 |       $morpho .= qq(     </fs> | 
 | 139 |     </f> | 
 | 140 |    </fs> | 
 | 141 |   </span> | 
 | 142 | ); | 
 | 143 |         $i++; | 
 | 144 |     } | 
 | 145 |   } | 
 | 146 |   $current .= "\n"; | 
 | 147 |   closeDoc(1); | 
 | 148 |   $zip->close(); | 
 | 149 |   close($fh); | 
 | 150 | } | 
 | 151 | exit; | 
 | 152 |  | 
 | 153 | sub newZipStream { | 
 | 154 |   my ($fname) = @_; | 
 | 155 |   if (defined $zip) { | 
 | 156 |     $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, | 
 | 157 |         Append            => 1, Name => $fname) | 
 | 158 |         or die "ERROR ('$fname'): zip failed: $ZipError\n"; | 
 | 159 |   } else { | 
 | 160 |     $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1, | 
 | 161 |         Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname" | 
 | 162 |         or die "ERROR ('$fname'): zip failed: $ZipError\n"; | 
 | 163 |   } | 
 | 164 | } | 
 | 165 |  | 
 | 166 | sub closeDoc { | 
 | 167 |   if ($write_morpho) { | 
 | 168 |     newZipStream($morpho_file); | 
 | 169 |     $zip->print($morpho, qq( </spanList>\n</layer>\n)); | 
 | 170 |   } | 
 | 171 |   if ($write_syntax) { | 
 | 172 |     $write_syntax = 0; | 
 | 173 |     newZipStream($parser_file); | 
 | 174 |     $zip->print($parse, qq(</spanList>\n</layer>\n)); | 
 | 175 |   } | 
 | 176 | } | 
 | 177 |  | 
 | 178 | sub layer_header { | 
 | 179 |   my ($docid) = @_; | 
 | 180 |   return(qq(<?xml version="1.0" encoding="UTF-8"?> | 
 | 181 | <?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> | 
 | 182 | <layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4"> | 
 | 183 | <spanList> | 
 | 184 | )); | 
 | 185 | } |