| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 1 | #!/usr/bin/env perl | 
 | 2 | use strict; | 
 | 3 | use warnings; | 
 | 4 | use POSIX; | 
 | 5 | use Getopt::Std; | 
 | 6 | use Encode; | 
 | 7 | use IO::Compress::Zip qw(zip $ZipError :constants); | 
 | 8 | use File::Basename; | 
 | 9 |  | 
 | 10 | my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; | 
 | 11 | my %opts; | 
 | 12 | my %processedFilenames; | 
 | 13 |  | 
| Marc Kupietz | 4cc243a | 2021-10-11 17:15:16 +0200 | [diff] [blame^] | 14 | our $VERSION = '0.4.1.9000'; | 
 | 15 |  | 
| Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame] | 16 | my $usage=<<EOF; | 
 | 17 | Usage: $0 [options] [CoNLL-U-FILE...] | 
 | 18 |  | 
 | 19 | Options: | 
 | 20 |  -d        debug | 
 | 21 | Description: | 
 | 22 |  Converts CoNLL-U files that follow KorAP-specific comment conventions | 
 | 23 |  and contain morphosyntactic and/or dependency annotations to | 
 | 24 |  corresponding KorAP-XML zip files. | 
 | 25 |  | 
 | 26 | Examples: | 
 | 27 |  $0 zca20.spacy.conllu > zca20.spacy.zip | 
 | 28 |  | 
 | 29 |  $0 < zca20.spacy.conllu > zca20.spacy.zip | 
 | 30 | EOF | 
 | 31 |  | 
 | 32 |  | 
 | 33 | getopts('hd', \%opts); | 
 | 34 | die $usage if($opts{h}); | 
 | 35 | my $debug=($opts{d}? 1 : 0); | 
 | 36 |  | 
 | 37 | my $docid=""; | 
 | 38 | my $zip = undef; | 
 | 39 | my $outh = \*STDOUT; | 
 | 40 | my $parser_file; | 
 | 41 | my $parse; | 
 | 42 | my $morpho_file; | 
 | 43 | my $morpho; | 
 | 44 | my @spansFrom; | 
 | 45 | my @spansTo; | 
 | 46 | my $current; | 
 | 47 | my ($unknown, $known) = (0, 0); | 
 | 48 |  | 
 | 49 | my ($write_morpho, $write_syntax, $base) = (1, 0, 0); | 
 | 50 | my $filename; | 
 | 51 | my $foundry_name; | 
 | 52 | my $first=1; | 
 | 53 | my @conllu_files = @ARGV; | 
 | 54 | push @conllu_files, "-" if (@conllu_files == 0); | 
 | 55 | my $fh; | 
 | 56 | foreach my $conllu_file (@conllu_files) { | 
 | 57 |   if ($conllu_file eq '-') { | 
 | 58 |     $fh = \*STDIN; | 
 | 59 |   } else { | 
 | 60 |     open($fh, "<", $conllu_file) or die "Cannot open $conllu_file"; | 
 | 61 |   } | 
 | 62 |   my $i=0; my $s=0; my $first_in_sentence=0; | 
 | 63 |   my $lastDocSigle=""; | 
 | 64 |   while (<$fh>) { | 
 | 65 |     if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) { | 
 | 66 |       $filename=$1; | 
 | 67 |       if(!$first) { | 
 | 68 |         closeDoc(0); | 
 | 69 |       } else { | 
 | 70 |         $first=0; | 
 | 71 |       } | 
 | 72 |       if($processedFilenames{$filename}) { | 
 | 73 |         print STDERR "WARNING: $filename is already processed\n"; | 
 | 74 |       } | 
 | 75 |       $processedFilenames{$filename}=1; | 
 | 76 |       $i=0; | 
 | 77 |     } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) { | 
 | 78 |       $foundry_name=$1; | 
 | 79 |       print STDERR "Foundry: $foundry_name\n" if($debug); | 
 | 80 |     } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) { | 
 | 81 |       $docid=$1; | 
 | 82 |       my $docSigle = $docid; | 
 | 83 |       $docSigle =~ s/\..*//; | 
 | 84 |       if($docSigle ne $lastDocSigle) { | 
 | 85 |         print STDERR "Analyzing $docSigle\n"; | 
 | 86 |         $lastDocSigle = $docSigle; | 
 | 87 |       } | 
 | 88 |       $known=$unknown=0; | 
 | 89 |       $current=""; | 
 | 90 |       $parser_file = dirname($filename); | 
 | 91 |       $parser_file =~ s@(.*)/[^/]+$@$1@; | 
 | 92 |       $morpho_file = $parser_file; | 
 | 93 |       $morpho_file .= "/$foundry_name/morpho.xml"; | 
 | 94 |       $parser_file .= "/$foundry_name/dependency.xml"; | 
 | 95 |       $parse = $morpho = layer_header($docid); | 
 | 96 |     }  elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) { | 
 | 97 |       @spansFrom = split(/\s+/, $1); | 
 | 98 |     }  elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) { | 
 | 99 |       @spansTo = split(/\s+/, $1); | 
 | 100 |     } elsif (! /^\s*$/) { | 
 | 101 |       my @parsed=split('\t'); | 
 | 102 |       chomp  $parsed[9]; | 
 | 103 |       if(@parsed != 10) { | 
 | 104 |         print STDERR "WARNING: skipping strange parser output line in $docid\n"; | 
 | 105 |         $i++; | 
 | 106 |         next; | 
 | 107 |       } | 
 | 108 |       my $t=$parsed[0]; | 
 | 109 |       if($t == 1) { | 
 | 110 |         $s++; | 
 | 111 |         $first_in_sentence = $i; | 
 | 112 |       } | 
 | 113 |       if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) { | 
 | 114 |         $write_syntax=1; | 
 | 115 |         my $from=$spansFrom[$parsed[6]]; | 
 | 116 |         my $to=$spansTo[$parsed[6]]; | 
 | 117 |           $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> | 
 | 118 | <rel label="$parsed[7]"> | 
 | 119 | <span from="$from" to="$to"/> | 
 | 120 | </rel> | 
 | 121 | </span> | 
 | 122 | @; | 
 | 123 |         } | 
 | 124 |         $morpho .= qq(  <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> | 
 | 125 |    <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0"> | 
 | 126 |     <f name="lex"> | 
 | 127 |      <fs> | 
 | 128 |       <f name="lemma">$parsed[2]</f> | 
 | 129 |       <f name="pos">$parsed[3]</f> | 
 | 130 | ); | 
 | 131 |       $morpho .= qq(      <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_"); | 
 | 132 |       if($parsed[9] ne "_") { | 
 | 133 |         if ($parsed[9] =~ /[0-9.e]+/) { | 
 | 134 |           $morpho .= qq(      <f name="certainty">$parsed[9]</f>\n) | 
 | 135 |         } | 
 | 136 |         else { | 
 | 137 |           $morpho .= qq(      <f name="misc">$parsed[9]</f>\n) | 
 | 138 |         } | 
 | 139 |       } | 
 | 140 |       $morpho .= qq(     </fs> | 
 | 141 |     </f> | 
 | 142 |    </fs> | 
 | 143 |   </span> | 
 | 144 | ); | 
 | 145 |         $i++; | 
 | 146 |     } | 
 | 147 |   } | 
 | 148 |   $current .= "\n"; | 
 | 149 |   closeDoc(1); | 
 | 150 |   $zip->close(); | 
 | 151 |   close($fh); | 
 | 152 | } | 
 | 153 | exit; | 
 | 154 |  | 
 | 155 | sub newZipStream { | 
 | 156 |   my ($fname) = @_; | 
 | 157 |   if (defined $zip) { | 
 | 158 |     $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, | 
 | 159 |         Append            => 1, Name => $fname) | 
 | 160 |         or die "ERROR ('$fname'): zip failed: $ZipError\n"; | 
 | 161 |   } else { | 
 | 162 |     $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1, | 
 | 163 |         Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname" | 
 | 164 |         or die "ERROR ('$fname'): zip failed: $ZipError\n"; | 
 | 165 |   } | 
 | 166 | } | 
 | 167 |  | 
 | 168 | sub closeDoc { | 
 | 169 |   if ($write_morpho) { | 
 | 170 |     newZipStream($morpho_file); | 
 | 171 |     $zip->print($morpho, qq( </spanList>\n</layer>\n)); | 
 | 172 |   } | 
 | 173 |   if ($write_syntax) { | 
 | 174 |     $write_syntax = 0; | 
 | 175 |     newZipStream($parser_file); | 
 | 176 |     $zip->print($parse, qq(</spanList>\n</layer>\n)); | 
 | 177 |   } | 
 | 178 | } | 
 | 179 |  | 
 | 180 | sub layer_header { | 
 | 181 |   my ($docid) = @_; | 
 | 182 |   return(qq(<?xml version="1.0" encoding="UTF-8"?> | 
 | 183 | <?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> | 
 | 184 | <layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4"> | 
 | 185 | <spanList> | 
 | 186 | )); | 
 | 187 | } |