Marc Kupietz | 79ba1e5 | 2021-02-12 17:26:54 +0100 | [diff] [blame^] | 1 | #!/usr/bin/env perl |
| 2 | use strict; |
| 3 | use warnings; |
| 4 | use POSIX; |
| 5 | use Getopt::Std; |
| 6 | use Encode; |
| 7 | use IO::Compress::Zip qw(zip $ZipError :constants); |
| 8 | use File::Basename; |
| 9 | |
| 10 | my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; |
| 11 | my %opts; |
| 12 | my %processedFilenames; |
| 13 | |
| 14 | my $usage=<<EOF; |
| 15 | Usage: $0 [options] [CoNLL-U-FILE...] |
| 16 | |
| 17 | Options: |
| 18 | -d debug |
| 19 | Description: |
| 20 | Converts CoNLL-U files that follow KorAP-specific comment conventions |
| 21 | and contain morphosyntactic and/or dependency annotations to |
| 22 | corresponding KorAP-XML zip files. |
| 23 | |
| 24 | Examples: |
| 25 | $0 zca20.spacy.conllu > zca20.spacy.zip |
| 26 | |
| 27 | $0 < zca20.spacy.conllu > zca20.spacy.zip |
| 28 | EOF |
| 29 | |
| 30 | |
| 31 | getopts('hd', \%opts); |
| 32 | die $usage if($opts{h}); |
| 33 | my $debug=($opts{d}? 1 : 0); |
| 34 | |
| 35 | my $docid=""; |
| 36 | my $zip = undef; |
| 37 | my $outh = \*STDOUT; |
| 38 | my $parser_file; |
| 39 | my $parse; |
| 40 | my $morpho_file; |
| 41 | my $morpho; |
| 42 | my @spansFrom; |
| 43 | my @spansTo; |
| 44 | my $current; |
| 45 | my ($unknown, $known) = (0, 0); |
| 46 | |
| 47 | my ($write_morpho, $write_syntax, $base) = (1, 0, 0); |
| 48 | my $filename; |
| 49 | my $foundry_name; |
| 50 | my $first=1; |
| 51 | my @conllu_files = @ARGV; |
| 52 | push @conllu_files, "-" if (@conllu_files == 0); |
| 53 | my $fh; |
| 54 | foreach my $conllu_file (@conllu_files) { |
| 55 | if ($conllu_file eq '-') { |
| 56 | $fh = \*STDIN; |
| 57 | } else { |
| 58 | open($fh, "<", $conllu_file) or die "Cannot open $conllu_file"; |
| 59 | } |
| 60 | my $i=0; my $s=0; my $first_in_sentence=0; |
| 61 | my $lastDocSigle=""; |
| 62 | while (<$fh>) { |
| 63 | if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) { |
| 64 | $filename=$1; |
| 65 | if(!$first) { |
| 66 | closeDoc(0); |
| 67 | } else { |
| 68 | $first=0; |
| 69 | } |
| 70 | if($processedFilenames{$filename}) { |
| 71 | print STDERR "WARNING: $filename is already processed\n"; |
| 72 | } |
| 73 | $processedFilenames{$filename}=1; |
| 74 | $i=0; |
| 75 | } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) { |
| 76 | $foundry_name=$1; |
| 77 | print STDERR "Foundry: $foundry_name\n" if($debug); |
| 78 | } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) { |
| 79 | $docid=$1; |
| 80 | my $docSigle = $docid; |
| 81 | $docSigle =~ s/\..*//; |
| 82 | if($docSigle ne $lastDocSigle) { |
| 83 | print STDERR "Analyzing $docSigle\n"; |
| 84 | $lastDocSigle = $docSigle; |
| 85 | } |
| 86 | $known=$unknown=0; |
| 87 | $current=""; |
| 88 | $parser_file = dirname($filename); |
| 89 | $parser_file =~ s@(.*)/[^/]+$@$1@; |
| 90 | $morpho_file = $parser_file; |
| 91 | $morpho_file .= "/$foundry_name/morpho.xml"; |
| 92 | $parser_file .= "/$foundry_name/dependency.xml"; |
| 93 | $parse = $morpho = layer_header($docid); |
| 94 | } elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) { |
| 95 | @spansFrom = split(/\s+/, $1); |
| 96 | } elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) { |
| 97 | @spansTo = split(/\s+/, $1); |
| 98 | } elsif (! /^\s*$/) { |
| 99 | my @parsed=split('\t'); |
| 100 | chomp $parsed[9]; |
| 101 | if(@parsed != 10) { |
| 102 | print STDERR "WARNING: skipping strange parser output line in $docid\n"; |
| 103 | $i++; |
| 104 | next; |
| 105 | } |
| 106 | my $t=$parsed[0]; |
| 107 | if($t == 1) { |
| 108 | $s++; |
| 109 | $first_in_sentence = $i; |
| 110 | } |
| 111 | if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) { |
| 112 | $write_syntax=1; |
| 113 | my $from=$spansFrom[$parsed[6]]; |
| 114 | my $to=$spansTo[$parsed[6]]; |
| 115 | $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> |
| 116 | <rel label="$parsed[7]"> |
| 117 | <span from="$from" to="$to"/> |
| 118 | </rel> |
| 119 | </span> |
| 120 | @; |
| 121 | } |
| 122 | $morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> |
| 123 | <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0"> |
| 124 | <f name="lex"> |
| 125 | <fs> |
| 126 | <f name="lemma">$parsed[2]</f> |
| 127 | <f name="pos">$parsed[3]</f> |
| 128 | ); |
| 129 | $morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_"); |
| 130 | if($parsed[9] ne "_") { |
| 131 | if ($parsed[9] =~ /[0-9.e]+/) { |
| 132 | $morpho .= qq( <f name="certainty">$parsed[9]</f>\n) |
| 133 | } |
| 134 | else { |
| 135 | $morpho .= qq( <f name="misc">$parsed[9]</f>\n) |
| 136 | } |
| 137 | } |
| 138 | $morpho .= qq( </fs> |
| 139 | </f> |
| 140 | </fs> |
| 141 | </span> |
| 142 | ); |
| 143 | $i++; |
| 144 | } |
| 145 | } |
| 146 | $current .= "\n"; |
| 147 | closeDoc(1); |
| 148 | $zip->close(); |
| 149 | close($fh); |
| 150 | } |
| 151 | exit; |
| 152 | |
| 153 | sub newZipStream { |
| 154 | my ($fname) = @_; |
| 155 | if (defined $zip) { |
| 156 | $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, |
| 157 | Append => 1, Name => $fname) |
| 158 | or die "ERROR ('$fname'): zip failed: $ZipError\n"; |
| 159 | } else { |
| 160 | $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1, |
| 161 | Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname" |
| 162 | or die "ERROR ('$fname'): zip failed: $ZipError\n"; |
| 163 | } |
| 164 | } |
| 165 | |
| 166 | sub closeDoc { |
| 167 | if ($write_morpho) { |
| 168 | newZipStream($morpho_file); |
| 169 | $zip->print($morpho, qq( </spanList>\n</layer>\n)); |
| 170 | } |
| 171 | if ($write_syntax) { |
| 172 | $write_syntax = 0; |
| 173 | newZipStream($parser_file); |
| 174 | $zip->print($parse, qq(</spanList>\n</layer>\n)); |
| 175 | } |
| 176 | } |
| 177 | |
| 178 | sub layer_header { |
| 179 | my ($docid) = @_; |
| 180 | return(qq(<?xml version="1.0" encoding="UTF-8"?> |
| 181 | <?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> |
| 182 | <layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4"> |
| 183 | <spanList> |
| 184 | )); |
| 185 | } |