blob: 005861888925d69355da4a83a606feea13f02b8f [file] [log] [blame]
Marc Kupietz79ba1e52021-02-12 17:26:54 +01001#!/usr/bin/env perl
2use strict;
3use warnings;
4use POSIX;
5use Getopt::Std;
6use Encode;
7use IO::Compress::Zip qw(zip $ZipError :constants);
8use File::Basename;
9
10my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
11my %opts;
12my %processedFilenames;
13
Marc Kupietz4cc243a2021-10-11 17:15:16 +020014our $VERSION = '0.4.1.9000';
15
Marc Kupietz79ba1e52021-02-12 17:26:54 +010016my $usage=<<EOF;
17Usage: $0 [options] [CoNLL-U-FILE...]
18
19Options:
20 -d debug
21Description:
22 Converts CoNLL-U files that follow KorAP-specific comment conventions
23 and contain morphosyntactic and/or dependency annotations to
24 corresponding KorAP-XML zip files.
25
26Examples:
27 $0 zca20.spacy.conllu > zca20.spacy.zip
28
29 $0 < zca20.spacy.conllu > zca20.spacy.zip
30EOF
31
32
33getopts('hd', \%opts);
34die $usage if($opts{h});
35my $debug=($opts{d}? 1 : 0);
36
37my $docid="";
38my $zip = undef;
39my $outh = \*STDOUT;
40my $parser_file;
41my $parse;
42my $morpho_file;
43my $morpho;
44my @spansFrom;
45my @spansTo;
46my $current;
47my ($unknown, $known) = (0, 0);
48
49my ($write_morpho, $write_syntax, $base) = (1, 0, 0);
50my $filename;
51my $foundry_name;
52my $first=1;
53my @conllu_files = @ARGV;
54push @conllu_files, "-" if (@conllu_files == 0);
55my $fh;
56foreach my $conllu_file (@conllu_files) {
57 if ($conllu_file eq '-') {
58 $fh = \*STDIN;
59 } else {
60 open($fh, "<", $conllu_file) or die "Cannot open $conllu_file";
61 }
62 my $i=0; my $s=0; my $first_in_sentence=0;
63 my $lastDocSigle="";
64 while (<$fh>) {
65 if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
66 $filename=$1;
67 if(!$first) {
68 closeDoc(0);
69 } else {
70 $first=0;
71 }
72 if($processedFilenames{$filename}) {
73 print STDERR "WARNING: $filename is already processed\n";
74 }
75 $processedFilenames{$filename}=1;
76 $i=0;
77 } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
78 $foundry_name=$1;
79 print STDERR "Foundry: $foundry_name\n" if($debug);
80 } elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
81 $docid=$1;
82 my $docSigle = $docid;
83 $docSigle =~ s/\..*//;
84 if($docSigle ne $lastDocSigle) {
85 print STDERR "Analyzing $docSigle\n";
86 $lastDocSigle = $docSigle;
87 }
88 $known=$unknown=0;
89 $current="";
90 $parser_file = dirname($filename);
91 $parser_file =~ s@(.*)/[^/]+$@$1@;
92 $morpho_file = $parser_file;
93 $morpho_file .= "/$foundry_name/morpho.xml";
94 $parser_file .= "/$foundry_name/dependency.xml";
95 $parse = $morpho = layer_header($docid);
96 } elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
97 @spansFrom = split(/\s+/, $1);
98 } elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
99 @spansTo = split(/\s+/, $1);
100 } elsif (! /^\s*$/) {
101 my @parsed=split('\t');
102 chomp $parsed[9];
103 if(@parsed != 10) {
104 print STDERR "WARNING: skipping strange parser output line in $docid\n";
105 $i++;
106 next;
107 }
108 my $t=$parsed[0];
109 if($t == 1) {
110 $s++;
111 $first_in_sentence = $i;
112 }
113 if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) {
114 $write_syntax=1;
115 my $from=$spansFrom[$parsed[6]];
116 my $to=$spansTo[$parsed[6]];
117 $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
118<rel label="$parsed[7]">
119<span from="$from" to="$to"/>
120</rel>
121</span>
122@;
123 }
124 $morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
125 <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
126 <f name="lex">
127 <fs>
128 <f name="lemma">$parsed[2]</f>
129 <f name="pos">$parsed[3]</f>
130);
131 $morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
132 if($parsed[9] ne "_") {
133 if ($parsed[9] =~ /[0-9.e]+/) {
134 $morpho .= qq( <f name="certainty">$parsed[9]</f>\n)
135 }
136 else {
137 $morpho .= qq( <f name="misc">$parsed[9]</f>\n)
138 }
139 }
140 $morpho .= qq( </fs>
141 </f>
142 </fs>
143 </span>
144);
145 $i++;
146 }
147 }
148 $current .= "\n";
149 closeDoc(1);
150 $zip->close();
151 close($fh);
152}
153exit;
154
155sub newZipStream {
156 my ($fname) = @_;
157 if (defined $zip) {
158 $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
159 Append => 1, Name => $fname)
160 or die "ERROR ('$fname'): zip failed: $ZipError\n";
161 } else {
162 $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1,
163 Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname"
164 or die "ERROR ('$fname'): zip failed: $ZipError\n";
165 }
166}
167
168sub closeDoc {
169 if ($write_morpho) {
170 newZipStream($morpho_file);
171 $zip->print($morpho, qq( </spanList>\n</layer>\n));
172 }
173 if ($write_syntax) {
174 $write_syntax = 0;
175 newZipStream($parser_file);
176 $zip->print($parse, qq(</spanList>\n</layer>\n));
177 }
178}
179
180sub layer_header {
181 my ($docid) = @_;
182 return(qq(<?xml version="1.0" encoding="UTF-8"?>
183<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
184<layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
185<spanList>
186));
187}