blob: 6de032f63e39794bcc2343561152b2766e4f133d [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use POSIX;
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use Log::Any '$log';
use Log::Any::Adapter;
use Encode;
use IO::Compress::Zip qw(zip $ZipError :constants);
use File::Basename;
use Pod::Usage;
my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE;
my %opts;
my %processedFilenames;
our $VERSION = '0.4.1.9000';
our $VERSION_MSG = "\nconllu2korapxml - v$VERSION\n";
use constant {
# Set to 1 for minimal more debug output (no need to be parametrized)
DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0
};
GetOptions(
'force-foundry|f=s' => \(my $foundry_name = ''),
'log|l=s' => \(my $log_level = 'warn'),
'help|h' => sub {
pod2usage(
-verbose => 99,
-sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES',
-msg => $VERSION_MSG,
-output => '-'
)
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG,
-output => '-'
);
}
);
# Establish logger
binmode(STDERR, ':encoding(UTF-8)');
Log::Any::Adapter->set('Stderr', log_level => $log_level);
$log->notice('Debugging is activated') if DEBUG;
my $docid="";
my $zip = undef;
my $outh = \*STDOUT;
my $parser_file;
my $parse;
my $morpho_file;
my $morpho;
my @spansFrom;
my @spansTo;
my $current;
my ($unknown, $known) = (0, 0);
my ($write_morpho, $write_syntax, $base) = (1, 0, 0);
my $filename;
my $first=1;
my @conllu_files = @ARGV;
push @conllu_files, "-" if (@conllu_files == 0);
my $fh;
foreach my $conllu_file (@conllu_files) {
if ($conllu_file eq '-') {
$fh = \*STDIN;
} else {
open($fh, "<", $conllu_file) or die "Cannot open $conllu_file";
}
my $i=0; my $s=0; my $first_in_sentence=0;
my $lastDocSigle="";
while (<$fh>) {
if(/^\s*(?:#|0\.\d)/) {
if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) {
$filename=$1;
if(!$first) {
closeDoc(0);
} else {
$first=0;
}
if($processedFilenames{$filename}) {
$log->warn("WARNING: $filename is already processed");
}
$processedFilenames{$filename}=1;
$i=0;
} elsif(/^#\s*foundry\s*[:=]\s*(.*)/) {
if(!$foundry_name) {
$foundry_name = $1;
$log->debug("Foundry: $foundry_name\n");
} else {
$log->debug("Ignored foundry name: $1\n");
}
} elsif(/^#\s*generator\s*[=]\s*udpipe/i) {
if(!$foundry_name) {
$foundry_name = "ud";
$log->debug("Foundry: $foundry_name\n");
} else {
$log->debug("Ignored foundry name: ud\n");
}
} elsif(/^(?:#|0\.2)\s+.*id\s*[:=]\s*(.*)/) {
$docid=$1;
my $docSigle = $docid;
$docSigle =~ s/\..*//;
if($docSigle ne $lastDocSigle) {
$log->info("Analyzing $docSigle");
$lastDocSigle = $docSigle;
}
$known=$unknown=0;
$current="";
$parser_file = dirname($filename);
$parser_file =~ s@(.*)/[^/]+$@$1@;
$morpho_file = $parser_file;
$morpho_file .= "/$foundry_name/morpho.xml";
$parser_file .= "/$foundry_name/dependency.xml";
$parse = $morpho = layer_header($docid);
} elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) {
@spansFrom = split(/\s+/, $1);
} elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) {
@spansTo = split(/\s+/, $1);
}
} elsif (! /^\s*$/) {
my @parsed=split('\t');
chomp $parsed[9];
if(@parsed != 10) {
$log->warn("WARNING: skipping strange parser output line in $docid");
$i++;
next;
}
my $t=$parsed[0];
if($t == 1) {
$s++;
$first_in_sentence = $i;
}
if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) {
$write_syntax=1;
my $from=$spansFrom[$parsed[6]];
my $to=$spansTo[$parsed[6]];
$parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
<rel label="$parsed[7]">
<span from="$from" to="$to"/>
</rel>
</span>
@;
}
my $pos = $parsed[3];
$pos =~ s/\|.*//;
$morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]">
<fs type="lex" xmlns="http://www.tei-c.org/ns/1.0">
<f name="lex">
<fs>
<f name="pos">$pos</f>
);
$morpho .= qq( <f name="lemma">$parsed[2]</f>\n) if($parsed[2] ne "_" || $parsed[1] eq '_');
$morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_");
if($parsed[9] ne "_") {
if ($parsed[9] =~ /[0-9.e]+/) {
$morpho .= qq( <f name="certainty">$parsed[9]</f>\n)
}
else {
$morpho .= qq( <f name="misc">$parsed[9]</f>\n)
}
}
$morpho .= qq( </fs>
</f>
</fs>
</span>
);
$i++;
}
}
$current .= "\n";
closeDoc(1);
$zip->close();
close($fh);
}
exit;
sub newZipStream {
my ($fname) = @_;
if (defined $zip) {
$zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD,
Append => 1, Name => $fname)
or die "ERROR ('$fname'): zip failed: $ZipError\n";
} else {
$zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1,
Method => $_COMPRESSION_METHOD, Append => 1, Name => "$fname"
or die "ERROR ('$fname'): zip failed: $ZipError\n";
}
}
sub closeDoc {
if ($write_morpho) {
newZipStream($morpho_file);
$zip->print($morpho, qq( </spanList>\n</layer>\n));
}
if ($write_syntax) {
$write_syntax = 0;
newZipStream($parser_file);
$zip->print($parse, qq(</spanList>\n</layer>\n));
}
}
sub layer_header {
my ($docid) = @_;
return(qq(<?xml version="1.0" encoding="UTF-8"?>
<?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?>
<layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4">
<spanList>
));
}
=pod
=encoding utf8
=head1 NAME
conllu2korapxml - Conversion of KorAP-XML CoNLL-U to KorAP-XML zips
=head1 SYNOPSIS
conllu2korapxml < zca15.tree_tagger.conllu > zca15.tree_tagger.zip
=head1 DESCRIPTION
C<conllu2korapxml> converts CoNLL-U files that follow KorAP-specific comment conventions
and contain morphosyntactic and/or dependency annotations to
corresponding KorAP-XML zip files.
=head1 INSTALLATION
$ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git
=head1 OPTIONS
=over 2
=item B<--force-foundry|-f>
Set foundry name and ignore foundry names in the input.
=item B<--help|-h>
Print help information.
=item B<--version|-v>
Print version information.
=item B<--log|-l>
Loglevel for I<Log::Any>. Defaults to C<warn>.
=back
=head1 EXAMPLES
conllu2korapxml -f tree_tagger < t/data/wdf19.morpho.conllu > wdf19.tree_tagger.zip
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: Marc Kupietz
Contributors: Nils Diewald
L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.