| #!/usr/bin/env perl |
| use strict; |
| use warnings; |
| use POSIX; |
| use Getopt::Long qw(GetOptions :config no_auto_abbrev); |
| use Log::Any '$log'; |
| use Log::Any::Adapter; |
| use Encode; |
| use IO::Compress::Zip qw(zip $ZipError :constants); |
| use File::Basename; |
| use Pod::Usage; |
| |
| my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; |
| my %opts; |
| my %processedFilenames; |
| |
| our $VERSION = '0.6.2'; |
| our $VERSION_MSG = "\nconllu2korapxml - v$VERSION\n"; |
| |
| use constant { |
| # Set to 1 for minimal more debug output (no need to be parametrized) |
| DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0 |
| }; |
| |
| GetOptions( |
| 'force-foundry|f=s' => \(my $foundry_name = ''), |
| 'log|l=s' => \(my $log_level = 'warn'), |
| |
| 'help|h' => sub { |
| pod2usage( |
| -verbose => 99, |
| -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES', |
| -msg => $VERSION_MSG, |
| -output => '-' |
| ) |
| }, |
| 'version|v' => sub { |
| pod2usage( |
| -verbose => 0, |
| -msg => $VERSION_MSG, |
| -output => '-' |
| ); |
| } |
| ); |
| |
| # Establish logger |
| binmode(STDERR, ':encoding(UTF-8)'); |
| Log::Any::Adapter->set('Stderr', log_level => $log_level); |
| $log->notice('Debugging is activated') if DEBUG; |
| |
| my $docid=""; |
| my $zip = undef; |
| my $outh = '-'; |
| my $parser_file; |
| my $parse; |
| my $morpho_file; |
| my $morpho; |
| my @spansFrom; |
| my @spansTo; |
| my $current; |
| my ($unknown, $known) = (0, 0); |
| |
| my ($write_morpho, $write_syntax, $base) = (1, 0, 0); |
| my $filename; |
| my $first=1; |
| my @conllu_files = @ARGV; |
| push @conllu_files, "-" if (@conllu_files == 0); |
| my $fh; |
| |
| my $dependency_foundry_name = $foundry_name; |
| if ($foundry_name =~ /(.*) dependency:(.*)/) { |
| $foundry_name = $1; |
| $dependency_foundry_name = $2; |
| } |
| |
| foreach my $conllu_file (@conllu_files) { |
| if ($conllu_file eq '-') { |
| $fh = \*STDIN; |
| } else { |
| open($fh, "<", $conllu_file) or die "Cannot open $conllu_file"; |
| } |
| my $i=0; my $s=0; my $first_in_sentence=0; |
| my $lastDocSigle=""; |
| MAIN: while (<$fh>) { |
| if(/^\s*(?:#|0\.\d)/) { |
| if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) { |
| $filename=$1; |
| if(!$first) { |
| closeDoc(0); |
| } else { |
| $first=0; |
| } |
| if($processedFilenames{$filename}) { |
| $log->warn("WARNING: $filename is already processed"); |
| } |
| $processedFilenames{$filename}=1; |
| $i=0; |
| } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) { |
| if(!$foundry_name) { |
| $dependency_foundry_name = $foundry_name = $1; |
| if ($foundry_name =~ /(.*) dependency:(.*)/) { |
| $foundry_name = $1; |
| $dependency_foundry_name = $2; |
| } |
| $log->debug("Foundry: $foundry_name\n"); |
| } else { |
| $log->debug("Ignored foundry name: $1\n"); |
| } |
| } elsif(/^#\s*generator\s*[=]\s*udpipe/i) { |
| if(!$foundry_name) { |
| $dependency_foundry_name = $foundry_name = "ud"; |
| $log->debug("Foundry: $foundry_name\n"); |
| } else { |
| $log->debug("Ignored foundry name: ud\n"); |
| } |
| } elsif(/^(?:#|0\.2)\s+text_id\s*[:=]\s*(.*)/) { |
| $docid=$1; |
| my $docSigle = $docid; |
| $docSigle =~ s/\..*//; |
| if($docSigle ne $lastDocSigle) { |
| $log->info("Analyzing $docSigle"); |
| $lastDocSigle = $docSigle; |
| } |
| $known=$unknown=0; |
| $current=""; |
| $parser_file = dirname($filename); |
| $parser_file =~ s@(.*)/[^/]+$@$1@; |
| $morpho_file = $parser_file; |
| $morpho_file .= "/$foundry_name/morpho.xml"; |
| $parser_file .= "/$dependency_foundry_name/dependency.xml"; |
| $parse = $morpho = layer_header($docid); |
| } elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) { |
| @spansFrom = split(/\s+/, $1); |
| } elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) { |
| @spansTo = split(/\s+/, $1); |
| } |
| } elsif ( !/^\s*$/ ) { |
| if ( !$docid || scalar @spansTo == 0 || scalar @spansFrom == 0 ) { |
| if ( !$docid ) { |
| $log->warn("WARNING: No valid input document: text_id (e.g. '# text_id = GOE_AGA.00000') missing"); |
| } |
| if ( scalar @spansTo == 0 || scalar @spansFrom == 0 ) { |
| $log->warn("WARNING: No valid input document: token offsets missing"); |
| } |
| |
| # Skip to next potentially valid document |
| while (<$fh>) { |
| next MAIN if m!^\s*$!s; |
| } |
| }; |
| my @parsed = map { |
| my $s = $_; |
| $s =~ s/&/&/g; |
| $s =~ s/</</g; |
| $s =~ s/>/>/g; |
| $s; |
| } split('\t'); |
| chomp $parsed[9]; |
| if (@parsed != 10) { |
| $log->warn("WARNING: skipping strange parser output line in $docid"); |
| $i++; |
| next; |
| } |
| my $t=$parsed[0]; |
| if($t == 1) { |
| $s++; |
| $first_in_sentence = $i; |
| } |
| if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) { |
| $write_syntax=1; |
| my $from=$spansFrom[$parsed[6]]; |
| my $to=$spansTo[$parsed[6]]; |
| $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> |
| <rel label="$parsed[7]"> |
| <span from="$from" to="$to"/> |
| </rel> |
| </span> |
| @; |
| } |
| my $pos = $parsed[4]; |
| my $upos = $parsed[3]; |
| $pos =~ s/\|.*//; |
| $morpho .= qq( <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> |
| <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0"> |
| <f name="lex"> |
| <fs> |
| ); |
| if($pos ne "_") { |
| $morpho .= qq( <f name="pos">$pos</f>\n); |
| } |
| if($upos ne "_") { |
| $morpho .= qq( <f name="upos">$upos</f>\n); |
| } |
| $morpho .= qq( <f name="lemma">$parsed[2]</f>\n) if($parsed[2] ne "_" || $parsed[1] eq '_'); |
| $morpho .= qq( <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_"); |
| if($parsed[9] ne "_") { |
| if ($parsed[9] =~ /[0-9.e]+/) { |
| $morpho .= qq( <f name="certainty">$parsed[9]</f>\n) |
| } |
| else { |
| $morpho .= qq( <f name="misc">$parsed[9]</f>\n) |
| } |
| } |
| $morpho .= qq( </fs> |
| </f> |
| </fs> |
| </span> |
| ); |
| $i++; |
| } |
| } |
| $current .= "\n"; |
| closeDoc(1); |
| $zip->close() if $zip; |
| close($fh); |
| } |
| exit; |
| |
| sub newZipStream { |
| my ($fname) = @_; |
| if (defined $zip) { |
| $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, |
| Append => 1, Name => $fname, ExtAttr => 0100666 << 16) |
| or die "ERROR ('$fname'): zip failed: $ZipError\n"; |
| } else { |
| $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1, |
| Method => $_COMPRESSION_METHOD, Append => 0, Name => "$fname", ExtAttr => 0100666 << 16 |
| or die "ERROR ('$fname'): zip failed: $ZipError\n"; |
| } |
| } |
| |
| sub closeDoc { |
| if ($write_morpho && $morpho_file) { |
| newZipStream($morpho_file); |
| $zip->print($morpho, qq( </spanList>\n</layer>\n)); |
| } |
| if ($write_syntax && $parser_file) { |
| $write_syntax = 0; |
| newZipStream($parser_file); |
| $zip->print($parse, qq(</spanList>\n</layer>\n)); |
| } |
| } |
| |
| sub layer_header { |
| my ($docid) = @_; |
| return(qq(<?xml version="1.0" encoding="UTF-8"?> |
| <?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> |
| <layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4"> |
| <spanList> |
| )); |
| } |
| |
| =pod |
| |
| =encoding utf8 |
| |
| =head1 NAME |
| |
| conllu2korapxml - Conversion of KorAP-XML CoNLL-U to KorAP-XML zips |
| |
| =head1 SYNOPSIS |
| |
| conllu2korapxml < zca15.tree_tagger.conllu > zca15.tree_tagger.zip |
| |
| =head1 DESCRIPTION |
| |
| C<conllu2korapxml> converts CoNLL-U files that follow KorAP-specific comment conventions |
| and contain morphosyntactic and/or dependency annotations to |
| corresponding KorAP-XML zip files. |
| |
| =head1 INSTALLATION |
| |
| $ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git |
| |
| =head1 OPTIONS |
| |
| =over 2 |
| |
| =item B<--force-foundry|-f> |
| |
| Set foundry name and ignore foundry names in the input. |
| |
| |
| =item B<--help|-h> |
| |
| Print help information. |
| |
| =item B<--version|-v> |
| |
| Print version information. |
| |
| |
| =item B<--log|-l> |
| |
| Loglevel for I<Log::Any>. Defaults to C<warn>. |
| |
| =back |
| |
| =head1 EXAMPLES |
| |
| conllu2korapxml -f tree_tagger < t/data/wdf19.morpho.conllu > wdf19.tree_tagger.zip |
| |
| conllu2korapxml -f "tree_tagger dependency:malt" < t/data/wdf19.tt-malt.conllu > wdf19.tree_tagger.zip |
| |
| =head1 COPYRIGHT AND LICENSE |
| |
| Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/> |
| |
| Author: Marc Kupietz |
| |
| Contributors: Nils Diewald |
| |
| L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/> |
| Corpus Analysis Platform at the |
| L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>, |
| member of the |
| L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>. |
| |
| This program is free software published under the |
| L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>. |