| #!/usr/bin/env perl | 
 | use strict; | 
 | use warnings; | 
 | use POSIX; | 
 | use Getopt::Long qw(GetOptions :config no_auto_abbrev); | 
 | use Log::Any '$log'; | 
 | use Log::Any::Adapter; | 
 | use Encode; | 
 | use IO::Compress::Zip qw(zip $ZipError :constants); | 
 | use File::Basename; | 
 | use Pod::Usage; | 
 |  | 
 | my $_COMPRESSION_METHOD = ZIP_CM_DEFLATE; | 
 | my %opts; | 
 | my %processedFilenames; | 
 |  | 
 | our $VERSION = '0.6.2'; | 
 | our $VERSION_MSG = "\nconllu2korapxml - v$VERSION\n"; | 
 |  | 
 | use constant { | 
 |     # Set to 1 for minimal more debug output (no need to be parametrized) | 
 |     DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0 | 
 | }; | 
 |  | 
 | GetOptions( | 
 |     'force-foundry|f=s'            => \(my $foundry_name = ''), | 
 |     'log|l=s'                      => \(my $log_level = 'warn'), | 
 |  | 
 |     'help|h'                       => sub { | 
 |       pod2usage( | 
 |           -verbose  => 99, | 
 |           -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES', | 
 |           -msg      => $VERSION_MSG, | 
 |           -output   => '-' | 
 |       ) | 
 |     }, | 
 |     'version|v'                    => sub { | 
 |       pod2usage( | 
 |           -verbose => 0, | 
 |           -msg     => $VERSION_MSG, | 
 |           -output  => '-' | 
 |       ); | 
 |     } | 
 | ); | 
 |  | 
 | # Establish logger | 
 | binmode(STDERR, ':encoding(UTF-8)'); | 
 | Log::Any::Adapter->set('Stderr', log_level => $log_level); | 
 | $log->notice('Debugging is activated') if DEBUG; | 
 |  | 
 | my $docid=""; | 
 | my $zip = undef; | 
 | my $outh = '-'; | 
 | my $parser_file; | 
 | my $parse; | 
 | my $morpho_file; | 
 | my $morpho; | 
 | my @spansFrom; | 
 | my @spansTo; | 
 | my $current; | 
 | my ($unknown, $known) = (0, 0); | 
 |  | 
 | my ($write_morpho, $write_syntax, $base) = (1, 0, 0); | 
 | my $filename; | 
 | my $first=1; | 
 | my @conllu_files = @ARGV; | 
 | push @conllu_files, "-" if (@conllu_files == 0); | 
 | my $fh; | 
 | foreach my $conllu_file (@conllu_files) { | 
 |   if ($conllu_file eq '-') { | 
 |     $fh = \*STDIN; | 
 |   } else { | 
 |     open($fh, "<", $conllu_file) or die "Cannot open $conllu_file"; | 
 |   } | 
 |   my $i=0; my $s=0; my $first_in_sentence=0; | 
 |   my $lastDocSigle=""; | 
 |   MAIN: while (<$fh>) { | 
 |     if(/^\s*(?:#|0\.\d)/) { | 
 |       if(/^(?:#|0\.1)\s+filename\s*[:=]\s*(.*)/) { | 
 |         $filename=$1; | 
 |         if(!$first) { | 
 |           closeDoc(0); | 
 |         } else { | 
 |           $first=0; | 
 |         } | 
 |         if($processedFilenames{$filename}) { | 
 |           $log->warn("WARNING: $filename is already processed"); | 
 |         } | 
 |         $processedFilenames{$filename}=1; | 
 |         $i=0; | 
 |       } elsif(/^#\s*foundry\s*[:=]\s*(.*)/) { | 
 |         if(!$foundry_name) { | 
 |           $foundry_name = $1; | 
 |           $log->debug("Foundry: $foundry_name\n"); | 
 |         } else { | 
 |           $log->debug("Ignored foundry name: $1\n"); | 
 |         } | 
 |       } elsif(/^#\s*generator\s*[=]\s*udpipe/i) { | 
 |         if(!$foundry_name) { | 
 |           $foundry_name = "ud"; | 
 |           $log->debug("Foundry: $foundry_name\n"); | 
 |         } else { | 
 |           $log->debug("Ignored foundry name: ud\n"); | 
 |         } | 
 |       } elsif(/^(?:#|0\.2)\s+text_id\s*[:=]\s*(.*)/) { | 
 |         $docid=$1; | 
 |         my $docSigle = $docid; | 
 |         $docSigle =~ s/\..*//; | 
 |         if($docSigle ne $lastDocSigle) { | 
 |           $log->info("Analyzing $docSigle"); | 
 |           $lastDocSigle = $docSigle; | 
 |         } | 
 |         $known=$unknown=0; | 
 |         $current=""; | 
 |         $parser_file = dirname($filename); | 
 |         $parser_file =~ s@(.*)/[^/]+$@$1@; | 
 |         $morpho_file = $parser_file; | 
 |         $morpho_file .= "/$foundry_name/morpho.xml"; | 
 |         $parser_file .= "/$foundry_name/dependency.xml"; | 
 |         $parse = $morpho = layer_header($docid); | 
 |       }  elsif (/^(?:#|0\.3)\s+(?:start_offsets|from)\s*[:=]\s*(.*)/) { | 
 |         @spansFrom = split(/\s+/, $1); | 
 |       }  elsif (/^(?:#|0\.4)\s+(?:end_offsets|to)\s+[:=]\s*(.*)/) { | 
 |         @spansTo = split(/\s+/, $1); | 
 |       } | 
 |     } elsif ( !/^\s*$/ ) { | 
 |       if ( !$docid || scalar @spansTo == 0 || scalar @spansFrom == 0 ) { | 
 |         if ( !$docid ) { | 
 |           $log->warn("WARNING: No valid input document: text_id (e.g. '# text_id = GOE_AGA.00000') missing"); | 
 |         } | 
 |         if ( scalar @spansTo == 0 || scalar @spansFrom == 0 ) { | 
 |           $log->warn("WARNING: No valid input document: token offsets missing"); | 
 |         } | 
 |  | 
 |         # Skip to next potentially valid document | 
 |         while (<$fh>) { | 
 |           next MAIN if m!^\s*$!s; | 
 |         } | 
 |       }; | 
 |       my @parsed=split('\t'); | 
 |       chomp  $parsed[9]; | 
 |       if (@parsed != 10) { | 
 |         $log->warn("WARNING: skipping strange parser output line in $docid"); | 
 |         $i++; | 
 |         next; | 
 |       } | 
 |       my $t=$parsed[0]; | 
 |       if($t == 1) { | 
 |         $s++; | 
 |         $first_in_sentence = $i; | 
 |       } | 
 |       if($parsed[6] =~ /\d+/ && $parsed[7] !~ /_/) { | 
 |         $write_syntax=1; | 
 |         my $from=$spansFrom[$parsed[6]]; | 
 |         my $to=$spansTo[$parsed[6]]; | 
 |           $parse .= qq@<span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> | 
 | <rel label="$parsed[7]"> | 
 | <span from="$from" to="$to"/> | 
 | </rel> | 
 | </span> | 
 | @; | 
 |       } | 
 |       my $pos = $parsed[3]; | 
 |       $pos =~ s/\|.*//; | 
 |       $morpho .= qq(  <span id="s${s}_n$t" from="$spansFrom[$t]" to="$spansTo[$t]"> | 
 |    <fs type="lex" xmlns="http://www.tei-c.org/ns/1.0"> | 
 |     <f name="lex"> | 
 |      <fs> | 
 |       <f name="pos">$pos</f> | 
 | ); | 
 |       $morpho .= qq(      <f name="lemma">$parsed[2]</f>\n) if($parsed[2] ne "_" || $parsed[1] eq '_'); | 
 |       $morpho .= qq(      <f name="msd">$parsed[5]</f>\n) if($parsed[5] ne "_"); | 
 |       if($parsed[9] ne "_") { | 
 |         if ($parsed[9] =~ /[0-9.e]+/) { | 
 |           $morpho .= qq(      <f name="certainty">$parsed[9]</f>\n) | 
 |         } | 
 |         else { | 
 |           $morpho .= qq(      <f name="misc">$parsed[9]</f>\n) | 
 |         } | 
 |       } | 
 |       $morpho .= qq(     </fs> | 
 |     </f> | 
 |    </fs> | 
 |   </span> | 
 | ); | 
 |         $i++; | 
 |     } | 
 |   } | 
 |   $current .= "\n"; | 
 |   closeDoc(1); | 
 |   $zip->close() if $zip; | 
 |   close($fh); | 
 | } | 
 | exit; | 
 |  | 
 | sub newZipStream { | 
 |   my ($fname) = @_; | 
 |   if (defined $zip) { | 
 |     $zip->newStream(Zip64 => 1, TextFlag => 1, Method => $_COMPRESSION_METHOD, | 
 |         Append            => 1, Name => $fname) | 
 |         or die "ERROR ('$fname'): zip failed: $ZipError\n"; | 
 |   } else { | 
 |     $zip = new IO::Compress::Zip $outh, Zip64 => 1, TextFlag => 1, | 
 |         Method => $_COMPRESSION_METHOD, Append => 0, Name => "$fname" | 
 |         or die "ERROR ('$fname'): zip failed: $ZipError\n"; | 
 |   } | 
 | } | 
 |  | 
 | sub closeDoc { | 
 |   if ($write_morpho && $morpho_file) { | 
 |     newZipStream($morpho_file); | 
 |     $zip->print($morpho, qq( </spanList>\n</layer>\n)); | 
 |   } | 
 |   if ($write_syntax && $parser_file) { | 
 |     $write_syntax = 0; | 
 |     newZipStream($parser_file); | 
 |     $zip->print($parse, qq(</spanList>\n</layer>\n)); | 
 |   } | 
 | } | 
 |  | 
 | sub layer_header { | 
 |   my ($docid) = @_; | 
 |   return(qq(<?xml version="1.0" encoding="UTF-8"?> | 
 | <?xml-model href="span.rng" type="application/xml" schematypens="http://relaxng.org/ns/structure/1.0"?> | 
 | <layer docid="$docid" xmlns="http://ids-mannheim.de/ns/KorAP" version="KorAP-0.4"> | 
 | <spanList> | 
 | )); | 
 | } | 
 |  | 
 | =pod | 
 |  | 
 | =encoding utf8 | 
 |  | 
 | =head1 NAME | 
 |  | 
 | conllu2korapxml - Conversion of KorAP-XML CoNLL-U to KorAP-XML zips | 
 |  | 
 | =head1 SYNOPSIS | 
 |  | 
 |   conllu2korapxml < zca15.tree_tagger.conllu > zca15.tree_tagger.zip | 
 |  | 
 | =head1 DESCRIPTION | 
 |  | 
 | C<conllu2korapxml> converts CoNLL-U files that follow KorAP-specific comment conventions | 
 |  and contain morphosyntactic and/or dependency annotations to | 
 |  corresponding KorAP-XML zip files. | 
 |  | 
 | =head1 INSTALLATION | 
 |  | 
 |   $ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git | 
 |  | 
 | =head1 OPTIONS | 
 |  | 
 | =over 2 | 
 |  | 
 | =item B<--force-foundry|-f> | 
 |  | 
 | Set foundry name and ignore foundry names in the input. | 
 |  | 
 | =item B<--help|-h> | 
 |  | 
 | Print help information. | 
 |  | 
 | =item B<--version|-v> | 
 |  | 
 | Print version information. | 
 |  | 
 |  | 
 | =item B<--log|-l> | 
 |  | 
 | Loglevel for I<Log::Any>. Defaults to C<warn>. | 
 |  | 
 | =back | 
 |  | 
 | =head1 EXAMPLES | 
 |  | 
 |  conllu2korapxml -f tree_tagger < t/data/wdf19.morpho.conllu > wdf19.tree_tagger.zip | 
 |  | 
 | =head1 COPYRIGHT AND LICENSE | 
 |  | 
 | Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/> | 
 |  | 
 | Author: Marc Kupietz | 
 |  | 
 | Contributors: Nils Diewald | 
 |  | 
 | L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/> | 
 | Corpus Analysis Platform at the | 
 | L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>, | 
 | member of the | 
 | L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>. | 
 |  | 
 | This program is free software published under the | 
 | L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>. |