script/korapxml2conllu - KorAP/KorAP-XML-CoNLL-U - Gitiles

 #!/usr/bin/env perl
 use strict;
 use warnings;
 use POSIX;
 use Log::Any '$log';
 use Log::Any::Adapter;
 use Pod::Usage;
 use Getopt::Long qw(GetOptions :config no_auto_abbrev);

 my $MAX_SENTENCE_LENGTH=10000;
 my $COMMENT_START="#";
 my $COMMENT_END="";

 my $test=0;
 my $text_no=0;
 my %opts;
 my %plain_texts;
 my %sentence_ends;
 my %metadata;

 my $offsets = 1;
 my $comments = 1;
 my $extract_metadata = 0;
 my @extract_metadata_regex;

 my $lm_training_data = 0;

 our $VERSION = '0.6.2';

 our $VERSION_MSG = "\nkorapxml2conllu - v$VERSION\n";

 use constant {
   # Set to 1 for minimal more debug output (no need to be parametrized)
   DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0
 };

 use open ':std', ':encoding(UTF-8)';

 GetOptions(
   'sigle-pattern|p=s'            => \(my $sigle_pattern = ''),
   'extract-attributes-regex|e=s' => \(my $extract_attributes_regex = ''),
   's-bounds-from-morpho'         => \(my $s_bounds_from_morpho = 0),
   'log|l=s'                      => \(my $log_level = 'warn'),
   'columns|c=n'                  => \(my $columns = 10),
   'word2vec|lm-training-data!'   => \$lm_training_data,
   'token-separator|s=s'          => \(my $token_separator = "\n"),
   'offsets!'                     => \$offsets,
   'comments!'                    => \$comments,
   'extract-metadata-regex|m=s@'  => \@extract_metadata_regex,
   'help|h'                       => sub {
     pod2usage(
       -verbose  => 99,
       -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS|EXAMPLES',
       -msg      => $VERSION_MSG,
       -output   => '-'
     )
   },
   'version|v'                    => sub {
     pod2usage(
       -verbose => 0,
       -msg     => $VERSION_MSG,
       -output  => '-'
     );
   }
 );

 if (@extract_metadata_regex) {
   $extract_metadata = 1;
 }

 # Establish logger
 binmode(STDERR, ':encoding(UTF-8)');
 Log::Any::Adapter->set('Stderr', log_level => $log_level);
 $log->notice('Debugging is activated') if DEBUG;

 if ($lm_training_data) {
   $columns = 1;
   $comments = 0;
   $offsets = 0;
   $token_separator = " ";
 }

 my $docid="";
 my ($current_id, $current_from, $current_to, $token);
 my $current;
 my ($unknown, $known) = (0, 0);
 my @current_lines;
 my %processedFilenames;
 my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
 my $baseOnly;
 my %extras;

 my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);

 my $UNZIP = `sh -c 'command -v unzip'`;
 chomp $UNZIP;


 if ($UNZIP eq '') {
   warn('No unzip executable found in PATH.');
   return 0;
 };

 foreach my $morpho_zip (@ARGV) {
   die "cannot open $morpho_zip" if(! -r $morpho_zip);
   my $data_zip = $morpho_zip;
   if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
     open(CONLL, "<$data_zip") or die "cannot open $data_zip";
     while(<CONLL>) {
       print;
     }
     close(CONLL);
     next;
   }
   $data_zip =~ s/\.([^\/.]+)\.zip$/.zip/;
   my $foundry = $1;
   die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);

   my $first=1;
   my @conll = ("_") x 10;
   my $filename;

   $baseOnly = $morpho_zip eq $data_zip;
   my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
   my $zip_content_pattern = "[sd][ta]*";
   if ($extract_metadata) {
     $zip_content_pattern = "[sdh][tae]*";
   }
   if (!$baseOnly) {
     $morphoOrTokenCommand = "$UNZIP -c $morpho_zip '*${sigle_pattern}*/morpho.xml' $zipsiglepattern |";
     if ($extract_attributes_regex || !$s_bounds_from_morpho) {
       $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*${sigle_pattern}*/$zip_content_pattern.xml' $zipsiglepattern |";
     } else {
       $log->debug("Not reading structure information.");
       $plaintextAndStructureCommand = "$UNZIP -c $data_zip '*${sigle_pattern}*/data.xml' $zipsiglepattern |";
     }
   } else {
     $foundry = "base";
     $morphoOrTokenCommand = "$UNZIP -l $morpho_zip '*${sigle_pattern}*/morpho.xml' $zipsiglepattern";
     if (`$morphoOrTokenCommand` !~ /morpho\.xml/) {
       $morphoOrTokenCommand =~ s/morpho\.xml/tokens.xml/;
     } else {
       $baseOnly = 0;
     }
     $morphoOrTokenCommand =~ s/-l/-c/;
     $morphoOrTokenCommand .= ' |';
     $plaintextAndStructureCommand = "$UNZIP -c $data_zip " . "'*${sigle_pattern}*/$zip_content_pattern.xml' " . "$zipsiglepattern |";
   }

   $log->debug("command to extract annotation and/or tokenization: $morphoOrTokenCommand");
   $log->debug("command to extract plain text and structure: $plaintextAndStructureCommand");

   open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
   open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
   print "$COMMENT_START foundry = $foundry$COMMENT_END\n" if ($comments);
   while (<MORPHO_OR_TOKENPIPE>) {
     if (/^  inflating: (.*)/) {
       $filename=$1;
       while($processedFilenames{$filename} && !eof(MORPHO_OR_TOKENPIPE)) {
         $log->warn("$filename already processed");
         while (<MORPHO_OR_TOKENPIPE>) {
           last if(/\s+inflating:\s+(.*)/);
         }
         $filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
       }
     } elsif(m@(?:^|\s)docid="([^"]+)"@) {
       last if($test && $text_no++ > 3);
       if(!$first) {
         closeDoc(0);
       }
       $processedFilenames{$filename}=1;
       $docid=$1;
       @current_lines=();
       $known=$unknown=0;
       $current="";
       if ($first) {
           $first = 0;
       }
       if(!fetch_plaintext($docid)) { # skip this text
         while (<MORPHO_OR_TOKENPIPE>) {
           last if(m@</layer>@);
         }
       }
       print STDOUT "$COMMENT_START filename = $filename$COMMENT_END\n$COMMENT_START text_id = $docid$COMMENT_END\n" if($comments);
       $log->debug("Analyzing $docid");
     } elsif (m@^\s*<f\s+.*name="([^"]+)">([^<]+)</f>@) {
       if ($1 eq "lemma") {
         $conll[$LEMMA_idx] = $2;
         $conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
         if($conll[$LEMMA_idx] eq 'UNKNOWN') {
           $conll[$LEMMA_idx] = "--";
           $unknown++;
         } else {
           $known++;
         }
       } elsif ($1 eq 'pos' || $1 eq "ctag") {
         $unknown++;
         $conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
       } elsif ($1 eq 'msd') {
         $conll[$FEATS_idx] = $2;
       } elsif ($1 eq 'certainty') {
         $conll[$MISC_idx] = $2;
       }
     } elsif (/<span /) {
       my $last_from = $current_from // -1;
       ($current_id) = /id="[^0-9]*([^\"]*)"/;
       ($current_from) = /from="([^\"]*)"/;
       ($current_to) = /to="([^\"]*)"/;
       if($extract_attributes_regex) {
         for (my $i = $last_from + 1; $i <= $current_from; $i++) {
           if ($extras{$docid}{$i}) {
             $current .= $extras{$docid}{$i};
             undef $extras{$docid}{$i};
           }
         }
       }
 #      $log->debug("found span: $current_id $current_from $current_to");
       $token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
       if (!defined $token || length($token) == 0) {
         $log->warn("$docid: could not retrieve token at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10));
         $token = "_";
       }
       $token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
       @conll = ("_") x 10;
       $conll[$FORM_idx] = $token;
       if($baseOnly) {
         my @vals = ($current_from, $current_to);
 #        $log->debug("joining : ", join(" ", @vals));
         push @current_lines, \@vals;
         $known++;
         $conll[$ID_idx] = $#current_lines+1;
         if ($columns == 1) {
           $current .= "$conll[1]$token_separator" ;
         } else {
           $current .= join("\t", @conll[0..$columns-1]) . "\n"; # conll columns
         }
         if ($sentence_ends{$docid}{$current_to}) {
           $log->debug("Using sentence end for $docid \@$current_to");
           $current .= "\n";
           printTokenRanges();
           if ($extract_metadata) {
             for (my $i = 0; $i < @extract_metadata_regex; $i++) {
               if(!defined($metadata{$docid}[$i])) {
                 $log->warn("$docid: metadata matching /$extract_metadata_regex[$i]/ was not found, using empty string instead");
                 $metadata{$docid}[$i]="";
               }
               print "$metadata{$docid}[$i]\t";
             }
           }
           $current =~ s/ $//;
           print STDOUT $current;
           $current = "";
           $known = 0;
           $unknown = 0;
           @current_lines = ();
         }
       }
     } elsif (m@^\s*</fs>@) {
       my @vals = ($current_from, $current_to);
 #      $log->debug("joining : ", join(" ", @vals));
       push @current_lines, \@vals;
       # convert gathered information to CONLL
       $conll[$ID_idx] = $#current_lines+1;
       if ($columns == 1) {
         $current .= "$conll[1]\n";
       } else {
         $current .= join("\t", @conll[0..$columns-1]) . "\n"; # conll columns
       }
       if($sentence_ends{$docid}{$current_to} || ($s_bounds_from_morpho && $conll[$XPOS_idx] eq '$.' || ($conll[$XPOS_idx] eq 'SENT' && $token eq '.')) || $known + $unknown >= $MAX_SENTENCE_LENGTH) {
         $log->debug("Using sentence end for $docid \@$current_to");
         $current .= "\n";
         if($known + $unknown > 0) { # only print sentence if it contains some words
           printTokenRanges();
           print STDOUT $current;
         }
         $current=""; $known=0; $unknown=0;
         @current_lines = ();
       }
       while (<MORPHO_OR_TOKENPIPE>) {
         last if (m@</span>@);    # only consider first interpretation
       }
     }
   }
   $current .= "\n";
   closeDoc(1);
   close(MORPHO_OR_TOKENPIPE);
   close(PLAINTEXTPIPE);
 }
 exit;

 sub printTokenRanges {
   return if(!$offsets);
   print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
   foreach my $t (@current_lines) {
     print " $t->[0]";
   }
   print "$COMMENT_END\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1] if($comments);
   foreach my $t (@current_lines) {
     print " $t->[1]";
   }
   print "$COMMENT_END\n";
 }

 sub closeDoc {
   $log->debug("closing doc");
   if($known + $unknown > 0) { # only parse a sentence if it has some words
     chomp $current;
     chomp $current;
     chomp $current;
     $current .= "\n\n";
     printTokenRanges();
     print $current;
   }
 }

 # read data.xml to figure out the tokens
 # (ideally tokens should also be in in morpho.xml, but they are not)
 sub fetch_plaintext {
   my ($target_id) = @_;
   my $docid;
   my $text_started=0;
   my $text_count = 0;
   my ($current_id, $current_from, $current_to);

   if($plain_texts{$target_id} && ($s_bounds_from_morpho || $sentence_ends{$target_id}) && (!$extract_metadata || $metadata{$target_id})) {
     $log->debug("Already got $target_id");
     return 1;
   }
   while(<PLAINTEXTPIPE>) {
     if ($extract_metadata) {
       if (/<textSigle>([^<]+)/) {
         $docid = $1;
         $docid =~ s@/@_@;
         $log->debug("textsigle=$docid");
       }
       for (my $i=0; $i < @extract_metadata_regex; $i++) {
         if ($_ =~ /$extract_metadata_regex[$i]/) {
           $metadata{$docid}[$i]=$1;
         }
       }
     }
     if(/<raw_text[^>]+docid="([^"]*)/) {
       $docid=$1;
       $log->debug("Getting plain text for $docid");
       $text_started=0;
     } elsif(/<raw_text\b/) {
       $text_started=0;
     } elsif(/(?:^|\s)docid="([^"]*)/) {
       $docid=$1;
       $log->debug("Getting plain text for $docid");
     } elsif(m@<span @) {
         ($current_id) = /id="[^0-9]*([^\"]*)"/;
         ($current_from) = /from="([^\"]*)"/;
         ($current_to) = /to="([^\"]*)"/;
     } elsif(m@<f\s[^>]*>s</f>@) {
       if ($s_bounds_from_morpho) {
         $log->debug("Ignoring sentence end for $docid \@$current_to because of --s-bounds-from-morpho");
       } else {
         $log->debug("Found sentence end for $docid \@$current_to");
         $sentence_ends{$docid}{$current_to} = 1;
       }
     } elsif($extract_attributes_regex && m@<f\sname="name"[^>]*>([^<]+)</f>@) {
       my $current_element = $1;
       $log->debug("Looking for matching attributes in $docid");
       while(<PLAINTEXTPIPE>) {
         last if(m@</fs>@);
         if(m@<f\sname="([^"]+)"[^>]*>([^<]+)</f>@) {
           my $current_node = "$current_element/$1";
           my $value = $2;
           if ($current_node =~ /$extract_attributes_regex/) {
             $log->debug("Found matching attribute: $docid - $current_node = $value");
             $extras{$docid}{$current_from} .= "# $current_node = $value\n";
           }
         }
       }
     } elsif (m@<text>(.*)</text>@) {
       $_= $1;
       s/&lt;/</go;
       s/&gt;/>/go;
       s/&amp;/&/go;
       tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
       $plain_texts{$docid} = $_;
       last if(!$extract_attributes_regex && ($text_count++ > 1 && $plain_texts{$target_id} && (!$extract_metadata || $metadata{$target_id})));
     } elsif (m@<text>(.*)@) {
       $_= $1;
       s/&lt;/</go;
       s/&gt;/>/go;
       s/&amp;/&/go;
       tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
       $plain_texts{$docid} = "$_ ";
       $text_started=1;
     } elsif ($text_started && m@(.*)</text>@) {
       $_= $1;
       s/&lt;/</go;
       s/&gt;/>/go;
       s/&amp;/&/go;
       tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
       $plain_texts{$docid} .= $_;
       $text_started=0;
       last if(!$extract_attributes_regex && ($text_count++ > 1 && $plain_texts{$target_id} && (!$extract_metadata || $metadata{$target_id})));
     } elsif ($text_started) {
       chomp;
       s/&lt;/</go;
       s/&gt;/>/go;
       s/&amp;/&/go;
       tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
       $plain_texts{$docid} .= $_;
     }
   }
   $log->debug("Got plain text for $docid");
   if(defined($ENV{PLAINTEXTFILTER})) {
     if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
       $plain_texts{$docid} = undef;
       $log->info("Skipping $docid");
       return(undef);
     } else {
       $log->debug("Using $docid");
     }
   }
   return(1);
 }

 =pod

 =encoding utf8

 =head1 NAME

 korapxml2conllu - Conversion of KorAP-XML zips to CoNLL-U

 =head1 SYNOPSIS

   korapxml2conllu zca15.tree_tagger.zip > zca15.conllu

 =head1 DESCRIPTION

 C<korapxml2conllu> is a script to Convert L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml> base or morpho zips to CoNLL(-U) format with all information necessary
  for reconstruction in comment lines.

 =head1 INSTALLATION

   $ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git

 =head1 OPTIONS

 =over 2

 =item B<--sigle-pattern|-p>

 Convert only texts from the KorAP XML zip files with folder names (i.e. sigles) matching the glob pattern.

 =item B<--extract-attribute-pattern|-e>

 Extract element/attribute regular expressions to comments.

 =item B<--columns>=I<int> | B<-c> I<int>

 Print n columns (default: 10). If n=1, only the token itself is printed.

 =item B<--s-bounds-from-morpho>

 Get sentence boundary information from tagger output rather than from s annotation in structure.xml files.

 =item B<--token-separator>=I<string>

 Token separator.

 =item B<--(no)comments>

 Switch comment printing on or off (default: on).

 =item B<--(no)offsets>

 Switch offsets printing on or off (default: on).

 =item B<--word2vec>

 Print output in word2vec (tokenized) one sentence per line format.

 =item B<--extract-metadata-regex|-m>

 Can be used to extract and print strings from the text headers. Currently only works together with --word2vec option.
 For example:

   korapxml2conllu -m '<textSigle>([^<.]+)' -m '<creatDate>([^<]{7})' --word2vec t/data/wdf19.zip

 Will print the document sigle, year and month of the creation date and one sentence per line, separated by tabs.

 =item B<--help|-h>

 Print help information.

 =item B<--version|-v>

 Print version information.


 =item B<--log|-l>

 Loglevel for I<Log::Any>. Defaults to C<warn>.

 =back

 =head1 EXAMPLES

 =head2 Extract to CoNNL-U with posting and div ids:

  korapxml2conllu -e '(posting/id|div/id)' t/data/wdf19.zip

 =head2 Extract to word2vec input format metadata columns:

  korapxml2conllu --word2vec t/data/wdf19.zip

 =head1 COPYRIGHT AND LICENSE

 Copyright (C) 2021-2022, L<IDS Mannheim|https://www.ids-mannheim.de/>

 Author: Marc Kupietz

 Contributors: Nils Diewald

 L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
 member of the
 L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.

 This program is free software published under the
 L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
	#!/usr/bin/env perl
	use strict;
	use warnings;
	use POSIX;
	use Log::Any '$log';
	use Log::Any::Adapter;
	use Pod::Usage;
	use Getopt::Long qw(GetOptions :config no_auto_abbrev);

	my $MAX_SENTENCE_LENGTH=10000;
	my $COMMENT_START="#";
	my $COMMENT_END="";

	my $test=0;
	my $text_no=0;
	my %opts;
	my %plain_texts;
	my %sentence_ends;
	my %metadata;

	my $offsets = 1;
	my $comments = 1;
	my $extract_metadata = 0;
	my @extract_metadata_regex;

	my $lm_training_data = 0;

	our $VERSION = '0.6.2';

	our $VERSION_MSG = "\nkorapxml2conllu - v$VERSION\n";

	use constant {
	# Set to 1 for minimal more debug output (no need to be parametrized)
	DEBUG => $ENV{KORAPXMLCONLLU_DEBUG} // 0
	};

	use open ':std', ':encoding(UTF-8)';

	GetOptions(
	'sigle-pattern\|p=s' => \(my $sigle_pattern = ''),
	'extract-attributes-regex\|e=s' => \(my $extract_attributes_regex = ''),
	's-bounds-from-morpho' => \(my $s_bounds_from_morpho = 0),
	'log\|l=s' => \(my $log_level = 'warn'),
	'columns\|c=n' => \(my $columns = 10),
	'word2vec\|lm-training-data!' => \$lm_training_data,
	'token-separator\|s=s' => \(my $token_separator = "\n"),
	'offsets!' => \$offsets,
	'comments!' => \$comments,
	'extract-metadata-regex\|m=s@' => \@extract_metadata_regex,
	'help\|h' => sub {
	pod2usage(
	-verbose => 99,
	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS\|EXAMPLES',
	-msg => $VERSION_MSG,
	-output => '-'
	)
	},
	'version\|v' => sub {
	pod2usage(
	-verbose => 0,
	-msg => $VERSION_MSG,
	-output => '-'
	);
	}
	);

	if (@extract_metadata_regex) {
	$extract_metadata = 1;
	}

	# Establish logger
	binmode(STDERR, ':encoding(UTF-8)');
	Log::Any::Adapter->set('Stderr', log_level => $log_level);
	$log->notice('Debugging is activated') if DEBUG;

	if ($lm_training_data) {
	$columns = 1;
	$comments = 0;
	$offsets = 0;
	$token_separator = " ";
	}

	my $docid="";
	my ($current_id, $current_from, $current_to, $token);
	my $current;
	my ($unknown, $known) = (0, 0);
	my @current_lines;
	my %processedFilenames;
	my $zipsiglepattern = (defined($ENV{ZIPSIGLEPATTERN})? $ENV{ZIPSIGLEPATTERN} : "");
	my $baseOnly;
	my %extras;

	my ($ID_idx, $FORM_idx, $LEMMA_idx, $UPOS_idx, $XPOS_idx, $FEATS_idx, $HEAD_idx, $DEPREC_idx, $DEPS_idx, $MISC_idx) = (0..9);

	my $UNZIP = `sh -c 'command -v unzip'`;
	chomp $UNZIP;


	if ($UNZIP eq '') {
	warn('No unzip executable found in PATH.');
	return 0;
	};

	foreach my $morpho_zip (@ARGV) {
	die "cannot open $morpho_zip" if(! -r $morpho_zip);
	my $data_zip = $morpho_zip;
	if ($data_zip !~ /\.zip/ && $data_zip =~ /\.conllu?/i) {
	open(CONLL, "<$data_zip") or die "cannot open $data_zip";
	while(<CONLL>) {
	print;
	}
	close(CONLL);
	next;
	}
	$data_zip =~ s/\.([^\/.]+)\.zip$/.zip/;
	my $foundry = $1;
	die "cannot open data file $data_zip corresponding to $morpho_zip" if(! -r $data_zip);

	my $first=1;
	my @conll = ("_") x 10;
	my $filename;

	$baseOnly = $morpho_zip eq $data_zip;
	my ($morphoOrTokenCommand, $plaintextAndStructureCommand);
	my $zip_content_pattern = "[sd][ta]*";
	if ($extract_metadata) {
	$zip_content_pattern = "[sdh][tae]*";
	}
	if (!$baseOnly) {
	$morphoOrTokenCommand = "$UNZIP -c $morpho_zip '${sigle_pattern}/morpho.xml' $zipsiglepattern \|";
	if ($extract_attributes_regex \|\| !$s_bounds_from_morpho) {
	$plaintextAndStructureCommand = "$UNZIP -c $data_zip '${sigle_pattern}/$zip_content_pattern.xml' $zipsiglepattern \|";
	} else {
	$log->debug("Not reading structure information.");
	$plaintextAndStructureCommand = "$UNZIP -c $data_zip '${sigle_pattern}/data.xml' $zipsiglepattern \|";
	}
	} else {
	$foundry = "base";
	$morphoOrTokenCommand = "$UNZIP -l $morpho_zip '${sigle_pattern}/morpho.xml' $zipsiglepattern";
	if (`$morphoOrTokenCommand` !~ /morpho\.xml/) {
	$morphoOrTokenCommand =~ s/morpho\.xml/tokens.xml/;
	} else {
	$baseOnly = 0;
	}
	$morphoOrTokenCommand =~ s/-l/-c/;
	$morphoOrTokenCommand .= ' \|';
	$plaintextAndStructureCommand = "$UNZIP -c $data_zip " . "'${sigle_pattern}/$zip_content_pattern.xml' " . "$zipsiglepattern \|";
	}

	$log->debug("command to extract annotation and/or tokenization: $morphoOrTokenCommand");
	$log->debug("command to extract plain text and structure: $plaintextAndStructureCommand");

	open (MORPHO_OR_TOKENPIPE, $morphoOrTokenCommand) or die "cannot unzip $morpho_zip";
	open (PLAINTEXTPIPE, $plaintextAndStructureCommand) or die "cannot unzip $data_zip";
	print "$COMMENT_START foundry = $foundry$COMMENT_END\n" if ($comments);
	while (<MORPHO_OR_TOKENPIPE>) {
	if (/^ inflating: (.*)/) {
	$filename=$1;
	while($processedFilenames{$filename} && !eof(MORPHO_OR_TOKENPIPE)) {
	$log->warn("$filename already processed");
	while (<MORPHO_OR_TOKENPIPE>) {
	last if(/\s+inflating:\s+(.*)/);
	}
	$filename=$1 if(!eof(MORPHO_OR_TOKENPIPE) && /\s+inflating:\s+(.*)/);
	}
	} elsif(m@(?:^\|\s)docid="([^"]+)"@) {
	last if($test && $text_no++ > 3);
	if(!$first) {
	closeDoc(0);
	}
	$processedFilenames{$filename}=1;
	$docid=$1;
	@current_lines=();
	$known=$unknown=0;
	$current="";
	if ($first) {
	$first = 0;
	}
	if(!fetch_plaintext($docid)) { # skip this text
	while (<MORPHO_OR_TOKENPIPE>) {
	last if(m@</layer>@);
	}
	}
	print STDOUT "$COMMENT_START filename = $filename$COMMENT_END\n$COMMENT_START text_id = $docid$COMMENT_END\n" if($comments);
	$log->debug("Analyzing $docid");
	} elsif (m@^\s<f\s+.name="([^"]+)">([^<]+)</f>@) {
	if ($1 eq "lemma") {
	$conll[$LEMMA_idx] = $2;
	$conll[$LEMMA_idx] =~ s/[\t\n\r]//g; # make sure that lemmas never contain tabs or newlines
	if($conll[$LEMMA_idx] eq 'UNKNOWN') {
	$conll[$LEMMA_idx] = "--";
	$unknown++;
	} else {
	$known++;
	}
	} elsif ($1 eq 'pos' \|\| $1 eq "ctag") {
	$unknown++;
	$conll[$XPOS_idx] = $conll[$UPOS_idx] = $2;
	} elsif ($1 eq 'msd') {
	$conll[$FEATS_idx] = $2;
	} elsif ($1 eq 'certainty') {
	$conll[$MISC_idx] = $2;
	}
	} elsif (/<span /) {
	my $last_from = $current_from // -1;
	($current_id) = /id="[^0-9]([^\"])"/;
	($current_from) = /from="([^\"]*)"/;
	($current_to) = /to="([^\"]*)"/;
	if($extract_attributes_regex) {
	for (my $i = $last_from + 1; $i <= $current_from; $i++) {
	if ($extras{$docid}{$i}) {
	$current .= $extras{$docid}{$i};
	undef $extras{$docid}{$i};
	}
	}
	}
	# $log->debug("found span: $current_id $current_from $current_to");
	$token = substr($plain_texts{$docid}, $current_from, $current_to - $current_from);
	if (!defined $token \|\| length($token) == 0) {
	$log->warn("$docid: could not retrieve token at $current_from-$current_to/", length($plain_texts{$docid}), " - ending with: ", substr($plain_texts{$docid},length($plain_texts{$docid})-10));
	$token = "_";
	}
	$token=~s/[\t\n\r]//g; # make sure that tokens never contain tabs or newlines
	@conll = ("_") x 10;
	$conll[$FORM_idx] = $token;
	if($baseOnly) {
	my @vals = ($current_from, $current_to);
	# $log->debug("joining : ", join(" ", @vals));
	push @current_lines, \@vals;
	$known++;
	$conll[$ID_idx] = $#current_lines+1;
	if ($columns == 1) {
	$current .= "$conll[1]$token_separator" ;
	} else {
	$current .= join("\t", @conll[0..$columns-1]) . "\n"; # conll columns
	}
	if ($sentence_ends{$docid}{$current_to}) {
	$log->debug("Using sentence end for $docid \@$current_to");
	$current .= "\n";
	printTokenRanges();
	if ($extract_metadata) {
	for (my $i = 0; $i < @extract_metadata_regex; $i++) {
	if(!defined($metadata{$docid}[$i])) {
	$log->warn("$docid: metadata matching /$extract_metadata_regex[$i]/ was not found, using empty string instead");
	$metadata{$docid}[$i]="";
	}
	print "$metadata{$docid}[$i]\t";
	}
	}
	$current =~ s/ $//;
	print STDOUT $current;
	$current = "";
	$known = 0;
	$unknown = 0;
	@current_lines = ();
	}
	}
	} elsif (m@^\s*</fs>@) {
	my @vals = ($current_from, $current_to);
	# $log->debug("joining : ", join(" ", @vals));
	push @current_lines, \@vals;
	# convert gathered information to CONLL
	$conll[$ID_idx] = $#current_lines+1;
	if ($columns == 1) {
	$current .= "$conll[1]\n";
	} else {
	$current .= join("\t", @conll[0..$columns-1]) . "\n"; # conll columns
	}
	if($sentence_ends{$docid}{$current_to} \|\| ($s_bounds_from_morpho && $conll[$XPOS_idx] eq '$.' \|\| ($conll[$XPOS_idx] eq 'SENT' && $token eq '.')) \|\| $known + $unknown >= $MAX_SENTENCE_LENGTH) {
	$log->debug("Using sentence end for $docid \@$current_to");
	$current .= "\n";
	if($known + $unknown > 0) { # only print sentence if it contains some words
	printTokenRanges();
	print STDOUT $current;
	}
	$current=""; $known=0; $unknown=0;
	@current_lines = ();
	}
	while (<MORPHO_OR_TOKENPIPE>) {
	last if (m@</span>@); # only consider first interpretation
	}
	}
	}
	$current .= "\n";
	closeDoc(1);
	close(MORPHO_OR_TOKENPIPE);
	close(PLAINTEXTPIPE);
	}
	exit;

	sub printTokenRanges {
	return if(!$offsets);
	print "$COMMENT_START start_offsets = ", $current_lines[0]->[0];
	foreach my $t (@current_lines) {
	print " $t->[0]";
	}
	print "$COMMENT_END\n$COMMENT_START end_offsets = ", $current_lines[$#current_lines]->[1] if($comments);
	foreach my $t (@current_lines) {
	print " $t->[1]";
	}
	print "$COMMENT_END\n";
	}

	sub closeDoc {
	$log->debug("closing doc");
	if($known + $unknown > 0) { # only parse a sentence if it has some words
	chomp $current;
	chomp $current;
	chomp $current;
	$current .= "\n\n";
	printTokenRanges();
	print $current;
	}
	}

	# read data.xml to figure out the tokens
	# (ideally tokens should also be in in morpho.xml, but they are not)
	sub fetch_plaintext {
	my ($target_id) = @_;
	my $docid;
	my $text_started=0;
	my $text_count = 0;
	my ($current_id, $current_from, $current_to);

	if($plain_texts{$target_id} && ($s_bounds_from_morpho \|\| $sentence_ends{$target_id}) && (!$extract_metadata \|\| $metadata{$target_id})) {
	$log->debug("Already got $target_id");
	return 1;
	}
	while(<PLAINTEXTPIPE>) {
	if ($extract_metadata) {
	if (/<textSigle>([^<]+)/) {
	$docid = $1;
	$docid =~ s@/@_@;
	$log->debug("textsigle=$docid");
	}
	for (my $i=0; $i < @extract_metadata_regex; $i++) {
	if ($_ =~ /$extract_metadata_regex[$i]/) {
	$metadata{$docid}[$i]=$1;
	}
	}
	}
	if(/<raw_text[^>]+docid="([^"]*)/) {
	$docid=$1;
	$log->debug("Getting plain text for $docid");
	$text_started=0;
	} elsif(/<raw_text\b/) {
	$text_started=0;
	} elsif(/(?:^\|\s)docid="([^"]*)/) {
	$docid=$1;
	$log->debug("Getting plain text for $docid");
	} elsif(m@<span @) {
	($current_id) = /id="[^0-9]([^\"])"/;
	($current_from) = /from="([^\"]*)"/;
	($current_to) = /to="([^\"]*)"/;
	} elsif(m@<f\s[^>]*>s</f>@) {
	if ($s_bounds_from_morpho) {
	$log->debug("Ignoring sentence end for $docid \@$current_to because of --s-bounds-from-morpho");
	} else {
	$log->debug("Found sentence end for $docid \@$current_to");
	$sentence_ends{$docid}{$current_to} = 1;
	}
	} elsif($extract_attributes_regex && m@<f\sname="name"[^>]*>([^<]+)</f>@) {
	my $current_element = $1;
	$log->debug("Looking for matching attributes in $docid");
	while(<PLAINTEXTPIPE>) {
	last if(m@</fs>@);
	if(m@<f\sname="([^"]+)"[^>]*>([^<]+)</f>@) {
	my $current_node = "$current_element/$1";
	my $value = $2;
	if ($current_node =~ /$extract_attributes_regex/) {
	$log->debug("Found matching attribute: $docid - $current_node = $value");
	$extras{$docid}{$current_from} .= "# $current_node = $value\n";
	}
	}
	}
	} elsif (m@<text>(.*)</text>@) {
	$_= $1;
	s/</</go;
	s/>/>/go;
	s/&/&/go;
	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
	$plain_texts{$docid} = $_;
	last if(!$extract_attributes_regex && ($text_count++ > 1 && $plain_texts{$target_id} && (!$extract_metadata \|\| $metadata{$target_id})));
	} elsif (m@<text>(.*)@) {
	$_= $1;
	s/</</go;
	s/>/>/go;
	s/&/&/go;
	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
	$plain_texts{$docid} = "$_ ";
	$text_started=1;
	} elsif ($text_started && m@(.*)</text>@) {
	$_= $1;
	s/</</go;
	s/>/>/go;
	s/&/&/go;
	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
	$plain_texts{$docid} .= $_;
	$text_started=0;
	last if(!$extract_attributes_regex && ($text_count++ > 1 && $plain_texts{$target_id} && (!$extract_metadata \|\| $metadata{$target_id})));
	} elsif ($text_started) {
	chomp;
	s/</</go;
	s/>/>/go;
	s/&/&/go;
	tr/…•⋅»«ˮ“”„›‹ʼ‘’‚′‐‑‒–—―⁓⁻₋−﹣－/...""""""'''''''-/;
	$plain_texts{$docid} .= $_;
	}
	}
	$log->debug("Got plain text for $docid");
	if(defined($ENV{PLAINTEXTFILTER})) {
	if ($plain_texts{$docid} !~ $ENV{PLAINTEXTFILTER}) {
	$plain_texts{$docid} = undef;
	$log->info("Skipping $docid");
	return(undef);
	} else {
	$log->debug("Using $docid");
	}
	}
	return(1);
	}

	=pod

	=encoding utf8

	=head1 NAME

	korapxml2conllu - Conversion of KorAP-XML zips to CoNLL-U

	=head1 SYNOPSIS

	korapxml2conllu zca15.tree_tagger.zip > zca15.conllu

	=head1 DESCRIPTION

	C<korapxml2conllu> is a script to Convert L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml> base or morpho zips to CoNLL(-U) format with all information necessary
	for reconstruction in comment lines.

	=head1 INSTALLATION

	$ cpanm https://github.com/KorAP/KorAP-XML-CoNLL-U.git

	=head1 OPTIONS

	=over 2

	=item B<--sigle-pattern\|-p>

	Convert only texts from the KorAP XML zip files with folder names (i.e. sigles) matching the glob pattern.

	=item B<--extract-attribute-pattern\|-e>

	Extract element/attribute regular expressions to comments.

	=item B<--columns>=I<int> \| B<-c> I<int>

	Print n columns (default: 10). If n=1, only the token itself is printed.

	=item B<--s-bounds-from-morpho>

	Get sentence boundary information from tagger output rather than from s annotation in structure.xml files.

	=item B<--token-separator>=I<string>

	Token separator.

	=item B<--(no)comments>

	Switch comment printing on or off (default: on).

	=item B<--(no)offsets>

	Switch offsets printing on or off (default: on).

	=item B<--word2vec>

	Print output in word2vec (tokenized) one sentence per line format.

	=item B<--extract-metadata-regex\|-m>

	Can be used to extract and print strings from the text headers. Currently only works together with --word2vec option.
	For example:

	korapxml2conllu -m '<textSigle>([^<.]+)' -m '<creatDate>([^<]{7})' --word2vec t/data/wdf19.zip

	Will print the document sigle, year and month of the creation date and one sentence per line, separated by tabs.

	=item B<--help\|-h>

	Print help information.

	=item B<--version\|-v>

	Print version information.


	=item B<--log\|-l>

	Loglevel for I<Log::Any>. Defaults to C<warn>.

	=back

	=head1 EXAMPLES

	=head2 Extract to CoNNL-U with posting and div ids:

	korapxml2conllu -e '(posting/id\|div/id)' t/data/wdf19.zip

	=head2 Extract to word2vec input format metadata columns:

	korapxml2conllu --word2vec t/data/wdf19.zip

	=head1 COPYRIGHT AND LICENSE

	Copyright (C) 2021-2022, L<IDS Mannheim\|https://www.ids-mannheim.de/>

	Author: Marc Kupietz

	Contributors: Nils Diewald

	L<KorAP::XML::CoNNL-U> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
	Corpus Analysis Platform at the
	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
	member of the
	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.

	This program is free software published under the
	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.