script/tei2korapxml - KorAP/KorAP-XML-TEI - Gitiles

 #!/usr/bin/env perl
 use strict;
 use warnings;

 use Log::Any '$log';
 use Log::Any::Adapter;
 use Pod::Usage;
 use Getopt::Long qw(GetOptions :config no_auto_abbrev);

 use File::Basename qw(dirname);

 use Encode qw(decode);

 use FindBin;
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };

 use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 use KorAP::XML::TEI::Inline;

 our $VERSION = '2.6.0';

 our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";

 use constant {
   # Set to 1 for minimal more debug output (no need to be parametrized)
   DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
 };

 if ($ENV{KORAPXMLTEI_INLINE}) {
   warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
 };

 # Inline tokens won't be stored in the structure file
 my $inline_tokens_exclusive = 0;

 # Inline dependencies won't be stored in the tokens file
 my $inline_deps_exclusive = 0;

 # Parse options from the command line
 GetOptions(
   'root|r=s'              => \(my $root_dir    = '.'),
   'input|i=s'             => \(my $input_fname = ''),
   'output|o=s'            => \(my $output_fname = ''),
   'tokenizer-call|tc=s'   => \(my $tokenizer_call),
   'tokenizer-korap|tk'    => \(my $tokenizer_korap),
   'tokenizer-internal|ti' => \(my $tokenizer_intern),
   'no-tokenizer'          => \(my $no_tokenizer),
   'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
   'inline-tokens=s'       => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s'   => \(my $inline_structures = 'struct#structure'),
   'inline-dependencies=s' => \(my $inline_dependencies),
   'skip-inline-tokens'    => \(my $skip_inline_tokens = 0),
   'skip-inline-token-annotations!' => \(
     my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
   'skip-inline-tags=s'    => \(my $skip_inline_tags_str = ''),
   'base-foundry=s'        => \(my $base_dir    = 'base'),
   'data-file=s'           => \(my $data_file   = 'data'),
   'header-file=s'         => \(my $header_file = 'header'),
   'tokens-file=s'         => \(my $tokens_file = 'tokens'),
   'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
   'log|l=s'               => \(my $log_level   = 'notice'),
   'required-version|rv=s' => \(my $required_version),
   ''                      => \(my $stdio),
   'help|h' => sub {
     pod2usage(
       -verbose => 99,
       -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
       -msg => $VERSION_MSG,
       -output => '-'
     )
   },
   'version|v' => sub {
     pod2usage(
       -verbose => 0,
       -msg => $VERSION_MSG,
       -output => '-'
     );
   }
 );


 # Establish logger
 binmode(STDERR, ':encoding(UTF-8)');
 Log::Any::Adapter->set('Stderr', log_level => $log_level);
 $log->notice('Debugging is activated') if DEBUG;


 if ($required_version) {
   $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
   if (!$1 || $1 ne $VERSION) {
     $log->error("Required version $required_version mismatches version $VERSION");
     exit(1);
   };
 };


 my ($what, $with);
 if ($xmlid_to_textsigle ne '') {
   ($what, $with) = split('@', $xmlid_to_textsigle);
   $what = qr!$what!;
 };

 # tag (without attributes), which contains the primary text
 my $_TEXT_BODY = 'text';
 # optional

 # Remember to skip certain inline tags
 my %skip_inline_tags = ();
 if ($skip_inline_tags_str) {
   foreach (split /\s*,\s*/, $skip_inline_tags_str) {
     $skip_inline_tags{$_} = 1;
   };
 };

 # External tokenization
 my $ext_tok;
 if ($tokenizer_call) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
   $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
 }

 # KorAP tokenization
 elsif ($tokenizer_korap) {
   eval {
     require KorAP::XML::TEI::Tokenizer::KorAP;
     1;
   };

   my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
   if ($korap_tok_ver ne $VERSION) {
     $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
     exit(1);
   };

   $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
 }

 # No internal tokenizer chosen
 elsif (!$tokenizer_intern && !$no_tokenizer) {
   $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
   exit(1);
 };

 if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
   $skip_inline_tags{s} = 1;
 };

 # Internal tokenization
 my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;


 # Name of the directory and the file containing all inline structure informations
 # except for $_TOKENS_TAG information
 my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';

 # Name of the directory and the file containing all inline token informations
 # i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
 my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';

 if (index($_tokens_dir, '!') == 0) {
   $_tokens_dir = substr($_tokens_dir, 1);
   $inline_tokens_exclusive = 1;
 };


 my ($_dep_dir, $_dep_file);
 if ($inline_dependencies) {
   ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
   $inline_dependencies = 1;

   if ($_dep_dir && index($_dep_dir, '!') == 0) {
     $_dep_dir = substr($_dep_dir, 1);
     $inline_deps_exclusive = 1;
   };
 };


 # Initialize zipper
 my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);

 # text directory (below $root_dir)
 my $dir = '';

 # Escaped version of text id
 my $text_id_esc;

 # Default encoding of the text
 my $input_enc = 'UTF-8';

 # text line (needed for whitespace handling)
 my $text_line = 0;


 # Input file handle (default: stdin)
 my $input_fh;

 # Single dash was set
 if ($stdio) {
   $input_fh = *STDIN;
 }

 # Input flag was passed
 elsif ($input_fname ne '') {
   unless (open($input_fh, '<', $input_fname)) {
     die $log->fatal("File '$input_fname' could not be opened.");
   };
 }

 # No input to process
 else {
   pod2usage(
     -verbose => 99,
     -sections => 'NAME|SYNOPSIS',
     -msg => $VERSION_MSG,
     -output => '-'
   );
   exit;
 };

 # Prevents segfaulting (see notes on segfault prevention)
 binmode $input_fh;


 # Create inline parser object
 my $inline = KorAP::XML::TEI::Inline->new(
   $skip_inline_tokens,
   \%skip_inline_tags,
   $inline_tokens_exclusive,
   $inline_dependencies
 );


 # Reading input document
 MAIN: while (<$input_fh>) {

   # remove HTML (multi-line) comments (<!--...-->)
   $_ = remove_xml_comments($input_fh, $_);

   # Set input encoding
   if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
     $input_enc = $2;
     next;
   };

   $_ = decode($input_enc, $_);
   $_ = replace_entities($_);

   # Start of text body
   if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
     my $suffix = $2;

     if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
       die $log->fatal("input line number $.: " .
                         "line with opening text-body tag '${_TEXT_BODY}' " .
                         "contains additional information ... => Aborting (line=$_)");
     };

     # Text body data extracted from input document ($input_fh),
     # further processed by XML::LibXML::Reader
     my $text_buffer = '';

     # Iterate over all lines in the text body
     while (<$input_fh>) {

       $_ = remove_xml_comments($input_fh, $_);
       $_ = decode($input_enc, $_);
       $_ = replace_entities($_);

       # End of text body
       if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {

         # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files

         if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
           die $log->fatal("input line number $.: " .
                             "line with closing text-body tag '${_TEXT_BODY}'".
                             " contains additional information ... => Aborting (line=$_)");
         };

         if ($dir eq '') {
           $log->warn(
             "Maybe empty textSigle => skipping this text ...\n" .
               'data=' . substr($inline->data->data, 0, 200)
             );
           next MAIN;
         };

         # Parse inline structure
         $inline->parse($text_id_esc, \$text_buffer);

         if (DEBUG) {
           $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
         };

         my $data = $inline->data;

         # Write data.xml
         $data->to_zip(
           $zipper->new_stream("$dir/${data_file}.xml"),
           $text_id_esc
         );

         # Tokenize with external tokenizer
         if ($ext_tok) {

           # Tokenize and output
           $ext_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
             $text_id_esc
           );

           if ($use_tokenizer_sentence_splits) {
             $ext_tok->sentencize_from_previous_input($inline->structures);
           };
         };

         # Tokenize with internal tokenizer
         if ($tokenizer_intern) {

           # Tokenize and output
           $cons_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
             $text_id_esc
           )->reset;

           $aggr_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
             $text_id_esc
           )->reset;
         };

         # ~ write structures ~
         unless ($inline->structures->empty) {
           $inline->structures->to_zip(
             $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
             $text_id_esc,
             2 # = structure serialization
           );
         };

         # ~ write tokens ~
         unless ($skip_inline_tokens || $inline->tokens->empty) {
           $inline->tokens->to_zip(
             $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
             $text_id_esc,
             # Either 0 = tokens without inline or
             # 1 = tokens with inline
             # !$skip_inline_token_annotations
             ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
           );
         };

         # ~ write dependencies ~
         unless ($inline->dependencies->empty) {
           $inline->dependencies->to_zip(
             $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
             $text_id_esc,
             3 # = dependency serialization
           );
         };


         # reinit.
         $dir = '';

         next MAIN;
       };


       # ~ whitespace handling ~

       # Fix whitespaces (see notes on whitespace fixing)

       # TODO:
       #   Maybe it's best, to keep the stripping of whitespace and
       #   to just remove the if-clause and to insert a blank by default
       #   (with possibly an option on how newlines in primary text should
       #   be handled (stripped or replaced by a whitespace)).

       # Remove consecutive whitespace at beginning and end (mostly one newline)
       s/^\s+//; s/\s+$//;

       # NOTE:
       #   this is only relevant, if a text consists of more than one line

       # TODO:
       #   find a better solution, or create a warning, if a text has more
       #   than one line ($text_line > 1)

       # TODO:
       #   do testing with 2 different corpora
       #   (one with only one-line texts, the other with several lines per text)

       # line contains at least one non-tag character
       if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {

         # Increment counter for text lines
         $text_line++;

         # insert blank before 1st character
         # (for 2nd line and consecutive lines)
         $_ = ' ' . $_ if $text_line > 1;
       }

       # add line to buffer
       $text_buffer .= $_;
     };
   }

   elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
     my $leadin = $1;
     my $id = $3;
     my $sigle = $3;

     if ($what) {
       $_ = $id;
       eval "s|$what|$with|";  # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
       $sigle = $_;
       $log->debug("Converted text id `$id' to sigle `$sigle'");
     };
     $sigle =~ s/\./-/g;

     my @parts = split(/[\/_]/, $sigle);
     if (@parts != 3) {
       die $log->fatal(
           "input line number $.: " .
               "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
               "=> Aborting (line=$_)");
     };

     $dir = join("/", @parts);
     $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
     $log->notice("$0: text_id=$text_id_esc");

     if ($leadin !~ /^\s*$/) {
       die $log->fatal(
           "input line number $.: " .
               'line with opening header tag is not in expected format ... ' .
               "=> Aborting (line=$_)");
     };
   }

   # Start of header section
   elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
     my $content = "$2\n";

     if ($1 !~ /^\s*$/) {
       die $log->fatal(
         "input line number $.: " .
           'line with opening header tag is not in expected format ... ' .
           "=> Aborting (line=$_)");
     };

     # Parse header
     my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);

     # Header was parseable
     if ($header) {

       # Write header to zip
       my $file = $header->dir . '/' . $header_file . '.xml';

       $log->debug("Writing file $file") if DEBUG;

       $header->to_zip($zipper->new_stream($file));

       # Header is for text level
       if ($header->type eq 'text') {

         # Remember dir and sigles
         $dir         = $header->dir;
         $text_id_esc = $header->id_esc;

         # log output for seeing progression
         $log->notice("$0: text_id=$text_id_esc");

         # Reset counter for text lines
         # (needed for whitespace handling)
         $text_line = 0;
       };
     };
   };
 };

 $zipper->close;

 $ext_tok->close if $ext_tok;

 close $input_fh;


 __END__

 =pod

 =encoding utf8

 =head1 NAME

 tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML

 =head1 SYNOPSIS

   cat corpus.i5.xml | tei2korapxml - > corpus.korapxml.zip

 =head1 DESCRIPTION

 C<tei2korapxml> is a script to convert TEI P5 and
 L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
 based documents to the
 L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.

 This program is usually called from inside another script.

 =head1 FORMATS

 =head2 Input restrictions

 =over 2

 =item

 TEI P5 formatted input with certain restrictions:

 =over 4

 =item

 B<mandatory>: text-header with integrated textsigle
 (or convertable identifier), text-body

 =item

 B<optional>: corp-header with integrated corpsigle,
 doc-header with integrated docsigle

 =back

 =item

 All tokens inside the primary text may not be
 newline seperated, because newlines are removed
 (see L<KorAP::XML::TEI::Data>) and a conversion of newlines
 into blanks between 2 tokens could lead to additional blanks,
 where there should be none (e.g.: punctuation characters like C<,> or
 C<.> should not be seperated from their predecessor token).
 (see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).

 =item

 Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
 need to be defined in the same line as the header tag.

 =back

 =head2 Notes on the output

 =over 2

 =item

 zip file output (default on C<stdout>) with utf8 encoded entries
 (which together form the KorAP-XML format)

 =back

 =head1 INSTALLATION

 C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
 When these requirements are met, the preferred way to install the script is
 to use L<cpanm|App::cpanminus>.

   $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git

 In case everything went well, the C<tei2korapxml> tool will
 be available on your command line immediately.

 Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.

 =head1 OPTIONS

 =over 2

 =item B<--input|-i>

 The input file to process. If no specific input is defined and a single
 dash C<-> is passed as an argument, data is read from C<STDIN>.

 =item B<--output|-o>

 The output zip file to be created. If no specific output is defined,
 data is written to C<STDOUT>.

 =item B<--root|-r>

 The root directory for output. Defaults to C<.>.

 =item B<--help|-h>

 Print help information.

 =item B<--version|-v>

 Print version information.

 =item B<--tokenizer-korap|-tk>

 Use the standard KorAP/DeReKo tokenizer.

 =item B<--tokenizer-internal|-ti>

 Tokenize the data using two embedded tokenizers,
 that will take an I<aggressive> and a I<conservative>
 approach.

 =item B<--tokenizer-call|-tc>

 Call an external tokenizer process, that will tokenize
 from STDIN and outputs the offsets of all tokens.

 Texts are separated using C<\x04\n>. The external process
 should add a new line per text.

 If the L</--use-tokenizer-sentence-splits> option is activated,
 sentences are marked by offset as well in new lines.

 To use L<Datok|https://github.com/KorAP/Datok> including sentence
 splitting, call C<tei2korap> as follows:

   $ cat corpus.i5.xml | tei2korapxml -s \
   $   -tc 'datok tokenize \
   $        -t ./tokenizer.matok \
   $        -p --newline-after-eot --no-sentences \
   $        --no-tokens --sentence-positions -' - \
   $        > corpus.korapxml.zip

 =item B<--no-tokenizer>

 Boolean flag indicating that no tokenizer should be used.
 This is meant to ensure that by default a final token layer always
 exists.
 If a separate tokenizer is chosen, this flag is ignored.

 =item B<--skip-inline-tokens>

 Boolean flag indicating that inline tokens should not
 be processed. Defaults to false (meaning inline tokens will be processed).

 =item B<--skip-inline-token-annotations>

 Boolean flag indicating that inline token annotations should not
 be processed. Defaults to true (meaning inline token annotations
 won't be processed). Can be negated with
 C<--no-skip-inline-token-annotations>.

 =item B<--skip-inline-tags> <tags>

 Expects a comma-separated list of tags to be ignored when the structure
 is parsed. Content of these tags however will be processed.

 =item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>

 Expects a regular replacement expression (separated by B<@> between the
 search and the replacement) to convert text id attributes to text sigles
 with three parts (separated by B</>).

 Example:

   tei2korapxml  \
     --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
     -tk - < t/data/icc_german_sample.p5.xml

 Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
 sigle C<ICCGER/DeReKo.WPD17/G11.00238>.

 =item B<--inline-tokens> <foundry>#[<file>]

 Define the foundry and file (without extension)
 to store inline token information in.
 Unless C<--skip-inline-token-annotations> is set,
 this will contain annotations as well.
 Defaults to C<tokens> and C<morpho>.

 The inline token data will also be stored in the
 inline structures file (see I<--inline-structures>),
 unless the inline token foundry is prepended
 by an B<!> exclamation mark, indicating that inline
 tokens are stored exclusively in the inline tokens
 file.

 Example:

   tei2korapxml --no-tokenizer --inline-tokens \
     '!gingko#morpho' < data.i5.xml > korapxml.zip

 =item B<--inline-dependencies> <foundry>#[<file>]

 Define the foundry and file (without extension)
 to store inline dependency information in.
 Defaults to the layer of C<dependency> and
 will be ignored if not set (which means, dependency
 attributes will be stored in the inline tokens file,
 if not skipped).

 The dependency data will also be stored in the
 inline token file (see I<--inline-tokens>),
 unless the inline dependencies foundry is prepended
 by an B<!> exclamation mark, indicating that inline
 dependency data is stored exclusively in the inline
 dependencies file.

 Example:

   tei2korapxml --no-tokenizer --inline-dependencies \
     'gingko#dependency' < data.i5.xml > korapxml.zip


 =item B<--inline-structures> <foundry>#[<file>]

 Define the foundry and file (without extension)
 to store inline structure information in.
 Defaults to C<struct> and C<structures>.

 =item B<--base-foundry> <foundry>

 Define the base foundry to store newly generated
 token information in.
 Defaults to C<base>.

 =item B<--data-file> <file>

 Define the file (without extension)
 to store primary data information in.
 Defaults to C<data>.

 =item B<--header-file> <file>

 Define the file name (without extension)
 to store header information on
 the corpus, document, and text level in.
 Defaults to C<header>.

 =item B<--use-tokenizer-sentence-splits|-s>

 Replace existing with, or add new, sentence boundary information
 provided by the tokenizer.
 Currently KorAP-tokenizer and certain external tokenizers support
 these boundaries.

 =item B<--tokens-file> <file>

 Define the file (without extension)
 to store generated token information in
 (either from the KorAP tokenizer or an externally called tokenizer).
 Defaults to C<tokens>.

 =item B<--log|-l>

 Loglevel for I<Log::Any>. Defaults to C<notice>.

 =back

 =head1 ENVIRONMENT VARIABLES

 =over 2

 =item B<KORAPXMLTEI_DEBUG>

 Activate minimal debugging.
 Defaults to C<false>.

 =back

 =head1 COPYRIGHT AND LICENSE

 Copyright (C) 2021-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>

 Author: Peter Harders

 Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober

 L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
 member of the
 L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.

 This program is free software published under the
 L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.

 =cut

 # NOTES

 ## Notes on segfault prevention

 binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
 (see notes on 'PerlIO layers' in  'man XML::LibXML'),
 removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
 see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
 see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
	#!/usr/bin/env perl
	use strict;
	use warnings;

	use Log::Any '$log';
	use Log::Any::Adapter;
	use Pod::Usage;
	use Getopt::Long qw(GetOptions :config no_auto_abbrev);

	use File::Basename qw(dirname);

	use Encode qw(decode);

	use FindBin;
	BEGIN {
	unshift @INC, "$FindBin::Bin/../lib";
	};

	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
	use KorAP::XML::TEI::Tokenizer::External;
	use KorAP::XML::TEI::Tokenizer::Conservative;
	use KorAP::XML::TEI::Tokenizer::Aggressive;
	use KorAP::XML::TEI::Zipper;
	use KorAP::XML::TEI::Header;
	use KorAP::XML::TEI::Inline;

	our $VERSION = '2.6.0';

	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";

	use constant {
	# Set to 1 for minimal more debug output (no need to be parametrized)
	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
	};

	if ($ENV{KORAPXMLTEI_INLINE}) {
	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
	};

	# Inline tokens won't be stored in the structure file
	my $inline_tokens_exclusive = 0;

	# Inline dependencies won't be stored in the tokens file
	my $inline_deps_exclusive = 0;

	# Parse options from the command line
	GetOptions(
	'root\|r=s' => \(my $root_dir = '.'),
	'input\|i=s' => \(my $input_fname = ''),
	'output\|o=s' => \(my $output_fname = ''),
	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
	'no-tokenizer' => \(my $no_tokenizer),
	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
	'inline-dependencies=s' => \(my $inline_dependencies),
	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
	'skip-inline-token-annotations!' => \(
	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
	'base-foundry=s' => \(my $base_dir = 'base'),
	'data-file=s' => \(my $data_file = 'data'),
	'header-file=s' => \(my $header_file = 'header'),
	'tokens-file=s' => \(my $tokens_file = 'tokens'),
	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
	'log\|l=s' => \(my $log_level = 'notice'),
	'required-version\|rv=s' => \(my $required_version),
	'' => \(my $stdio),
	'help\|h' => sub {
	pod2usage(
	-verbose => 99,
	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
	-msg => $VERSION_MSG,
	-output => '-'
	)
	},
	'version\|v' => sub {
	pod2usage(
	-verbose => 0,
	-msg => $VERSION_MSG,
	-output => '-'
	);
	}
	);


	# Establish logger
	binmode(STDERR, ':encoding(UTF-8)');
	Log::Any::Adapter->set('Stderr', log_level => $log_level);
	$log->notice('Debugging is activated') if DEBUG;


	if ($required_version) {
	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
	if (!$1 \|\| $1 ne $VERSION) {
	$log->error("Required version $required_version mismatches version $VERSION");
	exit(1);
	};
	};


	my ($what, $with);
	if ($xmlid_to_textsigle ne '') {
	($what, $with) = split('@', $xmlid_to_textsigle);
	$what = qr!$what!;
	};

	# tag (without attributes), which contains the primary text
	my $_TEXT_BODY = 'text';
	# optional

	# Remember to skip certain inline tags
	my %skip_inline_tags = ();
	if ($skip_inline_tags_str) {
	foreach (split /\s,\s/, $skip_inline_tags_str) {
	$skip_inline_tags{$_} = 1;
	};
	};

	# External tokenization
	my $ext_tok;
	if ($tokenizer_call) {
	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
	}

	# KorAP tokenization
	elsif ($tokenizer_korap) {
	eval {
	require KorAP::XML::TEI::Tokenizer::KorAP;
	1;
	};

	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
	if ($korap_tok_ver ne $VERSION) {
	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
	exit(1);
	};

	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
	}

	# No internal tokenizer chosen
	elsif (!$tokenizer_intern && !$no_tokenizer) {
	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
	exit(1);
	};

	if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
	$skip_inline_tags{s} = 1;
	};

	# Internal tokenization
	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;


	# Name of the directory and the file containing all inline structure informations
	# except for $_TOKENS_TAG information
	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';

	# Name of the directory and the file containing all inline token informations
	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';

	if (index($_tokens_dir, '!') == 0) {
	$_tokens_dir = substr($_tokens_dir, 1);
	$inline_tokens_exclusive = 1;
	};


	my ($_dep_dir, $_dep_file);
	if ($inline_dependencies) {
	($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
	$inline_dependencies = 1;

	if ($_dep_dir && index($_dep_dir, '!') == 0) {
	$_dep_dir = substr($_dep_dir, 1);
	$inline_deps_exclusive = 1;
	};
	};


	# Initialize zipper
	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);

	# text directory (below $root_dir)
	my $dir = '';

	# Escaped version of text id
	my $text_id_esc;

	# Default encoding of the text
	my $input_enc = 'UTF-8';

	# text line (needed for whitespace handling)
	my $text_line = 0;


	# Input file handle (default: stdin)
	my $input_fh;

	# Single dash was set
	if ($stdio) {
	$input_fh = *STDIN;
	}

	# Input flag was passed
	elsif ($input_fname ne '') {
	unless (open($input_fh, '<', $input_fname)) {
	die $log->fatal("File '$input_fname' could not be opened.");
	};
	}

	# No input to process
	else {
	pod2usage(
	-verbose => 99,
	-sections => 'NAME\|SYNOPSIS',
	-msg => $VERSION_MSG,
	-output => '-'
	);
	exit;
	};

	# Prevents segfaulting (see notes on segfault prevention)
	binmode $input_fh;


	# Create inline parser object
	my $inline = KorAP::XML::TEI::Inline->new(
	$skip_inline_tokens,
	\%skip_inline_tags,
	$inline_tokens_exclusive,
	$inline_dependencies
	);


	# Reading input document
	MAIN: while (<$input_fh>) {

	# remove HTML (multi-line) comments (<!--...-->)
	$_ = remove_xml_comments($input_fh, $_);

	# Set input encoding
	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
	$input_enc = $2;
	next;
	};

	$_ = decode($input_enc, $_);
	$_ = replace_entities($_);

	# Start of text body
	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
	my $suffix = $2;

	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
	die $log->fatal("input line number $.: " .
	"line with opening text-body tag '${_TEXT_BODY}' " .
	"contains additional information ... => Aborting (line=$_)");
	};

	# Text body data extracted from input document ($input_fh),
	# further processed by XML::LibXML::Reader
	my $text_buffer = '';

	# Iterate over all lines in the text body
	while (<$input_fh>) {

	$_ = remove_xml_comments($input_fh, $_);
	$_ = decode($input_enc, $_);
	$_ = replace_entities($_);

	# End of text body
	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {

	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files

	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
	die $log->fatal("input line number $.: " .
	"line with closing text-body tag '${_TEXT_BODY}'".
	" contains additional information ... => Aborting (line=$_)");
	};

	if ($dir eq '') {
	$log->warn(
	"Maybe empty textSigle => skipping this text ...\n" .
	'data=' . substr($inline->data->data, 0, 200)
	);
	next MAIN;
	};

	# Parse inline structure
	$inline->parse($text_id_esc, \$text_buffer);

	if (DEBUG) {
	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
	};

	my $data = $inline->data;

	# Write data.xml
	$data->to_zip(
	$zipper->new_stream("$dir/${data_file}.xml"),
	$text_id_esc
	);

	# Tokenize with external tokenizer
	if ($ext_tok) {

	# Tokenize and output
	$ext_tok->tokenize($data->data)->to_zip(
	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
	$text_id_esc
	);

	if ($use_tokenizer_sentence_splits) {
	$ext_tok->sentencize_from_previous_input($inline->structures);
	};
	};

	# Tokenize with internal tokenizer
	if ($tokenizer_intern) {

	# Tokenize and output
	$cons_tok->tokenize($data->data)->to_zip(
	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
	$text_id_esc
	)->reset;

	$aggr_tok->tokenize($data->data)->to_zip(
	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
	$text_id_esc
	)->reset;
	};

	# ~ write structures ~
	unless ($inline->structures->empty) {
	$inline->structures->to_zip(
	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
	$text_id_esc,
	2 # = structure serialization
	);
	};

	# ~ write tokens ~
	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
	$inline->tokens->to_zip(
	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
	$text_id_esc,
	# Either 0 = tokens without inline or
	# 1 = tokens with inline
	# !$skip_inline_token_annotations
	($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
	);
	};

	# ~ write dependencies ~
	unless ($inline->dependencies->empty) {
	$inline->dependencies->to_zip(
	$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
	$text_id_esc,
	3 # = dependency serialization
	);
	};


	# reinit.
	$dir = '';

	next MAIN;
	};


	# ~ whitespace handling ~

	# Fix whitespaces (see notes on whitespace fixing)

	# TODO:
	# Maybe it's best, to keep the stripping of whitespace and
	# to just remove the if-clause and to insert a blank by default
	# (with possibly an option on how newlines in primary text should
	# be handled (stripped or replaced by a whitespace)).

	# Remove consecutive whitespace at beginning and end (mostly one newline)
	s/^\s+//; s/\s+$//;

	# NOTE:
	# this is only relevant, if a text consists of more than one line

	# TODO:
	# find a better solution, or create a warning, if a text has more
	# than one line ($text_line > 1)

	# TODO:
	# do testing with 2 different corpora
	# (one with only one-line texts, the other with several lines per text)

	# line contains at least one non-tag character
	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {

	# Increment counter for text lines
	$text_line++;

	# insert blank before 1st character
	# (for 2nd line and consecutive lines)
	$_ = ' ' . $_ if $text_line > 1;
	}

	# add line to buffer
	$text_buffer .= $_;
	};
	}

	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
	my $leadin = $1;
	my $id = $3;
	my $sigle = $3;

	if ($what) {
	$_ = $id;
	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
	$sigle = $_;
	$log->debug("Converted text id `$id' to sigle `$sigle'");
	};
	$sigle =~ s/\./-/g;

	my @parts = split(/[\/_]/, $sigle);
	if (@parts != 3) {
	die $log->fatal(
	"input line number $.: " .
	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
	"=> Aborting (line=$_)");
	};

	$dir = join("/", @parts);
	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
	$log->notice("$0: text_id=$text_id_esc");

	if ($leadin !~ /^\s*$/) {
	die $log->fatal(
	"input line number $.: " .
	'line with opening header tag is not in expected format ... ' .
	"=> Aborting (line=$_)");
	};
	}

	# Start of header section
	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
	my $content = "$2\n";

	if ($1 !~ /^\s*$/) {
	die $log->fatal(
	"input line number $.: " .
	'line with opening header tag is not in expected format ... ' .
	"=> Aborting (line=$_)");
	};

	# Parse header
	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);

	# Header was parseable
	if ($header) {

	# Write header to zip
	my $file = $header->dir . '/' . $header_file . '.xml';

	$log->debug("Writing file $file") if DEBUG;

	$header->to_zip($zipper->new_stream($file));

	# Header is for text level
	if ($header->type eq 'text') {

	# Remember dir and sigles
	$dir = $header->dir;
	$text_id_esc = $header->id_esc;

	# log output for seeing progression
	$log->notice("$0: text_id=$text_id_esc");

	# Reset counter for text lines
	# (needed for whitespace handling)
	$text_line = 0;
	};
	};
	};
	};

	$zipper->close;

	$ext_tok->close if $ext_tok;

	close $input_fh;


	__END__

	=pod

	=encoding utf8

	=head1 NAME

	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML

	=head1 SYNOPSIS

	cat corpus.i5.xml \| tei2korapxml - > corpus.korapxml.zip

	=head1 DESCRIPTION

	C<tei2korapxml> is a script to convert TEI P5 and
	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
	based documents to the
	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.

	This program is usually called from inside another script.

	=head1 FORMATS

	=head2 Input restrictions

	=over 2

	=item

	TEI P5 formatted input with certain restrictions:

	=over 4

	=item

	B<mandatory>: text-header with integrated textsigle
	(or convertable identifier), text-body

	=item

	B<optional>: corp-header with integrated corpsigle,
	doc-header with integrated docsigle

	=back

	=item

	All tokens inside the primary text may not be
	newline seperated, because newlines are removed
	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
	into blanks between 2 tokens could lead to additional blanks,
	where there should be none (e.g.: punctuation characters like C<,> or
	C<.> should not be seperated from their predecessor token).
	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).

	=item

	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
	need to be defined in the same line as the header tag.

	=back

	=head2 Notes on the output

	=over 2

	=item

	zip file output (default on C<stdout>) with utf8 encoded entries
	(which together form the KorAP-XML format)

	=back

	=head1 INSTALLATION

	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
	When these requirements are met, the preferred way to install the script is
	to use L<cpanm\|App::cpanminus>.

	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git

	In case everything went well, the C<tei2korapxml> tool will
	be available on your command line immediately.

	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.

	=head1 OPTIONS

	=over 2

	=item B<--input\|-i>

	The input file to process. If no specific input is defined and a single
	dash C<-> is passed as an argument, data is read from C<STDIN>.

	=item B<--output\|-o>

	The output zip file to be created. If no specific output is defined,
	data is written to C<STDOUT>.

	=item B<--root\|-r>

	The root directory for output. Defaults to C<.>.

	=item B<--help\|-h>

	Print help information.

	=item B<--version\|-v>

	Print version information.

	=item B<--tokenizer-korap\|-tk>

	Use the standard KorAP/DeReKo tokenizer.

	=item B<--tokenizer-internal\|-ti>

	Tokenize the data using two embedded tokenizers,
	that will take an I<aggressive> and a I<conservative>
	approach.

	=item B<--tokenizer-call\|-tc>

	Call an external tokenizer process, that will tokenize
	from STDIN and outputs the offsets of all tokens.

	Texts are separated using C<\x04\n>. The external process
	should add a new line per text.

	If the L</--use-tokenizer-sentence-splits> option is activated,
	sentences are marked by offset as well in new lines.

	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
	splitting, call C<tei2korap> as follows:

	$ cat corpus.i5.xml \| tei2korapxml -s \
	$ -tc 'datok tokenize \
	$ -t ./tokenizer.matok \
	$ -p --newline-after-eot --no-sentences \
	$ --no-tokens --sentence-positions -' - \
	$ > corpus.korapxml.zip

	=item B<--no-tokenizer>

	Boolean flag indicating that no tokenizer should be used.
	This is meant to ensure that by default a final token layer always
	exists.
	If a separate tokenizer is chosen, this flag is ignored.

	=item B<--skip-inline-tokens>

	Boolean flag indicating that inline tokens should not
	be processed. Defaults to false (meaning inline tokens will be processed).

	=item B<--skip-inline-token-annotations>

	Boolean flag indicating that inline token annotations should not
	be processed. Defaults to true (meaning inline token annotations
	won't be processed). Can be negated with
	C<--no-skip-inline-token-annotations>.

	=item B<--skip-inline-tags> <tags>

	Expects a comma-separated list of tags to be ignored when the structure
	is parsed. Content of these tags however will be processed.

	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>

	Expects a regular replacement expression (separated by B<@> between the
	search and the replacement) to convert text id attributes to text sigles
	with three parts (separated by B</>).

	Example:

	tei2korapxml \
	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
	-tk - < t/data/icc_german_sample.p5.xml

	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.

	=item B<--inline-tokens> <foundry>#[<file>]

	Define the foundry and file (without extension)
	to store inline token information in.
	Unless C<--skip-inline-token-annotations> is set,
	this will contain annotations as well.
	Defaults to C<tokens> and C<morpho>.

	The inline token data will also be stored in the
	inline structures file (see I<--inline-structures>),
	unless the inline token foundry is prepended
	by an B<!> exclamation mark, indicating that inline
	tokens are stored exclusively in the inline tokens
	file.

	Example:

	tei2korapxml --no-tokenizer --inline-tokens \
	'!gingko#morpho' < data.i5.xml > korapxml.zip

	=item B<--inline-dependencies> <foundry>#[<file>]

	Define the foundry and file (without extension)
	to store inline dependency information in.
	Defaults to the layer of C<dependency> and
	will be ignored if not set (which means, dependency
	attributes will be stored in the inline tokens file,
	if not skipped).

	The dependency data will also be stored in the
	inline token file (see I<--inline-tokens>),
	unless the inline dependencies foundry is prepended
	by an B<!> exclamation mark, indicating that inline
	dependency data is stored exclusively in the inline
	dependencies file.

	Example:

	tei2korapxml --no-tokenizer --inline-dependencies \
	'gingko#dependency' < data.i5.xml > korapxml.zip


	=item B<--inline-structures> <foundry>#[<file>]

	Define the foundry and file (without extension)
	to store inline structure information in.
	Defaults to C<struct> and C<structures>.

	=item B<--base-foundry> <foundry>

	Define the base foundry to store newly generated
	token information in.
	Defaults to C<base>.

	=item B<--data-file> <file>

	Define the file (without extension)
	to store primary data information in.
	Defaults to C<data>.

	=item B<--header-file> <file>

	Define the file name (without extension)
	to store header information on
	the corpus, document, and text level in.
	Defaults to C<header>.

	=item B<--use-tokenizer-sentence-splits\|-s>

	Replace existing with, or add new, sentence boundary information
	provided by the tokenizer.
	Currently KorAP-tokenizer and certain external tokenizers support
	these boundaries.

	=item B<--tokens-file> <file>

	Define the file (without extension)
	to store generated token information in
	(either from the KorAP tokenizer or an externally called tokenizer).
	Defaults to C<tokens>.

	=item B<--log\|-l>

	Loglevel for I<Log::Any>. Defaults to C<notice>.

	=back

	=head1 ENVIRONMENT VARIABLES

	=over 2

	=item B<KORAPXMLTEI_DEBUG>

	Activate minimal debugging.
	Defaults to C<false>.

	=back

	=head1 COPYRIGHT AND LICENSE

	Copyright (C) 2021-2024, L<IDS Mannheim\|https://www.ids-mannheim.de/>

	Author: Peter Harders

	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober

	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
	Corpus Analysis Platform at the
	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
	member of the
	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.

	This program is free software published under the
	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.

	=cut

	# NOTES

	## Notes on segfault prevention

	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.