script/tei2korapxml - KorAP/KorAP-XML-TEI - Gitiles

 #!/usr/bin/env perl
 use strict;
 use warnings;

 use Log::Any '$log';
 use Log::Any::Adapter;
 use Pod::Usage;
 use Getopt::Long qw(GetOptions :config no_auto_abbrev);

 use File::Basename qw(dirname);

 use Encode qw(decode);

 use FindBin;
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };

 use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;
 use KorAP::XML::TEI::Inline;

 our $VERSION = '2.4.3';

 our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";

 use constant {
   # Set to 1 for minimal more debug output (no need to be parametrized)
   DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
 };

 if ($ENV{KORAPXMLTEI_INLINE}) {
   warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
 };

 # Inline tokens won't be stored in the structure file
 my $inline_tokens_exclusive = 0;

 # Parse options from the command line
 GetOptions(
   'root|r=s'              => \(my $root_dir    = '.'),
   'input|i=s'             => \(my $input_fname = ''),
   'tokenizer-call|tc=s'   => \(my $tokenizer_call),
   'tokenizer-korap|tk'    => \(my $tokenizer_korap),
   'tokenizer-internal|ti' => \(my $tokenizer_intern),
   'no-tokenizer'          => \(my $no_tokenizer),
   'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
   'inline-tokens=s'       => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s'   => \(my $inline_structures = 'struct#structure'),
   'skip-inline-tokens'    => \(my $skip_inline_tokens = 0),
   'skip-inline-token-annotations' => \(
     my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
   'skip-inline-tags=s'    => \(my $skip_inline_tags_str = ''),
   'base-foundry=s'        => \(my $base_dir    = 'base'),
   'data-file=s'           => \(my $data_file   = 'data'),
   'header-file=s'         => \(my $header_file = 'header'),
   'tokens-file=s'         => \(my $tokens_file = 'tokens'),
   'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
   'log|l=s'               => \(my $log_level   = 'notice'),
   'required-version|rv=s' => \(my $required_version),
   ''                      => \(my $stdio),
   'help|h' => sub {
     pod2usage(
       -verbose => 99,
       -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
       -msg => $VERSION_MSG,
       -output => '-'
     )
   },
   'version|v' => sub {
     pod2usage(
       -verbose => 0,
       -msg => $VERSION_MSG,
       -output => '-'
     );
   }
 );


 # Establish logger
 binmode(STDERR, ':encoding(UTF-8)');
 Log::Any::Adapter->set('Stderr', log_level => $log_level);
 $log->notice('Debugging is activated') if DEBUG;


 if ($required_version) {
   $required_version =~ /^\s*(\d+\.\d+\.\d+)\s*$/;
   if (!$1 || $1 ne $VERSION) {
     $log->error("Required version $required_version mismatches version $VERSION");
     exit(1);
   };
 };


 my ($what, $with);
 if ($xmlid_to_textsigle ne '') {
   ($what, $with) = split('@', $xmlid_to_textsigle);
   $what = qr!$what!;
 };

 # tag (without attributes), which contains the primary text
 my $_TEXT_BODY = 'text';
 # optional

 # Remember to skip certain inline tags
 my %skip_inline_tags = ();
 if ($skip_inline_tags_str) {
   foreach (split /\s*,\s*/, $skip_inline_tags_str) {
     $skip_inline_tags{$_} = 1;
   };
 };

 # External tokenization
 my $ext_tok;
 if ($tokenizer_call) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
   $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
 }

 # KorAP tokenization
 elsif ($tokenizer_korap) {
   eval {
     require KorAP::XML::TEI::Tokenizer::KorAP;
     1;
   };

   my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
   if ($korap_tok_ver ne $VERSION) {
     $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
     exit(1);
   };

   $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
 }

 # No internal tokenizer chosen
 elsif (!$tokenizer_intern && !$no_tokenizer) {
   $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
   exit(1);
 };

 if ($use_tokenizer_sentence_splits) {
   $skip_inline_tags{s} = 1;
 };

 # Internal tokenization
 my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;


 # Name of the directory and the file containing all inline structure informations
 # except for $_TOKENS_TAG information
 my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';

 # Name of the directory and the file containing all inline token informations
 # i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
 my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';

 if (index($_tokens_dir, '!') == 0) {
   $_tokens_dir = substr($_tokens_dir, 1);
   $inline_tokens_exclusive = 1;
 };

 # Initialize zipper
 my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);

 # text directory (below $root_dir)
 my $dir = '';

 # Escaped version of text id
 my $text_id_esc;

 # Default encoding of the text
 my $input_enc = 'UTF-8';

 # text line (needed for whitespace handling)
 my $text_line = 0;


 # Input file handle (default: stdin)
 my $input_fh;

 # Single dash was set
 if ($stdio) {
   $input_fh = *STDIN;
 }

 # Input flag was passed
 elsif ($input_fname ne '') {
   unless (open($input_fh, '<', $input_fname)) {
     die $log->fatal("File '$input_fname' could not be opened.");
   };
 }

 # No input to process
 else {
   pod2usage(
     -verbose => 99,
     -sections => 'NAME|SYNOPSIS',
     -msg => $VERSION_MSG,
     -output => '-'
   );
   exit;
 };

 # Prevents segfaulting (see notes on segfault prevention)
 binmode $input_fh;


 # Create inline parser object
 my $inline = KorAP::XML::TEI::Inline->new(
   $skip_inline_tokens,
   \%skip_inline_tags,
   $inline_tokens_exclusive
 );


 # Reading input document
 MAIN: while (<$input_fh>) {

   # remove HTML (multi-line) comments (<!--...-->)
   $_ = remove_xml_comments($input_fh, $_);

   # Set input encoding
   if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
     $input_enc = $2;
     next;
   };

   $_ = decode($input_enc, $_);
   $_ = replace_entities($_);

   # Start of text body
   if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
     my $suffix = $2;

     if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
       die $log->fatal("input line number $.: " .
                         "line with opening text-body tag '${_TEXT_BODY}' " .
                         "contains additional information ... => Aborting (line=$_)");
     };

     # Text body data extracted from input document ($input_fh),
     # further processed by XML::LibXML::Reader
     my $text_buffer = '';

     # Iterate over all lines in the text body
     while (<$input_fh>) {

       $_ = remove_xml_comments($input_fh, $_);
       $_ = decode($input_enc, $_);
       $_ = replace_entities($_);

       # End of text body
       if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {

         # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files

         if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
           die $log->fatal("input line number $.: " .
                             "line with closing text-body tag '${_TEXT_BODY}'".
                             " contains additional information ... => Aborting (line=$_)");
         };

         if ($dir eq '') {
           $log->warn(
             "Maybe empty textSigle => skipping this text ...\n" .
               'data=' . substr($inline->data->data, 0, 200)
             );
           next MAIN;
         };

         # Parse inline structure
         $inline->parse($text_id_esc, \$text_buffer);

         if (DEBUG) {
           $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
         };

         my $data = $inline->data;

         # Write data.xml
         $data->to_zip(
           $zipper->new_stream("$dir/${data_file}.xml"),
           $text_id_esc
         );

         # Tokenize with external tokenizer
         if ($ext_tok) {

           # Tokenize and output
           $ext_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
             $text_id_esc
           );

           if ($use_tokenizer_sentence_splits) {
             $ext_tok->sentencize_from_previous_input($inline->structures);
           };
         };

         # Tokenize with internal tokenizer
         if ($tokenizer_intern) {

           # Tokenize and output
           $cons_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
             $text_id_esc
           )->reset;

           $aggr_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
             $text_id_esc
           )->reset;
         };

         # ~ write structures ~
         if (!$inline->structures->empty) {
           $inline->structures->to_zip(
             $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
             $text_id_esc,
             2 # = structure serialization
           );
         };

         # ~ write tokens ~
         unless ($skip_inline_tokens || $inline->tokens->empty) {
           $inline->tokens->to_zip(
             $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
             $text_id_esc,
             # Either 0 = tokens without inline or 1 = tokens with inline
             !$skip_inline_token_annotations
           );
         };

         # reinit.
         $dir = '';

         next MAIN;
       };


       # ~ whitespace handling ~

       # Fix whitespaces (see notes on whitespace fixing)

       # TODO:
       #   Maybe it's best, to keep the stripping of whitespace and
       #   to just remove the if-clause and to insert a blank by default
       #   (with possibly an option on how newlines in primary text should
       #   be handled (stripped or replaced by a whitespace)).

       # Remove consecutive whitespace at beginning and end (mostly one newline)
       s/^\s+//; s/\s+$//;

       # NOTE:
       #   this is only relevant, if a text consists of more than one line

       # TODO:
       #   find a better solution, or create a warning, if a text has more
       #   than one line ($text_line > 1)

       # TODO:
       #   do testing with 2 different corpora
       #   (one with only one-line texts, the other with several lines per text)

       # line contains at least one tag with at least one character contents
       if (m/<[^>]+>[^<]/) {

         # Increment counter for text lines
         $text_line++;

         # insert blank before 1st character
         # (for 2nd line and consecutive lines)
         $_ = ' ' . $_ if $text_line > 1;
       }

       # add line to buffer
       $text_buffer .= $_;
     };
   }

   elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
     my $leadin = $1;
     my $id = $3;
     my $sigle = $3;

     if ($what) {
       $_ = $id;
       eval "s|$what|$with|";  # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
       $sigle = $_;
       $log->debug("Converted text id `$id' to sigle `$sigle'");
     };
     $sigle =~ s/\./-/g;

     my @parts = split(/[\/_]/, $sigle);
     if (@parts != 3) {
       die $log->fatal(
           "input line number $.: " .
               "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
               "=> Aborting (line=$_)");
     };

     $dir = join("/", @parts);
     $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
     $log->notice("$0: text_id=$text_id_esc");

     if ($leadin !~ /^\s*$/) {
       die $log->fatal(
           "input line number $.: " .
               'line with opening header tag is not in expected format ... ' .
               "=> Aborting (line=$_)");
     };
   }

   # Start of header section
   elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
     my $content = "$2\n";

     if ($1 !~ /^\s*$/) {
       die $log->fatal(
         "input line number $.: " .
           'line with opening header tag is not in expected format ... ' .
           "=> Aborting (line=$_)");
     };

     # Parse header
     my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);

     # Header was parseable
     if ($header) {

       # Write header to zip
       my $file = $header->dir . '/' . $header_file . '.xml';

       $log->debug("Writing file $file") if DEBUG;

       $header->to_zip($zipper->new_stream($file));

       # Header is for text level
       if ($header->type eq 'text') {

         # Remember dir and sigles
         $dir         = $header->dir;
         $text_id_esc = $header->id_esc;

         # log output for seeing progression
         $log->notice("$0: text_id=$text_id_esc");

         # Reset counter for text lines
         # (needed for whitespace handling)
         $text_line = 0;
       };
     };
   };
 };

 $zipper->close;

 $ext_tok->close if $ext_tok;

 close $input_fh;


 __END__

 =pod

 =encoding utf8

 =head1 NAME

 tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML

 =head1 SYNOPSIS

   cat corpus.i5.xml | tei2korapxml - > corpus.korapxml.zip

 =head1 DESCRIPTION

 C<tei2korapxml> is a script to convert TEI P5 and
 L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
 based documents to the
 L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.

 This program is usually called from inside another script.

 =head1 FORMATS

 =head2 Input restrictions

 =over 2

 =item

 TEI P5 formatted input with certain restrictions:

 =over 4

 =item

 B<mandatory>: text-header with integrated textsigle
 (or convertable identifier), text-body

 =item

 B<optional>: corp-header with integrated corpsigle,
 doc-header with integrated docsigle

 =back

 =item

 All tokens inside the primary text may not be
 newline seperated, because newlines are removed
 (see L<KorAP::XML::TEI::Data>) and a conversion of newlines
 into blanks between 2 tokens could lead to additional blanks,
 where there should be none (e.g.: punctuation characters like C<,> or
 C<.> should not be seperated from their predecessor token).
 (see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).

 =item

 Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
 need to be defined in the same line as the header tag.

 =back

 =head2 Notes on the output

 =over 2

 =item

 zip file output (default on C<stdout>) with utf8 encoded entries
 (which together form the KorAP-XML format)

 =back

 =head1 INSTALLATION

 C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
 When these requirements are met, the preferred way to install the script is
 to use L<cpanm|App::cpanminus>.

   $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git

 In case everything went well, the C<tei2korapxml> tool will
 be available on your command line immediately.

 Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.

 =head1 OPTIONS

 =over 2

 =item B<--input|-i>

 The input file to process. If no specific input is defined and a single
 dash C<-> is passed as an argument, data is read from C<STDIN>.


 =item B<--root|-r>

 The root directory for output. Defaults to C<.>.

 =item B<--help|-h>

 Print help information.

 =item B<--version|-v>

 Print version information.

 =item B<--tokenizer-korap|-tk>

 Use the standard KorAP/DeReKo tokenizer.

 =item B<--tokenizer-internal|-ti>

 Tokenize the data using two embedded tokenizers,
 that will take an I<aggressive> and a I<conservative>
 approach.

 =item B<--tokenizer-call|-tc>

 Call an external tokenizer process, that will tokenize
 from STDIN and outputs the offsets of all tokens.

 Texts are separated using C<\x04\n>. The external process
 should add a new line per text.

 If the L</--use-tokenizer-sentence-splits> option is activated,
 sentences are marked by offset as well in new lines.

 To use L<Datok|https://github.com/KorAP/Datok> including sentence
 splitting, call C<tei2korap> as follows:

   $ cat corpus.i5.xml | tei2korapxml -s \
   $   -tc 'datok tokenize \
   $        -t ./tokenizer.matok \
   $        -p --newline-after-eot --no-sentences \
   $        --no-tokens --sentence-positions -' - \
   $        > corpus.korapxml.zip

 =item B<--no-tokenizer>

 Boolean flag indicating that no tokenizer should be used.
 This is meant to ensure that by default a final token layer always
 exists.
 If a separate tokenizer is chosen, this flag is ignored.

 =item B<--skip-inline-tokens>

 Boolean flag indicating that inline tokens should not
 be processed. Defaults to false (meaning inline tokens will be processed).

 =item B<--skip-inline-token-annotations>

 Boolean flag indicating that inline token annotations should not
 be processed. Defaults to true (meaning inline token annotations
 won't be processed).

 =item B<--skip-inline-tags> <tags>

 Expects a comma-separated list of tags to be ignored when the structure
 is parsed. Content of these tags however will be processed.

 =item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>

 Expects a regular replacement expression (separated by B<@> between the
 search and the replacement) to convert text id attributes to text sigles
 with three parts (separated by B</>).

 Example:

   tei2korapxml  \
     --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
     -tk - < t/data/icc_german_sample.p5.xml

 Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
 sigle C<ICCGER/DeReKo.WPD17/G11.00238>.

 =item B<--inline-tokens> <foundry>#[<file>]

 Define the foundry and file (without extension)
 to store inline token information in.
 Unless C<--skip-inline-token-annotations> is set,
 this will contain annotations as well.
 Defaults to C<tokens> and C<morpho>.

 The inline token data will also be stored in the
 inline structures file (see I<--inline-structures>),
 unless the inline token foundry is prepended
 by an B<!> exclamation mark, indicating that inline
 tokens are stored exclusively in the inline tokens
 file.

 Example:

   tei2korapxml --inline-tokens '!gingko#morpho' < data.i5.xml > korapxml.zip

 =item B<--inline-structures> <foundry>#[<file>]

 Define the foundry and file (without extension)
 to store inline structure information in.
 Defaults to C<struct> and C<structures>.

 =item B<--base-foundry> <foundry>

 Define the base foundry to store newly generated
 token information in.
 Defaults to C<base>.

 =item B<--data-file> <file>

 Define the file (without extension)
 to store primary data information in.
 Defaults to C<data>.

 =item B<--header-file> <file>

 Define the file name (without extension)
 to store header information on
 the corpus, document, and text level in.
 Defaults to C<header>.

 =item B<--use-tokenizer-sentence-splits|-s>

 Replace existing with, or add new, sentence boundary information
 provided by the tokenizer.
 Currently KorAP-tokenizer and certain external tokenizers support
 these boundaries.

 =item B<--tokens-file> <file>

 Define the file (without extension)
 to store generated token information in
 (either from the KorAP tokenizer or an externally called tokenizer).
 Defaults to C<tokens>.

 =item B<--log|-l>

 Loglevel for I<Log::Any>. Defaults to C<notice>.

 =back

 =head1 ENVIRONMENT VARIABLES

 =over 2

 =item B<KORAPXMLTEI_DEBUG>

 Activate minimal debugging.
 Defaults to C<false>.

 =back

 =head1 COPYRIGHT AND LICENSE

 Copyright (C) 2021-2023, L<IDS Mannheim|https://www.ids-mannheim.de/>

 Author: Peter Harders

 Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober

 L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
 member of the
 L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.

 This program is free software published under the
 L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.

 =cut

 # NOTES

 ## Notes on segfault prevention

 binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
 (see notes on 'PerlIO layers' in  'man XML::LibXML'),
 removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
 see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
 see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.
	#!/usr/bin/env perl
	use strict;
	use warnings;

	use Log::Any '$log';
	use Log::Any::Adapter;
	use Pod::Usage;
	use Getopt::Long qw(GetOptions :config no_auto_abbrev);

	use File::Basename qw(dirname);

	use Encode qw(decode);

	use FindBin;
	BEGIN {
	unshift @INC, "$FindBin::Bin/../lib";
	};

	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
	use KorAP::XML::TEI::Tokenizer::External;
	use KorAP::XML::TEI::Tokenizer::Conservative;
	use KorAP::XML::TEI::Tokenizer::Aggressive;
	use KorAP::XML::TEI::Zipper;
	use KorAP::XML::TEI::Header;
	use KorAP::XML::TEI::Inline;

	our $VERSION = '2.4.3';

	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";

	use constant {
	# Set to 1 for minimal more debug output (no need to be parametrized)
	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
	};

	if ($ENV{KORAPXMLTEI_INLINE}) {
	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
	};

	# Inline tokens won't be stored in the structure file
	my $inline_tokens_exclusive = 0;

	# Parse options from the command line
	GetOptions(
	'root\|r=s' => \(my $root_dir = '.'),
	'input\|i=s' => \(my $input_fname = ''),
	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
	'no-tokenizer' => \(my $no_tokenizer),
	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
	'skip-inline-token-annotations' => \(
	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
	'base-foundry=s' => \(my $base_dir = 'base'),
	'data-file=s' => \(my $data_file = 'data'),
	'header-file=s' => \(my $header_file = 'header'),
	'tokens-file=s' => \(my $tokens_file = 'tokens'),
	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
	'log\|l=s' => \(my $log_level = 'notice'),
	'required-version\|rv=s' => \(my $required_version),
	'' => \(my $stdio),
	'help\|h' => sub {
	pod2usage(
	-verbose => 99,
	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
	-msg => $VERSION_MSG,
	-output => '-'
	)
	},
	'version\|v' => sub {
	pod2usage(
	-verbose => 0,
	-msg => $VERSION_MSG,
	-output => '-'
	);
	}
	);


	# Establish logger
	binmode(STDERR, ':encoding(UTF-8)');
	Log::Any::Adapter->set('Stderr', log_level => $log_level);
	$log->notice('Debugging is activated') if DEBUG;


	if ($required_version) {
	$required_version =~ /^\s(\d+\.\d+\.\d+)\s$/;
	if (!$1 \|\| $1 ne $VERSION) {
	$log->error("Required version $required_version mismatches version $VERSION");
	exit(1);
	};
	};


	my ($what, $with);
	if ($xmlid_to_textsigle ne '') {
	($what, $with) = split('@', $xmlid_to_textsigle);
	$what = qr!$what!;
	};

	# tag (without attributes), which contains the primary text
	my $_TEXT_BODY = 'text';
	# optional

	# Remember to skip certain inline tags
	my %skip_inline_tags = ();
	if ($skip_inline_tags_str) {
	foreach (split /\s,\s/, $skip_inline_tags_str) {
	$skip_inline_tags{$_} = 1;
	};
	};

	# External tokenization
	my $ext_tok;
	if ($tokenizer_call) {
	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
	}

	# KorAP tokenization
	elsif ($tokenizer_korap) {
	eval {
	require KorAP::XML::TEI::Tokenizer::KorAP;
	1;
	};

	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
	if ($korap_tok_ver ne $VERSION) {
	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
	exit(1);
	};

	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
	}

	# No internal tokenizer chosen
	elsif (!$tokenizer_intern && !$no_tokenizer) {
	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
	exit(1);
	};

	if ($use_tokenizer_sentence_splits) {
	$skip_inline_tags{s} = 1;
	};

	# Internal tokenization
	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;


	# Name of the directory and the file containing all inline structure informations
	# except for $_TOKENS_TAG information
	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';

	# Name of the directory and the file containing all inline token informations
	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';

	if (index($_tokens_dir, '!') == 0) {
	$_tokens_dir = substr($_tokens_dir, 1);
	$inline_tokens_exclusive = 1;
	};

	# Initialize zipper
	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);

	# text directory (below $root_dir)
	my $dir = '';

	# Escaped version of text id
	my $text_id_esc;

	# Default encoding of the text
	my $input_enc = 'UTF-8';

	# text line (needed for whitespace handling)
	my $text_line = 0;


	# Input file handle (default: stdin)
	my $input_fh;

	# Single dash was set
	if ($stdio) {
	$input_fh = *STDIN;
	}

	# Input flag was passed
	elsif ($input_fname ne '') {
	unless (open($input_fh, '<', $input_fname)) {
	die $log->fatal("File '$input_fname' could not be opened.");
	};
	}

	# No input to process
	else {
	pod2usage(
	-verbose => 99,
	-sections => 'NAME\|SYNOPSIS',
	-msg => $VERSION_MSG,
	-output => '-'
	);
	exit;
	};

	# Prevents segfaulting (see notes on segfault prevention)
	binmode $input_fh;


	# Create inline parser object
	my $inline = KorAP::XML::TEI::Inline->new(
	$skip_inline_tokens,
	\%skip_inline_tags,
	$inline_tokens_exclusive
	);


	# Reading input document
	MAIN: while (<$input_fh>) {

	# remove HTML (multi-line) comments (<!--...-->)
	$_ = remove_xml_comments($input_fh, $_);

	# Set input encoding
	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
	$input_enc = $2;
	next;
	};

	$_ = decode($input_enc, $_);
	$_ = replace_entities($_);

	# Start of text body
	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
	my $suffix = $2;

	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
	die $log->fatal("input line number $.: " .
	"line with opening text-body tag '${_TEXT_BODY}' " .
	"contains additional information ... => Aborting (line=$_)");
	};

	# Text body data extracted from input document ($input_fh),
	# further processed by XML::LibXML::Reader
	my $text_buffer = '';

	# Iterate over all lines in the text body
	while (<$input_fh>) {

	$_ = remove_xml_comments($input_fh, $_);
	$_ = decode($input_enc, $_);
	$_ = replace_entities($_);

	# End of text body
	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {

	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files

	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
	die $log->fatal("input line number $.: " .
	"line with closing text-body tag '${_TEXT_BODY}'".
	" contains additional information ... => Aborting (line=$_)");
	};

	if ($dir eq '') {
	$log->warn(
	"Maybe empty textSigle => skipping this text ...\n" .
	'data=' . substr($inline->data->data, 0, 200)
	);
	next MAIN;
	};

	# Parse inline structure
	$inline->parse($text_id_esc, \$text_buffer);

	if (DEBUG) {
	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
	};

	my $data = $inline->data;

	# Write data.xml
	$data->to_zip(
	$zipper->new_stream("$dir/${data_file}.xml"),
	$text_id_esc
	);

	# Tokenize with external tokenizer
	if ($ext_tok) {

	# Tokenize and output
	$ext_tok->tokenize($data->data)->to_zip(
	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
	$text_id_esc
	);

	if ($use_tokenizer_sentence_splits) {
	$ext_tok->sentencize_from_previous_input($inline->structures);
	};
	};

	# Tokenize with internal tokenizer
	if ($tokenizer_intern) {

	# Tokenize and output
	$cons_tok->tokenize($data->data)->to_zip(
	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
	$text_id_esc
	)->reset;

	$aggr_tok->tokenize($data->data)->to_zip(
	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
	$text_id_esc
	)->reset;
	};

	# ~ write structures ~
	if (!$inline->structures->empty) {
	$inline->structures->to_zip(
	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
	$text_id_esc,
	2 # = structure serialization
	);
	};

	# ~ write tokens ~
	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
	$inline->tokens->to_zip(
	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
	$text_id_esc,
	# Either 0 = tokens without inline or 1 = tokens with inline
	!$skip_inline_token_annotations
	);
	};

	# reinit.
	$dir = '';

	next MAIN;
	};


	# ~ whitespace handling ~

	# Fix whitespaces (see notes on whitespace fixing)

	# TODO:
	# Maybe it's best, to keep the stripping of whitespace and
	# to just remove the if-clause and to insert a blank by default
	# (with possibly an option on how newlines in primary text should
	# be handled (stripped or replaced by a whitespace)).

	# Remove consecutive whitespace at beginning and end (mostly one newline)
	s/^\s+//; s/\s+$//;

	# NOTE:
	# this is only relevant, if a text consists of more than one line

	# TODO:
	# find a better solution, or create a warning, if a text has more
	# than one line ($text_line > 1)

	# TODO:
	# do testing with 2 different corpora
	# (one with only one-line texts, the other with several lines per text)

	# line contains at least one tag with at least one character contents
	if (m/<[^>]+>[^<]/) {

	# Increment counter for text lines
	$text_line++;

	# insert blank before 1st character
	# (for 2nd line and consecutive lines)
	$_ = ' ' . $_ if $text_line > 1;
	}

	# add line to buffer
	$text_buffer .= $_;
	};
	}

	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
	my $leadin = $1;
	my $id = $3;
	my $sigle = $3;

	if ($what) {
	$_ = $id;
	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
	$sigle = $_;
	$log->debug("Converted text id `$id' to sigle `$sigle'");
	};
	$sigle =~ s/\./-/g;

	my @parts = split(/[\/_]/, $sigle);
	if (@parts != 3) {
	die $log->fatal(
	"input line number $.: " .
	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
	"=> Aborting (line=$_)");
	};

	$dir = join("/", @parts);
	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
	$log->notice("$0: text_id=$text_id_esc");

	if ($leadin !~ /^\s*$/) {
	die $log->fatal(
	"input line number $.: " .
	'line with opening header tag is not in expected format ... ' .
	"=> Aborting (line=$_)");
	};
	}

	# Start of header section
	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
	my $content = "$2\n";

	if ($1 !~ /^\s*$/) {
	die $log->fatal(
	"input line number $.: " .
	'line with opening header tag is not in expected format ... ' .
	"=> Aborting (line=$_)");
	};

	# Parse header
	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);

	# Header was parseable
	if ($header) {

	# Write header to zip
	my $file = $header->dir . '/' . $header_file . '.xml';

	$log->debug("Writing file $file") if DEBUG;

	$header->to_zip($zipper->new_stream($file));

	# Header is for text level
	if ($header->type eq 'text') {

	# Remember dir and sigles
	$dir = $header->dir;
	$text_id_esc = $header->id_esc;

	# log output for seeing progression
	$log->notice("$0: text_id=$text_id_esc");

	# Reset counter for text lines
	# (needed for whitespace handling)
	$text_line = 0;
	};
	};
	};
	};

	$zipper->close;

	$ext_tok->close if $ext_tok;

	close $input_fh;


	__END__

	=pod

	=encoding utf8

	=head1 NAME

	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML

	=head1 SYNOPSIS

	cat corpus.i5.xml \| tei2korapxml - > corpus.korapxml.zip

	=head1 DESCRIPTION

	C<tei2korapxml> is a script to convert TEI P5 and
	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
	based documents to the
	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.

	This program is usually called from inside another script.

	=head1 FORMATS

	=head2 Input restrictions

	=over 2

	=item

	TEI P5 formatted input with certain restrictions:

	=over 4

	=item

	B<mandatory>: text-header with integrated textsigle
	(or convertable identifier), text-body

	=item

	B<optional>: corp-header with integrated corpsigle,
	doc-header with integrated docsigle

	=back

	=item

	All tokens inside the primary text may not be
	newline seperated, because newlines are removed
	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
	into blanks between 2 tokens could lead to additional blanks,
	where there should be none (e.g.: punctuation characters like C<,> or
	C<.> should not be seperated from their predecessor token).
	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).

	=item

	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
	need to be defined in the same line as the header tag.

	=back

	=head2 Notes on the output

	=over 2

	=item

	zip file output (default on C<stdout>) with utf8 encoded entries
	(which together form the KorAP-XML format)

	=back

	=head1 INSTALLATION

	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
	When these requirements are met, the preferred way to install the script is
	to use L<cpanm\|App::cpanminus>.

	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git

	In case everything went well, the C<tei2korapxml> tool will
	be available on your command line immediately.

	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.

	=head1 OPTIONS

	=over 2

	=item B<--input\|-i>

	The input file to process. If no specific input is defined and a single
	dash C<-> is passed as an argument, data is read from C<STDIN>.


	=item B<--root\|-r>

	The root directory for output. Defaults to C<.>.

	=item B<--help\|-h>

	Print help information.

	=item B<--version\|-v>

	Print version information.

	=item B<--tokenizer-korap\|-tk>

	Use the standard KorAP/DeReKo tokenizer.

	=item B<--tokenizer-internal\|-ti>

	Tokenize the data using two embedded tokenizers,
	that will take an I<aggressive> and a I<conservative>
	approach.

	=item B<--tokenizer-call\|-tc>

	Call an external tokenizer process, that will tokenize
	from STDIN and outputs the offsets of all tokens.

	Texts are separated using C<\x04\n>. The external process
	should add a new line per text.

	If the L</--use-tokenizer-sentence-splits> option is activated,
	sentences are marked by offset as well in new lines.

	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
	splitting, call C<tei2korap> as follows:

	$ cat corpus.i5.xml \| tei2korapxml -s \
	$ -tc 'datok tokenize \
	$ -t ./tokenizer.matok \
	$ -p --newline-after-eot --no-sentences \
	$ --no-tokens --sentence-positions -' - \
	$ > corpus.korapxml.zip

	=item B<--no-tokenizer>

	Boolean flag indicating that no tokenizer should be used.
	This is meant to ensure that by default a final token layer always
	exists.
	If a separate tokenizer is chosen, this flag is ignored.

	=item B<--skip-inline-tokens>

	Boolean flag indicating that inline tokens should not
	be processed. Defaults to false (meaning inline tokens will be processed).

	=item B<--skip-inline-token-annotations>

	Boolean flag indicating that inline token annotations should not
	be processed. Defaults to true (meaning inline token annotations
	won't be processed).

	=item B<--skip-inline-tags> <tags>

	Expects a comma-separated list of tags to be ignored when the structure
	is parsed. Content of these tags however will be processed.

	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>

	Expects a regular replacement expression (separated by B<@> between the
	search and the replacement) to convert text id attributes to text sigles
	with three parts (separated by B</>).

	Example:

	tei2korapxml \
	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
	-tk - < t/data/icc_german_sample.p5.xml

	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.

	=item B<--inline-tokens> <foundry>#[<file>]

	Define the foundry and file (without extension)
	to store inline token information in.
	Unless C<--skip-inline-token-annotations> is set,
	this will contain annotations as well.
	Defaults to C<tokens> and C<morpho>.

	The inline token data will also be stored in the
	inline structures file (see I<--inline-structures>),
	unless the inline token foundry is prepended
	by an B<!> exclamation mark, indicating that inline
	tokens are stored exclusively in the inline tokens
	file.

	Example:

	tei2korapxml --inline-tokens '!gingko#morpho' < data.i5.xml > korapxml.zip

	=item B<--inline-structures> <foundry>#[<file>]

	Define the foundry and file (without extension)
	to store inline structure information in.
	Defaults to C<struct> and C<structures>.

	=item B<--base-foundry> <foundry>

	Define the base foundry to store newly generated
	token information in.
	Defaults to C<base>.

	=item B<--data-file> <file>

	Define the file (without extension)
	to store primary data information in.
	Defaults to C<data>.

	=item B<--header-file> <file>

	Define the file name (without extension)
	to store header information on
	the corpus, document, and text level in.
	Defaults to C<header>.

	=item B<--use-tokenizer-sentence-splits\|-s>

	Replace existing with, or add new, sentence boundary information
	provided by the tokenizer.
	Currently KorAP-tokenizer and certain external tokenizers support
	these boundaries.

	=item B<--tokens-file> <file>

	Define the file (without extension)
	to store generated token information in
	(either from the KorAP tokenizer or an externally called tokenizer).
	Defaults to C<tokens>.

	=item B<--log\|-l>

	Loglevel for I<Log::Any>. Defaults to C<notice>.

	=back

	=head1 ENVIRONMENT VARIABLES

	=over 2

	=item B<KORAPXMLTEI_DEBUG>

	Activate minimal debugging.
	Defaults to C<false>.

	=back

	=head1 COPYRIGHT AND LICENSE

	Copyright (C) 2021-2023, L<IDS Mannheim\|https://www.ids-mannheim.de/>

	Author: Peter Harders

	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober

	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
	Corpus Analysis Platform at the
	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
	member of the
	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.

	This program is free software published under the
	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.

	=cut

	# NOTES

	## Notes on segfault prevention

	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.