script/tei2korapxml - KorAP/KorAP-XML-TEI - Gitiles

 #!/usr/bin/env perl
 use strict;
 use warnings;

 use Log::Any '$log';
 use Log::Any::Adapter;
 use Pod::Usage;
 use Getopt::Long qw(GetOptions :config no_auto_abbrev);

 use File::Basename qw(dirname);

 use Encode qw(decode);

 use XML::CompactTree::XS;
 use XML::LibXML::Reader;

 use FindBin;
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };

 use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Annotations::Collector;
 use KorAP::XML::TEI::Data;
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;

 eval {
   require KorAP::XML::TEI::Tokenizer::KorAP;
   1;
 };

 our $VERSION = '1.00';

 our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";

 # Set to 1 for minimal more debug output (no need to be parametrized)
 use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;

 # Parse options from the command line
 GetOptions(
   "root|r=s"  => \(my $_root_dir = '.'),  # name of root directory inside zip file
   "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
   'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
   'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
   'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
   'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
   'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
   'log|l=s' => \(my $log_level = 'notice'),
   'help|h'    => sub {
     pod2usage(
       -verbose => 99,
       -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
       -msg => $VERSION_MSG,
       -output => '-'
     )
   },
   'version|v' => sub {
     pod2usage(
       -verbose => 0,
       -msg => $VERSION_MSG,
       -output => '-'
     )
   }
 );

 binmode(STDERR, ":encoding(UTF-8)");
 Log::Any::Adapter->set('Stderr', log_level => $log_level);

 $log->notice('Debugging is activated') if DEBUG;

 #
 # ~~~ parameter (mandatory) ~~~
 #
 my $_TEXT_BODY        = "text";                        # tag (without attributes), which contains the primary text
 # optional
 my $_CORP_HEADER_BEG  = "idsHeader type=\"corpus\"";   # just keep the correct order of the attributes and evtl. add an '.*' between them
 # optional
 my $_DOC_HEADER_BEG   = "idsHeader type=\"document\""; # analog
 # mandatory
 my $_TEXT_HEADER_BEG  = "idsHeader type=\"text\"";     # analog


 ## extern tokenization
 my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;

 if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
   die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
 }

 my $ext_tok;
 if ($tokenizer_call) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
 }

 elsif ($tokenizer_korap) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
 };
 my $_tok_file_ext  = "tokens.xml";
 ##


 #
 # ~~~ constants ~~~
 #


 ## intern tokenization
 my $_GEN_TOK_INT = $tokenizer_intern;                  # simple tokenization (recommended for testing)
 my $_tok_file_con  = "tokens_conservative.xml";
 my $_tok_file_agg  = "tokens_aggressive.xml";
 my $aggr_tok       = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 my $cons_tok       = KorAP::XML::TEI::Tokenizer::Conservative->new;
 ##


 my $_tok_dir         = "base";                       # name of directory for storing tokenization files

 my $_header_file     = "header.xml";                 # name of files      containing the  text, document and corpus header
 my $_data_file       = "data.xml";                   # name of file       containing the  primary text data (tokens)
 my $_structure_dir   = "struct";                     # name of directory  containing the  $_structure_file
 my $_structure_file  = "structure.xml";              # name of file       containing all  tags (except ${_TOKEN_TAG}'s) related information
                                                      #                                     (= their names and byte offsets in $_data)
 ## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
 my $_TOKENS_PROC     = 1;                            # on/off: processing of ${_TOKEN_TAG}'s (default: 1)


 # Name of the directory and the file containing all inline token informations
 # i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
 my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
 $_tokens_file .= '.xml';

 my $_TOKENS_TAG      = "w";                          # name of tag        containing all  information stored in $_tokens_file

 # Handling inline annotations (inside $_TOKENS_TAG)
 my $_INLINE_ANNOT    = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;


 #
 # ~~~ variables ~~~
 #

 # Initialize Token- and Structure-Collector
 my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
 my $structures = KorAP::XML::TEI::Annotations::Collector->new;


 # Initialize Data-Collector
 my $data = KorAP::XML::TEI::Data->new;


 # Initialize zipper
 my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
 my $input_fh;                                        # input file handle (default: stdin)

 my $dir;                                             # text     directory (below $_root_dir)

 my ( $text_id,
      $text_id_esc );                                 # '$text_id_esc' = escaped version of $text_id

 my ( $reader,                                        # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
      $tree_data );                                   # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')

 # these are only used inside recursive function 'retr_info'
 my ( $_IDX,                                          # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
      $e,                                             # element from $tree_data
      ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
      $add_one,                                       # ...
      $fval,                                          # ...
      %ws);                                           # hash for indices of whitespace-nodes (needed to recorrect from-values)
                                                      # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
                                                      #  (means: 'from-index - 1' is a key in %ws).
                                                      # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1

 my $c;                                               # index variables used in loops


 #
 # ~~~ main ~~~
 #

 # ~ initializations ~

 # Include line numbers in elements of $tree_data for debugging
 DEBUG ? ($_IDX = 5) : ($_IDX = 4);

 $fval = 0;

 # Normalize regex for header parsing
 for ($_CORP_HEADER_BEG,
      $_DOC_HEADER_BEG,
      $_TEXT_HEADER_BEG) {
   s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
 };


 # ~ read input and write output (text by text) ~

 my ( $pfx, $sfx );

 my $tl = 0; # text line (needed for whitespace handling)

 $input_fh = *STDIN;  # input file handle (default: stdin)

 # Maybe not necessary
 $data->reset;

 $dir = "";

 if ( $input_fname ne '' ){
   unless (open($input_fh, '<', $input_fname)) {
     die $log->fatal("File '$input_fname' could not be opened.");
   };
 }

 # Prevents segfaulting (see notes on segfault prevention)
 binmode $input_fh;

 my $pos;
 my $input_enc = 'UTF-8';
 my $l = length('</' . $_TEXT_BODY) + 1;

 # ~ loop (reading input document) ~

 MAIN: while ( <$input_fh> ){

   $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)

   # Set input encoding
   if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
     $input_enc = $2;
     next;
   };

   $_ = decode($input_enc, $_);
   $_ = replace_entities($_);

   if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){

     # ~ start of text body ~

     $pfx = $1;
     $sfx = $2;

     if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
       die $log->fatal("input line number $.: " .
                         "line with opening text-body tag '${_TEXT_BODY}' " .
                         "contains additional information ... => Aborting (line=$_)");
     };

     # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
     my $buf_in = '';

     # Iterate over all lines in the text body
     while (<$input_fh>) {

       $_ = remove_xml_comments( $input_fh, $_ );
       $_ = decode($input_enc, $_);
       $_ = replace_entities($_);

       # ~ end of text body ~
       if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {

         # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)

         if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
           die $log->fatal("input line number $.: " .
                             "line with closing text-body tag '${_TEXT_BODY}'".
                             " contains additional information ... => Aborting (line=$_)");
         };

         if ($dir ne "") {

           $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );

           # See notes on whitespace handling
           my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;

           # XCT_LINE_NUMBERS is only needed for debugging
           # (see XML::CompactTree::XS)
           $param |= XCT_LINE_NUMBERS if DEBUG;
           $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);

           $structures->reset;

           $tokens->reset if $_TOKENS_PROC;

           # ~ whitespace related issue ~
           $add_one = 0;
           %ws = ();

           # ~ recursion ~
           retr_info(1, \$tree_data->[2] ); # parse input data

           if (DEBUG) {
             $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
           };

           # ~ write data.xml ~
           $data->to_zip(
             $zipper->new_stream("$dir/${_data_file}"),
             $text_id_esc
           );

           # ~ tokenization ~
           if ($_GEN_TOK_EXT) {

             # Tokenize and output
             $ext_tok->tokenize($data->data)->to_zip(
               $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
               $text_id_esc
             );
           };

           if ($_GEN_TOK_INT) {

             # Tokenize and output
             $cons_tok->tokenize($data->data)->to_zip(
               $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
               $text_id_esc
             );

             $aggr_tok->tokenize($data->data)->to_zip(
               $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
               $text_id_esc
             );

             $aggr_tok->reset;
             $cons_tok->reset;
           };

           if ($use_tokenizer_sentence_splits) {
             $ext_tok->sentencize_from_previous_input($structures);
           }

           # ~ write structures ~
           if (!$structures->empty) {
             $structures->to_zip(
               $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
               $text_id_esc,
               2 # = structure serialization
             );
           };

           # ~ write tokens ~
           if ($_TOKENS_PROC && !$tokens->empty) {
             $tokens->to_zip(
               $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
               $text_id_esc,
               $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
             );
           };

           $dir = ""; # reinit.

           # Maybe not necessary
           $data->reset;

         } else { # $dir eq ""

           $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
         }

         next MAIN;
       };

       # ~ inside text body ~

       # ~ whitespace handling ~

       # Fix whitespaces (see notes on whitespace fixing)

       # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
       #   an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).

       # Remove consecutive whitespace at beginning and end (mostly one newline)
       s/^\s+//; s/\s+$//;

       ### NOTE: this is only relevant, if a text consists of more than one line
       ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
       ###  do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
       if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents

         $tl++; # counter for text lines

         s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
       }
       ###

       # add line to buffer
       $buf_in .= $_;
     };

   } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {

     # ~ start of header ~
     $pfx = $1;
     my $content = "$2\n";

     if ($pfx !~ /^\s*$/) {
       die $log->fatal("input line number $.: " .
                         "line with opening header tag" .
                         " is not in expected format ... => Aborting (line=$_)");
     };

     # Parse header
     my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);

     # Header was parseable
     if ($header) {

       # Write header to zip
       my $file = $header->dir . '/' . $_header_file;

       $log->debug("Writing file $file") if DEBUG;

       $header->to_zip($zipper->new_stream($file));

       # Header is for text level
       if ($header->type eq 'text') {

         # Remember dir and sigles
         $dir         = $header->dir;
         $text_id     = $header->id;
         $text_id_esc = $header->id_esc;

         # log output for seeing progression
         $log->notice("$0: main(): text_id=$text_id");

         $tl = 0; # reset (needed for ~ whitespace handling ~)
       }
     }
   }
 } #end: while

 $zipper->close;

 $ext_tok->close if $_GEN_TOK_EXT;

 exit(0);


 # Recursively called function to handle XML tree data
 sub retr_info {

   # recursion level
   # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
   my $rl = shift;

   my $dummy_anno;
   if ($use_tokenizer_sentence_splits) {
     $dummy_anno = $structures->new_dummy_annotation;
   }

   # Iteration through all array elements
   # ($_[0] is a reference to an array reference)
   # See notes on how 'XML::CompactTree::XS' works and
   # see 'NODE TYPES' in manpage of XML::LibXML::Reader
   foreach $e (@{${$_[0]}}) {

     # Element node
     if ($e->[0] == XML_READER_TYPE_ELEMENT) {

       #~~~~
       # from here: tag-node (opening)
       #~~~~

       my $anno;

       # $e->[1] represents the tag name
       if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
         $anno = $dummy_anno;
       } else {
         $anno = $structures->add_new_annotation($e->[1]);
       }


       # Add element also to token list
       if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
         $tokens->add_annotation($anno);
       };

       # Handle attributes (if attributes exist)
       if (defined $e->[3]) {

         # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
         #  [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
         # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
         for ($c = 0; $c < @{$e->[3]}; $c += 2) {

           # '$c' references the 'key' and '$c+1' the 'value'
           $anno->add_attribute(
             @{$e->[3]}[$c, $c + 1]
           );
         };
       };

       # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
       $anno->set_from($data->position + $add_one);


       #~~~~
       # until here: tag-node (opening)
       #~~~~


       # Call function recursively
       # do no recursion, if $e->[$_IDX] is not defined
       # (because we have no array of child-nodes, e.g.: <back/>)
       if (defined $e->[$_IDX]) {

         # Recursion with array of child-nodes
         retr_info($rl+1, \$e->[$_IDX]);
       }


       #~~~~~
       # from here: tag-node (closing)
       #~~~~~

       # NOTE: use $pos, because the offsets are _between_ the characters
       # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
       my $pos = $data->position;

       # Handle structures and tokens

       $fval = $anno->from;

       # ~ whitespace related issue ~
       if ($fval > 0 && not exists $ws{$fval - 1}) {

         # ~ previous node was a text-node ~
         $anno->set_from($fval - 1);
       }

       # in case this fails, check input
       if (($fval - 1) > $pos) {
         die $log->fatal("text_id='$text_id', " .
                           "processing of structures: " .
                           "from-value ($fval) is 2 or more greater " .
                           "than to-value ($pos) => please check. Aborting");
       };

       # TODO: find example for which this case applies
       #  maybe this is not necessary anymore, because the above recorrection of the from-value suffices
       #
       # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
       #   do testing with bigger corpus excerpt (wikipedia?)
       $anno->set_from($pos) if $fval == $pos + 1;
       $anno->set_to($pos);
       $anno->set_level($rl);

       # Clean up whitespace
       delete $ws{$fval  - 1} if $fval > 0 && exists $ws{$fval - 1};


       #~~~~
       # until here: tag-node (closing)
       #~~~~
     }

     # Text node
     elsif ($e->[0] == XML_READER_TYPE_TEXT){

       $add_one = 1;
       $data->append($e->[1]);
     }

     # Whitespace node
     # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
     elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {

       # state, that this from-index belongs to a whitespace-node
       #  ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
       $ws{$data->position}++;

       $add_one = 0;
       $data->append($e->[1]);
     }

     # not yet handled type
     else {

       die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
     };
   };
 };


 __END__

 =pod

 =encoding utf8

 =head1 NAME

 tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML

 =head1 SYNOPSIS

   cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip

 =head1 DESCRIPTION

 C<tei2korapxml> is a script to convert TEI P5 and
 L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
 based documents to the
 L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
 If no specific input is defined, data is
 read from C<STDIN>. If no specific output is defined, data is written
 to C<STDOUT>.

 This program is usually called from inside another script.

 =head1 FORMATS

 =head2 Input restrictions

 =over 2

 =item

 TEI P5 formatted input with certain restrictions:

 =over 4

 =item

 B<mandatory>: text-header with integrated textsigle, text-body

 =item

 B<optional>: corp-header with integrated corpsigle,
 doc-header with integrated docsigle

 =back

 =item

 All tokens inside the primary text may not be
 newline seperated, because newlines are removed
 (see L<KorAP::XML::TEI::Data>) and a conversion of newlines
 into blanks between 2 tokens could lead to additional blanks,
 where there should be none (e.g.: punctuation characters like C<,> or
 C<.> should not be seperated from their predecessor token).
 (see also code section C<~ whitespace handling ~>).

 =back

 =head2 Notes on the output

 =over 2

 =item

 zip file output (default on C<stdout>) with utf8 encoded entries
 (which together form the KorAP-XML format)

 =back

 =head1 INSTALLATION

 C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
 these bindings are available, the preferred way to install the script is
 to use L<cpanm|App::cpanminus>.

   $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git

 In case everything went well, the C<tei2korapxml> tool will
 be available on your command line immediately.

 Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.

 =head1 OPTIONS

 =over 2

 =item B<--root|-r>

 The root directory for output. Defaults to C<.>.

 =item B<--help|-h>

 Print help information.

 =item B<--version|-v>

 Print version information.

 =item B<--tokenizer-call|-tc>

 Call an external tokenizer process, that will tokenize
 a single line from STDIN and outputs one token per line.

 =item B<--tokenizer-korap|-tk>

 Use the standard KorAP/DeReKo tokenizer.

 =item B<--tokenizer-internal|-ti>

 Tokenize the data using two embedded tokenizers,
 that will take an I<Aggressive> and a I<conservative>
 approach.

 =item B<--inline-tokens> <foundry>#[<file>]

 Define the foundry and file (without extension)
 to store inline token information in.
 If L</KORAPXMLTEI_INLINE> is set, this will contain
 annotations as well.
 Defaults to C<tokens> and C<morpho>.

 =item B<--use-tokenizer-sentence-splits|-s>

 Replace existing with, or add new, sentence boundary information
 provided by the KorAP tokenizer (currently supported only).

 =item B<--log|-l>

 Loglevel for I<Log::Any>. Defaults to C<notice>.

 =back

 =head1 ENVIRONMENT VARIABLES

 =over 2

 =item B<KORAPXMLTEI_DEBUG>

 Activate minimal debugging.
 Defaults to C<false>.

 =item B<KORAPXMLTEI_INLINE>

 Process inline annotations, if present.
 Defaults to C<false>.

 =back

 =head1 COPYRIGHT AND LICENSE

 Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>

 Author: Peter Harders

 Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober

 L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
 member of the
 L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.

 This program is free software published under the
 L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.

 =cut

 # NOTES

 ##  Notes on how 'XML::CompactTree::XS' works

 Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>

 Print out name of 'node2' for the above example:

 echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'

 Exploring the structure of $data ( = reference to below array ):

 [ 0: XML_READER_TYPE_DOCUMENT,
   1: ?
   2: [ 0: [ 0: XML_READER_TYPE_ELEMENT                     <- start recursion with array '$data->[2]' (see main(): retr_info( \$tree_data->[2] ))
             1: 'node'
             2: ?
             3: HASH (attributes)
             4: 1 (line number)
             5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
                       1: 'node1'
                       2: ?
                       3: undefined (no attributes)
                       4: 1 (line number)
                       5: [ 0: [ 0: XML_READER_TYPE_TEXT
                                 1: 'some '
                               ]
                            1: [ 0: XML_READER_TYPE_ELEMENT
                                 1: 'n'
                                 2: ?
                                 3: undefined (no attributes)
                                 4: 1 (line number)
                                 5: undefined (no child-nodes)
                               ]
                            2: [ 0: XML_READER_TYPE_TEXT
                                 1: ' text'
                               ]
                          ]
                     ]
                  1: [ 0: XML_READER_TYPE_ELEMENT
                       1: 'node2'
                       2: ?
                       3: undefined (not attributes)
                       4: 1 (line number)
                       5: [ 0: [ 0: XML_READER_TYPE_TEXT
                                 1: 'more-text'
                               ]
                          ]
                     ]
                ]
           ]
      ]
 ]

 $data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)

 ref($data->[2])                                                         == ARRAY (with 1 element for 'node')
 ref($data->[2]->[0])                                                    == ARRAY (with 6 elements)

 $data->[2]->[0]->[0]                                                    == 1 (=> type == XML_READER_TYPE_ELEMENT)
 $data->[2]->[0]->[1]                                                    == 'node'
 ref($data->[2]->[0]->[3])                                               == HASH  (=> ${$data->[2]->[0]->[3]}{a} == 'v')
 $data->[2]->[0]->[4]                                                    == 1 (line number)
 ref($data->[2]->[0]->[5])                                               == ARRAY (with 2 elements for 'node1' and 'node2')
                                                                                    # child-nodes of actual node (see $_IDX)

 ref($data->[2]->[0]->[5]->[0])                                          == ARRAY (with 6 elements)
 $data->[2]->[0]->[5]->[0]->[0]                                          == 1 (=> type == XML_READER_TYPE_ELEMENT)
 $data->[2]->[0]->[5]->[0]->[1]                                          == 'node1'
 $data->[2]->[0]->[5]->[0]->[3]                                          == undefined (=> no attribute)
 $data->[2]->[0]->[5]->[0]->[4]                                          == 1 (line number)
 ref($data->[2]->[0]->[5]->[0]->[5])                                     == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')

 ref($data->[2]->[0]->[5]->[0]->[5]->[0])                                == ARRAY (with 2 elements)
 $data->[2]->[0]->[5]->[0]->[5]->[0]->[0]                                == 3 (=> type ==  XML_READER_TYPE_TEXT)
 $data->[2]->[0]->[5]->[0]->[5]->[0]->[1]                                == 'some '

 ref($data->[2]->[0]->[5]->[0]->[5]->[1])                                == ARRAY (with 5 elements)
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[0]                                == 1 (=> type == XML_READER_TYPE_ELEMENT)
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[1]                                == 'n'
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[3]                                == undefined (=> no attribute)
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[4]                                == 1 (line number)
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[5]                                == undefined (=> no child-nodes)

 ref($data->[2]->[0]->[5]->[0]->[5]->[2])                                == ARRAY (with 2 elements)
 $data->[2]->[0]->[5]->[0]->[5]->[2]->[0]                                == 3 (=> type ==  XML_READER_TYPE_TEXT)
 $data->[2]->[0]->[5]->[0]->[5]->[2]->[1]                                == ' text'


 retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
 Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
 ${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).


 ## Notes on whitespace handling

 Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
 (see function 'retr_info()').

 Definition of significant and insignificant whitespace
 (source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):

 Significant whitespace is part of the document content and should be preserved.
 Insignificant whitespace is used when editing XML documents for readability.
 These whitespaces are typically not intended for inclusion in the delivery of the document.

 ### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE

 The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
  'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.

 When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
  '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
  (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):

 echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'


 Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'

 Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
  'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).

 The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
  it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').

 The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
  enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:

 When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
  So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
  the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
  the last read 'non-tag'-node has to be corrected (see [1]),

 For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
  additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).

 [1]
 Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
  In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
  (see above code fragment '... not exists $ws{ $fval - 1 } ...').

 [2]
 Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
  whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).

 The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
  (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?

 Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
  and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.


 ## Notes on whitespace fixing

 The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
  into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.

 It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
  example further down and notes on 'Input restrictions' in the manpage).

 Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.

 Examples (how primary text with linebreaks would be converted by below code):

   '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
   '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.

 Blanks are inserted before the 1st character:

  NOTE: not stringent ('...' stands for text):

    beg1............................end1  => no blank before 'beg1'
    beg2....<pb/>...................end2  => no blank before 'beg2'
    beg3....<info attr1="val1"/>....end3  => no blank before 'beg3'
    beg4....<test>ok</test>.........end4  =>    blank before 'beg4'

      =>  beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
                                                                             ^
                                                                             |_blank between 'end3' and 'beg4'


 ## Notes on segfault prevention

 binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside 'main()'
 (see notes on 'PerlIO layers' in  'man XML::LibXML'),
 removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
 see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
 see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.