script/tei2korapxml - KorAP/KorAP-XML-TEI - Gitiles

 #!/usr/bin/env perl
 use strict;
 use warnings;

 use Log::Any '$log';
 use Log::Any::Adapter;
 use Pod::Usage;
 use Getopt::Long qw(GetOptions :config no_auto_abbrev);

 use File::Basename qw(dirname);

 use Encode qw(decode);

 use XML::CompactTree::XS;
 use XML::LibXML::Reader;

 use FindBin;
 BEGIN {
   unshift @INC, "$FindBin::Bin/../lib";
 };

 use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
 use KorAP::XML::TEI::Tokenizer::External;
 use KorAP::XML::TEI::Tokenizer::Conservative;
 use KorAP::XML::TEI::Tokenizer::Aggressive;
 use KorAP::XML::TEI::Annotations::Collector;
 use KorAP::XML::TEI::Data;
 use KorAP::XML::TEI::Zipper;
 use KorAP::XML::TEI::Header;

 eval {
   require KorAP::XML::TEI::Tokenizer::KorAP;
   1;
 };

 our $VERSION = '1.00';

 our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";

 # Set to 1 for minimal more debug output (no need to be parametrized)
 use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;

 # Parse options from the command line
 GetOptions(
   "root|r=s"              => \(my $root_dir = '.'),
   "input|i=s"             => \(my $input_fname = ''),
   'tokenizer-call|tc=s'   => \(my $tokenizer_call),
   'tokenizer-korap|tk'    => \(my $tokenizer_korap),
   'tokenizer-internal|ti' => \(my $tokenizer_intern),
   'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
   'inline-tokens=s'       => \(my $inline_tokens = 'tokens#morpho'),
   'inline-structures=s'   => \(my $inline_structures = 'struct#structure'),
   'skip-inline-tokens'    => \(my $skip_inline_tokens = 0),
   'base-foundry=s'        => \(my $base_dir = 'base'),
   'data-file=s'           => \(my $data_file = 'data'),
   'header-file=s'         => \(my $header_file = 'header'),
   'tokens-file=s'         => \(my $tokens_file = 'tokens'),
   'log|l=s'               => \(my $log_level = 'notice'),
   'help|h' => sub {
     pod2usage(
       -verbose => 99,
       -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
       -msg => $VERSION_MSG,
       -output => '-'
     )
   },
   'version|v' => sub {
     pod2usage(
       -verbose => 0,
       -msg => $VERSION_MSG,
       -output => '-'
     )
   }
 );

 # Establish logger
 binmode(STDERR, ":encoding(UTF-8)");
 Log::Any::Adapter->set('Stderr', log_level => $log_level);

 $log->notice('Debugging is activated') if DEBUG;

 # tag (without attributes), which contains the primary text
 my $_TEXT_BODY = 'text';
 # optional

 # TODO: IDS-specific (and redundant)
 my $_HEADER_TAG = 'idsHeader';

 # name of the tag containing all information stored in $_tokens_file
 my $_TOKENS_TAG = 'w';


 if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
   die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
 };

 my $ext_tok;
 if ($tokenizer_call) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
 }

 elsif ($tokenizer_korap) {
   $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
 };
 ##


 #
 # ~~~ constants ~~~
 #


 ## intern tokenization
 my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
 my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
 ##

 # Name of the directory and the file containing all inline structure informations
 # except for $_TOKENS_TAG information
 my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';

 # Name of the directory and the file containing all inline token informations
 # i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
 my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';

 # Handling inline annotations (inside $_TOKENS_TAG)
 my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;

 # Initialize Token- and Structure-Collector
 my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
 my $structures = KorAP::XML::TEI::Annotations::Collector->new;

 # Initialize Data-Collector
 my $data = KorAP::XML::TEI::Data->new;

 # Initialize zipper
 my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);


 #
 # ~~~ variables ~~~
 #

 # text directory (below $root_dir)
 my $dir = '';

 # Escaped version of text id
 my $text_id_esc;

 # element from $tree_data
 my $e;

 # Keeping track of the current positions in the text
 my $pos;

 # Default encoding of the text
 my $input_enc = 'UTF-8';

 # variables for handling ~ whitespace related issue ~
 # (it is sometimes necessary, to correct the from-values for some tags)
 my $add_one;
 my $from = 0;

 # text line (needed for whitespace handling)
 my $text_line = 0;

 # hash for indices of whitespace-nodes
 # (needed to recorrect from-values)
 # IDEA:
 #   when closing element, check if it's from-index minus 1 refers to a whitespace-node
 #  (means: 'from-index - 1' is a key in %ws).
 #  if this is _not_ the case, then the from-value is one
 #  to high => correct it by substracting 1
 my %ws;


 #
 # ~~~ main ~~~
 #

 # ~ read input and write output (text by text) ~

 # Input file handle (default: stdin)
 my $input_fh = *STDIN;

 if ($input_fname ne '') {
   unless (open($input_fh, '<', $input_fname)) {
     die $log->fatal("File '$input_fname' could not be opened.");
   };
 };

 # Prevents segfaulting (see notes on segfault prevention)
 binmode $input_fh;


 # Reading input document
 MAIN: while ( <$input_fh> ){

   # remove HTML (multi-line) comments (<!--...-->)
   $_ = remove_xml_comments( $input_fh, $_ );

   # Set input encoding
   if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
     $input_enc = $2;
     next;
   };

   $_ = decode($input_enc, $_);
   $_ = replace_entities($_);

   # Start of Text body
   if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#){

     my $suffix = $2;

     if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
       die $log->fatal("input line number $.: " .
                         "line with opening text-body tag '${_TEXT_BODY}' " .
                         "contains additional information ... => Aborting (line=$_)");
     };

     # Text body data extracted from input document ($input_fh),
     # further processed by XML::LibXML::Reader
     my $text_buffer = '';

     # Iterate over all lines in the text body
     while (<$input_fh>) {

       $_ = remove_xml_comments( $input_fh, $_ );
       $_ = decode($input_enc, $_);
       $_ = replace_entities($_);

       # End of text body
       if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {

         # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files

         if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
           die $log->fatal("input line number $.: " .
                             "line with closing text-body tag '${_TEXT_BODY}'".
                             " contains additional information ... => Aborting (line=$_)");
         };

         if ($dir eq '') {
           $log->warn(
             "Maybe empty textSigle => skipping this text ...\n" .
               'data=' . substr($data->data, 0, 200)
             );
           next MAIN;
         };

         my $reader = XML::LibXML::Reader->new(
           string => "<text>$text_buffer</text>",
           huge => 1
         );

         # See notes on whitespace handling
         my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;

         # XCT_LINE_NUMBERS is only needed for debugging
         # (see XML::CompactTree::XS)
         $param |= XCT_LINE_NUMBERS if DEBUG;
         my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);

         # ~ whitespace related issue ~
         $add_one = 0;
         %ws = ();

         # ~ recursion ~
         descend(1, $tree_data->[2]); # parse input data

         if (DEBUG) {
           $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
         };

         # Write data.xml
         $data->to_zip(
           $zipper->new_stream("$dir/${data_file}.xml"),
           $text_id_esc
         );

         # Tokenize with external tokenizer
         if ($ext_tok) {

           # Tokenize and output
           $ext_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
             $text_id_esc
           );

           if ($use_tokenizer_sentence_splits) {
             $ext_tok->sentencize_from_previous_input($structures);
           };
         };

         # Tokenize with internal tokenizer
         if ($tokenizer_intern) {

           # Tokenize and output
           $cons_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
             $text_id_esc
           )->reset;

           $aggr_tok->tokenize($data->data)->to_zip(
             $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
             $text_id_esc
           )->reset;
         };

         # ~ write structures ~
         if (!$structures->empty) {
           $structures->to_zip(
             $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
             $text_id_esc,
             2 # = structure serialization
           )->reset;
         };

         # ~ write tokens ~
         unless ($skip_inline_tokens || $tokens->empty) {
           $tokens->to_zip(
             $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
             $text_id_esc,
             $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
           )->reset;
         };

         # reinit.
         $dir = '';

         # Maybe not necessary
         $data->reset;

         next MAIN;
       };


       # ~ whitespace handling ~

       # Fix whitespaces (see notes on whitespace fixing)

       # TODO:
       #   Maybe it's best, to keep the stripping of whitespace and
       #   to just remove the if-clause and to insert a blank by default
       #   (with possibly an option on how newlines in primary text should
       #   be handled (stripped or replaced by a whitespace)).

       # Remove consecutive whitespace at beginning and end (mostly one newline)
       s/^\s+//; s/\s+$//;

       # NOTE:
       #   this is only relevant, if a text consists of more than one line

       # TODO:
       #   find a better solution, or create a warning, if a text has more
       #   than one line ($text_line > 1)

       # TODO:
       #   do testing with 2 different corpora
       #   (one with only one-line texts, the other with several lines per text)

       # line contains at least one tag with at least one character contents
       if (m/<[^>]+>[^<]/) {

         # Increment counter for text lines
         $text_line++;

         # insert blank before 1st character
         #(for 2nd line and consecutive lines)
         s/^(.)/ $1/ if $text_line > 1;
       }

       # add line to buffer
       $text_buffer .= $_;
     };

   } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {

     # ~ start of header ~
     my $content = "$2\n";

     if ($1 !~ /^\s*$/) {
       die $log->fatal(
         "input line number $.: " .
           'line with opening header tag is not in expected format ... ' .
           "=> Aborting (line=$_)");
     };

     # Parse header
     my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);

     # Header was parseable
     if ($header) {

       # Write header to zip
       my $file = $header->dir . '/' . $header_file . '.xml';

       $log->debug("Writing file $file") if DEBUG;

       $header->to_zip($zipper->new_stream($file));

       # Header is for text level
       if ($header->type eq 'text') {

         # Remember dir and sigles
         $dir         = $header->dir;
         $text_id_esc = $header->id_esc;

         # log output for seeing progression
         $log->notice("$0: text_id=$text_id_esc");

         # Reset counter for text lines
         # (needed for whitespace handling)
         $text_line = 0;
       };
     };
   };
 };

 $zipper->close;

 $ext_tok->close if $ext_tok;

 close $input_fh;

 exit(0);


 # Recursively called function to handle XML tree data
 sub descend {

   # recursion level
   # (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
   my $depth = shift;

   # Iteration through all array elements
   # ($_[0] is a reference to an array reference)
   # See notes on how 'XML::CompactTree::XS' works and
   # see 'NODE TYPES' in manpage of XML::LibXML::Reader
   foreach $e (@{$_[0]}) {

     # Element node
     if ($e->[0] == XML_READER_TYPE_ELEMENT) {

       #~~~~
       # from here: tag-node (opening)
       #~~~~

       # Get the child index depending on the debug state.
       # This is likely to be optimized away by the compiler.
       my $children = $e->[DEBUG ? 5 : 4];

       # $e->[1] represents the tag name
       # Skip sentences
       if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
         descend($depth+1, $children) if defined $children;
         next;
       }

       my $anno = $structures->add_new_annotation($e->[1]);

       # Add element also to token list
       if (!$skip_inline_tokens && $e->[1] eq $_TOKENS_TAG) {
         $tokens->add_annotation($anno);
       };

       # Handle attributes (if attributes exist)
       if (defined $e->[3]) {

         # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
         #  [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
         # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
         for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {

           # '$_' references the 'key' and '$_+1' the 'value'
           $anno->add_attribute(
             @{$e->[3]}[$_, $_ + 1]
           );
         };
       };

       # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
       $anno->set_from($data->position + $add_one);


       #~~~~
       # until here: tag-node (opening)
       #~~~~


       # Call function recursively
       # do no recursion, if $children is not defined
       # (because we have no array of child-nodes, e.g.: <back/>)
       descend($depth+1, $children) if defined $children;


       #~~~~~
       # from here: tag-node (closing)
       #~~~~~

       # NOTE: use $pos, because the offsets are _between_ the characters
       # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
       my $pos = $data->position;

       # Handle structures and tokens

       $from = $anno->from;

       # ~ whitespace related issue ~
       if ($from > 0 && not exists $ws{$from - 1}) {

         # ~ previous node was a text-node ~
         $anno->set_from($from - 1);
       };

       # in case this fails, check input
       if (($from - 1) > $pos) {
         die $log->fatal(
           "text_id='$text_id_esc', " .
             'processing of structures: ' .
             "from-value ($from) is 2 or more greater " .
             "than to-value ($pos) => please check. Aborting"
           );
       };

       # TODO:
       #   find example for which this case applies
       #   maybe this is not necessary anymore, because the
       #   above recorrection of the from-value suffices
       #
       # TODO:
       #   check, if it's better to remove this line and
       #   change above check to 'if ($from - 1) >= $pos;
       #   do testing with bigger corpus excerpt (wikipedia?)
       $anno->set_from($pos) if $from == $pos + 1;
       $anno->set_to($pos);
       $anno->set_level($depth);

       # Clean up whitespace
       delete $ws{$from  - 1} if $from > 0 && exists $ws{$from - 1};


       #~~~~
       # until here: tag-node (closing)
       #~~~~
     }

     # Text node
     elsif ($e->[0] == XML_READER_TYPE_TEXT){

       $add_one = 1;
       $data->append($e->[1]);
     }

     # Whitespace node
     # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
     elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {

       # state, that this from-index belongs to a whitespace-node
       #  ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
       $ws{$data->position}++;

       $add_one = 0;
       $data->append($e->[1]);
     }

     # not yet handled type
     else {

       die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
     };
   };
 };


 __END__

 =pod

 =encoding utf8

 =head1 NAME

 tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML

 =head1 SYNOPSIS

   cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip

 =head1 DESCRIPTION

 C<tei2korapxml> is a script to convert TEI P5 and
 L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
 based documents to the
 L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
 If no specific input is defined, data is
 read from C<STDIN>. If no specific output is defined, data is written
 to C<STDOUT>.

 This program is usually called from inside another script.

 =head1 FORMATS

 =head2 Input restrictions

 =over 2

 =item

 TEI P5 formatted input with certain restrictions:

 =over 4

 =item

 B<mandatory>: text-header with integrated textsigle, text-body

 =item

 B<optional>: corp-header with integrated corpsigle,
 doc-header with integrated docsigle

 =back

 =item

 All tokens inside the primary text may not be
 newline seperated, because newlines are removed
 (see L<KorAP::XML::TEI::Data>) and a conversion of newlines
 into blanks between 2 tokens could lead to additional blanks,
 where there should be none (e.g.: punctuation characters like C<,> or
 C<.> should not be seperated from their predecessor token).
 (see also code section C<~ whitespace handling ~>).

 =back

 =head2 Notes on the output

 =over 2

 =item

 zip file output (default on C<stdout>) with utf8 encoded entries
 (which together form the KorAP-XML format)

 =back

 =head1 INSTALLATION

 C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
 these bindings are available, the preferred way to install the script is
 to use L<cpanm|App::cpanminus>.

   $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git

 In case everything went well, the C<tei2korapxml> tool will
 be available on your command line immediately.

 Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.

 =head1 OPTIONS

 =over 2

 =item B<--root|-r>

 The root directory for output. Defaults to C<.>.

 =item B<--help|-h>

 Print help information.

 =item B<--version|-v>

 Print version information.

 =item B<--tokenizer-call|-tc>

 Call an external tokenizer process, that will tokenize
 a single line from STDIN and outputs one token per line.

 =item B<--tokenizer-korap|-tk>

 Use the standard KorAP/DeReKo tokenizer.

 =item B<--tokenizer-internal|-ti>

 Tokenize the data using two embedded tokenizers,
 that will take an I<Aggressive> and a I<conservative>
 approach.

 =item B<--skip-inline-tokens>

 Boolean flag indicating that inline tokens should not
 be processed. Defaults to false (meaning inline tokens will be processed).

 =item B<--inline-tokens> <foundry>#[<file>]

 Define the foundry and file (without extension)
 to store inline token information in.
 If L</KORAPXMLTEI_INLINE> is set, this will contain
 annotations as well.
 Defaults to C<tokens> and C<morpho>.

 =item B<--inline-structures> <foundry>#[<file>]

 Define the foundry and file (without extension)
 to store inline structure information in.
 Defaults to C<struct> and C<structures>.

 =item B<--base-foundry> <foundry>

 Define the base foundry to store newly generated
 token information in.
 Defaults to C<base>.

 =item B<--data-file> <file>

 Define the file (without extension)
 to store primary data information in.
 Defaults to C<data>.

 =item B<--header-file> <file>

 Define the file name (without extension)
 to store header information on
 the corpus, document, and text level in.
 Defaults to C<header>.

 =item B<--use-tokenizer-sentence-splits|-s>

 Replace existing with, or add new, sentence boundary information
 provided by the KorAP tokenizer (currently supported only).

 =item B<--tokens-file> <file>

 Define the file (without extension)
 to store generated token information in
 (either from the KorAP tokenizer or an externally called tokenizer).
 Defaults to C<tokens>.

 =item B<--log|-l>

 Loglevel for I<Log::Any>. Defaults to C<notice>.

 =back

 =head1 ENVIRONMENT VARIABLES

 =over 2

 =item B<KORAPXMLTEI_DEBUG>

 Activate minimal debugging.
 Defaults to C<false>.

 =item B<KORAPXMLTEI_INLINE>

 Process inline annotations, if present.
 Defaults to C<false>.

 =back

 =head1 COPYRIGHT AND LICENSE

 Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>

 Author: Peter Harders

 Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober

 L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
 Corpus Analysis Platform at the
 L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
 member of the
 L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.

 This program is free software published under the
 L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.

 =cut

 # NOTES

 ##  Notes on how 'XML::CompactTree::XS' works

 Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>

 Print out name of 'node2' for the above example:

 echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'

 Exploring the structure of $data ( = reference to below array ):

 [ 0: XML_READER_TYPE_DOCUMENT,
   1: ?
   2: [ 0: [ 0: XML_READER_TYPE_ELEMENT                     <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
             1: 'node'
             2: ?
             3: HASH (attributes)
             4: 1 (line number)
             5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
                       1: 'node1'
                       2: ?
                       3: undefined (no attributes)
                       4: 1 (line number)
                       5: [ 0: [ 0: XML_READER_TYPE_TEXT
                                 1: 'some '
                               ]
                            1: [ 0: XML_READER_TYPE_ELEMENT
                                 1: 'n'
                                 2: ?
                                 3: undefined (no attributes)
                                 4: 1 (line number)
                                 5: undefined (no child-nodes)
                               ]
                            2: [ 0: XML_READER_TYPE_TEXT
                                 1: ' text'
                               ]
                          ]
                     ]
                  1: [ 0: XML_READER_TYPE_ELEMENT
                       1: 'node2'
                       2: ?
                       3: undefined (not attributes)
                       4: 1 (line number)
                       5: [ 0: [ 0: XML_READER_TYPE_TEXT
                                 1: 'more-text'
                               ]
                          ]
                     ]
                ]
           ]
      ]
 ]

 $data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)

 ref($data->[2])                                                         == ARRAY (with 1 element for 'node')
 ref($data->[2]->[0])                                                    == ARRAY (with 6 elements)

 $data->[2]->[0]->[0]                                                    == 1 (=> type == XML_READER_TYPE_ELEMENT)
 $data->[2]->[0]->[1]                                                    == 'node'
 ref($data->[2]->[0]->[3])                                               == HASH  (=> ${$data->[2]->[0]->[3]}{a} == 'v')
 $data->[2]->[0]->[4]                                                    == 1 (line number)
 ref($data->[2]->[0]->[5])                                               == ARRAY (with 2 elements for 'node1' and 'node2')
                                                                                    # child-nodes of actual node (see $children)

 ref($data->[2]->[0]->[5]->[0])                                          == ARRAY (with 6 elements)
 $data->[2]->[0]->[5]->[0]->[0]                                          == 1 (=> type == XML_READER_TYPE_ELEMENT)
 $data->[2]->[0]->[5]->[0]->[1]                                          == 'node1'
 $data->[2]->[0]->[5]->[0]->[3]                                          == undefined (=> no attribute)
 $data->[2]->[0]->[5]->[0]->[4]                                          == 1 (line number)
 ref($data->[2]->[0]->[5]->[0]->[5])                                     == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')

 ref($data->[2]->[0]->[5]->[0]->[5]->[0])                                == ARRAY (with 2 elements)
 $data->[2]->[0]->[5]->[0]->[5]->[0]->[0]                                == 3 (=> type ==  XML_READER_TYPE_TEXT)
 $data->[2]->[0]->[5]->[0]->[5]->[0]->[1]                                == 'some '

 ref($data->[2]->[0]->[5]->[0]->[5]->[1])                                == ARRAY (with 5 elements)
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[0]                                == 1 (=> type == XML_READER_TYPE_ELEMENT)
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[1]                                == 'n'
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[3]                                == undefined (=> no attribute)
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[4]                                == 1 (line number)
 $data->[2]->[0]->[5]->[0]->[5]->[1]->[5]                                == undefined (=> no child-nodes)

 ref($data->[2]->[0]->[5]->[0]->[5]->[2])                                == ARRAY (with 2 elements)
 $data->[2]->[0]->[5]->[0]->[5]->[2]->[0]                                == 3 (=> type ==  XML_READER_TYPE_TEXT)
 $data->[2]->[0]->[5]->[0]->[5]->[2]->[1]                                == ' text'


 descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
 Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
 ${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).


 ## Notes on whitespace handling

 Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
 (see function 'descend()').

 Definition of significant and insignificant whitespace
 (source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):

 Significant whitespace is part of the document content and should be preserved.
 Insignificant whitespace is used when editing XML documents for readability.
 These whitespaces are typically not intended for inclusion in the delivery of the document.

 ### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE

 The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
  'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.

 When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
  '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
  (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):

 echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'


 Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'

 Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
  'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).

 The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
  it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').

 The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
  enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:

 When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
  So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
  the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
  the last read 'non-tag'-node has to be corrected (see [1]),

 For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
  additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).

 [1]
 Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
  In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
  (see above code fragment '... not exists $ws{ $from - 1 } ...').

 [2]
 Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
  whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).

 The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
  (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?

 Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
  and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.


 ## Notes on whitespace fixing

 The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
  into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.

 It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
  example further down and notes on 'Input restrictions' in the manpage).

 Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.

 Examples (how primary text with linebreaks would be converted by below code):

   '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
   '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.

 Blanks are inserted before the 1st character:

  NOTE: not stringent ('...' stands for text):

    beg1............................end1  => no blank before 'beg1'
    beg2....<pb/>...................end2  => no blank before 'beg2'
    beg3....<info attr1="val1"/>....end3  => no blank before 'beg3'
    beg4....<test>ok</test>.........end4  =>    blank before 'beg4'

      =>  beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
                                                                             ^
                                                                             |_blank between 'end3' and 'beg4'


 ## Notes on segfault prevention

 binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
 (see notes on 'PerlIO layers' in  'man XML::LibXML'),
 removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
 see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
 see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.