blob: 80b0596be01388739ac763d21faa65ec0ac56659 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use Log::Any '$log';
use Log::Any::Adapter;
use Pod::Usage;
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use File::Basename qw(dirname);
use Encode qw(decode);
use XML::CompactTree::XS;
use XML::LibXML::Reader;
use FindBin;
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Annotations::Collector;
use KorAP::XML::TEI::Data;
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
eval {
require KorAP::XML::TEI::Tokenizer::KorAP;
1;
};
our $VERSION = '1.01';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
use constant {
# Set to 1 for minimal more debug output (no need to be parametrized)
DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0,
# XCT_LINE_NUMBERS is only needed for debugging
# (see XML::CompactTree::XS)
XCT_PARAM => (
XCT_DOCUMENT_ROOT
| XCT_IGNORE_COMMENTS
| XCT_ATTRIBUTE_ARRAY
| ($ENV{KORAPXMLTEI_DEBUG} ? XCT_LINE_NUMBERS : 0)
)
};
# Parse options from the command line
GetOptions(
'root|r=s' => \(my $root_dir = '.'),
'input|i=s' => \(my $input_fname = ''),
'tokenizer-call|tc=s' => \(my $tokenizer_call),
'tokenizer-korap|tk' => \(my $tokenizer_korap),
'tokenizer-internal|ti' => \(my $tokenizer_intern),
'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
'base-foundry=s' => \(my $base_dir = 'base'),
'data-file=s' => \(my $data_file = 'data'),
'header-file=s' => \(my $header_file = 'header'),
'tokens-file=s' => \(my $tokens_file = 'tokens'),
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
pod2usage(
-verbose => 99,
-sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
-msg => $VERSION_MSG,
-output => '-'
)
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG,
-output => '-'
);
}
);
# Establish logger
binmode(STDERR, ':encoding(UTF-8)');
Log::Any::Adapter->set('Stderr', log_level => $log_level);
$log->notice('Debugging is activated') if DEBUG;
# tag (without attributes), which contains the primary text
my $_TEXT_BODY = 'text';
# optional
# TODO: IDS-specific (and redundant)
my $_HEADER_TAG = 'idsHeader';
# name of the tag containing all information stored in $_tokens_file
my $_TOKENS_TAG = 'w';
# Define tokenizers
if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
die $log->fatal(
'Sentence splitting is currently only supported by KorAP tokenizer ' .
'(use -tk to activate it)'
);
};
# Remember to skip certain inline tags
my %skip_inline_tags = ();
if ($skip_inline_tags_str) {
foreach (split /\s*,\s*/, $skip_inline_tags_str) {
$skip_inline_tags{$_} = 1;
};
};
# External tokenization
my $ext_tok;
if ($tokenizer_call) {
$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
}
elsif ($tokenizer_korap) {
$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
if ($use_tokenizer_sentence_splits) {
$skip_inline_tags{s} = 1;
};
};
# Internal tokenization
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
# Name of the directory and the file containing all inline structure informations
# except for $_TOKENS_TAG information
my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
# Name of the directory and the file containing all inline token informations
# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
# Handling inline annotations (inside $_TOKENS_TAG)
my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
# Initialize Token- and Structure-Collector
my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
my $structures = KorAP::XML::TEI::Annotations::Collector->new;
# Initialize Data-Collector
my $data = KorAP::XML::TEI::Data->new;
# Initialize zipper
my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
# text directory (below $root_dir)
my $dir = '';
# Escaped version of text id
my $text_id_esc;
# element from $tree_data
my $e;
# Default encoding of the text
my $input_enc = 'UTF-8';
# variables for handling ~ whitespace related issue ~
# (it is sometimes necessary, to correct the from-values for some tags)
my $add_one;
# text line (needed for whitespace handling)
my $text_line = 0;
# hash for indices of whitespace-nodes
# (needed to recorrect from-values)
# IDEA:
# when closing element, check if it's from-index minus 1 refers to a whitespace-node
# (means: 'from-index - 1' is a key in %ws).
# if this is _not_ the case, then the from-value is one
# to high => correct it by substracting 1
my %ws;
# Input file handle (default: stdin)
my $input_fh = *STDIN;
if ($input_fname ne '') {
unless (open($input_fh, '<', $input_fname)) {
die $log->fatal("File '$input_fname' could not be opened.");
};
};
# Prevents segfaulting (see notes on segfault prevention)
binmode $input_fh;
# Reading input document
MAIN: while (<$input_fh>) {
# remove HTML (multi-line) comments (<!--...-->)
$_ = remove_xml_comments($input_fh, $_);
# Set input encoding
if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
$input_enc = $2;
next;
};
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
# Start of text body
if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
my $suffix = $2;
if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with opening text-body tag '${_TEXT_BODY}' " .
"contains additional information ... => Aborting (line=$_)");
};
# Text body data extracted from input document ($input_fh),
# further processed by XML::LibXML::Reader
my $text_buffer = '';
# Iterate over all lines in the text body
while (<$input_fh>) {
$_ = remove_xml_comments($input_fh, $_);
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
# End of text body
if ((my $pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with closing text-body tag '${_TEXT_BODY}'".
" contains additional information ... => Aborting (line=$_)");
};
if ($dir eq '') {
$log->warn(
"Maybe empty textSigle => skipping this text ...\n" .
'data=' . substr($data->data, 0, 200)
);
next MAIN;
};
my $reader = XML::LibXML::Reader->new(
string => "<text>$text_buffer</text>",
huge => 1
);
my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, XCT_PARAM);
# ~ whitespace related issue ~
$add_one = 0;
%ws = ();
# Recursively parse all children
descend(1, $tree_data->[2]);
if (DEBUG) {
$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
};
# Write data.xml
$data->to_zip(
$zipper->new_stream("$dir/${data_file}.xml"),
$text_id_esc
);
# Tokenize with external tokenizer
if ($ext_tok) {
# Tokenize and output
$ext_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
$text_id_esc
);
if ($use_tokenizer_sentence_splits) {
$ext_tok->sentencize_from_previous_input($structures);
};
};
# Tokenize with internal tokenizer
if ($tokenizer_intern) {
# Tokenize and output
$cons_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
$text_id_esc
)->reset;
$aggr_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
$text_id_esc
)->reset;
};
# ~ write structures ~
if (!$structures->empty) {
$structures->to_zip(
$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
$text_id_esc,
2 # = structure serialization
)->reset;
};
# ~ write tokens ~
unless ($skip_inline_tokens || $tokens->empty) {
$tokens->to_zip(
$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
$text_id_esc,
$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
)->reset;
};
# reinit.
$dir = '';
# Maybe not necessary
$data->reset;
next MAIN;
};
# ~ whitespace handling ~
# Fix whitespaces (see notes on whitespace fixing)
# TODO:
# Maybe it's best, to keep the stripping of whitespace and
# to just remove the if-clause and to insert a blank by default
# (with possibly an option on how newlines in primary text should
# be handled (stripped or replaced by a whitespace)).
# Remove consecutive whitespace at beginning and end (mostly one newline)
s/^\s+//; s/\s+$//;
# NOTE:
# this is only relevant, if a text consists of more than one line
# TODO:
# find a better solution, or create a warning, if a text has more
# than one line ($text_line > 1)
# TODO:
# do testing with 2 different corpora
# (one with only one-line texts, the other with several lines per text)
# line contains at least one tag with at least one character contents
if (m/<[^>]+>[^<]/) {
# Increment counter for text lines
$text_line++;
# insert blank before 1st character
# (for 2nd line and consecutive lines)
$_ = ' ' . $_ if $text_line > 1;
}
# add line to buffer
$text_buffer .= $_;
};
}
# Start of header section
elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
my $content = "$2\n";
if ($1 !~ /^\s*$/) {
die $log->fatal(
"input line number $.: " .
'line with opening header tag is not in expected format ... ' .
"=> Aborting (line=$_)");
};
# Parse header
my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
# Header was parseable
if ($header) {
# Write header to zip
my $file = $header->dir . '/' . $header_file . '.xml';
$log->debug("Writing file $file") if DEBUG;
$header->to_zip($zipper->new_stream($file));
# Header is for text level
if ($header->type eq 'text') {
# Remember dir and sigles
$dir = $header->dir;
$text_id_esc = $header->id_esc;
# log output for seeing progression
$log->notice("$0: text_id=$text_id_esc");
# Reset counter for text lines
# (needed for whitespace handling)
$text_line = 0;
};
};
};
};
$zipper->close;
$ext_tok->close if $ext_tok;
close $input_fh;
exit(0);
# Recursively called function to handle XML tree data
sub descend {
# recursion level
# (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
my $depth = shift;
# Iteration through all array elements
# ($_[0] is a reference to an array reference)
# See notes on how 'XML::CompactTree::XS' works and
# see 'NODE TYPES' in manpage of XML::LibXML::Reader
foreach $e (@{$_[0]}) {
# $e->[1] represents the tag name of an element node
# or the primary data of a text or ws node
my $node_info = $e->[1];
# Element node
if ($e->[0] == XML_READER_TYPE_ELEMENT) {
# Deal with opening tag
# Get the child index depending on the debug state.
# This is likely to be optimized away by the compiler.
my $children = $e->[DEBUG ? 5 : 4];
# Skip certain tags
if ($skip_inline_tags{$node_info}) {
descend($depth + 1, $children) if defined $children;
next;
};
my $anno = $structures->add_new_annotation($node_info);
# Add element also to token list
if (!$skip_inline_tokens && $node_info eq $_TOKENS_TAG) {
$tokens->add_annotation($anno);
};
# Handle attributes (if attributes exist)
if (defined $e->[3]) {
# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
# NOTE:
# arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
$anno->add_attribute(
@{$e->[3]}[$_, $_ + 1]
);
};
};
# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
$anno->set_from($data->position + $add_one);
# Call function recursively
# do no recursion, if $children is not defined
# (because we have no array of child-nodes, e.g.: <back/>)
descend($depth+1, $children) if defined $children;
# Deal with closing tag
# NOTE:
# use $pos, because the offsets are _between_ the characters
# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
my $pos = $data->position;
# Handle structures and tokens
my $from = $anno->from;
# ~ whitespace related issue ~
if ($from > 0 && not exists $ws{$from - 1}) {
# Previous node was a text-node
$anno->set_from($from - 1);
};
# in case this fails, check input
if (($from - 1) > $pos) {
die $log->fatal(
"text_id='$text_id_esc', " .
'processing of structures: ' .
"from-value ($from) is 2 or more greater " .
"than to-value ($pos) => please check. Aborting"
);
};
# TODO:
# find example for which this case applies
# maybe this is not necessary anymore, because the
# above recorrection of the from-value suffices
#
# TODO:
# check, if it's better to remove this line and
# change above check to 'if ($from - 1) >= $pos;
# do testing with bigger corpus excerpt (wikipedia?)
$anno->set_from($pos) if $from == $pos + 1;
$anno->set_to($pos);
$anno->set_level($depth);
# Clean up whitespace
delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
}
# Text node
elsif ($e->[0] == XML_READER_TYPE_TEXT) {
$add_one = 1;
$data->append($node_info);
}
# Whitespace node
# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
# state, that this from-index belongs to a whitespace-node
# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
$ws{$data->position}++;
$add_one = 0;
$data->append($node_info);
}
# not yet handled type
else {
die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
};
};
};
__END__
=pod
=encoding utf8
=head1 NAME
tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
=head1 SYNOPSIS
cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
=head1 DESCRIPTION
C<tei2korapxml> is a script to convert TEI P5 and
L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
based documents to the
L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
If no specific input is defined, data is
read from C<STDIN>. If no specific output is defined, data is written
to C<STDOUT>.
This program is usually called from inside another script.
=head1 FORMATS
=head2 Input restrictions
=over 2
=item
TEI P5 formatted input with certain restrictions:
=over 4
=item
B<mandatory>: text-header with integrated textsigle, text-body
=item
B<optional>: corp-header with integrated corpsigle,
doc-header with integrated docsigle
=back
=item
All tokens inside the primary text may not be
newline seperated, because newlines are removed
(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
into blanks between 2 tokens could lead to additional blanks,
where there should be none (e.g.: punctuation characters like C<,> or
C<.> should not be seperated from their predecessor token).
(see also code section C<~ whitespace handling ~>).
=back
=head2 Notes on the output
=over 2
=item
zip file output (default on C<stdout>) with utf8 encoded entries
(which together form the KorAP-XML format)
=back
=head1 INSTALLATION
C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
these bindings are available, the preferred way to install the script is
to use L<cpanm|App::cpanminus>.
$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
In case everything went well, the C<tei2korapxml> tool will
be available on your command line immediately.
Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
=head1 OPTIONS
=over 2
=item B<--root|-r>
The root directory for output. Defaults to C<.>.
=item B<--help|-h>
Print help information.
=item B<--version|-v>
Print version information.
=item B<--tokenizer-call|-tc>
Call an external tokenizer process, that will tokenize
a single line from STDIN and outputs one token per line.
=item B<--tokenizer-korap|-tk>
Use the standard KorAP/DeReKo tokenizer.
=item B<--tokenizer-internal|-ti>
Tokenize the data using two embedded tokenizers,
that will take an I<Aggressive> and a I<conservative>
approach.
=item B<--skip-inline-tokens>
Boolean flag indicating that inline tokens should not
be processed. Defaults to false (meaning inline tokens will be processed).
=item B<--skip-inline-tags> <tags>
Expects a comma-separated list of tags to be ignored when the structure
is parsed. Content of these tags however will be processed.
=item B<--inline-tokens> <foundry>#[<file>]
Define the foundry and file (without extension)
to store inline token information in.
If L</KORAPXMLTEI_INLINE> is set, this will contain
annotations as well.
Defaults to C<tokens> and C<morpho>.
=item B<--inline-structures> <foundry>#[<file>]
Define the foundry and file (without extension)
to store inline structure information in.
Defaults to C<struct> and C<structures>.
=item B<--base-foundry> <foundry>
Define the base foundry to store newly generated
token information in.
Defaults to C<base>.
=item B<--data-file> <file>
Define the file (without extension)
to store primary data information in.
Defaults to C<data>.
=item B<--header-file> <file>
Define the file name (without extension)
to store header information on
the corpus, document, and text level in.
Defaults to C<header>.
=item B<--use-tokenizer-sentence-splits|-s>
Replace existing with, or add new, sentence boundary information
provided by the KorAP tokenizer (currently supported only).
=item B<--tokens-file> <file>
Define the file (without extension)
to store generated token information in
(either from the KorAP tokenizer or an externally called tokenizer).
Defaults to C<tokens>.
=item B<--log|-l>
Loglevel for I<Log::Any>. Defaults to C<notice>.
=back
=head1 ENVIRONMENT VARIABLES
=over 2
=item B<KORAPXMLTEI_DEBUG>
Activate minimal debugging.
Defaults to C<false>.
=item B<KORAPXMLTEI_INLINE>
Process inline annotations, if present.
Defaults to C<false>.
=back
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: Peter Harders
Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
=cut
# NOTES
## Notes on how 'XML::CompactTree::XS' works
Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
Print out name of 'node2' for the above example:
echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
Exploring the structure of $data ( = reference to below array ):
[ 0: XML_READER_TYPE_DOCUMENT,
1: ?
2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
1: 'node'
2: ?
3: HASH (attributes)
4: 1 (line number)
5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
1: 'node1'
2: ?
3: undefined (no attributes)
4: 1 (line number)
5: [ 0: [ 0: XML_READER_TYPE_TEXT
1: 'some '
]
1: [ 0: XML_READER_TYPE_ELEMENT
1: 'n'
2: ?
3: undefined (no attributes)
4: 1 (line number)
5: undefined (no child-nodes)
]
2: [ 0: XML_READER_TYPE_TEXT
1: ' text'
]
]
]
1: [ 0: XML_READER_TYPE_ELEMENT
1: 'node2'
2: ?
3: undefined (not attributes)
4: 1 (line number)
5: [ 0: [ 0: XML_READER_TYPE_TEXT
1: 'more-text'
]
]
]
]
]
]
]
$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
ref($data->[2]) == ARRAY (with 1 element for 'node')
ref($data->[2]->[0]) == ARRAY (with 6 elements)
$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
$data->[2]->[0]->[1] == 'node'
ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
$data->[2]->[0]->[4] == 1 (line number)
ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
# child-nodes of actual node (see $children)
ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
$data->[2]->[0]->[5]->[0]->[1] == 'node1'
$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
## Notes on whitespace handling
Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
(see function 'descend()').
Definition of significant and insignificant whitespace
(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
Significant whitespace is part of the document content and should be preserved.
Insignificant whitespace is used when editing XML documents for readability.
These whitespaces are typically not intended for inclusion in the delivery of the document.
### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
the last read 'non-tag'-node has to be corrected (see [1]),
For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
[1]
Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
(see above code fragment '... not exists $ws{ $from - 1 } ...').
[2]
Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
## Notes on whitespace fixing
The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
example further down and notes on 'Input restrictions' in the manpage).
Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
Examples (how primary text with linebreaks would be converted by below code):
'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
Blanks are inserted before the 1st character:
NOTE: not stringent ('...' stands for text):
beg1............................end1 => no blank before 'beg1'
beg2....<pb/>...................end2 => no blank before 'beg2'
beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
beg4....<test>ok</test>.........end4 => blank before 'beg4'
=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
^
|_blank between 'end3' and 'beg4'
## Notes on segfault prevention
binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
(see notes on 'PerlIO layers' in 'man XML::LibXML'),
removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.