blob: 785621ea29c9fdffc77df5d5ec9b9e8b0f4dc917 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use Log::Any '$log';
use Log::Any::Adapter;
use Pod::Usage;
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use File::Basename qw(dirname);
use Encode qw(decode);
use XML::CompactTree::XS;
use XML::LibXML::Reader;
use FindBin;
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Annotations::Collector;
use KorAP::XML::TEI::Data;
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
eval {
require KorAP::XML::TEI::Tokenizer::KorAP;
1;
};
our $VERSION = '1.00';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
# Set to 1 for minimal more debug output (no need to be parametrized)
use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
# Parse options from the command line
GetOptions(
"root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
"input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'log|l=s' => \(my $log_level = 'notice'),
'help|h' => sub {
pod2usage(
-verbose => 99,
-sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
-msg => $VERSION_MSG,
-output => '-'
)
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG,
-output => '-'
)
}
);
binmode(STDERR, ":encoding(UTF-8)");
Log::Any::Adapter->set('Stderr', log_level => $log_level);
$log->notice('Debugging is activated') if DEBUG;
#
# ~~~ parameter (mandatory) ~~~
#
my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
# optional
my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
# optional
my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
# mandatory
my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
## extern tokenization
my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
}
my $ext_tok;
if ($tokenizer_call) {
$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
}
elsif ($tokenizer_korap) {
$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
};
my $_tok_file_ext = "tokens.xml";
##
#
# ~~~ constants ~~~
#
## intern tokenization
my $_GEN_TOK_INT = $tokenizer_intern; # simple tokenization (recommended for testing)
my $_tok_file_con = "tokens_conservative.xml";
my $_tok_file_agg = "tokens_aggressive.xml";
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
##
my $_tok_dir = "base"; # name of directory for storing tokenization files
my $_header_file = "header.xml"; # name of files containing the text, document and corpus header
my $_data_file = "data.xml"; # name of file containing the primary text data (tokens)
my $_structure_dir = "struct"; # name of directory containing the $_structure_file
my $_structure_file = "structure.xml"; # name of file containing all tags (except ${_TOKEN_TAG}'s) related information
# (= their names and byte offsets in $_data)
## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
# Name of the directory and the file containing all inline token informations
# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
$_tokens_file .= '.xml';
my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
# Handling inline annotations (inside $_TOKENS_TAG)
my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
#
# ~~~ variables ~~~
#
# Initialize Token- and Structure-Collector
my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
my $structures = KorAP::XML::TEI::Annotations::Collector->new;
# Initialize Data-Collector
my $data = KorAP::XML::TEI::Data->new;
# Initialize zipper
my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
my $input_fh; # input file handle (default: stdin)
my $dir; # text directory (below $_root_dir)
my ( $text_id,
$text_id_esc ); # '$text_id_esc' = escaped version of $text_id
my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
$tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
# these are only used inside recursive function 'retr_info'
my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
$e, # element from $tree_data
## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
$add_one, # ...
$fval, # ...
%ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
# idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
# (means: 'from-index - 1' is a key in %ws).
# if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
my $c; # index variables used in loops
#
# ~~~ main ~~~
#
# ~ initializations ~
# Include line numbers in elements of $tree_data for debugging
DEBUG ? ($_IDX = 5) : ($_IDX = 4);
$fval = 0;
# Normalize regex for header parsing
for ($_CORP_HEADER_BEG,
$_DOC_HEADER_BEG,
$_TEXT_HEADER_BEG) {
s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
};
# ~ read input and write output (text by text) ~
my ( $pfx, $sfx );
my $tl = 0; # text line (needed for whitespace handling)
$input_fh = *STDIN; # input file handle (default: stdin)
# Maybe not necessary
$data->reset;
$dir = "";
if ( $input_fname ne '' ){
unless (open($input_fh, '<', $input_fname)) {
die $log->fatal("File '$input_fname' could not be opened.");
};
}
# Prevents segfaulting (see notes on segfault prevention)
binmode $input_fh;
my $pos;
my $input_enc = 'UTF-8';
my $l = length('</' . $_TEXT_BODY) + 1;
# ~ loop (reading input document) ~
MAIN: while ( <$input_fh> ){
$_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
# Set input encoding
if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
$input_enc = $2;
next;
};
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
# ~ start of text body ~
$pfx = $1;
$sfx = $2;
if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with opening text-body tag '${_TEXT_BODY}' " .
"contains additional information ... => Aborting (line=$_)");
};
# text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
my $buf_in = '';
# Iterate over all lines in the text body
while (<$input_fh>) {
$_ = remove_xml_comments( $input_fh, $_ );
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
# ~ end of text body ~
if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with closing text-body tag '${_TEXT_BODY}'".
" contains additional information ... => Aborting (line=$_)");
};
if ($dir ne "") {
$reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
# See notes on whitespace handling
my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
# XCT_LINE_NUMBERS is only needed for debugging
# (see XML::CompactTree::XS)
$param |= XCT_LINE_NUMBERS if DEBUG;
$tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
$structures->reset;
$tokens->reset if $_TOKENS_PROC;
# ~ whitespace related issue ~
$add_one = 0;
%ws = ();
# ~ recursion ~
retr_info(1, \$tree_data->[2] ); # parse input data
if (DEBUG) {
$log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
};
# ~ write data.xml ~
$data->to_zip(
$zipper->new_stream("$dir/${_data_file}"),
$text_id_esc
);
# ~ tokenization ~
if ($_GEN_TOK_EXT) {
# Tokenize and output
$ext_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
$text_id_esc
);
};
if ($_GEN_TOK_INT) {
# Tokenize and output
$cons_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
$text_id_esc
);
$aggr_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
$text_id_esc
);
$aggr_tok->reset;
$cons_tok->reset;
};
if ($use_tokenizer_sentence_splits) {
$ext_tok->sentencize_from_previous_input($structures);
}
# ~ write structures ~
if (!$structures->empty) {
$structures->to_zip(
$zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
$text_id_esc,
2 # = structure serialization
);
};
# ~ write tokens ~
if ($_TOKENS_PROC && !$tokens->empty) {
$tokens->to_zip(
$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
$text_id_esc,
$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
);
};
$dir = ""; # reinit.
# Maybe not necessary
$data->reset;
} else { # $dir eq ""
$log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
}
next MAIN;
};
# ~ inside text body ~
# ~ whitespace handling ~
# Fix whitespaces (see notes on whitespace fixing)
# TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
# an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
# Remove consecutive whitespace at beginning and end (mostly one newline)
s/^\s+//; s/\s+$//;
### NOTE: this is only relevant, if a text consists of more than one line
### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
$tl++; # counter for text lines
s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
}
###
# add line to buffer
$buf_in .= $_;
};
} elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
# ~ start of header ~
$pfx = $1;
my $content = "$2\n";
if ($pfx !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with opening header tag" .
" is not in expected format ... => Aborting (line=$_)");
};
# Parse header
my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
# Header was parseable
if ($header) {
# Write header to zip
my $file = $header->dir . '/' . $_header_file;
$log->debug("Writing file $file") if DEBUG;
$header->to_zip($zipper->new_stream($file));
# Header is for text level
if ($header->type eq 'text') {
# Remember dir and sigles
$dir = $header->dir;
$text_id = $header->id;
$text_id_esc = $header->id_esc;
# log output for seeing progression
$log->notice("$0: main(): text_id=$text_id");
$tl = 0; # reset (needed for ~ whitespace handling ~)
}
}
}
} #end: while
$zipper->close;
$ext_tok->close if $_GEN_TOK_EXT;
exit(0);
# Recursively called function to handle XML tree data
sub retr_info {
# recursion level
# (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
my $rl = shift;
my $dummy_anno;
if ($use_tokenizer_sentence_splits) {
$dummy_anno = $structures->new_dummy_annotation;
}
# Iteration through all array elements
# ($_[0] is a reference to an array reference)
# See notes on how 'XML::CompactTree::XS' works and
# see 'NODE TYPES' in manpage of XML::LibXML::Reader
foreach $e (@{${$_[0]}}) {
# Element node
if ($e->[0] == XML_READER_TYPE_ELEMENT) {
#~~~~
# from here: tag-node (opening)
#~~~~
my $anno;
# $e->[1] represents the tag name
if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
$anno = $dummy_anno;
} else {
$anno = $structures->add_new_annotation($e->[1]);
}
# Add element also to token list
if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
$tokens->add_annotation($anno);
};
# Handle attributes (if attributes exist)
if (defined $e->[3]) {
# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
for ($c = 0; $c < @{$e->[3]}; $c += 2) {
# '$c' references the 'key' and '$c+1' the 'value'
$anno->add_attribute(
@{$e->[3]}[$c, $c + 1]
);
};
};
# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
$anno->set_from($data->position + $add_one);
#~~~~
# until here: tag-node (opening)
#~~~~
# Call function recursively
# do no recursion, if $e->[$_IDX] is not defined
# (because we have no array of child-nodes, e.g.: <back/>)
if (defined $e->[$_IDX]) {
# Recursion with array of child-nodes
retr_info($rl+1, \$e->[$_IDX]);
}
#~~~~~
# from here: tag-node (closing)
#~~~~~
# NOTE: use $pos, because the offsets are _between_ the characters
# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
my $pos = $data->position;
# Handle structures and tokens
$fval = $anno->from;
# ~ whitespace related issue ~
if ($fval > 0 && not exists $ws{$fval - 1}) {
# ~ previous node was a text-node ~
$anno->set_from($fval - 1);
}
# in case this fails, check input
if (($fval - 1) > $pos) {
die $log->fatal("text_id='$text_id', " .
"processing of structures: " .
"from-value ($fval) is 2 or more greater " .
"than to-value ($pos) => please check. Aborting");
};
# TODO: find example for which this case applies
# maybe this is not necessary anymore, because the above recorrection of the from-value suffices
#
# TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
# do testing with bigger corpus excerpt (wikipedia?)
$anno->set_from($pos) if $fval == $pos + 1;
$anno->set_to($pos);
$anno->set_level($rl);
# Clean up whitespace
delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
#~~~~
# until here: tag-node (closing)
#~~~~
}
# Text node
elsif ($e->[0] == XML_READER_TYPE_TEXT){
$add_one = 1;
$data->append($e->[1]);
}
# Whitespace node
# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
# state, that this from-index belongs to a whitespace-node
# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
$ws{$data->position}++;
$add_one = 0;
$data->append($e->[1]);
}
# not yet handled type
else {
die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
};
};
};
__END__
=pod
=encoding utf8
=head1 NAME
tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
=head1 SYNOPSIS
cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
=head1 DESCRIPTION
C<tei2korapxml> is a script to convert TEI P5 and
L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
based documents to the
L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
If no specific input is defined, data is
read from C<STDIN>. If no specific output is defined, data is written
to C<STDOUT>.
This program is usually called from inside another script.
=head1 FORMATS
=head2 Input restrictions
=over 2
=item
TEI P5 formatted input with certain restrictions:
=over 4
=item
B<mandatory>: text-header with integrated textsigle, text-body
=item
B<optional>: corp-header with integrated corpsigle,
doc-header with integrated docsigle
=back
=item
All tokens inside the primary text may not be
newline seperated, because newlines are removed
(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
into blanks between 2 tokens could lead to additional blanks,
where there should be none (e.g.: punctuation characters like C<,> or
C<.> should not be seperated from their predecessor token).
(see also code section C<~ whitespace handling ~>).
=back
=head2 Notes on the output
=over 2
=item
zip file output (default on C<stdout>) with utf8 encoded entries
(which together form the KorAP-XML format)
=back
=head1 INSTALLATION
C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
these bindings are available, the preferred way to install the script is
to use L<cpanm|App::cpanminus>.
$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
In case everything went well, the C<tei2korapxml> tool will
be available on your command line immediately.
Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
=head1 OPTIONS
=over 2
=item B<--root|-r>
The root directory for output. Defaults to C<.>.
=item B<--help|-h>
Print help information.
=item B<--version|-v>
Print version information.
=item B<--tokenizer-call|-tc>
Call an external tokenizer process, that will tokenize
a single line from STDIN and outputs one token per line.
=item B<--tokenizer-korap|-tk>
Use the standard KorAP/DeReKo tokenizer.
=item B<--tokenizer-internal|-ti>
Tokenize the data using two embedded tokenizers,
that will take an I<Aggressive> and a I<conservative>
approach.
=item B<--inline-tokens> <foundry>#[<file>]
Define the foundry and file (without extension)
to store inline token information in.
If L</KORAPXMLTEI_INLINE> is set, this will contain
annotations as well.
Defaults to C<tokens> and C<morpho>.
=item B<--use-tokenizer-sentence-splits|-s>
Replace existing with, or add new, sentence boundary information
provided by the KorAP tokenizer (currently supported only).
=item B<--log|-l>
Loglevel for I<Log::Any>. Defaults to C<notice>.
=back
=head1 ENVIRONMENT VARIABLES
=over 2
=item B<KORAPXMLTEI_DEBUG>
Activate minimal debugging.
Defaults to C<false>.
=item B<KORAPXMLTEI_INLINE>
Process inline annotations, if present.
Defaults to C<false>.
=back
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: Peter Harders
Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
=cut
# NOTES
## Notes on how 'XML::CompactTree::XS' works
Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
Print out name of 'node2' for the above example:
echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
Exploring the structure of $data ( = reference to below array ):
[ 0: XML_READER_TYPE_DOCUMENT,
1: ?
2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see main(): retr_info( \$tree_data->[2] ))
1: 'node'
2: ?
3: HASH (attributes)
4: 1 (line number)
5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
1: 'node1'
2: ?
3: undefined (no attributes)
4: 1 (line number)
5: [ 0: [ 0: XML_READER_TYPE_TEXT
1: 'some '
]
1: [ 0: XML_READER_TYPE_ELEMENT
1: 'n'
2: ?
3: undefined (no attributes)
4: 1 (line number)
5: undefined (no child-nodes)
]
2: [ 0: XML_READER_TYPE_TEXT
1: ' text'
]
]
]
1: [ 0: XML_READER_TYPE_ELEMENT
1: 'node2'
2: ?
3: undefined (not attributes)
4: 1 (line number)
5: [ 0: [ 0: XML_READER_TYPE_TEXT
1: 'more-text'
]
]
]
]
]
]
]
$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
ref($data->[2]) == ARRAY (with 1 element for 'node')
ref($data->[2]->[0]) == ARRAY (with 6 elements)
$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
$data->[2]->[0]->[1] == 'node'
ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
$data->[2]->[0]->[4] == 1 (line number)
ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
# child-nodes of actual node (see $_IDX)
ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
$data->[2]->[0]->[5]->[0]->[1] == 'node1'
$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
## Notes on whitespace handling
Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
(see function 'retr_info()').
Definition of significant and insignificant whitespace
(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
Significant whitespace is part of the document content and should be preserved.
Insignificant whitespace is used when editing XML documents for readability.
These whitespaces are typically not intended for inclusion in the delivery of the document.
### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
the last read 'non-tag'-node has to be corrected (see [1]),
For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
[1]
Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
(see above code fragment '... not exists $ws{ $fval - 1 } ...').
[2]
Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
## Notes on whitespace fixing
The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
example further down and notes on 'Input restrictions' in the manpage).
Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
Examples (how primary text with linebreaks would be converted by below code):
'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
Blanks are inserted before the 1st character:
NOTE: not stringent ('...' stands for text):
beg1............................end1 => no blank before 'beg1'
beg2....<pb/>...................end2 => no blank before 'beg2'
beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
beg4....<test>ok</test>.........end4 => blank before 'beg4'
=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
^
|_blank between 'end3' and 'beg4'
## Notes on segfault prevention
binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside 'main()'
(see notes on 'PerlIO layers' in 'man XML::LibXML'),
removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.