blob: 9a0cbf9aa586ad3c73022a04d626291254d90121 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron6d7b8e42020-09-29 07:37:41 +020049 'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akrondd0be8f2021-02-18 19:29:41 +010052 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron3378dfd2020-08-01 15:01:36 +020053 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020054 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010055 pod2usage(
56 -verbose => 99,
57 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
58 -msg => $VERSION_MSG,
59 -output => '-'
60 )
61 },
62 'version|v' => sub {
63 pod2usage(
64 -verbose => 0,
65 -msg => $VERSION_MSG,
66 -output => '-'
67 )
68 }
Peter Hardersd892a582020-02-12 15:45:22 +010069);
70
Marc Kupietz44b1f252020-11-26 16:31:40 +010071binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020072Log::Any::Adapter->set('Stderr', log_level => $log_level);
73
Akronb3649472020-09-29 08:24:46 +020074$log->notice('Debugging is activated') if DEBUG;
75
Peter Harders6f526a32020-06-29 21:44:41 +020076#
77# ~~~ parameter (mandatory) ~~~
78#
Peter Harders6f526a32020-06-29 21:44:41 +020079my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron0c41ab32020-09-29 07:33:33 +020080# optional
Peter Harders6f526a32020-06-29 21:44:41 +020081my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron0c41ab32020-09-29 07:33:33 +020082# optional
Peter Harders6f526a32020-06-29 21:44:41 +020083my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron0c41ab32020-09-29 07:33:33 +020084# mandatory
Peter Harders6f526a32020-06-29 21:44:41 +020085my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron09e0b2c2020-07-28 15:57:01 +020086
Akron0c41ab32020-09-29 07:33:33 +020087
Peter Harders41c35622020-07-12 01:16:22 +020088## extern tokenization
Marc Kupietz1e882fb2020-09-09 00:05:46 +020089my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
90
Marc Kupietz985da0c2021-02-15 19:29:50 +010091if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
92 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
93}
94
Akron0c41ab32020-09-29 07:33:33 +020095my $ext_tok;
96if ($tokenizer_call) {
97 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
98}
Marc Kupietz1e882fb2020-09-09 00:05:46 +020099
Akron0c41ab32020-09-29 07:33:33 +0200100elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100101 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200102};
103my $_tok_file_ext = "tokens.xml";
Peter Harders6f526a32020-06-29 21:44:41 +0200104##
105
Akron0c41ab32020-09-29 07:33:33 +0200106
Akron4e3c7e32021-02-18 15:19:53 +0100107#
108# ~~~ constants ~~~
109#
110
111
Akron8b511f92020-07-09 17:28:08 +0200112## intern tokenization
Peter Hardersf9c51242020-07-21 02:37:44 +0200113my $_GEN_TOK_INT = $tokenizer_intern; # simple tokenization (recommended for testing)
Akron0c41ab32020-09-29 07:33:33 +0200114my $_tok_file_con = "tokens_conservative.xml";
115my $_tok_file_agg = "tokens_aggressive.xml";
116my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
117my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200118##
119
Akron0c41ab32020-09-29 07:33:33 +0200120
Peter Harders41c35622020-07-12 01:16:22 +0200121my $_tok_dir = "base"; # name of directory for storing tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200122
Peter Harders6f526a32020-06-29 21:44:41 +0200123my $_header_file = "header.xml"; # name of files containing the text, document and corpus header
124my $_data_file = "data.xml"; # name of file containing the primary text data (tokens)
Akrondd0be8f2021-02-18 19:29:41 +0100125
Peter Harders6f526a32020-06-29 21:44:41 +0200126## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
127my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100128
129
Akrondd0be8f2021-02-18 19:29:41 +0100130# Name of the directory and the file containing all inline structure informations
131# except for $_TOKEN_TAG information
132my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
133$_structure_file .= '.xml';
134
135
Akron1a5271a2021-02-18 13:18:15 +0100136# Name of the directory and the file containing all inline token informations
137# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
138my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
139$_tokens_file .= '.xml';
140
Peter Harders6f526a32020-06-29 21:44:41 +0200141my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
142
Akron4e3c7e32021-02-18 15:19:53 +0100143# Handling inline annotations (inside $_TOKENS_TAG)
144my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200145
Peter Harders6f526a32020-06-29 21:44:41 +0200146
147#
148# ~~~ variables ~~~
149#
150
Akron7501ca02020-08-01 21:05:25 +0200151# Initialize Token- and Structure-Collector
152my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
153my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200154
155
Akrona10ad592020-08-03 11:20:23 +0200156# Initialize Data-Collector
157my $data = KorAP::XML::TEI::Data->new;
158
159
Akron85717512020-07-08 11:19:19 +0200160# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200161my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200162my $input_fh; # input file handle (default: stdin)
163
Peter Harders6f526a32020-06-29 21:44:41 +0200164my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200165
Akron0c41ab32020-09-29 07:33:33 +0200166my ( $text_id,
167 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200168
Peter Harders6f526a32020-06-29 21:44:41 +0200169my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
170 $tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
171
172# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100173my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200174 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200175 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
176 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200177 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200178 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
179 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200180 # (means: 'from-index - 1' is a key in %ws).
181 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
182
Akron7501ca02020-08-01 21:05:25 +0200183my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200184
Peter Harders6f526a32020-06-29 21:44:41 +0200185
186#
187# ~~~ main ~~~
188#
189
190# ~ initializations ~
191
Akron4e3c7e32021-02-18 15:19:53 +0100192# Include line numbers in elements of $tree_data for debugging
193DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200194
Akron7501ca02020-08-01 21:05:25 +0200195$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200196
Akronec2cef22020-07-31 10:00:15 +0200197# Normalize regex for header parsing
198for ($_CORP_HEADER_BEG,
199 $_DOC_HEADER_BEG,
200 $_TEXT_HEADER_BEG) {
201 s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
202};
Peter Hardersd892a582020-02-12 15:45:22 +0100203
Peter Hardersd892a582020-02-12 15:45:22 +0100204
Peter Harders6f526a32020-06-29 21:44:41 +0200205# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100206
Akron347be812020-09-29 07:52:52 +0200207my ( $pfx, $sfx );
Peter Hardersd892a582020-02-12 15:45:22 +0100208
Akron347be812020-09-29 07:52:52 +0200209my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100210
Akron347be812020-09-29 07:52:52 +0200211$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100212
Akron347be812020-09-29 07:52:52 +0200213# Maybe not necessary
214$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100215
Akron347be812020-09-29 07:52:52 +0200216$dir = "";
Peter Hardersd892a582020-02-12 15:45:22 +0100217
Akron347be812020-09-29 07:52:52 +0200218if ( $input_fname ne '' ){
219 unless (open($input_fh, '<', $input_fname)) {
220 die $log->fatal("File '$input_fname' could not be opened.");
221 };
222}
Peter Harders6f526a32020-06-29 21:44:41 +0200223
Akronf8088e62021-02-18 16:18:59 +0100224# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200225binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200226
Akron347be812020-09-29 07:52:52 +0200227my $pos;
Akroneaa96232020-10-15 17:06:15 +0200228my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200229my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200230
Akron347be812020-09-29 07:52:52 +0200231# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200232
Akron347be812020-09-29 07:52:52 +0200233MAIN: while ( <$input_fh> ){
234
235 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
236
Akroneaa96232020-10-15 17:06:15 +0200237 # Set input encoding
238 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
239 $input_enc = $2;
240 next;
241 };
242
243 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100244 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200245
Akron347be812020-09-29 07:52:52 +0200246 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
247
248 # ~ start of text body ~
249
250 $pfx = $1;
251 $sfx = $2;
252
253 if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
254 die $log->fatal("input line number $.: " .
255 "line with opening text-body tag '${_TEXT_BODY}' " .
256 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200257 };
Peter Harders6f526a32020-06-29 21:44:41 +0200258
Akron347be812020-09-29 07:52:52 +0200259 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
260 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200261
Akron347be812020-09-29 07:52:52 +0200262 # Iterate over all lines in the text body
263 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200264
Akron347be812020-09-29 07:52:52 +0200265 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200266 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100267 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200268
Akron347be812020-09-29 07:52:52 +0200269 # ~ end of text body ~
270 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200271
Akron347be812020-09-29 07:52:52 +0200272 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
Peter Harders6f526a32020-06-29 21:44:41 +0200273
Akron347be812020-09-29 07:52:52 +0200274 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
275 die $log->fatal("input line number $.: " .
276 "line with closing text-body tag '${_TEXT_BODY}'".
277 " contains additional information ... => Aborting (line=$_)");
278 };
Peter Harders6f526a32020-06-29 21:44:41 +0200279
Akron347be812020-09-29 07:52:52 +0200280 if ($dir ne "") {
Peter Harders6f526a32020-06-29 21:44:41 +0200281
Akron347be812020-09-29 07:52:52 +0200282 $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
Peter Harders6f526a32020-06-29 21:44:41 +0200283
Akronf8088e62021-02-18 16:18:59 +0100284 # See notes on whitespace handling
Akron347be812020-09-29 07:52:52 +0200285 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
Peter Harders6f526a32020-06-29 21:44:41 +0200286
Akron4e3c7e32021-02-18 15:19:53 +0100287 # XCT_LINE_NUMBERS is only needed for debugging
288 # (see XML::CompactTree::XS)
289 $param |= XCT_LINE_NUMBERS if DEBUG;
Akron347be812020-09-29 07:52:52 +0200290 $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
Akron598d1a72020-08-02 17:33:31 +0200291
Akron347be812020-09-29 07:52:52 +0200292 $structures->reset;
Akron598d1a72020-08-02 17:33:31 +0200293
Akron347be812020-09-29 07:52:52 +0200294 $tokens->reset if $_TOKENS_PROC;
Akron598d1a72020-08-02 17:33:31 +0200295
Akron347be812020-09-29 07:52:52 +0200296 # ~ whitespace related issue ~
297 $add_one = 0;
298 %ws = ();
Akron598d1a72020-08-02 17:33:31 +0200299
Akron347be812020-09-29 07:52:52 +0200300 # ~ recursion ~
301 retr_info(1, \$tree_data->[2] ); # parse input data
Akron598d1a72020-08-02 17:33:31 +0200302
Akronb3649472020-09-29 08:24:46 +0200303 if (DEBUG) {
Akron347be812020-09-29 07:52:52 +0200304 $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
Akron0bb7e722020-09-29 07:48:33 +0200305 };
Akron598d1a72020-08-02 17:33:31 +0200306
Akron347be812020-09-29 07:52:52 +0200307 # ~ write data.xml ~
308 $data->to_zip(
309 $zipper->new_stream("$dir/${_data_file}"),
310 $text_id_esc
311 );
Akron598d1a72020-08-02 17:33:31 +0200312
Akron347be812020-09-29 07:52:52 +0200313 # ~ tokenization ~
314 if ($_GEN_TOK_EXT) {
Akron598d1a72020-08-02 17:33:31 +0200315
Akron347be812020-09-29 07:52:52 +0200316 # Tokenize and output
317 $ext_tok->tokenize($data->data)->to_zip(
318 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
319 $text_id_esc
320 );
321 };
Akrona10ad592020-08-03 11:20:23 +0200322
Akron347be812020-09-29 07:52:52 +0200323 if ($_GEN_TOK_INT) {
Akrona10ad592020-08-03 11:20:23 +0200324
Akron347be812020-09-29 07:52:52 +0200325 # Tokenize and output
326 $cons_tok->tokenize($data->data)->to_zip(
327 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
Akrona10ad592020-08-03 11:20:23 +0200328 $text_id_esc
329 );
Marc Kupietz74ed7f32020-09-09 18:22:07 +0200330
Akron347be812020-09-29 07:52:52 +0200331 $aggr_tok->tokenize($data->data)->to_zip(
332 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
333 $text_id_esc
334 );
Akron598d1a72020-08-02 17:33:31 +0200335
Akron347be812020-09-29 07:52:52 +0200336 $aggr_tok->reset;
337 $cons_tok->reset;
338 };
Akron598d1a72020-08-02 17:33:31 +0200339
Marc Kupietz985da0c2021-02-15 19:29:50 +0100340 if ($use_tokenizer_sentence_splits) {
341 $ext_tok->sentencize_from_previous_input($structures);
342 }
343
Akron347be812020-09-29 07:52:52 +0200344 # ~ write structures ~
345 if (!$structures->empty) {
346 $structures->to_zip(
347 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
348 $text_id_esc,
349 2 # = structure serialization
350 );
351 };
Akron598d1a72020-08-02 17:33:31 +0200352
Akron347be812020-09-29 07:52:52 +0200353 # ~ write tokens ~
354 if ($_TOKENS_PROC && !$tokens->empty) {
355 $tokens->to_zip(
356 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
357 $text_id_esc,
358 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
359 );
360 };
Akron598d1a72020-08-02 17:33:31 +0200361
Akron347be812020-09-29 07:52:52 +0200362 $dir = ""; # reinit.
Akron598d1a72020-08-02 17:33:31 +0200363
Akron347be812020-09-29 07:52:52 +0200364 # Maybe not necessary
365 $data->reset;
Akron598d1a72020-08-02 17:33:31 +0200366
Akron347be812020-09-29 07:52:52 +0200367 } else { # $dir eq ""
Akron598d1a72020-08-02 17:33:31 +0200368
Akron347be812020-09-29 07:52:52 +0200369 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
Akron598d1a72020-08-02 17:33:31 +0200370 }
Akron598d1a72020-08-02 17:33:31 +0200371
Akron347be812020-09-29 07:52:52 +0200372 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200373 };
374
Akron347be812020-09-29 07:52:52 +0200375 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200376
Akron347be812020-09-29 07:52:52 +0200377 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200378
Akronf8088e62021-02-18 16:18:59 +0100379 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100380
Akronf8088e62021-02-18 16:18:59 +0100381 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
382 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
383
384 # Remove consecutive whitespace at beginning and end (mostly one newline)
385 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200386
Akron347be812020-09-29 07:52:52 +0200387 ### NOTE: this is only relevant, if a text consists of more than one line
388 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
389 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
390 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200391
Akron347be812020-09-29 07:52:52 +0200392 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200393
Akron347be812020-09-29 07:52:52 +0200394 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
395 }
396 ###
Akronf57ed812020-07-27 10:37:52 +0200397
Akron347be812020-09-29 07:52:52 +0200398 # add line to buffer
399 $buf_in .= $_;
400 };
Akronf57ed812020-07-27 10:37:52 +0200401
Akron347be812020-09-29 07:52:52 +0200402 } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200403
Akron347be812020-09-29 07:52:52 +0200404 # ~ start of header ~
405 $pfx = $1;
406 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200407
Akron347be812020-09-29 07:52:52 +0200408 if ($pfx !~ /^\s*$/) {
409 die $log->fatal("input line number $.: " .
410 "line with opening header tag" .
411 " is not in expected format ... => Aborting (line=$_)");
412 };
413
414 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200415 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200416
417 # Header was parseable
418 if ($header) {
419
420 # Write header to zip
421 my $file = $header->dir . '/' . $_header_file;
422
Akronb3649472020-09-29 08:24:46 +0200423 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200424
425 $header->to_zip($zipper->new_stream($file));
426
427 # Header is for text level
428 if ($header->type eq 'text') {
429
430 # Remember dir and sigles
431 $dir = $header->dir;
432 $text_id = $header->id;
433 $text_id_esc = $header->id_esc;
434
435 # log output for seeing progression
Marc Kupietz44b1f252020-11-26 16:31:40 +0100436 $log->notice("$0: main(): text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200437
438 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200439 }
440 }
Akron347be812020-09-29 07:52:52 +0200441 }
442} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100443
Akron347be812020-09-29 07:52:52 +0200444$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200445
Akron347be812020-09-29 07:52:52 +0200446$ext_tok->close if $_GEN_TOK_EXT;
Peter Hardersd892a582020-02-12 15:45:22 +0100447
Akron347be812020-09-29 07:52:52 +0200448exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100449
Peter Hardersd892a582020-02-12 15:45:22 +0100450
Akrond658df72021-02-18 18:58:56 +0100451# Recursively called function to handle XML tree data
452sub retr_info {
453
Akron1c4f2202020-07-30 09:28:22 +0200454 # recursion level
455 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
456 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100457
Marc Kupietz985da0c2021-02-15 19:29:50 +0100458 my $dummy_anno;
459 if ($use_tokenizer_sentence_splits) {
Akrond658df72021-02-18 18:58:56 +0100460 $dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100461 }
462
Akrond658df72021-02-18 18:58:56 +0100463 # Iteration through all array elements
464 # ($_[0] is a reference to an array reference)
465 # See notes on how 'XML::CompactTree::XS' works and
466 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
467 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100468
Akrond658df72021-02-18 18:58:56 +0100469 # Element node
470 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100471
Peter Harders6f526a32020-06-29 21:44:41 +0200472 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200473 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200474 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100475
Marc Kupietz985da0c2021-02-15 19:29:50 +0100476 my $anno;
477
Akron7501ca02020-08-01 21:05:25 +0200478 # $e->[1] represents the tag name
Marc Kupietz985da0c2021-02-15 19:29:50 +0100479 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
480 $anno = $dummy_anno;
481 } else {
482 $anno = $structures->add_new_annotation($e->[1]);
483 }
Peter Hardersd892a582020-02-12 15:45:22 +0100484
Peter Hardersd892a582020-02-12 15:45:22 +0100485
Akron7501ca02020-08-01 21:05:25 +0200486 # Add element also to token list
487 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
488 $tokens->add_annotation($anno);
489 };
Peter Hardersd892a582020-02-12 15:45:22 +0100490
Akrond658df72021-02-18 18:58:56 +0100491 # Handle attributes (if attributes exist)
492 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100493
Akrond658df72021-02-18 18:58:56 +0100494 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
495 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
496 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
497 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100498
Peter Harders6f526a32020-06-29 21:44:41 +0200499 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200500 $anno->add_attribute(
501 @{$e->[3]}[$c, $c + 1]
502 );
Akrond658df72021-02-18 18:58:56 +0100503 };
504 };
Peter Harders6f526a32020-06-29 21:44:41 +0200505
506 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200507 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200508
Akrond658df72021-02-18 18:58:56 +0100509
Peter Harders6f526a32020-06-29 21:44:41 +0200510 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200511 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200512 #~~~~
513
514
Akrond658df72021-02-18 18:58:56 +0100515 # Call function recursively
516 # do no recursion, if $e->[$_IDX] is not defined
517 # (because we have no array of child-nodes, e.g.: <back/>)
518 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200519
Akrond658df72021-02-18 18:58:56 +0100520 # Recursion with array of child-nodes
521 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200522 }
523
524
525 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200526 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200527 #~~~~~
528
Akrond658df72021-02-18 18:58:56 +0100529 # NOTE: use $pos, because the offsets are _between_ the characters
530 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200531 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200532
Akrond658df72021-02-18 18:58:56 +0100533 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200534
Akrond658df72021-02-18 18:58:56 +0100535 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200536
Peter Harders6f526a32020-06-29 21:44:41 +0200537 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100538 if ($fval > 0 && not exists $ws{$fval - 1}) {
539
540 # ~ previous node was a text-node ~
541 $anno->set_from($fval - 1);
542 }
543
544 # in case this fails, check input
545 if (($fval - 1) > $pos) {
546 die $log->fatal("text_id='$text_id', " .
547 "processing of structures: " .
548 "from-value ($fval) is 2 or more greater " .
549 "than to-value ($pos) => please check. Aborting");
550 };
551
552 # TODO: find example for which this case applies
553 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
554 #
555 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
556 # do testing with bigger corpus excerpt (wikipedia?)
557 $anno->set_from($pos) if $fval == $pos + 1;
558 $anno->set_to($pos);
559 $anno->set_level($rl);
560
561 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200562 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100563
564
Peter Harders41c35622020-07-12 01:16:22 +0200565 #~~~~
566 # until here: tag-node (closing)
567 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200568 }
569
Akrond658df72021-02-18 18:58:56 +0100570 # Text node
571 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200572
Akrond658df72021-02-18 18:58:56 +0100573 $add_one = 1;
574 $data->append($e->[1]);
575 }
576
577 # Whitespace node
578 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
579 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
580
581 # state, that this from-index belongs to a whitespace-node
582 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
583 $ws{$data->position}++;
584
585 $add_one = 0;
586 $data->append($e->[1]);
587 }
588
589 # not yet handled type
590 else {
591
592 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
593 };
594 };
595};
596
Peter Harders6f526a32020-06-29 21:44:41 +0200597
Akrond949e182020-02-14 12:23:57 +0100598__END__
599
600=pod
601
602=encoding utf8
603
604=head1 NAME
605
606tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
607
608=head1 SYNOPSIS
609
610 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
611
612=head1 DESCRIPTION
613
Akronee434b12020-07-08 12:53:01 +0200614C<tei2korapxml> is a script to convert TEI P5 and
615L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
616based documents to the
617L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
618If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100619read from C<STDIN>. If no specific output is defined, data is written
620to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200621
Akrond949e182020-02-14 12:23:57 +0100622This program is usually called from inside another script.
623
Akronee434b12020-07-08 12:53:01 +0200624=head1 FORMATS
625
626=head2 Input restrictions
627
628=over 2
629
630=item
631
Akronee434b12020-07-08 12:53:01 +0200632TEI P5 formatted input with certain restrictions:
633
634=over 4
635
636=item
637
638B<mandatory>: text-header with integrated textsigle, text-body
639
640=item
641
642B<optional>: corp-header with integrated corpsigle,
643doc-header with integrated docsigle
644
645=back
646
647=item
648
Akron0c41ab32020-09-29 07:33:33 +0200649All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200650newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200651(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200652into blanks between 2 tokens could lead to additional blanks,
653where there should be none (e.g.: punctuation characters like C<,> or
654C<.> should not be seperated from their predecessor token).
655(see also code section C<~ whitespace handling ~>).
656
657=back
658
659=head2 Notes on the output
660
661=over 2
662
663=item
664
665zip file output (default on C<stdout>) with utf8 encoded entries
666(which together form the KorAP-XML format)
667
668=back
669
Akrond949e182020-02-14 12:23:57 +0100670=head1 INSTALLATION
671
672C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
673these bindings are available, the preferred way to install the script is
674to use L<cpanm|App::cpanminus>.
675
676 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
677
678In case everything went well, the C<tei2korapxml> tool will
679be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200680
Akrond949e182020-02-14 12:23:57 +0100681Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
682
683=head1 OPTIONS
684
685=over 2
686
Akron4e603a52020-07-27 14:23:49 +0200687=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100688
Akron4e603a52020-07-27 14:23:49 +0200689The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100690
691=item B<--help|-h>
692
693Print help information.
694
695=item B<--version|-v>
696
697Print version information.
698
Akron4e603a52020-07-27 14:23:49 +0200699=item B<--tokenizer-call|-tc>
700
701Call an external tokenizer process, that will tokenize
702a single line from STDIN and outputs one token per line.
703
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200704=item B<--tokenizer-korap|-tk>
705
706Use the standard KorAP/DeReKo tokenizer.
707
Akron6d7b8e42020-09-29 07:37:41 +0200708=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200709
710Tokenize the data using two embedded tokenizers,
711that will take an I<Aggressive> and a I<conservative>
712approach.
713
Akron1a5271a2021-02-18 13:18:15 +0100714=item B<--inline-tokens> <foundry>#[<file>]
715
716Define the foundry and file (without extension)
717to store inline token information in.
718If L</KORAPXMLTEI_INLINE> is set, this will contain
719annotations as well.
720Defaults to C<tokens> and C<morpho>.
721
Akrondd0be8f2021-02-18 19:29:41 +0100722=item B<--inline-structures> <foundry>#[<file>]
723
724Define the foundry and file (without extension)
725to store inline structure information in.
726Defaults to C<struct> and C<structures>.
727
Marc Kupietz985da0c2021-02-15 19:29:50 +0100728=item B<--use-tokenizer-sentence-splits|-s>
729
730Replace existing with, or add new, sentence boundary information
731provided by the KorAP tokenizer (currently supported only).
732
Akron3378dfd2020-08-01 15:01:36 +0200733=item B<--log|-l>
734
735Loglevel for I<Log::Any>. Defaults to C<notice>.
736
Akrond949e182020-02-14 12:23:57 +0100737=back
738
Akronb3649472020-09-29 08:24:46 +0200739=head1 ENVIRONMENT VARIABLES
740
741=over 2
742
743=item B<KORAPXMLTEI_DEBUG>
744
745Activate minimal debugging.
746Defaults to C<false>.
747
748=item B<KORAPXMLTEI_INLINE>
749
750Process inline annotations, if present.
751Defaults to C<false>.
752
753=back
754
Akrond949e182020-02-14 12:23:57 +0100755=head1 COPYRIGHT AND LICENSE
756
Marc Kupietze955ecc2021-02-17 17:42:01 +0100757Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100758
759Author: Peter Harders
760
Akronaabd0952020-09-29 07:35:08 +0200761Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100762
763L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
764Corpus Analysis Platform at the
765L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
766member of the
767L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
768
769This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100770L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100771
772=cut
Akronf8088e62021-02-18 16:18:59 +0100773
774# NOTES
775
776## Notes on how 'XML::CompactTree::XS' works
777
778Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
779
780Print out name of 'node2' for the above example:
781
782echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
783
784Exploring the structure of $data ( = reference to below array ):
785
786[ 0: XML_READER_TYPE_DOCUMENT,
787 1: ?
788 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see main(): retr_info( \$tree_data->[2] ))
789 1: 'node'
790 2: ?
791 3: HASH (attributes)
792 4: 1 (line number)
793 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
794 1: 'node1'
795 2: ?
796 3: undefined (no attributes)
797 4: 1 (line number)
798 5: [ 0: [ 0: XML_READER_TYPE_TEXT
799 1: 'some '
800 ]
801 1: [ 0: XML_READER_TYPE_ELEMENT
802 1: 'n'
803 2: ?
804 3: undefined (no attributes)
805 4: 1 (line number)
806 5: undefined (no child-nodes)
807 ]
808 2: [ 0: XML_READER_TYPE_TEXT
809 1: ' text'
810 ]
811 ]
812 ]
813 1: [ 0: XML_READER_TYPE_ELEMENT
814 1: 'node2'
815 2: ?
816 3: undefined (not attributes)
817 4: 1 (line number)
818 5: [ 0: [ 0: XML_READER_TYPE_TEXT
819 1: 'more-text'
820 ]
821 ]
822 ]
823 ]
824 ]
825 ]
826]
827
828$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
829
830ref($data->[2]) == ARRAY (with 1 element for 'node')
831ref($data->[2]->[0]) == ARRAY (with 6 elements)
832
833$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
834$data->[2]->[0]->[1] == 'node'
835ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
836$data->[2]->[0]->[4] == 1 (line number)
837ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
838 # child-nodes of actual node (see $_IDX)
839
840ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
841$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
842$data->[2]->[0]->[5]->[0]->[1] == 'node1'
843$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
844$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
845ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
846
847ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
848$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
849$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
850
851ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
852$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
853$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
854$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
855$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
856$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
857
858ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
859$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
860$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
861
862
863retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
864Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
865${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
866
867
868## Notes on whitespace handling
869
870Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
871(see function 'retr_info()').
872
873Definition of significant and insignificant whitespace
874(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
875
876Significant whitespace is part of the document content and should be preserved.
877Insignificant whitespace is used when editing XML documents for readability.
878These whitespaces are typically not intended for inclusion in the delivery of the document.
879
880### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
881
882The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
883 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
884
885When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
886 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
887 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
888
889echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
890
891
892Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
893
894Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
895 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
896
897The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
898 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
899
900The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
901 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
902
903When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
904 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
905 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
906 the last read 'non-tag'-node has to be corrected (see [1]),
907
908For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
909 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
910
911[1]
912Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
913 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
914 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
915
916[2]
917Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
918 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
919
920The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
921 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
922
923Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
924 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
925
926
927## Notes on whitespace fixing
928
929The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
930 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
931
932It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
933 example further down and notes on 'Input restrictions' in the manpage).
934
935Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
936
937Examples (how primary text with linebreaks would be converted by below code):
938
939 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
940 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
941
942Blanks are inserted before the 1st character:
943
944 NOTE: not stringent ('...' stands for text):
945
946 beg1............................end1 => no blank before 'beg1'
947 beg2....<pb/>...................end2 => no blank before 'beg2'
948 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
949 beg4....<test>ok</test>.........end4 => blank before 'beg4'
950
951 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
952 ^
953 |_blank between 'end3' and 'beg4'
954
955
956## Notes on segfault prevention
957
958binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside 'main()'
959(see notes on 'PerlIO layers' in 'man XML::LibXML'),
960removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
961see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
962see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.