blob: 9aca1f62c4b8fbfb3ddc2cfc0ff034ec9cab5483 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron91705d72021-02-19 10:59:45 +010049 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akrondd0be8f2021-02-18 19:29:41 +010052 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron26a71522021-02-19 10:27:37 +010053 'base-foundry=s' => \(my $_tok_dir = 'base'),
54 'data-file=s' => \(my $_data_file = 'data'),
55 'header-file=s' => \(my $_header_file = 'header'),
Akron91705d72021-02-19 10:59:45 +010056 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron3378dfd2020-08-01 15:01:36 +020057 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020058 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010059 pod2usage(
60 -verbose => 99,
61 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
62 -msg => $VERSION_MSG,
63 -output => '-'
64 )
65 },
66 'version|v' => sub {
67 pod2usage(
68 -verbose => 0,
69 -msg => $VERSION_MSG,
70 -output => '-'
71 )
72 }
Peter Hardersd892a582020-02-12 15:45:22 +010073);
74
Marc Kupietz44b1f252020-11-26 16:31:40 +010075binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020076Log::Any::Adapter->set('Stderr', log_level => $log_level);
77
Akronb3649472020-09-29 08:24:46 +020078$log->notice('Debugging is activated') if DEBUG;
79
Peter Harders6f526a32020-06-29 21:44:41 +020080#
81# ~~~ parameter (mandatory) ~~~
82#
Peter Harders6f526a32020-06-29 21:44:41 +020083my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron0c41ab32020-09-29 07:33:33 +020084# optional
Peter Harders6f526a32020-06-29 21:44:41 +020085my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron0c41ab32020-09-29 07:33:33 +020086# optional
Peter Harders6f526a32020-06-29 21:44:41 +020087my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron0c41ab32020-09-29 07:33:33 +020088# mandatory
Peter Harders6f526a32020-06-29 21:44:41 +020089my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron09e0b2c2020-07-28 15:57:01 +020090
Akron0c41ab32020-09-29 07:33:33 +020091
Peter Harders41c35622020-07-12 01:16:22 +020092## extern tokenization
Marc Kupietz1e882fb2020-09-09 00:05:46 +020093my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
94
Marc Kupietz985da0c2021-02-15 19:29:50 +010095if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
96 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
97}
98
Akron0c41ab32020-09-29 07:33:33 +020099my $ext_tok;
100if ($tokenizer_call) {
101 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
102}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200103
Akron0c41ab32020-09-29 07:33:33 +0200104elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100105 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200106};
Peter Harders6f526a32020-06-29 21:44:41 +0200107##
108
Akron0c41ab32020-09-29 07:33:33 +0200109
Akron4e3c7e32021-02-18 15:19:53 +0100110#
111# ~~~ constants ~~~
112#
113
114
Akron8b511f92020-07-09 17:28:08 +0200115## intern tokenization
Akron0c41ab32020-09-29 07:33:33 +0200116my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
117my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200118##
119
Peter Harders6f526a32020-06-29 21:44:41 +0200120## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
121my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100122
123
Akrondd0be8f2021-02-18 19:29:41 +0100124# Name of the directory and the file containing all inline structure informations
125# except for $_TOKEN_TAG information
126my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
127$_structure_file .= '.xml';
128
129
Akron1a5271a2021-02-18 13:18:15 +0100130# Name of the directory and the file containing all inline token informations
131# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
132my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
133$_tokens_file .= '.xml';
134
Peter Harders6f526a32020-06-29 21:44:41 +0200135my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
136
Akron4e3c7e32021-02-18 15:19:53 +0100137# Handling inline annotations (inside $_TOKENS_TAG)
138my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200139
Peter Harders6f526a32020-06-29 21:44:41 +0200140
141#
142# ~~~ variables ~~~
143#
144
Akron7501ca02020-08-01 21:05:25 +0200145# Initialize Token- and Structure-Collector
146my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
147my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200148
149
Akrona10ad592020-08-03 11:20:23 +0200150# Initialize Data-Collector
151my $data = KorAP::XML::TEI::Data->new;
152
153
Akron85717512020-07-08 11:19:19 +0200154# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200155my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200156my $input_fh; # input file handle (default: stdin)
157
Peter Harders6f526a32020-06-29 21:44:41 +0200158my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200159
Akron0c41ab32020-09-29 07:33:33 +0200160my ( $text_id,
161 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200162
Peter Harders6f526a32020-06-29 21:44:41 +0200163# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100164my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200165 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200166 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
167 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200168 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200169 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
170 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200171 # (means: 'from-index - 1' is a key in %ws).
172 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
173
Akron7501ca02020-08-01 21:05:25 +0200174my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200175
Peter Harders6f526a32020-06-29 21:44:41 +0200176
177#
178# ~~~ main ~~~
179#
180
181# ~ initializations ~
182
Akron4e3c7e32021-02-18 15:19:53 +0100183# Include line numbers in elements of $tree_data for debugging
184DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200185
Akron7501ca02020-08-01 21:05:25 +0200186$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200187
Akronec2cef22020-07-31 10:00:15 +0200188# Normalize regex for header parsing
189for ($_CORP_HEADER_BEG,
190 $_DOC_HEADER_BEG,
191 $_TEXT_HEADER_BEG) {
192 s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
193};
Peter Hardersd892a582020-02-12 15:45:22 +0100194
Peter Hardersd892a582020-02-12 15:45:22 +0100195
Peter Harders6f526a32020-06-29 21:44:41 +0200196# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100197
Akron347be812020-09-29 07:52:52 +0200198my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100199
Akron347be812020-09-29 07:52:52 +0200200$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100201
Akron347be812020-09-29 07:52:52 +0200202# Maybe not necessary
203$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100204
Akron347be812020-09-29 07:52:52 +0200205$dir = "";
Peter Hardersd892a582020-02-12 15:45:22 +0100206
Akron347be812020-09-29 07:52:52 +0200207if ( $input_fname ne '' ){
208 unless (open($input_fh, '<', $input_fname)) {
209 die $log->fatal("File '$input_fname' could not be opened.");
210 };
211}
Peter Harders6f526a32020-06-29 21:44:41 +0200212
Akronf8088e62021-02-18 16:18:59 +0100213# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200214binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200215
Akrond20898f2021-02-19 15:52:17 +0100216my $sfx;
Akron347be812020-09-29 07:52:52 +0200217my $pos;
Akroneaa96232020-10-15 17:06:15 +0200218my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200219my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200220
Akron347be812020-09-29 07:52:52 +0200221# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200222
Akron347be812020-09-29 07:52:52 +0200223MAIN: while ( <$input_fh> ){
224
225 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
226
Akroneaa96232020-10-15 17:06:15 +0200227 # Set input encoding
228 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
229 $input_enc = $2;
230 next;
231 };
232
233 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100234 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200235
Akron347be812020-09-29 07:52:52 +0200236 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
237
238 # ~ start of text body ~
239
Akron347be812020-09-29 07:52:52 +0200240 $sfx = $2;
241
Akrond20898f2021-02-19 15:52:17 +0100242 if ($1 !~ /^\s*$/ || $sfx !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200243 die $log->fatal("input line number $.: " .
244 "line with opening text-body tag '${_TEXT_BODY}' " .
245 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200246 };
Peter Harders6f526a32020-06-29 21:44:41 +0200247
Akron347be812020-09-29 07:52:52 +0200248 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
249 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200250
Akron347be812020-09-29 07:52:52 +0200251 # Iterate over all lines in the text body
252 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200253
Akron347be812020-09-29 07:52:52 +0200254 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200255 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100256 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200257
Akron347be812020-09-29 07:52:52 +0200258 # ~ end of text body ~
259 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200260
Akron91705d72021-02-19 10:59:45 +0100261 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200262
Akron347be812020-09-29 07:52:52 +0200263 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
264 die $log->fatal("input line number $.: " .
265 "line with closing text-body tag '${_TEXT_BODY}'".
266 " contains additional information ... => Aborting (line=$_)");
267 };
Peter Harders6f526a32020-06-29 21:44:41 +0200268
Akron347be812020-09-29 07:52:52 +0200269 if ($dir ne "") {
Peter Harders6f526a32020-06-29 21:44:41 +0200270
Akrond20898f2021-02-19 15:52:17 +0100271 my $reader = XML::LibXML::Reader->new(
272 string => "<text>$buf_in</text>",
273 huge => 1
274 );
Peter Harders6f526a32020-06-29 21:44:41 +0200275
Akronf8088e62021-02-18 16:18:59 +0100276 # See notes on whitespace handling
Akron347be812020-09-29 07:52:52 +0200277 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
Peter Harders6f526a32020-06-29 21:44:41 +0200278
Akron4e3c7e32021-02-18 15:19:53 +0100279 # XCT_LINE_NUMBERS is only needed for debugging
280 # (see XML::CompactTree::XS)
281 $param |= XCT_LINE_NUMBERS if DEBUG;
Akrond20898f2021-02-19 15:52:17 +0100282 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
Akron598d1a72020-08-02 17:33:31 +0200283
Akron347be812020-09-29 07:52:52 +0200284 $structures->reset;
Akron598d1a72020-08-02 17:33:31 +0200285
Akron347be812020-09-29 07:52:52 +0200286 $tokens->reset if $_TOKENS_PROC;
Akron598d1a72020-08-02 17:33:31 +0200287
Akron347be812020-09-29 07:52:52 +0200288 # ~ whitespace related issue ~
289 $add_one = 0;
290 %ws = ();
Akron598d1a72020-08-02 17:33:31 +0200291
Akron347be812020-09-29 07:52:52 +0200292 # ~ recursion ~
293 retr_info(1, \$tree_data->[2] ); # parse input data
Akron598d1a72020-08-02 17:33:31 +0200294
Akronb3649472020-09-29 08:24:46 +0200295 if (DEBUG) {
Akron26a71522021-02-19 10:27:37 +0100296 $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
Akron0bb7e722020-09-29 07:48:33 +0200297 };
Akron598d1a72020-08-02 17:33:31 +0200298
Akron347be812020-09-29 07:52:52 +0200299 # ~ write data.xml ~
300 $data->to_zip(
Akron26a71522021-02-19 10:27:37 +0100301 $zipper->new_stream("$dir/${_data_file}.xml"),
Akron347be812020-09-29 07:52:52 +0200302 $text_id_esc
303 );
Akron598d1a72020-08-02 17:33:31 +0200304
Akron347be812020-09-29 07:52:52 +0200305 # ~ tokenization ~
306 if ($_GEN_TOK_EXT) {
Akron598d1a72020-08-02 17:33:31 +0200307
Akron347be812020-09-29 07:52:52 +0200308 # Tokenize and output
309 $ext_tok->tokenize($data->data)->to_zip(
Akron91705d72021-02-19 10:59:45 +0100310 $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
Akron347be812020-09-29 07:52:52 +0200311 $text_id_esc
312 );
313 };
Akrona10ad592020-08-03 11:20:23 +0200314
Akron347be812020-09-29 07:52:52 +0200315 if ($_GEN_TOK_INT) {
Akrona10ad592020-08-03 11:20:23 +0200316
Akron347be812020-09-29 07:52:52 +0200317 # Tokenize and output
318 $cons_tok->tokenize($data->data)->to_zip(
Akron91705d72021-02-19 10:59:45 +0100319 $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akrona10ad592020-08-03 11:20:23 +0200320 $text_id_esc
321 );
Marc Kupietz74ed7f32020-09-09 18:22:07 +0200322
Akron347be812020-09-29 07:52:52 +0200323 $aggr_tok->tokenize($data->data)->to_zip(
Akron91705d72021-02-19 10:59:45 +0100324 $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200325 $text_id_esc
326 );
Akron598d1a72020-08-02 17:33:31 +0200327
Akron347be812020-09-29 07:52:52 +0200328 $aggr_tok->reset;
329 $cons_tok->reset;
330 };
Akron598d1a72020-08-02 17:33:31 +0200331
Marc Kupietz985da0c2021-02-15 19:29:50 +0100332 if ($use_tokenizer_sentence_splits) {
333 $ext_tok->sentencize_from_previous_input($structures);
334 }
335
Akron347be812020-09-29 07:52:52 +0200336 # ~ write structures ~
337 if (!$structures->empty) {
338 $structures->to_zip(
339 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
340 $text_id_esc,
341 2 # = structure serialization
342 );
343 };
Akron598d1a72020-08-02 17:33:31 +0200344
Akron347be812020-09-29 07:52:52 +0200345 # ~ write tokens ~
346 if ($_TOKENS_PROC && !$tokens->empty) {
347 $tokens->to_zip(
348 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
349 $text_id_esc,
350 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
351 );
352 };
Akron598d1a72020-08-02 17:33:31 +0200353
Akron347be812020-09-29 07:52:52 +0200354 $dir = ""; # reinit.
Akron598d1a72020-08-02 17:33:31 +0200355
Akron347be812020-09-29 07:52:52 +0200356 # Maybe not necessary
357 $data->reset;
Akron598d1a72020-08-02 17:33:31 +0200358
Akron347be812020-09-29 07:52:52 +0200359 } else { # $dir eq ""
Akron598d1a72020-08-02 17:33:31 +0200360
Akron347be812020-09-29 07:52:52 +0200361 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
Akron598d1a72020-08-02 17:33:31 +0200362 }
Akron598d1a72020-08-02 17:33:31 +0200363
Akron347be812020-09-29 07:52:52 +0200364 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200365 };
366
Akron347be812020-09-29 07:52:52 +0200367 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200368
Akron347be812020-09-29 07:52:52 +0200369 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200370
Akronf8088e62021-02-18 16:18:59 +0100371 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100372
Akronf8088e62021-02-18 16:18:59 +0100373 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
374 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
375
376 # Remove consecutive whitespace at beginning and end (mostly one newline)
377 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200378
Akron347be812020-09-29 07:52:52 +0200379 ### NOTE: this is only relevant, if a text consists of more than one line
380 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
381 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
382 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200383
Akron347be812020-09-29 07:52:52 +0200384 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200385
Akron347be812020-09-29 07:52:52 +0200386 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
387 }
388 ###
Akronf57ed812020-07-27 10:37:52 +0200389
Akron347be812020-09-29 07:52:52 +0200390 # add line to buffer
391 $buf_in .= $_;
392 };
Akronf57ed812020-07-27 10:37:52 +0200393
Akron347be812020-09-29 07:52:52 +0200394 } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200395
Akron347be812020-09-29 07:52:52 +0200396 # ~ start of header ~
Akron347be812020-09-29 07:52:52 +0200397 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200398
Akrond20898f2021-02-19 15:52:17 +0100399 if ($1 !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200400 die $log->fatal("input line number $.: " .
401 "line with opening header tag" .
402 " is not in expected format ... => Aborting (line=$_)");
403 };
404
405 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200406 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200407
408 # Header was parseable
409 if ($header) {
410
411 # Write header to zip
Akron26a71522021-02-19 10:27:37 +0100412 my $file = $header->dir . '/' . $_header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200413
Akronb3649472020-09-29 08:24:46 +0200414 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200415
416 $header->to_zip($zipper->new_stream($file));
417
418 # Header is for text level
419 if ($header->type eq 'text') {
420
421 # Remember dir and sigles
422 $dir = $header->dir;
423 $text_id = $header->id;
424 $text_id_esc = $header->id_esc;
425
426 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100427 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200428
429 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200430 }
431 }
Akron347be812020-09-29 07:52:52 +0200432 }
433} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100434
Akron347be812020-09-29 07:52:52 +0200435$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200436
Akron347be812020-09-29 07:52:52 +0200437$ext_tok->close if $_GEN_TOK_EXT;
Peter Hardersd892a582020-02-12 15:45:22 +0100438
Akron347be812020-09-29 07:52:52 +0200439exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100440
Peter Hardersd892a582020-02-12 15:45:22 +0100441
Akrond658df72021-02-18 18:58:56 +0100442# Recursively called function to handle XML tree data
443sub retr_info {
Akron1c4f2202020-07-30 09:28:22 +0200444 # recursion level
445 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
446 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100447
Marc Kupietz985da0c2021-02-15 19:29:50 +0100448 my $dummy_anno;
449 if ($use_tokenizer_sentence_splits) {
Akrond658df72021-02-18 18:58:56 +0100450 $dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100451 }
452
Akrond658df72021-02-18 18:58:56 +0100453 # Iteration through all array elements
454 # ($_[0] is a reference to an array reference)
455 # See notes on how 'XML::CompactTree::XS' works and
456 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
457 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100458
Akrond658df72021-02-18 18:58:56 +0100459 # Element node
460 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100461
Peter Harders6f526a32020-06-29 21:44:41 +0200462 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200463 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200464 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100465
Marc Kupietz985da0c2021-02-15 19:29:50 +0100466 my $anno;
467
Akron7501ca02020-08-01 21:05:25 +0200468 # $e->[1] represents the tag name
Marc Kupietz985da0c2021-02-15 19:29:50 +0100469 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
470 $anno = $dummy_anno;
471 } else {
472 $anno = $structures->add_new_annotation($e->[1]);
473 }
Peter Hardersd892a582020-02-12 15:45:22 +0100474
Peter Hardersd892a582020-02-12 15:45:22 +0100475
Akron7501ca02020-08-01 21:05:25 +0200476 # Add element also to token list
477 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
478 $tokens->add_annotation($anno);
479 };
Peter Hardersd892a582020-02-12 15:45:22 +0100480
Akrond658df72021-02-18 18:58:56 +0100481 # Handle attributes (if attributes exist)
482 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100483
Akrond658df72021-02-18 18:58:56 +0100484 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
485 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
486 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
487 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100488
Peter Harders6f526a32020-06-29 21:44:41 +0200489 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200490 $anno->add_attribute(
491 @{$e->[3]}[$c, $c + 1]
492 );
Akrond658df72021-02-18 18:58:56 +0100493 };
494 };
Peter Harders6f526a32020-06-29 21:44:41 +0200495
496 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200497 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200498
Akrond658df72021-02-18 18:58:56 +0100499
Peter Harders6f526a32020-06-29 21:44:41 +0200500 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200501 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200502 #~~~~
503
504
Akrond658df72021-02-18 18:58:56 +0100505 # Call function recursively
506 # do no recursion, if $e->[$_IDX] is not defined
507 # (because we have no array of child-nodes, e.g.: <back/>)
508 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200509
Akrond658df72021-02-18 18:58:56 +0100510 # Recursion with array of child-nodes
511 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200512 }
513
514
515 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200516 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200517 #~~~~~
518
Akrond658df72021-02-18 18:58:56 +0100519 # NOTE: use $pos, because the offsets are _between_ the characters
520 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200521 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200522
Akrond658df72021-02-18 18:58:56 +0100523 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200524
Akrond658df72021-02-18 18:58:56 +0100525 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200526
Peter Harders6f526a32020-06-29 21:44:41 +0200527 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100528 if ($fval > 0 && not exists $ws{$fval - 1}) {
529
530 # ~ previous node was a text-node ~
531 $anno->set_from($fval - 1);
532 }
533
534 # in case this fails, check input
535 if (($fval - 1) > $pos) {
536 die $log->fatal("text_id='$text_id', " .
537 "processing of structures: " .
538 "from-value ($fval) is 2 or more greater " .
539 "than to-value ($pos) => please check. Aborting");
540 };
541
542 # TODO: find example for which this case applies
543 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
544 #
545 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
546 # do testing with bigger corpus excerpt (wikipedia?)
547 $anno->set_from($pos) if $fval == $pos + 1;
548 $anno->set_to($pos);
549 $anno->set_level($rl);
550
551 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200552 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100553
554
Peter Harders41c35622020-07-12 01:16:22 +0200555 #~~~~
556 # until here: tag-node (closing)
557 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200558 }
559
Akrond658df72021-02-18 18:58:56 +0100560 # Text node
561 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200562
Akrond658df72021-02-18 18:58:56 +0100563 $add_one = 1;
564 $data->append($e->[1]);
565 }
566
567 # Whitespace node
568 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
569 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
570
571 # state, that this from-index belongs to a whitespace-node
572 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
573 $ws{$data->position}++;
574
575 $add_one = 0;
576 $data->append($e->[1]);
577 }
578
579 # not yet handled type
580 else {
581
582 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
583 };
584 };
585};
586
Peter Harders6f526a32020-06-29 21:44:41 +0200587
Akrond949e182020-02-14 12:23:57 +0100588__END__
589
590=pod
591
592=encoding utf8
593
594=head1 NAME
595
596tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
597
598=head1 SYNOPSIS
599
600 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
601
602=head1 DESCRIPTION
603
Akronee434b12020-07-08 12:53:01 +0200604C<tei2korapxml> is a script to convert TEI P5 and
605L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
606based documents to the
607L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
608If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100609read from C<STDIN>. If no specific output is defined, data is written
610to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200611
Akrond949e182020-02-14 12:23:57 +0100612This program is usually called from inside another script.
613
Akronee434b12020-07-08 12:53:01 +0200614=head1 FORMATS
615
616=head2 Input restrictions
617
618=over 2
619
620=item
621
Akronee434b12020-07-08 12:53:01 +0200622TEI P5 formatted input with certain restrictions:
623
624=over 4
625
626=item
627
628B<mandatory>: text-header with integrated textsigle, text-body
629
630=item
631
632B<optional>: corp-header with integrated corpsigle,
633doc-header with integrated docsigle
634
635=back
636
637=item
638
Akron0c41ab32020-09-29 07:33:33 +0200639All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200640newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200641(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200642into blanks between 2 tokens could lead to additional blanks,
643where there should be none (e.g.: punctuation characters like C<,> or
644C<.> should not be seperated from their predecessor token).
645(see also code section C<~ whitespace handling ~>).
646
647=back
648
649=head2 Notes on the output
650
651=over 2
652
653=item
654
655zip file output (default on C<stdout>) with utf8 encoded entries
656(which together form the KorAP-XML format)
657
658=back
659
Akrond949e182020-02-14 12:23:57 +0100660=head1 INSTALLATION
661
662C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
663these bindings are available, the preferred way to install the script is
664to use L<cpanm|App::cpanminus>.
665
666 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
667
668In case everything went well, the C<tei2korapxml> tool will
669be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200670
Akrond949e182020-02-14 12:23:57 +0100671Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
672
673=head1 OPTIONS
674
675=over 2
676
Akron4e603a52020-07-27 14:23:49 +0200677=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100678
Akron4e603a52020-07-27 14:23:49 +0200679The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100680
681=item B<--help|-h>
682
683Print help information.
684
685=item B<--version|-v>
686
687Print version information.
688
Akron4e603a52020-07-27 14:23:49 +0200689=item B<--tokenizer-call|-tc>
690
691Call an external tokenizer process, that will tokenize
692a single line from STDIN and outputs one token per line.
693
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200694=item B<--tokenizer-korap|-tk>
695
696Use the standard KorAP/DeReKo tokenizer.
697
Akron6d7b8e42020-09-29 07:37:41 +0200698=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200699
700Tokenize the data using two embedded tokenizers,
701that will take an I<Aggressive> and a I<conservative>
702approach.
703
Akron1a5271a2021-02-18 13:18:15 +0100704=item B<--inline-tokens> <foundry>#[<file>]
705
706Define the foundry and file (without extension)
707to store inline token information in.
708If L</KORAPXMLTEI_INLINE> is set, this will contain
709annotations as well.
710Defaults to C<tokens> and C<morpho>.
711
Akrondd0be8f2021-02-18 19:29:41 +0100712=item B<--inline-structures> <foundry>#[<file>]
713
714Define the foundry and file (without extension)
715to store inline structure information in.
716Defaults to C<struct> and C<structures>.
717
Akron26a71522021-02-19 10:27:37 +0100718=item B<--base-foundry> <foundry>
719
720Define the base foundry to store newly generated
721token information in.
722Defaults to C<base>.
723
724=item B<--data-file> <file>
725
726Define the file (without extension)
727to store primary data information in.
728Defaults to C<data>.
729
730=item B<--header-file> <file>
731
732Define the file name (without extension)
733to store header information on
734the corpus, document, and text level in.
735Defaults to C<header>.
736
Marc Kupietz985da0c2021-02-15 19:29:50 +0100737=item B<--use-tokenizer-sentence-splits|-s>
738
739Replace existing with, or add new, sentence boundary information
740provided by the KorAP tokenizer (currently supported only).
741
Akron91705d72021-02-19 10:59:45 +0100742=item B<--tokens-file> <file>
743
744Define the file (without extension)
745to store generated token information in
746(either from the KorAP tokenizer or an externally called tokenizer).
747Defaults to C<tokens>.
748
Akron3378dfd2020-08-01 15:01:36 +0200749=item B<--log|-l>
750
751Loglevel for I<Log::Any>. Defaults to C<notice>.
752
Akrond949e182020-02-14 12:23:57 +0100753=back
754
Akronb3649472020-09-29 08:24:46 +0200755=head1 ENVIRONMENT VARIABLES
756
757=over 2
758
759=item B<KORAPXMLTEI_DEBUG>
760
761Activate minimal debugging.
762Defaults to C<false>.
763
764=item B<KORAPXMLTEI_INLINE>
765
766Process inline annotations, if present.
767Defaults to C<false>.
768
769=back
770
Akrond949e182020-02-14 12:23:57 +0100771=head1 COPYRIGHT AND LICENSE
772
Marc Kupietze955ecc2021-02-17 17:42:01 +0100773Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100774
775Author: Peter Harders
776
Akronaabd0952020-09-29 07:35:08 +0200777Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100778
779L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
780Corpus Analysis Platform at the
781L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
782member of the
783L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
784
785This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100786L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100787
788=cut
Akronf8088e62021-02-18 16:18:59 +0100789
790# NOTES
791
792## Notes on how 'XML::CompactTree::XS' works
793
794Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
795
796Print out name of 'node2' for the above example:
797
798echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
799
800Exploring the structure of $data ( = reference to below array ):
801
802[ 0: XML_READER_TYPE_DOCUMENT,
803 1: ?
Akron91577922021-02-19 10:32:54 +0100804 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100805 1: 'node'
806 2: ?
807 3: HASH (attributes)
808 4: 1 (line number)
809 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
810 1: 'node1'
811 2: ?
812 3: undefined (no attributes)
813 4: 1 (line number)
814 5: [ 0: [ 0: XML_READER_TYPE_TEXT
815 1: 'some '
816 ]
817 1: [ 0: XML_READER_TYPE_ELEMENT
818 1: 'n'
819 2: ?
820 3: undefined (no attributes)
821 4: 1 (line number)
822 5: undefined (no child-nodes)
823 ]
824 2: [ 0: XML_READER_TYPE_TEXT
825 1: ' text'
826 ]
827 ]
828 ]
829 1: [ 0: XML_READER_TYPE_ELEMENT
830 1: 'node2'
831 2: ?
832 3: undefined (not attributes)
833 4: 1 (line number)
834 5: [ 0: [ 0: XML_READER_TYPE_TEXT
835 1: 'more-text'
836 ]
837 ]
838 ]
839 ]
840 ]
841 ]
842]
843
844$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
845
846ref($data->[2]) == ARRAY (with 1 element for 'node')
847ref($data->[2]->[0]) == ARRAY (with 6 elements)
848
849$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
850$data->[2]->[0]->[1] == 'node'
851ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
852$data->[2]->[0]->[4] == 1 (line number)
853ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
854 # child-nodes of actual node (see $_IDX)
855
856ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
857$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
858$data->[2]->[0]->[5]->[0]->[1] == 'node1'
859$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
860$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
861ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
862
863ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
864$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
865$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
866
867ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
868$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
869$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
870$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
871$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
872$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
873
874ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
875$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
876$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
877
878
879retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
880Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
881${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
882
883
884## Notes on whitespace handling
885
886Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
887(see function 'retr_info()').
888
889Definition of significant and insignificant whitespace
890(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
891
892Significant whitespace is part of the document content and should be preserved.
893Insignificant whitespace is used when editing XML documents for readability.
894These whitespaces are typically not intended for inclusion in the delivery of the document.
895
896### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
897
898The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
899 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
900
901When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
902 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
903 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
904
905echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
906
907
908Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
909
910Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
911 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
912
913The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
914 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
915
916The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
917 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
918
919When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
920 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
921 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
922 the last read 'non-tag'-node has to be corrected (see [1]),
923
924For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
925 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
926
927[1]
928Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
929 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
930 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
931
932[2]
933Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
934 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
935
936The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
937 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
938
939Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
940 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
941
942
943## Notes on whitespace fixing
944
945The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
946 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
947
948It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
949 example further down and notes on 'Input restrictions' in the manpage).
950
951Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
952
953Examples (how primary text with linebreaks would be converted by below code):
954
955 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
956 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
957
958Blanks are inserted before the 1st character:
959
960 NOTE: not stringent ('...' stands for text):
961
962 beg1............................end1 => no blank before 'beg1'
963 beg2....<pb/>...................end2 => no blank before 'beg2'
964 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
965 beg4....<test>ok</test>.........end4 => blank before 'beg4'
966
967 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
968 ^
969 |_blank between 'end3' and 'beg4'
970
971
972## Notes on segfault prevention
973
Akron91577922021-02-19 10:32:54 +0100974binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100975(see notes on 'PerlIO layers' in 'man XML::LibXML'),
976removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
977see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
978see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.