blob: 152eaaca3ba0eb633b9242fb5e370e80e719377a [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron91705d72021-02-19 10:59:45 +010049 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akrondd0be8f2021-02-18 19:29:41 +010052 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron26a71522021-02-19 10:27:37 +010053 'base-foundry=s' => \(my $_tok_dir = 'base'),
54 'data-file=s' => \(my $_data_file = 'data'),
55 'header-file=s' => \(my $_header_file = 'header'),
Akron91705d72021-02-19 10:59:45 +010056 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron3378dfd2020-08-01 15:01:36 +020057 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020058 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010059 pod2usage(
60 -verbose => 99,
61 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
62 -msg => $VERSION_MSG,
63 -output => '-'
64 )
65 },
66 'version|v' => sub {
67 pod2usage(
68 -verbose => 0,
69 -msg => $VERSION_MSG,
70 -output => '-'
71 )
72 }
Peter Hardersd892a582020-02-12 15:45:22 +010073);
74
Marc Kupietz44b1f252020-11-26 16:31:40 +010075binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020076Log::Any::Adapter->set('Stderr', log_level => $log_level);
77
Akronb3649472020-09-29 08:24:46 +020078$log->notice('Debugging is activated') if DEBUG;
79
Peter Harders6f526a32020-06-29 21:44:41 +020080#
81# ~~~ parameter (mandatory) ~~~
82#
Peter Harders6f526a32020-06-29 21:44:41 +020083my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron0c41ab32020-09-29 07:33:33 +020084# optional
Peter Harders6f526a32020-06-29 21:44:41 +020085my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron0c41ab32020-09-29 07:33:33 +020086# optional
Peter Harders6f526a32020-06-29 21:44:41 +020087my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron0c41ab32020-09-29 07:33:33 +020088# mandatory
Peter Harders6f526a32020-06-29 21:44:41 +020089my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron09e0b2c2020-07-28 15:57:01 +020090
Akron0c41ab32020-09-29 07:33:33 +020091
Peter Harders41c35622020-07-12 01:16:22 +020092## extern tokenization
Marc Kupietz1e882fb2020-09-09 00:05:46 +020093my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
94
Marc Kupietz985da0c2021-02-15 19:29:50 +010095if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
96 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
97}
98
Akron0c41ab32020-09-29 07:33:33 +020099my $ext_tok;
100if ($tokenizer_call) {
101 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
102}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200103
Akron0c41ab32020-09-29 07:33:33 +0200104elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100105 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200106};
Peter Harders6f526a32020-06-29 21:44:41 +0200107##
108
Akron0c41ab32020-09-29 07:33:33 +0200109
Akron4e3c7e32021-02-18 15:19:53 +0100110#
111# ~~~ constants ~~~
112#
113
114
Akron8b511f92020-07-09 17:28:08 +0200115## intern tokenization
Akron0c41ab32020-09-29 07:33:33 +0200116my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
117my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200118##
119
Peter Harders6f526a32020-06-29 21:44:41 +0200120## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
121my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100122
123
Akrondd0be8f2021-02-18 19:29:41 +0100124# Name of the directory and the file containing all inline structure informations
125# except for $_TOKEN_TAG information
126my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
127$_structure_file .= '.xml';
128
129
Akron1a5271a2021-02-18 13:18:15 +0100130# Name of the directory and the file containing all inline token informations
131# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
132my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
133$_tokens_file .= '.xml';
134
Peter Harders6f526a32020-06-29 21:44:41 +0200135my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
136
Akron4e3c7e32021-02-18 15:19:53 +0100137# Handling inline annotations (inside $_TOKENS_TAG)
138my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200139
Peter Harders6f526a32020-06-29 21:44:41 +0200140
141#
142# ~~~ variables ~~~
143#
144
Akron7501ca02020-08-01 21:05:25 +0200145# Initialize Token- and Structure-Collector
146my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
147my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200148
149
Akrona10ad592020-08-03 11:20:23 +0200150# Initialize Data-Collector
151my $data = KorAP::XML::TEI::Data->new;
152
153
Akron85717512020-07-08 11:19:19 +0200154# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200155my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200156my $input_fh; # input file handle (default: stdin)
157
Peter Harders6f526a32020-06-29 21:44:41 +0200158my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200159
Akron0c41ab32020-09-29 07:33:33 +0200160my ( $text_id,
161 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200162
Peter Harders6f526a32020-06-29 21:44:41 +0200163my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
164 $tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
165
166# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100167my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200168 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200169 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
170 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200171 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200172 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
173 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200174 # (means: 'from-index - 1' is a key in %ws).
175 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
176
Akron7501ca02020-08-01 21:05:25 +0200177my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200178
Peter Harders6f526a32020-06-29 21:44:41 +0200179
180#
181# ~~~ main ~~~
182#
183
184# ~ initializations ~
185
Akron4e3c7e32021-02-18 15:19:53 +0100186# Include line numbers in elements of $tree_data for debugging
187DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200188
Akron7501ca02020-08-01 21:05:25 +0200189$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200190
Akronec2cef22020-07-31 10:00:15 +0200191# Normalize regex for header parsing
192for ($_CORP_HEADER_BEG,
193 $_DOC_HEADER_BEG,
194 $_TEXT_HEADER_BEG) {
195 s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
196};
Peter Hardersd892a582020-02-12 15:45:22 +0100197
Peter Hardersd892a582020-02-12 15:45:22 +0100198
Peter Harders6f526a32020-06-29 21:44:41 +0200199# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100200
Akron347be812020-09-29 07:52:52 +0200201my ( $pfx, $sfx );
Peter Hardersd892a582020-02-12 15:45:22 +0100202
Akron347be812020-09-29 07:52:52 +0200203my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100204
Akron347be812020-09-29 07:52:52 +0200205$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100206
Akron347be812020-09-29 07:52:52 +0200207# Maybe not necessary
208$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100209
Akron347be812020-09-29 07:52:52 +0200210$dir = "";
Peter Hardersd892a582020-02-12 15:45:22 +0100211
Akron347be812020-09-29 07:52:52 +0200212if ( $input_fname ne '' ){
213 unless (open($input_fh, '<', $input_fname)) {
214 die $log->fatal("File '$input_fname' could not be opened.");
215 };
216}
Peter Harders6f526a32020-06-29 21:44:41 +0200217
Akronf8088e62021-02-18 16:18:59 +0100218# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200219binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200220
Akron347be812020-09-29 07:52:52 +0200221my $pos;
Akroneaa96232020-10-15 17:06:15 +0200222my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200223my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200224
Akron347be812020-09-29 07:52:52 +0200225# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200226
Akron347be812020-09-29 07:52:52 +0200227MAIN: while ( <$input_fh> ){
228
229 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
230
Akroneaa96232020-10-15 17:06:15 +0200231 # Set input encoding
232 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
233 $input_enc = $2;
234 next;
235 };
236
237 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100238 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200239
Akron347be812020-09-29 07:52:52 +0200240 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
241
242 # ~ start of text body ~
243
244 $pfx = $1;
245 $sfx = $2;
246
247 if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
248 die $log->fatal("input line number $.: " .
249 "line with opening text-body tag '${_TEXT_BODY}' " .
250 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200251 };
Peter Harders6f526a32020-06-29 21:44:41 +0200252
Akron347be812020-09-29 07:52:52 +0200253 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
254 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200255
Akron347be812020-09-29 07:52:52 +0200256 # Iterate over all lines in the text body
257 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200258
Akron347be812020-09-29 07:52:52 +0200259 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200260 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100261 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200262
Akron347be812020-09-29 07:52:52 +0200263 # ~ end of text body ~
264 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200265
Akron91705d72021-02-19 10:59:45 +0100266 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200267
Akron347be812020-09-29 07:52:52 +0200268 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
269 die $log->fatal("input line number $.: " .
270 "line with closing text-body tag '${_TEXT_BODY}'".
271 " contains additional information ... => Aborting (line=$_)");
272 };
Peter Harders6f526a32020-06-29 21:44:41 +0200273
Akron347be812020-09-29 07:52:52 +0200274 if ($dir ne "") {
Peter Harders6f526a32020-06-29 21:44:41 +0200275
Akron347be812020-09-29 07:52:52 +0200276 $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
Peter Harders6f526a32020-06-29 21:44:41 +0200277
Akronf8088e62021-02-18 16:18:59 +0100278 # See notes on whitespace handling
Akron347be812020-09-29 07:52:52 +0200279 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
Peter Harders6f526a32020-06-29 21:44:41 +0200280
Akron4e3c7e32021-02-18 15:19:53 +0100281 # XCT_LINE_NUMBERS is only needed for debugging
282 # (see XML::CompactTree::XS)
283 $param |= XCT_LINE_NUMBERS if DEBUG;
Akron347be812020-09-29 07:52:52 +0200284 $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
Akron598d1a72020-08-02 17:33:31 +0200285
Akron347be812020-09-29 07:52:52 +0200286 $structures->reset;
Akron598d1a72020-08-02 17:33:31 +0200287
Akron347be812020-09-29 07:52:52 +0200288 $tokens->reset if $_TOKENS_PROC;
Akron598d1a72020-08-02 17:33:31 +0200289
Akron347be812020-09-29 07:52:52 +0200290 # ~ whitespace related issue ~
291 $add_one = 0;
292 %ws = ();
Akron598d1a72020-08-02 17:33:31 +0200293
Akron347be812020-09-29 07:52:52 +0200294 # ~ recursion ~
295 retr_info(1, \$tree_data->[2] ); # parse input data
Akron598d1a72020-08-02 17:33:31 +0200296
Akronb3649472020-09-29 08:24:46 +0200297 if (DEBUG) {
Akron26a71522021-02-19 10:27:37 +0100298 $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
Akron0bb7e722020-09-29 07:48:33 +0200299 };
Akron598d1a72020-08-02 17:33:31 +0200300
Akron347be812020-09-29 07:52:52 +0200301 # ~ write data.xml ~
302 $data->to_zip(
Akron26a71522021-02-19 10:27:37 +0100303 $zipper->new_stream("$dir/${_data_file}.xml"),
Akron347be812020-09-29 07:52:52 +0200304 $text_id_esc
305 );
Akron598d1a72020-08-02 17:33:31 +0200306
Akron347be812020-09-29 07:52:52 +0200307 # ~ tokenization ~
308 if ($_GEN_TOK_EXT) {
Akron598d1a72020-08-02 17:33:31 +0200309
Akron347be812020-09-29 07:52:52 +0200310 # Tokenize and output
311 $ext_tok->tokenize($data->data)->to_zip(
Akron91705d72021-02-19 10:59:45 +0100312 $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
Akron347be812020-09-29 07:52:52 +0200313 $text_id_esc
314 );
315 };
Akrona10ad592020-08-03 11:20:23 +0200316
Akron347be812020-09-29 07:52:52 +0200317 if ($_GEN_TOK_INT) {
Akrona10ad592020-08-03 11:20:23 +0200318
Akron347be812020-09-29 07:52:52 +0200319 # Tokenize and output
320 $cons_tok->tokenize($data->data)->to_zip(
Akron91705d72021-02-19 10:59:45 +0100321 $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akrona10ad592020-08-03 11:20:23 +0200322 $text_id_esc
323 );
Marc Kupietz74ed7f32020-09-09 18:22:07 +0200324
Akron347be812020-09-29 07:52:52 +0200325 $aggr_tok->tokenize($data->data)->to_zip(
Akron91705d72021-02-19 10:59:45 +0100326 $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200327 $text_id_esc
328 );
Akron598d1a72020-08-02 17:33:31 +0200329
Akron347be812020-09-29 07:52:52 +0200330 $aggr_tok->reset;
331 $cons_tok->reset;
332 };
Akron598d1a72020-08-02 17:33:31 +0200333
Marc Kupietz985da0c2021-02-15 19:29:50 +0100334 if ($use_tokenizer_sentence_splits) {
335 $ext_tok->sentencize_from_previous_input($structures);
336 }
337
Akron347be812020-09-29 07:52:52 +0200338 # ~ write structures ~
339 if (!$structures->empty) {
340 $structures->to_zip(
341 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
342 $text_id_esc,
343 2 # = structure serialization
344 );
345 };
Akron598d1a72020-08-02 17:33:31 +0200346
Akron347be812020-09-29 07:52:52 +0200347 # ~ write tokens ~
348 if ($_TOKENS_PROC && !$tokens->empty) {
349 $tokens->to_zip(
350 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
351 $text_id_esc,
352 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
353 );
354 };
Akron598d1a72020-08-02 17:33:31 +0200355
Akron347be812020-09-29 07:52:52 +0200356 $dir = ""; # reinit.
Akron598d1a72020-08-02 17:33:31 +0200357
Akron347be812020-09-29 07:52:52 +0200358 # Maybe not necessary
359 $data->reset;
Akron598d1a72020-08-02 17:33:31 +0200360
Akron347be812020-09-29 07:52:52 +0200361 } else { # $dir eq ""
Akron598d1a72020-08-02 17:33:31 +0200362
Akron347be812020-09-29 07:52:52 +0200363 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
Akron598d1a72020-08-02 17:33:31 +0200364 }
Akron598d1a72020-08-02 17:33:31 +0200365
Akron347be812020-09-29 07:52:52 +0200366 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200367 };
368
Akron347be812020-09-29 07:52:52 +0200369 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200370
Akron347be812020-09-29 07:52:52 +0200371 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200372
Akronf8088e62021-02-18 16:18:59 +0100373 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100374
Akronf8088e62021-02-18 16:18:59 +0100375 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
376 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
377
378 # Remove consecutive whitespace at beginning and end (mostly one newline)
379 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200380
Akron347be812020-09-29 07:52:52 +0200381 ### NOTE: this is only relevant, if a text consists of more than one line
382 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
383 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
384 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200385
Akron347be812020-09-29 07:52:52 +0200386 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200387
Akron347be812020-09-29 07:52:52 +0200388 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
389 }
390 ###
Akronf57ed812020-07-27 10:37:52 +0200391
Akron347be812020-09-29 07:52:52 +0200392 # add line to buffer
393 $buf_in .= $_;
394 };
Akronf57ed812020-07-27 10:37:52 +0200395
Akron347be812020-09-29 07:52:52 +0200396 } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200397
Akron347be812020-09-29 07:52:52 +0200398 # ~ start of header ~
399 $pfx = $1;
400 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200401
Akron347be812020-09-29 07:52:52 +0200402 if ($pfx !~ /^\s*$/) {
403 die $log->fatal("input line number $.: " .
404 "line with opening header tag" .
405 " is not in expected format ... => Aborting (line=$_)");
406 };
407
408 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200409 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200410
411 # Header was parseable
412 if ($header) {
413
414 # Write header to zip
Akron26a71522021-02-19 10:27:37 +0100415 my $file = $header->dir . '/' . $_header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200416
Akronb3649472020-09-29 08:24:46 +0200417 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200418
419 $header->to_zip($zipper->new_stream($file));
420
421 # Header is for text level
422 if ($header->type eq 'text') {
423
424 # Remember dir and sigles
425 $dir = $header->dir;
426 $text_id = $header->id;
427 $text_id_esc = $header->id_esc;
428
429 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100430 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200431
432 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200433 }
434 }
Akron347be812020-09-29 07:52:52 +0200435 }
436} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100437
Akron347be812020-09-29 07:52:52 +0200438$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200439
Akron347be812020-09-29 07:52:52 +0200440$ext_tok->close if $_GEN_TOK_EXT;
Peter Hardersd892a582020-02-12 15:45:22 +0100441
Akron347be812020-09-29 07:52:52 +0200442exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100443
Peter Hardersd892a582020-02-12 15:45:22 +0100444
Akrond658df72021-02-18 18:58:56 +0100445# Recursively called function to handle XML tree data
446sub retr_info {
Akron1c4f2202020-07-30 09:28:22 +0200447 # recursion level
448 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
449 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100450
Marc Kupietz985da0c2021-02-15 19:29:50 +0100451 my $dummy_anno;
452 if ($use_tokenizer_sentence_splits) {
Akrond658df72021-02-18 18:58:56 +0100453 $dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100454 }
455
Akrond658df72021-02-18 18:58:56 +0100456 # Iteration through all array elements
457 # ($_[0] is a reference to an array reference)
458 # See notes on how 'XML::CompactTree::XS' works and
459 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
460 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100461
Akrond658df72021-02-18 18:58:56 +0100462 # Element node
463 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100464
Peter Harders6f526a32020-06-29 21:44:41 +0200465 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200466 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200467 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100468
Marc Kupietz985da0c2021-02-15 19:29:50 +0100469 my $anno;
470
Akron7501ca02020-08-01 21:05:25 +0200471 # $e->[1] represents the tag name
Marc Kupietz985da0c2021-02-15 19:29:50 +0100472 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
473 $anno = $dummy_anno;
474 } else {
475 $anno = $structures->add_new_annotation($e->[1]);
476 }
Peter Hardersd892a582020-02-12 15:45:22 +0100477
Peter Hardersd892a582020-02-12 15:45:22 +0100478
Akron7501ca02020-08-01 21:05:25 +0200479 # Add element also to token list
480 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
481 $tokens->add_annotation($anno);
482 };
Peter Hardersd892a582020-02-12 15:45:22 +0100483
Akrond658df72021-02-18 18:58:56 +0100484 # Handle attributes (if attributes exist)
485 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100486
Akrond658df72021-02-18 18:58:56 +0100487 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
488 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
489 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
490 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100491
Peter Harders6f526a32020-06-29 21:44:41 +0200492 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200493 $anno->add_attribute(
494 @{$e->[3]}[$c, $c + 1]
495 );
Akrond658df72021-02-18 18:58:56 +0100496 };
497 };
Peter Harders6f526a32020-06-29 21:44:41 +0200498
499 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200500 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200501
Akrond658df72021-02-18 18:58:56 +0100502
Peter Harders6f526a32020-06-29 21:44:41 +0200503 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200504 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200505 #~~~~
506
507
Akrond658df72021-02-18 18:58:56 +0100508 # Call function recursively
509 # do no recursion, if $e->[$_IDX] is not defined
510 # (because we have no array of child-nodes, e.g.: <back/>)
511 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200512
Akrond658df72021-02-18 18:58:56 +0100513 # Recursion with array of child-nodes
514 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200515 }
516
517
518 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200519 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200520 #~~~~~
521
Akrond658df72021-02-18 18:58:56 +0100522 # NOTE: use $pos, because the offsets are _between_ the characters
523 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200524 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200525
Akrond658df72021-02-18 18:58:56 +0100526 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200527
Akrond658df72021-02-18 18:58:56 +0100528 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200529
Peter Harders6f526a32020-06-29 21:44:41 +0200530 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100531 if ($fval > 0 && not exists $ws{$fval - 1}) {
532
533 # ~ previous node was a text-node ~
534 $anno->set_from($fval - 1);
535 }
536
537 # in case this fails, check input
538 if (($fval - 1) > $pos) {
539 die $log->fatal("text_id='$text_id', " .
540 "processing of structures: " .
541 "from-value ($fval) is 2 or more greater " .
542 "than to-value ($pos) => please check. Aborting");
543 };
544
545 # TODO: find example for which this case applies
546 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
547 #
548 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
549 # do testing with bigger corpus excerpt (wikipedia?)
550 $anno->set_from($pos) if $fval == $pos + 1;
551 $anno->set_to($pos);
552 $anno->set_level($rl);
553
554 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200555 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100556
557
Peter Harders41c35622020-07-12 01:16:22 +0200558 #~~~~
559 # until here: tag-node (closing)
560 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200561 }
562
Akrond658df72021-02-18 18:58:56 +0100563 # Text node
564 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200565
Akrond658df72021-02-18 18:58:56 +0100566 $add_one = 1;
567 $data->append($e->[1]);
568 }
569
570 # Whitespace node
571 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
572 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
573
574 # state, that this from-index belongs to a whitespace-node
575 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
576 $ws{$data->position}++;
577
578 $add_one = 0;
579 $data->append($e->[1]);
580 }
581
582 # not yet handled type
583 else {
584
585 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
586 };
587 };
588};
589
Peter Harders6f526a32020-06-29 21:44:41 +0200590
Akrond949e182020-02-14 12:23:57 +0100591__END__
592
593=pod
594
595=encoding utf8
596
597=head1 NAME
598
599tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
600
601=head1 SYNOPSIS
602
603 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
604
605=head1 DESCRIPTION
606
Akronee434b12020-07-08 12:53:01 +0200607C<tei2korapxml> is a script to convert TEI P5 and
608L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
609based documents to the
610L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
611If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100612read from C<STDIN>. If no specific output is defined, data is written
613to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200614
Akrond949e182020-02-14 12:23:57 +0100615This program is usually called from inside another script.
616
Akronee434b12020-07-08 12:53:01 +0200617=head1 FORMATS
618
619=head2 Input restrictions
620
621=over 2
622
623=item
624
Akronee434b12020-07-08 12:53:01 +0200625TEI P5 formatted input with certain restrictions:
626
627=over 4
628
629=item
630
631B<mandatory>: text-header with integrated textsigle, text-body
632
633=item
634
635B<optional>: corp-header with integrated corpsigle,
636doc-header with integrated docsigle
637
638=back
639
640=item
641
Akron0c41ab32020-09-29 07:33:33 +0200642All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200643newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200644(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200645into blanks between 2 tokens could lead to additional blanks,
646where there should be none (e.g.: punctuation characters like C<,> or
647C<.> should not be seperated from their predecessor token).
648(see also code section C<~ whitespace handling ~>).
649
650=back
651
652=head2 Notes on the output
653
654=over 2
655
656=item
657
658zip file output (default on C<stdout>) with utf8 encoded entries
659(which together form the KorAP-XML format)
660
661=back
662
Akrond949e182020-02-14 12:23:57 +0100663=head1 INSTALLATION
664
665C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
666these bindings are available, the preferred way to install the script is
667to use L<cpanm|App::cpanminus>.
668
669 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
670
671In case everything went well, the C<tei2korapxml> tool will
672be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200673
Akrond949e182020-02-14 12:23:57 +0100674Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
675
676=head1 OPTIONS
677
678=over 2
679
Akron4e603a52020-07-27 14:23:49 +0200680=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100681
Akron4e603a52020-07-27 14:23:49 +0200682The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100683
684=item B<--help|-h>
685
686Print help information.
687
688=item B<--version|-v>
689
690Print version information.
691
Akron4e603a52020-07-27 14:23:49 +0200692=item B<--tokenizer-call|-tc>
693
694Call an external tokenizer process, that will tokenize
695a single line from STDIN and outputs one token per line.
696
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200697=item B<--tokenizer-korap|-tk>
698
699Use the standard KorAP/DeReKo tokenizer.
700
Akron6d7b8e42020-09-29 07:37:41 +0200701=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200702
703Tokenize the data using two embedded tokenizers,
704that will take an I<Aggressive> and a I<conservative>
705approach.
706
Akron1a5271a2021-02-18 13:18:15 +0100707=item B<--inline-tokens> <foundry>#[<file>]
708
709Define the foundry and file (without extension)
710to store inline token information in.
711If L</KORAPXMLTEI_INLINE> is set, this will contain
712annotations as well.
713Defaults to C<tokens> and C<morpho>.
714
Akrondd0be8f2021-02-18 19:29:41 +0100715=item B<--inline-structures> <foundry>#[<file>]
716
717Define the foundry and file (without extension)
718to store inline structure information in.
719Defaults to C<struct> and C<structures>.
720
Akron26a71522021-02-19 10:27:37 +0100721=item B<--base-foundry> <foundry>
722
723Define the base foundry to store newly generated
724token information in.
725Defaults to C<base>.
726
727=item B<--data-file> <file>
728
729Define the file (without extension)
730to store primary data information in.
731Defaults to C<data>.
732
733=item B<--header-file> <file>
734
735Define the file name (without extension)
736to store header information on
737the corpus, document, and text level in.
738Defaults to C<header>.
739
Marc Kupietz985da0c2021-02-15 19:29:50 +0100740=item B<--use-tokenizer-sentence-splits|-s>
741
742Replace existing with, or add new, sentence boundary information
743provided by the KorAP tokenizer (currently supported only).
744
Akron91705d72021-02-19 10:59:45 +0100745=item B<--tokens-file> <file>
746
747Define the file (without extension)
748to store generated token information in
749(either from the KorAP tokenizer or an externally called tokenizer).
750Defaults to C<tokens>.
751
Akron3378dfd2020-08-01 15:01:36 +0200752=item B<--log|-l>
753
754Loglevel for I<Log::Any>. Defaults to C<notice>.
755
Akrond949e182020-02-14 12:23:57 +0100756=back
757
Akronb3649472020-09-29 08:24:46 +0200758=head1 ENVIRONMENT VARIABLES
759
760=over 2
761
762=item B<KORAPXMLTEI_DEBUG>
763
764Activate minimal debugging.
765Defaults to C<false>.
766
767=item B<KORAPXMLTEI_INLINE>
768
769Process inline annotations, if present.
770Defaults to C<false>.
771
772=back
773
Akrond949e182020-02-14 12:23:57 +0100774=head1 COPYRIGHT AND LICENSE
775
Marc Kupietze955ecc2021-02-17 17:42:01 +0100776Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100777
778Author: Peter Harders
779
Akronaabd0952020-09-29 07:35:08 +0200780Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100781
782L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
783Corpus Analysis Platform at the
784L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
785member of the
786L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
787
788This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100789L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100790
791=cut
Akronf8088e62021-02-18 16:18:59 +0100792
793# NOTES
794
795## Notes on how 'XML::CompactTree::XS' works
796
797Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
798
799Print out name of 'node2' for the above example:
800
801echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
802
803Exploring the structure of $data ( = reference to below array ):
804
805[ 0: XML_READER_TYPE_DOCUMENT,
806 1: ?
Akron91577922021-02-19 10:32:54 +0100807 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100808 1: 'node'
809 2: ?
810 3: HASH (attributes)
811 4: 1 (line number)
812 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
813 1: 'node1'
814 2: ?
815 3: undefined (no attributes)
816 4: 1 (line number)
817 5: [ 0: [ 0: XML_READER_TYPE_TEXT
818 1: 'some '
819 ]
820 1: [ 0: XML_READER_TYPE_ELEMENT
821 1: 'n'
822 2: ?
823 3: undefined (no attributes)
824 4: 1 (line number)
825 5: undefined (no child-nodes)
826 ]
827 2: [ 0: XML_READER_TYPE_TEXT
828 1: ' text'
829 ]
830 ]
831 ]
832 1: [ 0: XML_READER_TYPE_ELEMENT
833 1: 'node2'
834 2: ?
835 3: undefined (not attributes)
836 4: 1 (line number)
837 5: [ 0: [ 0: XML_READER_TYPE_TEXT
838 1: 'more-text'
839 ]
840 ]
841 ]
842 ]
843 ]
844 ]
845]
846
847$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
848
849ref($data->[2]) == ARRAY (with 1 element for 'node')
850ref($data->[2]->[0]) == ARRAY (with 6 elements)
851
852$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
853$data->[2]->[0]->[1] == 'node'
854ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
855$data->[2]->[0]->[4] == 1 (line number)
856ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
857 # child-nodes of actual node (see $_IDX)
858
859ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
860$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
861$data->[2]->[0]->[5]->[0]->[1] == 'node1'
862$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
863$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
864ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
865
866ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
867$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
868$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
869
870ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
871$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
872$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
873$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
874$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
875$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
876
877ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
878$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
879$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
880
881
882retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
883Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
884${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
885
886
887## Notes on whitespace handling
888
889Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
890(see function 'retr_info()').
891
892Definition of significant and insignificant whitespace
893(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
894
895Significant whitespace is part of the document content and should be preserved.
896Insignificant whitespace is used when editing XML documents for readability.
897These whitespaces are typically not intended for inclusion in the delivery of the document.
898
899### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
900
901The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
902 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
903
904When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
905 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
906 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
907
908echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
909
910
911Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
912
913Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
914 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
915
916The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
917 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
918
919The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
920 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
921
922When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
923 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
924 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
925 the last read 'non-tag'-node has to be corrected (see [1]),
926
927For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
928 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
929
930[1]
931Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
932 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
933 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
934
935[2]
936Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
937 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
938
939The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
940 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
941
942Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
943 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
944
945
946## Notes on whitespace fixing
947
948The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
949 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
950
951It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
952 example further down and notes on 'Input restrictions' in the manpage).
953
954Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
955
956Examples (how primary text with linebreaks would be converted by below code):
957
958 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
959 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
960
961Blanks are inserted before the 1st character:
962
963 NOTE: not stringent ('...' stands for text):
964
965 beg1............................end1 => no blank before 'beg1'
966 beg2....<pb/>...................end2 => no blank before 'beg2'
967 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
968 beg4....<test>ok</test>.........end4 => blank before 'beg4'
969
970 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
971 ^
972 |_blank between 'end3' and 'beg4'
973
974
975## Notes on segfault prevention
976
Akron91577922021-02-19 10:32:54 +0100977binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100978(see notes on 'PerlIO layers' in 'man XML::LibXML'),
979removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
980see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
981see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.