blob: 785621ea29c9fdffc77df5d5ec9b9e8b0f4dc917 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron6d7b8e42020-09-29 07:37:41 +020049 'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akron3378dfd2020-08-01 15:01:36 +020052 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020053 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010054 pod2usage(
55 -verbose => 99,
56 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
57 -msg => $VERSION_MSG,
58 -output => '-'
59 )
60 },
61 'version|v' => sub {
62 pod2usage(
63 -verbose => 0,
64 -msg => $VERSION_MSG,
65 -output => '-'
66 )
67 }
Peter Hardersd892a582020-02-12 15:45:22 +010068);
69
Marc Kupietz44b1f252020-11-26 16:31:40 +010070binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020071Log::Any::Adapter->set('Stderr', log_level => $log_level);
72
Akronb3649472020-09-29 08:24:46 +020073$log->notice('Debugging is activated') if DEBUG;
74
Peter Harders6f526a32020-06-29 21:44:41 +020075#
76# ~~~ parameter (mandatory) ~~~
77#
Peter Harders6f526a32020-06-29 21:44:41 +020078my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron0c41ab32020-09-29 07:33:33 +020079# optional
Peter Harders6f526a32020-06-29 21:44:41 +020080my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron0c41ab32020-09-29 07:33:33 +020081# optional
Peter Harders6f526a32020-06-29 21:44:41 +020082my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron0c41ab32020-09-29 07:33:33 +020083# mandatory
Peter Harders6f526a32020-06-29 21:44:41 +020084my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron09e0b2c2020-07-28 15:57:01 +020085
Akron0c41ab32020-09-29 07:33:33 +020086
Peter Harders41c35622020-07-12 01:16:22 +020087## extern tokenization
Marc Kupietz1e882fb2020-09-09 00:05:46 +020088my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
89
Marc Kupietz985da0c2021-02-15 19:29:50 +010090if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
91 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
92}
93
Akron0c41ab32020-09-29 07:33:33 +020094my $ext_tok;
95if ($tokenizer_call) {
96 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
97}
Marc Kupietz1e882fb2020-09-09 00:05:46 +020098
Akron0c41ab32020-09-29 07:33:33 +020099elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100100 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200101};
102my $_tok_file_ext = "tokens.xml";
Peter Harders6f526a32020-06-29 21:44:41 +0200103##
104
Akron0c41ab32020-09-29 07:33:33 +0200105
Akron4e3c7e32021-02-18 15:19:53 +0100106#
107# ~~~ constants ~~~
108#
109
110
Akron8b511f92020-07-09 17:28:08 +0200111## intern tokenization
Peter Hardersf9c51242020-07-21 02:37:44 +0200112my $_GEN_TOK_INT = $tokenizer_intern; # simple tokenization (recommended for testing)
Akron0c41ab32020-09-29 07:33:33 +0200113my $_tok_file_con = "tokens_conservative.xml";
114my $_tok_file_agg = "tokens_aggressive.xml";
115my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
116my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200117##
118
Akron0c41ab32020-09-29 07:33:33 +0200119
Peter Harders41c35622020-07-12 01:16:22 +0200120my $_tok_dir = "base"; # name of directory for storing tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200121
Peter Harders6f526a32020-06-29 21:44:41 +0200122my $_header_file = "header.xml"; # name of files containing the text, document and corpus header
123my $_data_file = "data.xml"; # name of file containing the primary text data (tokens)
124my $_structure_dir = "struct"; # name of directory containing the $_structure_file
125my $_structure_file = "structure.xml"; # name of file containing all tags (except ${_TOKEN_TAG}'s) related information
126 # (= their names and byte offsets in $_data)
127## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
128my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100129
130
131# Name of the directory and the file containing all inline token informations
132# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
133my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
134$_tokens_file .= '.xml';
135
Peter Harders6f526a32020-06-29 21:44:41 +0200136my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
137
Akron4e3c7e32021-02-18 15:19:53 +0100138# Handling inline annotations (inside $_TOKENS_TAG)
139my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200140
Peter Harders6f526a32020-06-29 21:44:41 +0200141
142#
143# ~~~ variables ~~~
144#
145
Akron7501ca02020-08-01 21:05:25 +0200146# Initialize Token- and Structure-Collector
147my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
148my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200149
150
Akrona10ad592020-08-03 11:20:23 +0200151# Initialize Data-Collector
152my $data = KorAP::XML::TEI::Data->new;
153
154
Akron85717512020-07-08 11:19:19 +0200155# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200156my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200157my $input_fh; # input file handle (default: stdin)
158
Peter Harders6f526a32020-06-29 21:44:41 +0200159my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200160
Akron0c41ab32020-09-29 07:33:33 +0200161my ( $text_id,
162 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200163
Peter Harders6f526a32020-06-29 21:44:41 +0200164my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
165 $tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
166
167# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100168my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200169 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200170 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
171 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200172 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200173 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
174 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200175 # (means: 'from-index - 1' is a key in %ws).
176 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
177
Akron7501ca02020-08-01 21:05:25 +0200178my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200179
Peter Harders6f526a32020-06-29 21:44:41 +0200180
181#
182# ~~~ main ~~~
183#
184
185# ~ initializations ~
186
Akron4e3c7e32021-02-18 15:19:53 +0100187# Include line numbers in elements of $tree_data for debugging
188DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200189
Akron7501ca02020-08-01 21:05:25 +0200190$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200191
Akronec2cef22020-07-31 10:00:15 +0200192# Normalize regex for header parsing
193for ($_CORP_HEADER_BEG,
194 $_DOC_HEADER_BEG,
195 $_TEXT_HEADER_BEG) {
196 s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
197};
Peter Hardersd892a582020-02-12 15:45:22 +0100198
Peter Hardersd892a582020-02-12 15:45:22 +0100199
Peter Harders6f526a32020-06-29 21:44:41 +0200200# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100201
Akron347be812020-09-29 07:52:52 +0200202my ( $pfx, $sfx );
Peter Hardersd892a582020-02-12 15:45:22 +0100203
Akron347be812020-09-29 07:52:52 +0200204my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100205
Akron347be812020-09-29 07:52:52 +0200206$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100207
Akron347be812020-09-29 07:52:52 +0200208# Maybe not necessary
209$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100210
Akron347be812020-09-29 07:52:52 +0200211$dir = "";
Peter Hardersd892a582020-02-12 15:45:22 +0100212
Akron347be812020-09-29 07:52:52 +0200213if ( $input_fname ne '' ){
214 unless (open($input_fh, '<', $input_fname)) {
215 die $log->fatal("File '$input_fname' could not be opened.");
216 };
217}
Peter Harders6f526a32020-06-29 21:44:41 +0200218
Akronf8088e62021-02-18 16:18:59 +0100219# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200220binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200221
Akron347be812020-09-29 07:52:52 +0200222my $pos;
Akroneaa96232020-10-15 17:06:15 +0200223my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200224my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200225
Akron347be812020-09-29 07:52:52 +0200226# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200227
Akron347be812020-09-29 07:52:52 +0200228MAIN: while ( <$input_fh> ){
229
230 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
231
Akroneaa96232020-10-15 17:06:15 +0200232 # Set input encoding
233 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
234 $input_enc = $2;
235 next;
236 };
237
238 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100239 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200240
Akron347be812020-09-29 07:52:52 +0200241 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
242
243 # ~ start of text body ~
244
245 $pfx = $1;
246 $sfx = $2;
247
248 if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
249 die $log->fatal("input line number $.: " .
250 "line with opening text-body tag '${_TEXT_BODY}' " .
251 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200252 };
Peter Harders6f526a32020-06-29 21:44:41 +0200253
Akron347be812020-09-29 07:52:52 +0200254 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
255 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200256
Akron347be812020-09-29 07:52:52 +0200257 # Iterate over all lines in the text body
258 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200259
Akron347be812020-09-29 07:52:52 +0200260 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200261 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100262 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200263
Akron347be812020-09-29 07:52:52 +0200264 # ~ end of text body ~
265 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200266
Akron347be812020-09-29 07:52:52 +0200267 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
Peter Harders6f526a32020-06-29 21:44:41 +0200268
Akron347be812020-09-29 07:52:52 +0200269 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
270 die $log->fatal("input line number $.: " .
271 "line with closing text-body tag '${_TEXT_BODY}'".
272 " contains additional information ... => Aborting (line=$_)");
273 };
Peter Harders6f526a32020-06-29 21:44:41 +0200274
Akron347be812020-09-29 07:52:52 +0200275 if ($dir ne "") {
Peter Harders6f526a32020-06-29 21:44:41 +0200276
Akron347be812020-09-29 07:52:52 +0200277 $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
Peter Harders6f526a32020-06-29 21:44:41 +0200278
Akronf8088e62021-02-18 16:18:59 +0100279 # See notes on whitespace handling
Akron347be812020-09-29 07:52:52 +0200280 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
Peter Harders6f526a32020-06-29 21:44:41 +0200281
Akron4e3c7e32021-02-18 15:19:53 +0100282 # XCT_LINE_NUMBERS is only needed for debugging
283 # (see XML::CompactTree::XS)
284 $param |= XCT_LINE_NUMBERS if DEBUG;
Akron347be812020-09-29 07:52:52 +0200285 $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
Akron598d1a72020-08-02 17:33:31 +0200286
Akron347be812020-09-29 07:52:52 +0200287 $structures->reset;
Akron598d1a72020-08-02 17:33:31 +0200288
Akron347be812020-09-29 07:52:52 +0200289 $tokens->reset if $_TOKENS_PROC;
Akron598d1a72020-08-02 17:33:31 +0200290
Akron347be812020-09-29 07:52:52 +0200291 # ~ whitespace related issue ~
292 $add_one = 0;
293 %ws = ();
Akron598d1a72020-08-02 17:33:31 +0200294
Akron347be812020-09-29 07:52:52 +0200295 # ~ recursion ~
296 retr_info(1, \$tree_data->[2] ); # parse input data
Akron598d1a72020-08-02 17:33:31 +0200297
Akronb3649472020-09-29 08:24:46 +0200298 if (DEBUG) {
Akron347be812020-09-29 07:52:52 +0200299 $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
Akron0bb7e722020-09-29 07:48:33 +0200300 };
Akron598d1a72020-08-02 17:33:31 +0200301
Akron347be812020-09-29 07:52:52 +0200302 # ~ write data.xml ~
303 $data->to_zip(
304 $zipper->new_stream("$dir/${_data_file}"),
305 $text_id_esc
306 );
Akron598d1a72020-08-02 17:33:31 +0200307
Akron347be812020-09-29 07:52:52 +0200308 # ~ tokenization ~
309 if ($_GEN_TOK_EXT) {
Akron598d1a72020-08-02 17:33:31 +0200310
Akron347be812020-09-29 07:52:52 +0200311 # Tokenize and output
312 $ext_tok->tokenize($data->data)->to_zip(
313 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
314 $text_id_esc
315 );
316 };
Akrona10ad592020-08-03 11:20:23 +0200317
Akron347be812020-09-29 07:52:52 +0200318 if ($_GEN_TOK_INT) {
Akrona10ad592020-08-03 11:20:23 +0200319
Akron347be812020-09-29 07:52:52 +0200320 # Tokenize and output
321 $cons_tok->tokenize($data->data)->to_zip(
322 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
Akrona10ad592020-08-03 11:20:23 +0200323 $text_id_esc
324 );
Marc Kupietz74ed7f32020-09-09 18:22:07 +0200325
Akron347be812020-09-29 07:52:52 +0200326 $aggr_tok->tokenize($data->data)->to_zip(
327 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
328 $text_id_esc
329 );
Akron598d1a72020-08-02 17:33:31 +0200330
Akron347be812020-09-29 07:52:52 +0200331 $aggr_tok->reset;
332 $cons_tok->reset;
333 };
Akron598d1a72020-08-02 17:33:31 +0200334
Marc Kupietz985da0c2021-02-15 19:29:50 +0100335 if ($use_tokenizer_sentence_splits) {
336 $ext_tok->sentencize_from_previous_input($structures);
337 }
338
Akron347be812020-09-29 07:52:52 +0200339 # ~ write structures ~
340 if (!$structures->empty) {
341 $structures->to_zip(
342 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
343 $text_id_esc,
344 2 # = structure serialization
345 );
346 };
Akron598d1a72020-08-02 17:33:31 +0200347
Akron347be812020-09-29 07:52:52 +0200348 # ~ write tokens ~
349 if ($_TOKENS_PROC && !$tokens->empty) {
350 $tokens->to_zip(
351 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
352 $text_id_esc,
353 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
354 );
355 };
Akron598d1a72020-08-02 17:33:31 +0200356
Akron347be812020-09-29 07:52:52 +0200357 $dir = ""; # reinit.
Akron598d1a72020-08-02 17:33:31 +0200358
Akron347be812020-09-29 07:52:52 +0200359 # Maybe not necessary
360 $data->reset;
Akron598d1a72020-08-02 17:33:31 +0200361
Akron347be812020-09-29 07:52:52 +0200362 } else { # $dir eq ""
Akron598d1a72020-08-02 17:33:31 +0200363
Akron347be812020-09-29 07:52:52 +0200364 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
Akron598d1a72020-08-02 17:33:31 +0200365 }
Akron598d1a72020-08-02 17:33:31 +0200366
Akron347be812020-09-29 07:52:52 +0200367 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200368 };
369
Akron347be812020-09-29 07:52:52 +0200370 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200371
Akron347be812020-09-29 07:52:52 +0200372 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200373
Akronf8088e62021-02-18 16:18:59 +0100374 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100375
Akronf8088e62021-02-18 16:18:59 +0100376 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
377 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
378
379 # Remove consecutive whitespace at beginning and end (mostly one newline)
380 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200381
Akron347be812020-09-29 07:52:52 +0200382 ### NOTE: this is only relevant, if a text consists of more than one line
383 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
384 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
385 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200386
Akron347be812020-09-29 07:52:52 +0200387 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200388
Akron347be812020-09-29 07:52:52 +0200389 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
390 }
391 ###
Akronf57ed812020-07-27 10:37:52 +0200392
Akron347be812020-09-29 07:52:52 +0200393 # add line to buffer
394 $buf_in .= $_;
395 };
Akronf57ed812020-07-27 10:37:52 +0200396
Akron347be812020-09-29 07:52:52 +0200397 } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200398
Akron347be812020-09-29 07:52:52 +0200399 # ~ start of header ~
400 $pfx = $1;
401 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200402
Akron347be812020-09-29 07:52:52 +0200403 if ($pfx !~ /^\s*$/) {
404 die $log->fatal("input line number $.: " .
405 "line with opening header tag" .
406 " is not in expected format ... => Aborting (line=$_)");
407 };
408
409 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200410 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200411
412 # Header was parseable
413 if ($header) {
414
415 # Write header to zip
416 my $file = $header->dir . '/' . $_header_file;
417
Akronb3649472020-09-29 08:24:46 +0200418 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200419
420 $header->to_zip($zipper->new_stream($file));
421
422 # Header is for text level
423 if ($header->type eq 'text') {
424
425 # Remember dir and sigles
426 $dir = $header->dir;
427 $text_id = $header->id;
428 $text_id_esc = $header->id_esc;
429
430 # log output for seeing progression
Marc Kupietz44b1f252020-11-26 16:31:40 +0100431 $log->notice("$0: main(): text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200432
433 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200434 }
435 }
Akron347be812020-09-29 07:52:52 +0200436 }
437} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100438
Akron347be812020-09-29 07:52:52 +0200439$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200440
Akron347be812020-09-29 07:52:52 +0200441$ext_tok->close if $_GEN_TOK_EXT;
Peter Hardersd892a582020-02-12 15:45:22 +0100442
Akron347be812020-09-29 07:52:52 +0200443exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100444
Peter Hardersd892a582020-02-12 15:45:22 +0100445
Akrond658df72021-02-18 18:58:56 +0100446# Recursively called function to handle XML tree data
447sub retr_info {
448
Akron1c4f2202020-07-30 09:28:22 +0200449 # recursion level
450 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
451 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100452
Marc Kupietz985da0c2021-02-15 19:29:50 +0100453 my $dummy_anno;
454 if ($use_tokenizer_sentence_splits) {
Akrond658df72021-02-18 18:58:56 +0100455 $dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100456 }
457
Akrond658df72021-02-18 18:58:56 +0100458 # Iteration through all array elements
459 # ($_[0] is a reference to an array reference)
460 # See notes on how 'XML::CompactTree::XS' works and
461 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
462 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100463
Akrond658df72021-02-18 18:58:56 +0100464 # Element node
465 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100466
Peter Harders6f526a32020-06-29 21:44:41 +0200467 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200468 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200469 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100470
Marc Kupietz985da0c2021-02-15 19:29:50 +0100471 my $anno;
472
Akron7501ca02020-08-01 21:05:25 +0200473 # $e->[1] represents the tag name
Marc Kupietz985da0c2021-02-15 19:29:50 +0100474 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
475 $anno = $dummy_anno;
476 } else {
477 $anno = $structures->add_new_annotation($e->[1]);
478 }
Peter Hardersd892a582020-02-12 15:45:22 +0100479
Peter Hardersd892a582020-02-12 15:45:22 +0100480
Akron7501ca02020-08-01 21:05:25 +0200481 # Add element also to token list
482 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
483 $tokens->add_annotation($anno);
484 };
Peter Hardersd892a582020-02-12 15:45:22 +0100485
Akrond658df72021-02-18 18:58:56 +0100486 # Handle attributes (if attributes exist)
487 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100488
Akrond658df72021-02-18 18:58:56 +0100489 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
490 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
491 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
492 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100493
Peter Harders6f526a32020-06-29 21:44:41 +0200494 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200495 $anno->add_attribute(
496 @{$e->[3]}[$c, $c + 1]
497 );
Akrond658df72021-02-18 18:58:56 +0100498 };
499 };
Peter Harders6f526a32020-06-29 21:44:41 +0200500
501 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200502 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200503
Akrond658df72021-02-18 18:58:56 +0100504
Peter Harders6f526a32020-06-29 21:44:41 +0200505 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200506 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200507 #~~~~
508
509
Akrond658df72021-02-18 18:58:56 +0100510 # Call function recursively
511 # do no recursion, if $e->[$_IDX] is not defined
512 # (because we have no array of child-nodes, e.g.: <back/>)
513 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200514
Akrond658df72021-02-18 18:58:56 +0100515 # Recursion with array of child-nodes
516 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200517 }
518
519
520 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200521 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200522 #~~~~~
523
Akrond658df72021-02-18 18:58:56 +0100524 # NOTE: use $pos, because the offsets are _between_ the characters
525 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200526 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200527
Akrond658df72021-02-18 18:58:56 +0100528 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200529
Akrond658df72021-02-18 18:58:56 +0100530 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200531
Peter Harders6f526a32020-06-29 21:44:41 +0200532 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100533 if ($fval > 0 && not exists $ws{$fval - 1}) {
534
535 # ~ previous node was a text-node ~
536 $anno->set_from($fval - 1);
537 }
538
539 # in case this fails, check input
540 if (($fval - 1) > $pos) {
541 die $log->fatal("text_id='$text_id', " .
542 "processing of structures: " .
543 "from-value ($fval) is 2 or more greater " .
544 "than to-value ($pos) => please check. Aborting");
545 };
546
547 # TODO: find example for which this case applies
548 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
549 #
550 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
551 # do testing with bigger corpus excerpt (wikipedia?)
552 $anno->set_from($pos) if $fval == $pos + 1;
553 $anno->set_to($pos);
554 $anno->set_level($rl);
555
556 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200557 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100558
559
Peter Harders41c35622020-07-12 01:16:22 +0200560 #~~~~
561 # until here: tag-node (closing)
562 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200563 }
564
Akrond658df72021-02-18 18:58:56 +0100565 # Text node
566 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200567
Akrond658df72021-02-18 18:58:56 +0100568 $add_one = 1;
569 $data->append($e->[1]);
570 }
571
572 # Whitespace node
573 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
574 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
575
576 # state, that this from-index belongs to a whitespace-node
577 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
578 $ws{$data->position}++;
579
580 $add_one = 0;
581 $data->append($e->[1]);
582 }
583
584 # not yet handled type
585 else {
586
587 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
588 };
589 };
590};
591
Peter Harders6f526a32020-06-29 21:44:41 +0200592
Akrond949e182020-02-14 12:23:57 +0100593__END__
594
595=pod
596
597=encoding utf8
598
599=head1 NAME
600
601tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
602
603=head1 SYNOPSIS
604
605 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
606
607=head1 DESCRIPTION
608
Akronee434b12020-07-08 12:53:01 +0200609C<tei2korapxml> is a script to convert TEI P5 and
610L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
611based documents to the
612L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
613If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100614read from C<STDIN>. If no specific output is defined, data is written
615to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200616
Akrond949e182020-02-14 12:23:57 +0100617This program is usually called from inside another script.
618
Akronee434b12020-07-08 12:53:01 +0200619=head1 FORMATS
620
621=head2 Input restrictions
622
623=over 2
624
625=item
626
Akronee434b12020-07-08 12:53:01 +0200627TEI P5 formatted input with certain restrictions:
628
629=over 4
630
631=item
632
633B<mandatory>: text-header with integrated textsigle, text-body
634
635=item
636
637B<optional>: corp-header with integrated corpsigle,
638doc-header with integrated docsigle
639
640=back
641
642=item
643
Akron0c41ab32020-09-29 07:33:33 +0200644All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200645newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200646(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200647into blanks between 2 tokens could lead to additional blanks,
648where there should be none (e.g.: punctuation characters like C<,> or
649C<.> should not be seperated from their predecessor token).
650(see also code section C<~ whitespace handling ~>).
651
652=back
653
654=head2 Notes on the output
655
656=over 2
657
658=item
659
660zip file output (default on C<stdout>) with utf8 encoded entries
661(which together form the KorAP-XML format)
662
663=back
664
Akrond949e182020-02-14 12:23:57 +0100665=head1 INSTALLATION
666
667C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
668these bindings are available, the preferred way to install the script is
669to use L<cpanm|App::cpanminus>.
670
671 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
672
673In case everything went well, the C<tei2korapxml> tool will
674be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200675
Akrond949e182020-02-14 12:23:57 +0100676Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
677
678=head1 OPTIONS
679
680=over 2
681
Akron4e603a52020-07-27 14:23:49 +0200682=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100683
Akron4e603a52020-07-27 14:23:49 +0200684The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100685
686=item B<--help|-h>
687
688Print help information.
689
690=item B<--version|-v>
691
692Print version information.
693
Akron4e603a52020-07-27 14:23:49 +0200694=item B<--tokenizer-call|-tc>
695
696Call an external tokenizer process, that will tokenize
697a single line from STDIN and outputs one token per line.
698
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200699=item B<--tokenizer-korap|-tk>
700
701Use the standard KorAP/DeReKo tokenizer.
702
Akron6d7b8e42020-09-29 07:37:41 +0200703=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200704
705Tokenize the data using two embedded tokenizers,
706that will take an I<Aggressive> and a I<conservative>
707approach.
708
Akron1a5271a2021-02-18 13:18:15 +0100709=item B<--inline-tokens> <foundry>#[<file>]
710
711Define the foundry and file (without extension)
712to store inline token information in.
713If L</KORAPXMLTEI_INLINE> is set, this will contain
714annotations as well.
715Defaults to C<tokens> and C<morpho>.
716
Marc Kupietz985da0c2021-02-15 19:29:50 +0100717=item B<--use-tokenizer-sentence-splits|-s>
718
719Replace existing with, or add new, sentence boundary information
720provided by the KorAP tokenizer (currently supported only).
721
Akron3378dfd2020-08-01 15:01:36 +0200722=item B<--log|-l>
723
724Loglevel for I<Log::Any>. Defaults to C<notice>.
725
Akrond949e182020-02-14 12:23:57 +0100726=back
727
Akronb3649472020-09-29 08:24:46 +0200728=head1 ENVIRONMENT VARIABLES
729
730=over 2
731
732=item B<KORAPXMLTEI_DEBUG>
733
734Activate minimal debugging.
735Defaults to C<false>.
736
737=item B<KORAPXMLTEI_INLINE>
738
739Process inline annotations, if present.
740Defaults to C<false>.
741
742=back
743
Akrond949e182020-02-14 12:23:57 +0100744=head1 COPYRIGHT AND LICENSE
745
Marc Kupietze955ecc2021-02-17 17:42:01 +0100746Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100747
748Author: Peter Harders
749
Akronaabd0952020-09-29 07:35:08 +0200750Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100751
752L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
753Corpus Analysis Platform at the
754L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
755member of the
756L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
757
758This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100759L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100760
761=cut
Akronf8088e62021-02-18 16:18:59 +0100762
763# NOTES
764
765## Notes on how 'XML::CompactTree::XS' works
766
767Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
768
769Print out name of 'node2' for the above example:
770
771echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
772
773Exploring the structure of $data ( = reference to below array ):
774
775[ 0: XML_READER_TYPE_DOCUMENT,
776 1: ?
777 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see main(): retr_info( \$tree_data->[2] ))
778 1: 'node'
779 2: ?
780 3: HASH (attributes)
781 4: 1 (line number)
782 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
783 1: 'node1'
784 2: ?
785 3: undefined (no attributes)
786 4: 1 (line number)
787 5: [ 0: [ 0: XML_READER_TYPE_TEXT
788 1: 'some '
789 ]
790 1: [ 0: XML_READER_TYPE_ELEMENT
791 1: 'n'
792 2: ?
793 3: undefined (no attributes)
794 4: 1 (line number)
795 5: undefined (no child-nodes)
796 ]
797 2: [ 0: XML_READER_TYPE_TEXT
798 1: ' text'
799 ]
800 ]
801 ]
802 1: [ 0: XML_READER_TYPE_ELEMENT
803 1: 'node2'
804 2: ?
805 3: undefined (not attributes)
806 4: 1 (line number)
807 5: [ 0: [ 0: XML_READER_TYPE_TEXT
808 1: 'more-text'
809 ]
810 ]
811 ]
812 ]
813 ]
814 ]
815]
816
817$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
818
819ref($data->[2]) == ARRAY (with 1 element for 'node')
820ref($data->[2]->[0]) == ARRAY (with 6 elements)
821
822$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
823$data->[2]->[0]->[1] == 'node'
824ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
825$data->[2]->[0]->[4] == 1 (line number)
826ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
827 # child-nodes of actual node (see $_IDX)
828
829ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
830$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
831$data->[2]->[0]->[5]->[0]->[1] == 'node1'
832$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
833$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
834ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
835
836ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
837$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
838$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
839
840ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
841$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
842$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
843$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
844$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
845$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
846
847ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
848$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
849$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
850
851
852retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
853Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
854${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
855
856
857## Notes on whitespace handling
858
859Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
860(see function 'retr_info()').
861
862Definition of significant and insignificant whitespace
863(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
864
865Significant whitespace is part of the document content and should be preserved.
866Insignificant whitespace is used when editing XML documents for readability.
867These whitespaces are typically not intended for inclusion in the delivery of the document.
868
869### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
870
871The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
872 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
873
874When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
875 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
876 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
877
878echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
879
880
881Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
882
883Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
884 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
885
886The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
887 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
888
889The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
890 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
891
892When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
893 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
894 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
895 the last read 'non-tag'-node has to be corrected (see [1]),
896
897For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
898 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
899
900[1]
901Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
902 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
903 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
904
905[2]
906Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
907 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
908
909The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
910 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
911
912Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
913 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
914
915
916## Notes on whitespace fixing
917
918The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
919 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
920
921It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
922 example further down and notes on 'Input restrictions' in the manpage).
923
924Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
925
926Examples (how primary text with linebreaks would be converted by below code):
927
928 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
929 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
930
931Blanks are inserted before the 1st character:
932
933 NOTE: not stringent ('...' stands for text):
934
935 beg1............................end1 => no blank before 'beg1'
936 beg2....<pb/>...................end2 => no blank before 'beg2'
937 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
938 beg4....<test>ok</test>.........end4 => blank before 'beg4'
939
940 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
941 ^
942 |_blank between 'end3' and 'beg4'
943
944
945## Notes on segfault prevention
946
947binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside 'main()'
948(see notes on 'PerlIO layers' in 'man XML::LibXML'),
949removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
950see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
951see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.