blob: ce31dddcd5ee1fb4bb248d89638576a0d5b03b87 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Akron75d63142021-02-23 18:40:56 +010045 "root|r=s" => \(my $_root_dir = '.'),
46 "input|i=s" => \(my $input_fname = ''),
47 'tokenizer-call|tc=s' => \(my $tokenizer_call),
48 'tokenizer-korap|tk' => \(my $tokenizer_korap),
49 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT),
50 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
51 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
52 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
53 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
54 'base-foundry=s' => \(my $_tok_dir = 'base'),
55 'data-file=s' => \(my $_data_file = 'data'),
56 'header-file=s' => \(my $_header_file = 'header'),
57 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
58 'log|l=s' => \(my $log_level = 'notice'),
59 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010060 pod2usage(
61 -verbose => 99,
62 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
63 -msg => $VERSION_MSG,
64 -output => '-'
65 )
66 },
67 'version|v' => sub {
68 pod2usage(
69 -verbose => 0,
70 -msg => $VERSION_MSG,
71 -output => '-'
72 )
73 }
Peter Hardersd892a582020-02-12 15:45:22 +010074);
75
Akronb87c58d2021-02-23 17:23:30 +010076# Establish logger
Marc Kupietz44b1f252020-11-26 16:31:40 +010077binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020078Log::Any::Adapter->set('Stderr', log_level => $log_level);
79
Akronb3649472020-09-29 08:24:46 +020080$log->notice('Debugging is activated') if DEBUG;
81
Akron0529e512021-02-22 09:55:35 +010082# tag (without attributes), which contains the primary text
83my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020084# optional
Akron09e0b2c2020-07-28 15:57:01 +020085
Akron0529e512021-02-22 09:55:35 +010086# TODO: IDS-specific (and redundant)
87my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +020088
Marc Kupietz985da0c2021-02-15 19:29:50 +010089if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
90 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
Akronb87c58d2021-02-23 17:23:30 +010091};
Marc Kupietz985da0c2021-02-15 19:29:50 +010092
Akron0c41ab32020-09-29 07:33:33 +020093my $ext_tok;
94if ($tokenizer_call) {
95 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
96}
Marc Kupietz1e882fb2020-09-09 00:05:46 +020097
Akron0c41ab32020-09-29 07:33:33 +020098elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +010099 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200100};
Peter Harders6f526a32020-06-29 21:44:41 +0200101##
102
Akron0c41ab32020-09-29 07:33:33 +0200103
Akron4e3c7e32021-02-18 15:19:53 +0100104#
105# ~~~ constants ~~~
106#
107
108
Akron8b511f92020-07-09 17:28:08 +0200109## intern tokenization
Akronb87c58d2021-02-23 17:23:30 +0100110my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
111my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200112##
113
Akrondd0be8f2021-02-18 19:29:41 +0100114# Name of the directory and the file containing all inline structure informations
115# except for $_TOKEN_TAG information
116my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100117
Akron1a5271a2021-02-18 13:18:15 +0100118# Name of the directory and the file containing all inline token informations
119# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
120my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100121
Akronb87c58d2021-02-23 17:23:30 +0100122# name of the tag containing all information stored in $_tokens_file
123my $_TOKENS_TAG = "w";
Peter Harders6f526a32020-06-29 21:44:41 +0200124
Akron4e3c7e32021-02-18 15:19:53 +0100125# Handling inline annotations (inside $_TOKENS_TAG)
Akronb87c58d2021-02-23 17:23:30 +0100126my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
127
128# Initialize Token- and Structure-Collector
129my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
130my $structures = KorAP::XML::TEI::Annotations::Collector->new;
131
132# Initialize Data-Collector
133my $data = KorAP::XML::TEI::Data->new;
134
135# Initialize zipper
136my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Akron09e0b2c2020-07-28 15:57:01 +0200137
Peter Harders6f526a32020-06-29 21:44:41 +0200138
139#
140# ~~~ variables ~~~
141#
142
Akron09e0b2c2020-07-28 15:57:01 +0200143
Peter Harders6f526a32020-06-29 21:44:41 +0200144my $input_fh; # input file handle (default: stdin)
145
Peter Harders6f526a32020-06-29 21:44:41 +0200146my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200147
Akron0c41ab32020-09-29 07:33:33 +0200148my ( $text_id,
149 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200150
Peter Harders6f526a32020-06-29 21:44:41 +0200151# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100152my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200153 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200154 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
155 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200156 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200157 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
158 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200159 # (means: 'from-index - 1' is a key in %ws).
160 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
161
Akron7501ca02020-08-01 21:05:25 +0200162my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200163
Peter Harders6f526a32020-06-29 21:44:41 +0200164
165#
166# ~~~ main ~~~
167#
168
169# ~ initializations ~
170
Akron4e3c7e32021-02-18 15:19:53 +0100171# Include line numbers in elements of $tree_data for debugging
172DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200173
Akron7501ca02020-08-01 21:05:25 +0200174$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200175
Peter Harders6f526a32020-06-29 21:44:41 +0200176# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100177
Akron347be812020-09-29 07:52:52 +0200178my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100179
Akron347be812020-09-29 07:52:52 +0200180$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100181
Akron347be812020-09-29 07:52:52 +0200182# Maybe not necessary
183$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100184
Akrondafaa7a2021-02-19 15:17:58 +0100185$dir = '';
Peter Hardersd892a582020-02-12 15:45:22 +0100186
Akron347be812020-09-29 07:52:52 +0200187if ( $input_fname ne '' ){
188 unless (open($input_fh, '<', $input_fname)) {
189 die $log->fatal("File '$input_fname' could not be opened.");
190 };
191}
Peter Harders6f526a32020-06-29 21:44:41 +0200192
Akronf8088e62021-02-18 16:18:59 +0100193# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200194binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200195
Akrond20898f2021-02-19 15:52:17 +0100196my $sfx;
Akron347be812020-09-29 07:52:52 +0200197my $pos;
Akroneaa96232020-10-15 17:06:15 +0200198my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200199my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200200
Akron347be812020-09-29 07:52:52 +0200201# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200202
Akron347be812020-09-29 07:52:52 +0200203MAIN: while ( <$input_fh> ){
204
205 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
206
Akroneaa96232020-10-15 17:06:15 +0200207 # Set input encoding
208 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
209 $input_enc = $2;
210 next;
211 };
212
213 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100214 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200215
Akron347be812020-09-29 07:52:52 +0200216 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
217
218 # ~ start of text body ~
219
Akron347be812020-09-29 07:52:52 +0200220 $sfx = $2;
221
Akrond20898f2021-02-19 15:52:17 +0100222 if ($1 !~ /^\s*$/ || $sfx !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200223 die $log->fatal("input line number $.: " .
224 "line with opening text-body tag '${_TEXT_BODY}' " .
225 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200226 };
Peter Harders6f526a32020-06-29 21:44:41 +0200227
Akron347be812020-09-29 07:52:52 +0200228 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
229 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200230
Akron347be812020-09-29 07:52:52 +0200231 # Iterate over all lines in the text body
232 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200233
Akron347be812020-09-29 07:52:52 +0200234 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200235 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100236 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200237
Akron347be812020-09-29 07:52:52 +0200238 # ~ end of text body ~
239 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200240
Akron91705d72021-02-19 10:59:45 +0100241 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200242
Akron347be812020-09-29 07:52:52 +0200243 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
244 die $log->fatal("input line number $.: " .
245 "line with closing text-body tag '${_TEXT_BODY}'".
246 " contains additional information ... => Aborting (line=$_)");
247 };
Peter Harders6f526a32020-06-29 21:44:41 +0200248
Akrondafaa7a2021-02-19 15:17:58 +0100249 if ($dir eq '') {
250 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
251 next MAIN;
252 };
Peter Harders6f526a32020-06-29 21:44:41 +0200253
Akrondafaa7a2021-02-19 15:17:58 +0100254 my $reader = XML::LibXML::Reader->new(
255 string => "<text>$buf_in</text>",
256 huge => 1
257 );
258
259 # See notes on whitespace handling
260 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
261
262 # XCT_LINE_NUMBERS is only needed for debugging
263 # (see XML::CompactTree::XS)
264 $param |= XCT_LINE_NUMBERS if DEBUG;
265 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
266
267 $structures->reset;
268
Akron75d63142021-02-23 18:40:56 +0100269 $tokens->reset unless $skip_inline_tokens;
Akrondafaa7a2021-02-19 15:17:58 +0100270
271 # ~ whitespace related issue ~
272 $add_one = 0;
273 %ws = ();
274
275 # ~ recursion ~
276 retr_info(1, \$tree_data->[2] ); # parse input data
277
278 if (DEBUG) {
279 $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
280 };
281
282 # ~ write data.xml ~
283 $data->to_zip(
284 $zipper->new_stream("$dir/${_data_file}.xml"),
285 $text_id_esc
286 );
287
288 # ~ tokenization ~
Akron9df4a242021-02-19 15:31:16 +0100289 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100290
291 # Tokenize and output
292 $ext_tok->tokenize($data->data)->to_zip(
293 $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
294 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100295 );
Akrondafaa7a2021-02-19 15:17:58 +0100296 };
Peter Harders6f526a32020-06-29 21:44:41 +0200297
Akrondafaa7a2021-02-19 15:17:58 +0100298 if ($_GEN_TOK_INT) {
Peter Harders6f526a32020-06-29 21:44:41 +0200299
Akrondafaa7a2021-02-19 15:17:58 +0100300 # Tokenize and output
301 $cons_tok->tokenize($data->data)->to_zip(
302 $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200303 $text_id_esc
304 );
Akron598d1a72020-08-02 17:33:31 +0200305
Akrondafaa7a2021-02-19 15:17:58 +0100306 $aggr_tok->tokenize($data->data)->to_zip(
307 $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
308 $text_id_esc
309 );
Akron598d1a72020-08-02 17:33:31 +0200310
Akrondafaa7a2021-02-19 15:17:58 +0100311 $aggr_tok->reset;
312 $cons_tok->reset;
313 };
Akrona10ad592020-08-03 11:20:23 +0200314
Akrondafaa7a2021-02-19 15:17:58 +0100315 if ($use_tokenizer_sentence_splits) {
316 $ext_tok->sentencize_from_previous_input($structures);
Akron9df4a242021-02-19 15:31:16 +0100317 };
Akron598d1a72020-08-02 17:33:31 +0200318
Akrondafaa7a2021-02-19 15:17:58 +0100319 # ~ write structures ~
320 if (!$structures->empty) {
321 $structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100322 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100323 $text_id_esc,
324 2 # = structure serialization
325 );
326 };
327
328 # ~ write tokens ~
Akron75d63142021-02-23 18:40:56 +0100329 unless ($skip_inline_tokens || $tokens->empty) {
Akrondafaa7a2021-02-19 15:17:58 +0100330 $tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100331 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100332 $text_id_esc,
333 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
334 );
335 };
336
337 # reinit.
338 $dir = '';
339
340 # Maybe not necessary
341 $data->reset;
342
Akron347be812020-09-29 07:52:52 +0200343 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200344 };
345
Akron347be812020-09-29 07:52:52 +0200346 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200347
Akron347be812020-09-29 07:52:52 +0200348 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200349
Akronf8088e62021-02-18 16:18:59 +0100350 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100351
Akronf8088e62021-02-18 16:18:59 +0100352 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
353 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
354
355 # Remove consecutive whitespace at beginning and end (mostly one newline)
356 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200357
Akron347be812020-09-29 07:52:52 +0200358 ### NOTE: this is only relevant, if a text consists of more than one line
359 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
360 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
361 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200362
Akron347be812020-09-29 07:52:52 +0200363 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200364
Akron347be812020-09-29 07:52:52 +0200365 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
366 }
367 ###
Akronf57ed812020-07-27 10:37:52 +0200368
Akron347be812020-09-29 07:52:52 +0200369 # add line to buffer
370 $buf_in .= $_;
371 };
Akronf57ed812020-07-27 10:37:52 +0200372
Akron0529e512021-02-22 09:55:35 +0100373 } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200374
Akron347be812020-09-29 07:52:52 +0200375 # ~ start of header ~
Akron347be812020-09-29 07:52:52 +0200376 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200377
Akrond20898f2021-02-19 15:52:17 +0100378 if ($1 !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200379 die $log->fatal("input line number $.: " .
380 "line with opening header tag" .
381 " is not in expected format ... => Aborting (line=$_)");
382 };
383
384 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200385 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200386
387 # Header was parseable
388 if ($header) {
389
390 # Write header to zip
Akron26a71522021-02-19 10:27:37 +0100391 my $file = $header->dir . '/' . $_header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200392
Akronb3649472020-09-29 08:24:46 +0200393 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200394
395 $header->to_zip($zipper->new_stream($file));
396
397 # Header is for text level
398 if ($header->type eq 'text') {
399
400 # Remember dir and sigles
401 $dir = $header->dir;
402 $text_id = $header->id;
403 $text_id_esc = $header->id_esc;
404
405 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100406 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200407
408 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200409 }
410 }
Akron347be812020-09-29 07:52:52 +0200411 }
412} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100413
Akron347be812020-09-29 07:52:52 +0200414$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200415
Akron9df4a242021-02-19 15:31:16 +0100416$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100417
Akron347be812020-09-29 07:52:52 +0200418exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100419
Peter Hardersd892a582020-02-12 15:45:22 +0100420
Akrond658df72021-02-18 18:58:56 +0100421# Recursively called function to handle XML tree data
422sub retr_info {
Akron1c4f2202020-07-30 09:28:22 +0200423 # recursion level
424 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
425 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100426
Akrond658df72021-02-18 18:58:56 +0100427 # Iteration through all array elements
428 # ($_[0] is a reference to an array reference)
429 # See notes on how 'XML::CompactTree::XS' works and
430 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
431 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100432
Akrond658df72021-02-18 18:58:56 +0100433 # Element node
434 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100435
Peter Harders6f526a32020-06-29 21:44:41 +0200436 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200437 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200438 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100439
Akron7501ca02020-08-01 21:05:25 +0200440 # $e->[1] represents the tag name
Akronace12772021-02-19 13:16:26 +0100441 # Skip sentences
Marc Kupietz985da0c2021-02-15 19:29:50 +0100442 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akronace12772021-02-19 13:16:26 +0100443 if (defined $e->[$_IDX]) {
444 retr_info($rl+1, \$e->[$_IDX]);
445 }
446 next;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100447 }
Peter Hardersd892a582020-02-12 15:45:22 +0100448
Akronace12772021-02-19 13:16:26 +0100449 my $anno = $structures->add_new_annotation($e->[1]);
Peter Hardersd892a582020-02-12 15:45:22 +0100450
Akron7501ca02020-08-01 21:05:25 +0200451 # Add element also to token list
Akron75d63142021-02-23 18:40:56 +0100452 if (!$skip_inline_tokens && $e->[1] eq $_TOKENS_TAG) {
Akron7501ca02020-08-01 21:05:25 +0200453 $tokens->add_annotation($anno);
454 };
Peter Hardersd892a582020-02-12 15:45:22 +0100455
Akrond658df72021-02-18 18:58:56 +0100456 # Handle attributes (if attributes exist)
457 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100458
Akrond658df72021-02-18 18:58:56 +0100459 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
460 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
461 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
462 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100463
Peter Harders6f526a32020-06-29 21:44:41 +0200464 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200465 $anno->add_attribute(
466 @{$e->[3]}[$c, $c + 1]
467 );
Akrond658df72021-02-18 18:58:56 +0100468 };
469 };
Peter Harders6f526a32020-06-29 21:44:41 +0200470
471 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200472 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200473
Akrond658df72021-02-18 18:58:56 +0100474
Peter Harders6f526a32020-06-29 21:44:41 +0200475 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200476 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200477 #~~~~
478
479
Akrond658df72021-02-18 18:58:56 +0100480 # Call function recursively
481 # do no recursion, if $e->[$_IDX] is not defined
482 # (because we have no array of child-nodes, e.g.: <back/>)
483 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200484
Akrond658df72021-02-18 18:58:56 +0100485 # Recursion with array of child-nodes
486 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200487 }
488
489
490 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200491 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200492 #~~~~~
493
Akrond658df72021-02-18 18:58:56 +0100494 # NOTE: use $pos, because the offsets are _between_ the characters
495 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200496 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200497
Akrond658df72021-02-18 18:58:56 +0100498 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200499
Akrond658df72021-02-18 18:58:56 +0100500 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200501
Peter Harders6f526a32020-06-29 21:44:41 +0200502 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100503 if ($fval > 0 && not exists $ws{$fval - 1}) {
504
505 # ~ previous node was a text-node ~
506 $anno->set_from($fval - 1);
507 }
508
509 # in case this fails, check input
510 if (($fval - 1) > $pos) {
511 die $log->fatal("text_id='$text_id', " .
512 "processing of structures: " .
513 "from-value ($fval) is 2 or more greater " .
514 "than to-value ($pos) => please check. Aborting");
515 };
516
517 # TODO: find example for which this case applies
518 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
519 #
520 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
521 # do testing with bigger corpus excerpt (wikipedia?)
522 $anno->set_from($pos) if $fval == $pos + 1;
523 $anno->set_to($pos);
524 $anno->set_level($rl);
525
526 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200527 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100528
529
Peter Harders41c35622020-07-12 01:16:22 +0200530 #~~~~
531 # until here: tag-node (closing)
532 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200533 }
534
Akrond658df72021-02-18 18:58:56 +0100535 # Text node
536 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200537
Akrond658df72021-02-18 18:58:56 +0100538 $add_one = 1;
539 $data->append($e->[1]);
540 }
541
542 # Whitespace node
543 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
544 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
545
546 # state, that this from-index belongs to a whitespace-node
547 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
548 $ws{$data->position}++;
549
550 $add_one = 0;
551 $data->append($e->[1]);
552 }
553
554 # not yet handled type
555 else {
556
557 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
558 };
559 };
560};
561
Peter Harders6f526a32020-06-29 21:44:41 +0200562
Akrond949e182020-02-14 12:23:57 +0100563__END__
564
565=pod
566
567=encoding utf8
568
569=head1 NAME
570
571tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
572
573=head1 SYNOPSIS
574
575 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
576
577=head1 DESCRIPTION
578
Akronee434b12020-07-08 12:53:01 +0200579C<tei2korapxml> is a script to convert TEI P5 and
580L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
581based documents to the
582L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
583If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100584read from C<STDIN>. If no specific output is defined, data is written
585to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200586
Akrond949e182020-02-14 12:23:57 +0100587This program is usually called from inside another script.
588
Akronee434b12020-07-08 12:53:01 +0200589=head1 FORMATS
590
591=head2 Input restrictions
592
593=over 2
594
595=item
596
Akronee434b12020-07-08 12:53:01 +0200597TEI P5 formatted input with certain restrictions:
598
599=over 4
600
601=item
602
603B<mandatory>: text-header with integrated textsigle, text-body
604
605=item
606
607B<optional>: corp-header with integrated corpsigle,
608doc-header with integrated docsigle
609
610=back
611
612=item
613
Akron0c41ab32020-09-29 07:33:33 +0200614All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200615newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200616(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200617into blanks between 2 tokens could lead to additional blanks,
618where there should be none (e.g.: punctuation characters like C<,> or
619C<.> should not be seperated from their predecessor token).
620(see also code section C<~ whitespace handling ~>).
621
622=back
623
624=head2 Notes on the output
625
626=over 2
627
628=item
629
630zip file output (default on C<stdout>) with utf8 encoded entries
631(which together form the KorAP-XML format)
632
633=back
634
Akrond949e182020-02-14 12:23:57 +0100635=head1 INSTALLATION
636
637C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
638these bindings are available, the preferred way to install the script is
639to use L<cpanm|App::cpanminus>.
640
641 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
642
643In case everything went well, the C<tei2korapxml> tool will
644be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200645
Akrond949e182020-02-14 12:23:57 +0100646Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
647
648=head1 OPTIONS
649
650=over 2
651
Akron4e603a52020-07-27 14:23:49 +0200652=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100653
Akron4e603a52020-07-27 14:23:49 +0200654The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100655
656=item B<--help|-h>
657
658Print help information.
659
660=item B<--version|-v>
661
662Print version information.
663
Akron4e603a52020-07-27 14:23:49 +0200664=item B<--tokenizer-call|-tc>
665
666Call an external tokenizer process, that will tokenize
667a single line from STDIN and outputs one token per line.
668
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200669=item B<--tokenizer-korap|-tk>
670
671Use the standard KorAP/DeReKo tokenizer.
672
Akron6d7b8e42020-09-29 07:37:41 +0200673=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200674
675Tokenize the data using two embedded tokenizers,
676that will take an I<Aggressive> and a I<conservative>
677approach.
678
Akron75d63142021-02-23 18:40:56 +0100679=item B<--skip-inline-tokens>
680
681Boolean flag indicating that inline tokens should not
682be processed. Defaults to false (meaning inline tokens will be processed).
683
Akron1a5271a2021-02-18 13:18:15 +0100684=item B<--inline-tokens> <foundry>#[<file>]
685
686Define the foundry and file (without extension)
687to store inline token information in.
688If L</KORAPXMLTEI_INLINE> is set, this will contain
689annotations as well.
690Defaults to C<tokens> and C<morpho>.
691
Akrondd0be8f2021-02-18 19:29:41 +0100692=item B<--inline-structures> <foundry>#[<file>]
693
694Define the foundry and file (without extension)
695to store inline structure information in.
696Defaults to C<struct> and C<structures>.
697
Akron26a71522021-02-19 10:27:37 +0100698=item B<--base-foundry> <foundry>
699
700Define the base foundry to store newly generated
701token information in.
702Defaults to C<base>.
703
704=item B<--data-file> <file>
705
706Define the file (without extension)
707to store primary data information in.
708Defaults to C<data>.
709
710=item B<--header-file> <file>
711
712Define the file name (without extension)
713to store header information on
714the corpus, document, and text level in.
715Defaults to C<header>.
716
Marc Kupietz985da0c2021-02-15 19:29:50 +0100717=item B<--use-tokenizer-sentence-splits|-s>
718
719Replace existing with, or add new, sentence boundary information
720provided by the KorAP tokenizer (currently supported only).
721
Akron91705d72021-02-19 10:59:45 +0100722=item B<--tokens-file> <file>
723
724Define the file (without extension)
725to store generated token information in
726(either from the KorAP tokenizer or an externally called tokenizer).
727Defaults to C<tokens>.
728
Akron3378dfd2020-08-01 15:01:36 +0200729=item B<--log|-l>
730
731Loglevel for I<Log::Any>. Defaults to C<notice>.
732
Akrond949e182020-02-14 12:23:57 +0100733=back
734
Akronb3649472020-09-29 08:24:46 +0200735=head1 ENVIRONMENT VARIABLES
736
737=over 2
738
739=item B<KORAPXMLTEI_DEBUG>
740
741Activate minimal debugging.
742Defaults to C<false>.
743
744=item B<KORAPXMLTEI_INLINE>
745
746Process inline annotations, if present.
747Defaults to C<false>.
748
749=back
750
Akrond949e182020-02-14 12:23:57 +0100751=head1 COPYRIGHT AND LICENSE
752
Marc Kupietze955ecc2021-02-17 17:42:01 +0100753Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100754
755Author: Peter Harders
756
Akronaabd0952020-09-29 07:35:08 +0200757Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100758
759L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
760Corpus Analysis Platform at the
761L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
762member of the
763L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
764
765This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100766L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100767
768=cut
Akronf8088e62021-02-18 16:18:59 +0100769
770# NOTES
771
772## Notes on how 'XML::CompactTree::XS' works
773
774Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
775
776Print out name of 'node2' for the above example:
777
778echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
779
780Exploring the structure of $data ( = reference to below array ):
781
782[ 0: XML_READER_TYPE_DOCUMENT,
783 1: ?
Akron91577922021-02-19 10:32:54 +0100784 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100785 1: 'node'
786 2: ?
787 3: HASH (attributes)
788 4: 1 (line number)
789 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
790 1: 'node1'
791 2: ?
792 3: undefined (no attributes)
793 4: 1 (line number)
794 5: [ 0: [ 0: XML_READER_TYPE_TEXT
795 1: 'some '
796 ]
797 1: [ 0: XML_READER_TYPE_ELEMENT
798 1: 'n'
799 2: ?
800 3: undefined (no attributes)
801 4: 1 (line number)
802 5: undefined (no child-nodes)
803 ]
804 2: [ 0: XML_READER_TYPE_TEXT
805 1: ' text'
806 ]
807 ]
808 ]
809 1: [ 0: XML_READER_TYPE_ELEMENT
810 1: 'node2'
811 2: ?
812 3: undefined (not attributes)
813 4: 1 (line number)
814 5: [ 0: [ 0: XML_READER_TYPE_TEXT
815 1: 'more-text'
816 ]
817 ]
818 ]
819 ]
820 ]
821 ]
822]
823
824$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
825
826ref($data->[2]) == ARRAY (with 1 element for 'node')
827ref($data->[2]->[0]) == ARRAY (with 6 elements)
828
829$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
830$data->[2]->[0]->[1] == 'node'
831ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
832$data->[2]->[0]->[4] == 1 (line number)
833ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
834 # child-nodes of actual node (see $_IDX)
835
836ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
837$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
838$data->[2]->[0]->[5]->[0]->[1] == 'node1'
839$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
840$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
841ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
842
843ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
844$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
845$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
846
847ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
848$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
849$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
850$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
851$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
852$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
853
854ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
855$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
856$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
857
858
859retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
860Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
861${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
862
863
864## Notes on whitespace handling
865
866Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
867(see function 'retr_info()').
868
869Definition of significant and insignificant whitespace
870(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
871
872Significant whitespace is part of the document content and should be preserved.
873Insignificant whitespace is used when editing XML documents for readability.
874These whitespaces are typically not intended for inclusion in the delivery of the document.
875
876### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
877
878The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
879 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
880
881When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
882 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
883 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
884
885echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
886
887
888Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
889
890Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
891 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
892
893The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
894 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
895
896The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
897 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
898
899When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
900 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
901 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
902 the last read 'non-tag'-node has to be corrected (see [1]),
903
904For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
905 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
906
907[1]
908Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
909 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
910 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
911
912[2]
913Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
914 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
915
916The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
917 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
918
919Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
920 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
921
922
923## Notes on whitespace fixing
924
925The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
926 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
927
928It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
929 example further down and notes on 'Input restrictions' in the manpage).
930
931Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
932
933Examples (how primary text with linebreaks would be converted by below code):
934
935 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
936 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
937
938Blanks are inserted before the 1st character:
939
940 NOTE: not stringent ('...' stands for text):
941
942 beg1............................end1 => no blank before 'beg1'
943 beg2....<pb/>...................end2 => no blank before 'beg2'
944 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
945 beg4....<test>ok</test>.........end4 => blank before 'beg4'
946
947 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
948 ^
949 |_blank between 'end3' and 'beg4'
950
951
952## Notes on segfault prevention
953
Akron91577922021-02-19 10:32:54 +0100954binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100955(see notes on 'PerlIO layers' in 'man XML::LibXML'),
956removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
957see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
958see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.