blob: 046b98bec39c5587497d45d656856daaacbf64e4 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron91705d72021-02-19 10:59:45 +010049 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akrondd0be8f2021-02-18 19:29:41 +010052 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron26a71522021-02-19 10:27:37 +010053 'base-foundry=s' => \(my $_tok_dir = 'base'),
54 'data-file=s' => \(my $_data_file = 'data'),
55 'header-file=s' => \(my $_header_file = 'header'),
Akron91705d72021-02-19 10:59:45 +010056 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron3378dfd2020-08-01 15:01:36 +020057 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020058 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010059 pod2usage(
60 -verbose => 99,
61 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
62 -msg => $VERSION_MSG,
63 -output => '-'
64 )
65 },
66 'version|v' => sub {
67 pod2usage(
68 -verbose => 0,
69 -msg => $VERSION_MSG,
70 -output => '-'
71 )
72 }
Peter Hardersd892a582020-02-12 15:45:22 +010073);
74
Marc Kupietz44b1f252020-11-26 16:31:40 +010075binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020076Log::Any::Adapter->set('Stderr', log_level => $log_level);
77
Akronb3649472020-09-29 08:24:46 +020078$log->notice('Debugging is activated') if DEBUG;
79
Peter Harders6f526a32020-06-29 21:44:41 +020080#
81# ~~~ parameter (mandatory) ~~~
82#
Akron0529e512021-02-22 09:55:35 +010083# tag (without attributes), which contains the primary text
84my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020085# optional
Akron09e0b2c2020-07-28 15:57:01 +020086
Akron0529e512021-02-22 09:55:35 +010087# TODO: IDS-specific (and redundant)
88my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +020089
Marc Kupietz985da0c2021-02-15 19:29:50 +010090if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
91 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
92}
93
Akron0c41ab32020-09-29 07:33:33 +020094my $ext_tok;
95if ($tokenizer_call) {
96 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
97}
Marc Kupietz1e882fb2020-09-09 00:05:46 +020098
Akron0c41ab32020-09-29 07:33:33 +020099elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100100 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200101};
Peter Harders6f526a32020-06-29 21:44:41 +0200102##
103
Akron0c41ab32020-09-29 07:33:33 +0200104
Akron4e3c7e32021-02-18 15:19:53 +0100105#
106# ~~~ constants ~~~
107#
108
109
Akron8b511f92020-07-09 17:28:08 +0200110## intern tokenization
Akron0c41ab32020-09-29 07:33:33 +0200111my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
112my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200113##
114
Peter Harders6f526a32020-06-29 21:44:41 +0200115## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
116my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100117
118
Akrondd0be8f2021-02-18 19:29:41 +0100119# Name of the directory and the file containing all inline structure informations
120# except for $_TOKEN_TAG information
121my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
122$_structure_file .= '.xml';
123
124
Akron1a5271a2021-02-18 13:18:15 +0100125# Name of the directory and the file containing all inline token informations
126# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
127my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
128$_tokens_file .= '.xml';
129
Peter Harders6f526a32020-06-29 21:44:41 +0200130my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
131
Akron4e3c7e32021-02-18 15:19:53 +0100132# Handling inline annotations (inside $_TOKENS_TAG)
133my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200134
Peter Harders6f526a32020-06-29 21:44:41 +0200135
136#
137# ~~~ variables ~~~
138#
139
Akron7501ca02020-08-01 21:05:25 +0200140# Initialize Token- and Structure-Collector
141my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
142my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200143
144
Akrona10ad592020-08-03 11:20:23 +0200145# Initialize Data-Collector
146my $data = KorAP::XML::TEI::Data->new;
147
148
Akron85717512020-07-08 11:19:19 +0200149# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200150my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200151my $input_fh; # input file handle (default: stdin)
152
Peter Harders6f526a32020-06-29 21:44:41 +0200153my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200154
Akron0c41ab32020-09-29 07:33:33 +0200155my ( $text_id,
156 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200157
Peter Harders6f526a32020-06-29 21:44:41 +0200158# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100159my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200160 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200161 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
162 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200163 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200164 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
165 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200166 # (means: 'from-index - 1' is a key in %ws).
167 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
168
Akron7501ca02020-08-01 21:05:25 +0200169my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200170
Peter Harders6f526a32020-06-29 21:44:41 +0200171
172#
173# ~~~ main ~~~
174#
175
176# ~ initializations ~
177
Akron4e3c7e32021-02-18 15:19:53 +0100178# Include line numbers in elements of $tree_data for debugging
179DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200180
Akron7501ca02020-08-01 21:05:25 +0200181$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200182
Peter Harders6f526a32020-06-29 21:44:41 +0200183# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100184
Akron347be812020-09-29 07:52:52 +0200185my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100186
Akron347be812020-09-29 07:52:52 +0200187$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100188
Akron347be812020-09-29 07:52:52 +0200189# Maybe not necessary
190$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100191
Akrondafaa7a2021-02-19 15:17:58 +0100192$dir = '';
Peter Hardersd892a582020-02-12 15:45:22 +0100193
Akron347be812020-09-29 07:52:52 +0200194if ( $input_fname ne '' ){
195 unless (open($input_fh, '<', $input_fname)) {
196 die $log->fatal("File '$input_fname' could not be opened.");
197 };
198}
Peter Harders6f526a32020-06-29 21:44:41 +0200199
Akronf8088e62021-02-18 16:18:59 +0100200# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200201binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200202
Akrond20898f2021-02-19 15:52:17 +0100203my $sfx;
Akron347be812020-09-29 07:52:52 +0200204my $pos;
Akroneaa96232020-10-15 17:06:15 +0200205my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200206my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200207
Akron347be812020-09-29 07:52:52 +0200208# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200209
Akron347be812020-09-29 07:52:52 +0200210MAIN: while ( <$input_fh> ){
211
212 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
213
Akroneaa96232020-10-15 17:06:15 +0200214 # Set input encoding
215 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
216 $input_enc = $2;
217 next;
218 };
219
220 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100221 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200222
Akron347be812020-09-29 07:52:52 +0200223 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
224
225 # ~ start of text body ~
226
Akron347be812020-09-29 07:52:52 +0200227 $sfx = $2;
228
Akrond20898f2021-02-19 15:52:17 +0100229 if ($1 !~ /^\s*$/ || $sfx !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200230 die $log->fatal("input line number $.: " .
231 "line with opening text-body tag '${_TEXT_BODY}' " .
232 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200233 };
Peter Harders6f526a32020-06-29 21:44:41 +0200234
Akron347be812020-09-29 07:52:52 +0200235 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
236 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200237
Akron347be812020-09-29 07:52:52 +0200238 # Iterate over all lines in the text body
239 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200240
Akron347be812020-09-29 07:52:52 +0200241 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200242 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100243 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200244
Akron347be812020-09-29 07:52:52 +0200245 # ~ end of text body ~
246 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200247
Akron91705d72021-02-19 10:59:45 +0100248 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200249
Akron347be812020-09-29 07:52:52 +0200250 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
251 die $log->fatal("input line number $.: " .
252 "line with closing text-body tag '${_TEXT_BODY}'".
253 " contains additional information ... => Aborting (line=$_)");
254 };
Peter Harders6f526a32020-06-29 21:44:41 +0200255
Akrondafaa7a2021-02-19 15:17:58 +0100256 if ($dir eq '') {
257 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
258 next MAIN;
259 };
Peter Harders6f526a32020-06-29 21:44:41 +0200260
Akrondafaa7a2021-02-19 15:17:58 +0100261 my $reader = XML::LibXML::Reader->new(
262 string => "<text>$buf_in</text>",
263 huge => 1
264 );
265
266 # See notes on whitespace handling
267 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
268
269 # XCT_LINE_NUMBERS is only needed for debugging
270 # (see XML::CompactTree::XS)
271 $param |= XCT_LINE_NUMBERS if DEBUG;
272 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
273
274 $structures->reset;
275
276 $tokens->reset if $_TOKENS_PROC;
277
278 # ~ whitespace related issue ~
279 $add_one = 0;
280 %ws = ();
281
282 # ~ recursion ~
283 retr_info(1, \$tree_data->[2] ); # parse input data
284
285 if (DEBUG) {
286 $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
287 };
288
289 # ~ write data.xml ~
290 $data->to_zip(
291 $zipper->new_stream("$dir/${_data_file}.xml"),
292 $text_id_esc
293 );
294
295 # ~ tokenization ~
Akron9df4a242021-02-19 15:31:16 +0100296 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100297
298 # Tokenize and output
299 $ext_tok->tokenize($data->data)->to_zip(
300 $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
301 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100302 );
Akrondafaa7a2021-02-19 15:17:58 +0100303 };
Peter Harders6f526a32020-06-29 21:44:41 +0200304
Akrondafaa7a2021-02-19 15:17:58 +0100305 if ($_GEN_TOK_INT) {
Peter Harders6f526a32020-06-29 21:44:41 +0200306
Akrondafaa7a2021-02-19 15:17:58 +0100307 # Tokenize and output
308 $cons_tok->tokenize($data->data)->to_zip(
309 $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200310 $text_id_esc
311 );
Akron598d1a72020-08-02 17:33:31 +0200312
Akrondafaa7a2021-02-19 15:17:58 +0100313 $aggr_tok->tokenize($data->data)->to_zip(
314 $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
315 $text_id_esc
316 );
Akron598d1a72020-08-02 17:33:31 +0200317
Akrondafaa7a2021-02-19 15:17:58 +0100318 $aggr_tok->reset;
319 $cons_tok->reset;
320 };
Akrona10ad592020-08-03 11:20:23 +0200321
Akrondafaa7a2021-02-19 15:17:58 +0100322 if ($use_tokenizer_sentence_splits) {
323 $ext_tok->sentencize_from_previous_input($structures);
Akron9df4a242021-02-19 15:31:16 +0100324 };
Akron598d1a72020-08-02 17:33:31 +0200325
Akrondafaa7a2021-02-19 15:17:58 +0100326 # ~ write structures ~
327 if (!$structures->empty) {
328 $structures->to_zip(
329 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
330 $text_id_esc,
331 2 # = structure serialization
332 );
333 };
334
335 # ~ write tokens ~
336 if ($_TOKENS_PROC && !$tokens->empty) {
337 $tokens->to_zip(
338 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
339 $text_id_esc,
340 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
341 );
342 };
343
344 # reinit.
345 $dir = '';
346
347 # Maybe not necessary
348 $data->reset;
349
Akron347be812020-09-29 07:52:52 +0200350 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200351 };
352
Akron347be812020-09-29 07:52:52 +0200353 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200354
Akron347be812020-09-29 07:52:52 +0200355 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200356
Akronf8088e62021-02-18 16:18:59 +0100357 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100358
Akronf8088e62021-02-18 16:18:59 +0100359 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
360 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
361
362 # Remove consecutive whitespace at beginning and end (mostly one newline)
363 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200364
Akron347be812020-09-29 07:52:52 +0200365 ### NOTE: this is only relevant, if a text consists of more than one line
366 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
367 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
368 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200369
Akron347be812020-09-29 07:52:52 +0200370 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200371
Akron347be812020-09-29 07:52:52 +0200372 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
373 }
374 ###
Akronf57ed812020-07-27 10:37:52 +0200375
Akron347be812020-09-29 07:52:52 +0200376 # add line to buffer
377 $buf_in .= $_;
378 };
Akronf57ed812020-07-27 10:37:52 +0200379
Akron0529e512021-02-22 09:55:35 +0100380 } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200381
Akron347be812020-09-29 07:52:52 +0200382 # ~ start of header ~
Akron347be812020-09-29 07:52:52 +0200383 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200384
Akrond20898f2021-02-19 15:52:17 +0100385 if ($1 !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200386 die $log->fatal("input line number $.: " .
387 "line with opening header tag" .
388 " is not in expected format ... => Aborting (line=$_)");
389 };
390
391 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200392 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200393
394 # Header was parseable
395 if ($header) {
396
397 # Write header to zip
Akron26a71522021-02-19 10:27:37 +0100398 my $file = $header->dir . '/' . $_header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200399
Akronb3649472020-09-29 08:24:46 +0200400 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200401
402 $header->to_zip($zipper->new_stream($file));
403
404 # Header is for text level
405 if ($header->type eq 'text') {
406
407 # Remember dir and sigles
408 $dir = $header->dir;
409 $text_id = $header->id;
410 $text_id_esc = $header->id_esc;
411
412 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100413 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200414
415 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200416 }
417 }
Akron347be812020-09-29 07:52:52 +0200418 }
419} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100420
Akron347be812020-09-29 07:52:52 +0200421$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200422
Akron9df4a242021-02-19 15:31:16 +0100423$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100424
Akron347be812020-09-29 07:52:52 +0200425exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100426
Peter Hardersd892a582020-02-12 15:45:22 +0100427
Akrond658df72021-02-18 18:58:56 +0100428# Recursively called function to handle XML tree data
429sub retr_info {
Akron1c4f2202020-07-30 09:28:22 +0200430 # recursion level
431 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
432 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100433
Marc Kupietz985da0c2021-02-15 19:29:50 +0100434 my $dummy_anno;
435 if ($use_tokenizer_sentence_splits) {
Akrond658df72021-02-18 18:58:56 +0100436 $dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100437 }
438
Akrond658df72021-02-18 18:58:56 +0100439 # Iteration through all array elements
440 # ($_[0] is a reference to an array reference)
441 # See notes on how 'XML::CompactTree::XS' works and
442 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
443 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100444
Akrond658df72021-02-18 18:58:56 +0100445 # Element node
446 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100447
Peter Harders6f526a32020-06-29 21:44:41 +0200448 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200449 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200450 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100451
Marc Kupietz985da0c2021-02-15 19:29:50 +0100452 my $anno;
453
Akron7501ca02020-08-01 21:05:25 +0200454 # $e->[1] represents the tag name
Marc Kupietz985da0c2021-02-15 19:29:50 +0100455 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
456 $anno = $dummy_anno;
457 } else {
458 $anno = $structures->add_new_annotation($e->[1]);
459 }
Peter Hardersd892a582020-02-12 15:45:22 +0100460
Peter Hardersd892a582020-02-12 15:45:22 +0100461
Akron7501ca02020-08-01 21:05:25 +0200462 # Add element also to token list
463 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
464 $tokens->add_annotation($anno);
465 };
Peter Hardersd892a582020-02-12 15:45:22 +0100466
Akrond658df72021-02-18 18:58:56 +0100467 # Handle attributes (if attributes exist)
468 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100469
Akrond658df72021-02-18 18:58:56 +0100470 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
471 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
472 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
473 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100474
Peter Harders6f526a32020-06-29 21:44:41 +0200475 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200476 $anno->add_attribute(
477 @{$e->[3]}[$c, $c + 1]
478 );
Akrond658df72021-02-18 18:58:56 +0100479 };
480 };
Peter Harders6f526a32020-06-29 21:44:41 +0200481
482 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200483 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200484
Akrond658df72021-02-18 18:58:56 +0100485
Peter Harders6f526a32020-06-29 21:44:41 +0200486 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200487 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200488 #~~~~
489
490
Akrond658df72021-02-18 18:58:56 +0100491 # Call function recursively
492 # do no recursion, if $e->[$_IDX] is not defined
493 # (because we have no array of child-nodes, e.g.: <back/>)
494 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200495
Akrond658df72021-02-18 18:58:56 +0100496 # Recursion with array of child-nodes
497 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200498 }
499
500
501 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200502 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200503 #~~~~~
504
Akrond658df72021-02-18 18:58:56 +0100505 # NOTE: use $pos, because the offsets are _between_ the characters
506 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200507 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200508
Akrond658df72021-02-18 18:58:56 +0100509 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200510
Akrond658df72021-02-18 18:58:56 +0100511 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200512
Peter Harders6f526a32020-06-29 21:44:41 +0200513 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100514 if ($fval > 0 && not exists $ws{$fval - 1}) {
515
516 # ~ previous node was a text-node ~
517 $anno->set_from($fval - 1);
518 }
519
520 # in case this fails, check input
521 if (($fval - 1) > $pos) {
522 die $log->fatal("text_id='$text_id', " .
523 "processing of structures: " .
524 "from-value ($fval) is 2 or more greater " .
525 "than to-value ($pos) => please check. Aborting");
526 };
527
528 # TODO: find example for which this case applies
529 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
530 #
531 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
532 # do testing with bigger corpus excerpt (wikipedia?)
533 $anno->set_from($pos) if $fval == $pos + 1;
534 $anno->set_to($pos);
535 $anno->set_level($rl);
536
537 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200538 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100539
540
Peter Harders41c35622020-07-12 01:16:22 +0200541 #~~~~
542 # until here: tag-node (closing)
543 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200544 }
545
Akrond658df72021-02-18 18:58:56 +0100546 # Text node
547 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200548
Akrond658df72021-02-18 18:58:56 +0100549 $add_one = 1;
550 $data->append($e->[1]);
551 }
552
553 # Whitespace node
554 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
555 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
556
557 # state, that this from-index belongs to a whitespace-node
558 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
559 $ws{$data->position}++;
560
561 $add_one = 0;
562 $data->append($e->[1]);
563 }
564
565 # not yet handled type
566 else {
567
568 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
569 };
570 };
571};
572
Peter Harders6f526a32020-06-29 21:44:41 +0200573
Akrond949e182020-02-14 12:23:57 +0100574__END__
575
576=pod
577
578=encoding utf8
579
580=head1 NAME
581
582tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
583
584=head1 SYNOPSIS
585
586 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
587
588=head1 DESCRIPTION
589
Akronee434b12020-07-08 12:53:01 +0200590C<tei2korapxml> is a script to convert TEI P5 and
591L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
592based documents to the
593L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
594If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100595read from C<STDIN>. If no specific output is defined, data is written
596to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200597
Akrond949e182020-02-14 12:23:57 +0100598This program is usually called from inside another script.
599
Akronee434b12020-07-08 12:53:01 +0200600=head1 FORMATS
601
602=head2 Input restrictions
603
604=over 2
605
606=item
607
Akronee434b12020-07-08 12:53:01 +0200608TEI P5 formatted input with certain restrictions:
609
610=over 4
611
612=item
613
614B<mandatory>: text-header with integrated textsigle, text-body
615
616=item
617
618B<optional>: corp-header with integrated corpsigle,
619doc-header with integrated docsigle
620
621=back
622
623=item
624
Akron0c41ab32020-09-29 07:33:33 +0200625All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200626newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200627(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200628into blanks between 2 tokens could lead to additional blanks,
629where there should be none (e.g.: punctuation characters like C<,> or
630C<.> should not be seperated from their predecessor token).
631(see also code section C<~ whitespace handling ~>).
632
633=back
634
635=head2 Notes on the output
636
637=over 2
638
639=item
640
641zip file output (default on C<stdout>) with utf8 encoded entries
642(which together form the KorAP-XML format)
643
644=back
645
Akrond949e182020-02-14 12:23:57 +0100646=head1 INSTALLATION
647
648C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
649these bindings are available, the preferred way to install the script is
650to use L<cpanm|App::cpanminus>.
651
652 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
653
654In case everything went well, the C<tei2korapxml> tool will
655be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200656
Akrond949e182020-02-14 12:23:57 +0100657Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
658
659=head1 OPTIONS
660
661=over 2
662
Akron4e603a52020-07-27 14:23:49 +0200663=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100664
Akron4e603a52020-07-27 14:23:49 +0200665The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100666
667=item B<--help|-h>
668
669Print help information.
670
671=item B<--version|-v>
672
673Print version information.
674
Akron4e603a52020-07-27 14:23:49 +0200675=item B<--tokenizer-call|-tc>
676
677Call an external tokenizer process, that will tokenize
678a single line from STDIN and outputs one token per line.
679
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200680=item B<--tokenizer-korap|-tk>
681
682Use the standard KorAP/DeReKo tokenizer.
683
Akron6d7b8e42020-09-29 07:37:41 +0200684=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200685
686Tokenize the data using two embedded tokenizers,
687that will take an I<Aggressive> and a I<conservative>
688approach.
689
Akron1a5271a2021-02-18 13:18:15 +0100690=item B<--inline-tokens> <foundry>#[<file>]
691
692Define the foundry and file (without extension)
693to store inline token information in.
694If L</KORAPXMLTEI_INLINE> is set, this will contain
695annotations as well.
696Defaults to C<tokens> and C<morpho>.
697
Akrondd0be8f2021-02-18 19:29:41 +0100698=item B<--inline-structures> <foundry>#[<file>]
699
700Define the foundry and file (without extension)
701to store inline structure information in.
702Defaults to C<struct> and C<structures>.
703
Akron26a71522021-02-19 10:27:37 +0100704=item B<--base-foundry> <foundry>
705
706Define the base foundry to store newly generated
707token information in.
708Defaults to C<base>.
709
710=item B<--data-file> <file>
711
712Define the file (without extension)
713to store primary data information in.
714Defaults to C<data>.
715
716=item B<--header-file> <file>
717
718Define the file name (without extension)
719to store header information on
720the corpus, document, and text level in.
721Defaults to C<header>.
722
Marc Kupietz985da0c2021-02-15 19:29:50 +0100723=item B<--use-tokenizer-sentence-splits|-s>
724
725Replace existing with, or add new, sentence boundary information
726provided by the KorAP tokenizer (currently supported only).
727
Akron91705d72021-02-19 10:59:45 +0100728=item B<--tokens-file> <file>
729
730Define the file (without extension)
731to store generated token information in
732(either from the KorAP tokenizer or an externally called tokenizer).
733Defaults to C<tokens>.
734
Akron3378dfd2020-08-01 15:01:36 +0200735=item B<--log|-l>
736
737Loglevel for I<Log::Any>. Defaults to C<notice>.
738
Akrond949e182020-02-14 12:23:57 +0100739=back
740
Akronb3649472020-09-29 08:24:46 +0200741=head1 ENVIRONMENT VARIABLES
742
743=over 2
744
745=item B<KORAPXMLTEI_DEBUG>
746
747Activate minimal debugging.
748Defaults to C<false>.
749
750=item B<KORAPXMLTEI_INLINE>
751
752Process inline annotations, if present.
753Defaults to C<false>.
754
755=back
756
Akrond949e182020-02-14 12:23:57 +0100757=head1 COPYRIGHT AND LICENSE
758
Marc Kupietze955ecc2021-02-17 17:42:01 +0100759Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100760
761Author: Peter Harders
762
Akronaabd0952020-09-29 07:35:08 +0200763Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100764
765L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
766Corpus Analysis Platform at the
767L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
768member of the
769L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
770
771This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100772L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100773
774=cut
Akronf8088e62021-02-18 16:18:59 +0100775
776# NOTES
777
778## Notes on how 'XML::CompactTree::XS' works
779
780Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
781
782Print out name of 'node2' for the above example:
783
784echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
785
786Exploring the structure of $data ( = reference to below array ):
787
788[ 0: XML_READER_TYPE_DOCUMENT,
789 1: ?
Akron91577922021-02-19 10:32:54 +0100790 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100791 1: 'node'
792 2: ?
793 3: HASH (attributes)
794 4: 1 (line number)
795 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
796 1: 'node1'
797 2: ?
798 3: undefined (no attributes)
799 4: 1 (line number)
800 5: [ 0: [ 0: XML_READER_TYPE_TEXT
801 1: 'some '
802 ]
803 1: [ 0: XML_READER_TYPE_ELEMENT
804 1: 'n'
805 2: ?
806 3: undefined (no attributes)
807 4: 1 (line number)
808 5: undefined (no child-nodes)
809 ]
810 2: [ 0: XML_READER_TYPE_TEXT
811 1: ' text'
812 ]
813 ]
814 ]
815 1: [ 0: XML_READER_TYPE_ELEMENT
816 1: 'node2'
817 2: ?
818 3: undefined (not attributes)
819 4: 1 (line number)
820 5: [ 0: [ 0: XML_READER_TYPE_TEXT
821 1: 'more-text'
822 ]
823 ]
824 ]
825 ]
826 ]
827 ]
828]
829
830$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
831
832ref($data->[2]) == ARRAY (with 1 element for 'node')
833ref($data->[2]->[0]) == ARRAY (with 6 elements)
834
835$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
836$data->[2]->[0]->[1] == 'node'
837ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
838$data->[2]->[0]->[4] == 1 (line number)
839ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
840 # child-nodes of actual node (see $_IDX)
841
842ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
843$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
844$data->[2]->[0]->[5]->[0]->[1] == 'node1'
845$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
846$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
847ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
848
849ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
850$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
851$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
852
853ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
854$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
855$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
856$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
857$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
858$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
859
860ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
861$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
862$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
863
864
865retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
866Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
867${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
868
869
870## Notes on whitespace handling
871
872Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
873(see function 'retr_info()').
874
875Definition of significant and insignificant whitespace
876(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
877
878Significant whitespace is part of the document content and should be preserved.
879Insignificant whitespace is used when editing XML documents for readability.
880These whitespaces are typically not intended for inclusion in the delivery of the document.
881
882### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
883
884The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
885 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
886
887When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
888 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
889 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
890
891echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
892
893
894Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
895
896Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
897 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
898
899The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
900 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
901
902The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
903 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
904
905When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
906 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
907 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
908 the last read 'non-tag'-node has to be corrected (see [1]),
909
910For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
911 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
912
913[1]
914Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
915 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
916 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
917
918[2]
919Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
920 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
921
922The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
923 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
924
925Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
926 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
927
928
929## Notes on whitespace fixing
930
931The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
932 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
933
934It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
935 example further down and notes on 'Input restrictions' in the manpage).
936
937Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
938
939Examples (how primary text with linebreaks would be converted by below code):
940
941 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
942 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
943
944Blanks are inserted before the 1st character:
945
946 NOTE: not stringent ('...' stands for text):
947
948 beg1............................end1 => no blank before 'beg1'
949 beg2....<pb/>...................end2 => no blank before 'beg2'
950 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
951 beg4....<test>ok</test>.........end4 => blank before 'beg4'
952
953 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
954 ^
955 |_blank between 'end3' and 'beg4'
956
957
958## Notes on segfault prevention
959
Akron91577922021-02-19 10:32:54 +0100960binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100961(see notes on 'PerlIO layers' in 'man XML::LibXML'),
962removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
963see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
964see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.