blob: 784ed1a0af7a126c36cc782c07e29eddd1fc28a4 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron91705d72021-02-19 10:59:45 +010049 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akrondd0be8f2021-02-18 19:29:41 +010052 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron26a71522021-02-19 10:27:37 +010053 'base-foundry=s' => \(my $_tok_dir = 'base'),
54 'data-file=s' => \(my $_data_file = 'data'),
55 'header-file=s' => \(my $_header_file = 'header'),
Akron91705d72021-02-19 10:59:45 +010056 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron3378dfd2020-08-01 15:01:36 +020057 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020058 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010059 pod2usage(
60 -verbose => 99,
61 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
62 -msg => $VERSION_MSG,
63 -output => '-'
64 )
65 },
66 'version|v' => sub {
67 pod2usage(
68 -verbose => 0,
69 -msg => $VERSION_MSG,
70 -output => '-'
71 )
72 }
Peter Hardersd892a582020-02-12 15:45:22 +010073);
74
Akronb87c58d2021-02-23 17:23:30 +010075# Establish logger
Marc Kupietz44b1f252020-11-26 16:31:40 +010076binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020077Log::Any::Adapter->set('Stderr', log_level => $log_level);
78
Akronb3649472020-09-29 08:24:46 +020079$log->notice('Debugging is activated') if DEBUG;
80
Akron0529e512021-02-22 09:55:35 +010081# tag (without attributes), which contains the primary text
82my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020083# optional
Akron09e0b2c2020-07-28 15:57:01 +020084
Akron0529e512021-02-22 09:55:35 +010085# TODO: IDS-specific (and redundant)
86my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +020087
Marc Kupietz985da0c2021-02-15 19:29:50 +010088if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
89 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
Akronb87c58d2021-02-23 17:23:30 +010090};
Marc Kupietz985da0c2021-02-15 19:29:50 +010091
Akron0c41ab32020-09-29 07:33:33 +020092my $ext_tok;
93if ($tokenizer_call) {
94 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
95}
Marc Kupietz1e882fb2020-09-09 00:05:46 +020096
Akron0c41ab32020-09-29 07:33:33 +020097elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +010098 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +020099};
Peter Harders6f526a32020-06-29 21:44:41 +0200100##
101
Akron0c41ab32020-09-29 07:33:33 +0200102
Akron4e3c7e32021-02-18 15:19:53 +0100103#
104# ~~~ constants ~~~
105#
106
107
Akron8b511f92020-07-09 17:28:08 +0200108## intern tokenization
Akronb87c58d2021-02-23 17:23:30 +0100109my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
110my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200111##
112
Akronb87c58d2021-02-23 17:23:30 +0100113# Processing of ${_TOKEN_TAG}'s - on/off (default: 1)
114my $_TOKENS_PROC = 1;
Akron1a5271a2021-02-18 13:18:15 +0100115
Akrondd0be8f2021-02-18 19:29:41 +0100116# Name of the directory and the file containing all inline structure informations
117# except for $_TOKEN_TAG information
118my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100119
Akron1a5271a2021-02-18 13:18:15 +0100120# Name of the directory and the file containing all inline token informations
121# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
122my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100123
Akronb87c58d2021-02-23 17:23:30 +0100124# name of the tag containing all information stored in $_tokens_file
125my $_TOKENS_TAG = "w";
Peter Harders6f526a32020-06-29 21:44:41 +0200126
Akron4e3c7e32021-02-18 15:19:53 +0100127# Handling inline annotations (inside $_TOKENS_TAG)
Akronb87c58d2021-02-23 17:23:30 +0100128my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
129
130# Initialize Token- and Structure-Collector
131my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
132my $structures = KorAP::XML::TEI::Annotations::Collector->new;
133
134# Initialize Data-Collector
135my $data = KorAP::XML::TEI::Data->new;
136
137# Initialize zipper
138my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Akron09e0b2c2020-07-28 15:57:01 +0200139
Peter Harders6f526a32020-06-29 21:44:41 +0200140
141#
142# ~~~ variables ~~~
143#
144
Akron09e0b2c2020-07-28 15:57:01 +0200145
Peter Harders6f526a32020-06-29 21:44:41 +0200146my $input_fh; # input file handle (default: stdin)
147
Peter Harders6f526a32020-06-29 21:44:41 +0200148my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200149
Akron0c41ab32020-09-29 07:33:33 +0200150my ( $text_id,
151 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200152
Peter Harders6f526a32020-06-29 21:44:41 +0200153# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100154my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200155 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200156 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
157 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200158 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200159 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
160 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200161 # (means: 'from-index - 1' is a key in %ws).
162 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
163
Akron7501ca02020-08-01 21:05:25 +0200164my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200165
Peter Harders6f526a32020-06-29 21:44:41 +0200166
167#
168# ~~~ main ~~~
169#
170
171# ~ initializations ~
172
Akron4e3c7e32021-02-18 15:19:53 +0100173# Include line numbers in elements of $tree_data for debugging
174DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200175
Akron7501ca02020-08-01 21:05:25 +0200176$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200177
Peter Harders6f526a32020-06-29 21:44:41 +0200178# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100179
Akron347be812020-09-29 07:52:52 +0200180my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100181
Akron347be812020-09-29 07:52:52 +0200182$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100183
Akron347be812020-09-29 07:52:52 +0200184# Maybe not necessary
185$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100186
Akrondafaa7a2021-02-19 15:17:58 +0100187$dir = '';
Peter Hardersd892a582020-02-12 15:45:22 +0100188
Akron347be812020-09-29 07:52:52 +0200189if ( $input_fname ne '' ){
190 unless (open($input_fh, '<', $input_fname)) {
191 die $log->fatal("File '$input_fname' could not be opened.");
192 };
193}
Peter Harders6f526a32020-06-29 21:44:41 +0200194
Akronf8088e62021-02-18 16:18:59 +0100195# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200196binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200197
Akrond20898f2021-02-19 15:52:17 +0100198my $sfx;
Akron347be812020-09-29 07:52:52 +0200199my $pos;
Akroneaa96232020-10-15 17:06:15 +0200200my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200201my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200202
Akron347be812020-09-29 07:52:52 +0200203# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200204
Akron347be812020-09-29 07:52:52 +0200205MAIN: while ( <$input_fh> ){
206
207 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
208
Akroneaa96232020-10-15 17:06:15 +0200209 # Set input encoding
210 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
211 $input_enc = $2;
212 next;
213 };
214
215 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100216 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200217
Akron347be812020-09-29 07:52:52 +0200218 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
219
220 # ~ start of text body ~
221
Akron347be812020-09-29 07:52:52 +0200222 $sfx = $2;
223
Akrond20898f2021-02-19 15:52:17 +0100224 if ($1 !~ /^\s*$/ || $sfx !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200225 die $log->fatal("input line number $.: " .
226 "line with opening text-body tag '${_TEXT_BODY}' " .
227 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200228 };
Peter Harders6f526a32020-06-29 21:44:41 +0200229
Akron347be812020-09-29 07:52:52 +0200230 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
231 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200232
Akron347be812020-09-29 07:52:52 +0200233 # Iterate over all lines in the text body
234 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200235
Akron347be812020-09-29 07:52:52 +0200236 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200237 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100238 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200239
Akron347be812020-09-29 07:52:52 +0200240 # ~ end of text body ~
241 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200242
Akron91705d72021-02-19 10:59:45 +0100243 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200244
Akron347be812020-09-29 07:52:52 +0200245 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
246 die $log->fatal("input line number $.: " .
247 "line with closing text-body tag '${_TEXT_BODY}'".
248 " contains additional information ... => Aborting (line=$_)");
249 };
Peter Harders6f526a32020-06-29 21:44:41 +0200250
Akrondafaa7a2021-02-19 15:17:58 +0100251 if ($dir eq '') {
252 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
253 next MAIN;
254 };
Peter Harders6f526a32020-06-29 21:44:41 +0200255
Akrondafaa7a2021-02-19 15:17:58 +0100256 my $reader = XML::LibXML::Reader->new(
257 string => "<text>$buf_in</text>",
258 huge => 1
259 );
260
261 # See notes on whitespace handling
262 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
263
264 # XCT_LINE_NUMBERS is only needed for debugging
265 # (see XML::CompactTree::XS)
266 $param |= XCT_LINE_NUMBERS if DEBUG;
267 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
268
269 $structures->reset;
270
271 $tokens->reset if $_TOKENS_PROC;
272
273 # ~ whitespace related issue ~
274 $add_one = 0;
275 %ws = ();
276
277 # ~ recursion ~
278 retr_info(1, \$tree_data->[2] ); # parse input data
279
280 if (DEBUG) {
281 $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
282 };
283
284 # ~ write data.xml ~
285 $data->to_zip(
286 $zipper->new_stream("$dir/${_data_file}.xml"),
287 $text_id_esc
288 );
289
290 # ~ tokenization ~
Akron9df4a242021-02-19 15:31:16 +0100291 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100292
293 # Tokenize and output
294 $ext_tok->tokenize($data->data)->to_zip(
295 $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
296 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100297 );
Akrondafaa7a2021-02-19 15:17:58 +0100298 };
Peter Harders6f526a32020-06-29 21:44:41 +0200299
Akrondafaa7a2021-02-19 15:17:58 +0100300 if ($_GEN_TOK_INT) {
Peter Harders6f526a32020-06-29 21:44:41 +0200301
Akrondafaa7a2021-02-19 15:17:58 +0100302 # Tokenize and output
303 $cons_tok->tokenize($data->data)->to_zip(
304 $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200305 $text_id_esc
306 );
Akron598d1a72020-08-02 17:33:31 +0200307
Akrondafaa7a2021-02-19 15:17:58 +0100308 $aggr_tok->tokenize($data->data)->to_zip(
309 $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
310 $text_id_esc
311 );
Akron598d1a72020-08-02 17:33:31 +0200312
Akrondafaa7a2021-02-19 15:17:58 +0100313 $aggr_tok->reset;
314 $cons_tok->reset;
315 };
Akrona10ad592020-08-03 11:20:23 +0200316
Akrondafaa7a2021-02-19 15:17:58 +0100317 if ($use_tokenizer_sentence_splits) {
318 $ext_tok->sentencize_from_previous_input($structures);
Akron9df4a242021-02-19 15:31:16 +0100319 };
Akron598d1a72020-08-02 17:33:31 +0200320
Akrondafaa7a2021-02-19 15:17:58 +0100321 # ~ write structures ~
322 if (!$structures->empty) {
323 $structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100324 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100325 $text_id_esc,
326 2 # = structure serialization
327 );
328 };
329
330 # ~ write tokens ~
331 if ($_TOKENS_PROC && !$tokens->empty) {
332 $tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100333 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100334 $text_id_esc,
335 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
336 );
337 };
338
339 # reinit.
340 $dir = '';
341
342 # Maybe not necessary
343 $data->reset;
344
Akron347be812020-09-29 07:52:52 +0200345 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200346 };
347
Akron347be812020-09-29 07:52:52 +0200348 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200349
Akron347be812020-09-29 07:52:52 +0200350 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200351
Akronf8088e62021-02-18 16:18:59 +0100352 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100353
Akronf8088e62021-02-18 16:18:59 +0100354 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
355 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
356
357 # Remove consecutive whitespace at beginning and end (mostly one newline)
358 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200359
Akron347be812020-09-29 07:52:52 +0200360 ### NOTE: this is only relevant, if a text consists of more than one line
361 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
362 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
363 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200364
Akron347be812020-09-29 07:52:52 +0200365 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200366
Akron347be812020-09-29 07:52:52 +0200367 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
368 }
369 ###
Akronf57ed812020-07-27 10:37:52 +0200370
Akron347be812020-09-29 07:52:52 +0200371 # add line to buffer
372 $buf_in .= $_;
373 };
Akronf57ed812020-07-27 10:37:52 +0200374
Akron0529e512021-02-22 09:55:35 +0100375 } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200376
Akron347be812020-09-29 07:52:52 +0200377 # ~ start of header ~
Akron347be812020-09-29 07:52:52 +0200378 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200379
Akrond20898f2021-02-19 15:52:17 +0100380 if ($1 !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200381 die $log->fatal("input line number $.: " .
382 "line with opening header tag" .
383 " is not in expected format ... => Aborting (line=$_)");
384 };
385
386 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200387 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200388
389 # Header was parseable
390 if ($header) {
391
392 # Write header to zip
Akron26a71522021-02-19 10:27:37 +0100393 my $file = $header->dir . '/' . $_header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200394
Akronb3649472020-09-29 08:24:46 +0200395 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200396
397 $header->to_zip($zipper->new_stream($file));
398
399 # Header is for text level
400 if ($header->type eq 'text') {
401
402 # Remember dir and sigles
403 $dir = $header->dir;
404 $text_id = $header->id;
405 $text_id_esc = $header->id_esc;
406
407 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100408 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200409
410 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200411 }
412 }
Akron347be812020-09-29 07:52:52 +0200413 }
414} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100415
Akron347be812020-09-29 07:52:52 +0200416$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200417
Akron9df4a242021-02-19 15:31:16 +0100418$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100419
Akron347be812020-09-29 07:52:52 +0200420exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100421
Peter Hardersd892a582020-02-12 15:45:22 +0100422
Akrond658df72021-02-18 18:58:56 +0100423# Recursively called function to handle XML tree data
424sub retr_info {
Akron1c4f2202020-07-30 09:28:22 +0200425 # recursion level
426 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
427 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100428
Akrond658df72021-02-18 18:58:56 +0100429 # Iteration through all array elements
430 # ($_[0] is a reference to an array reference)
431 # See notes on how 'XML::CompactTree::XS' works and
432 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
433 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100434
Akrond658df72021-02-18 18:58:56 +0100435 # Element node
436 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100437
Peter Harders6f526a32020-06-29 21:44:41 +0200438 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200439 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200440 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100441
Akron7501ca02020-08-01 21:05:25 +0200442 # $e->[1] represents the tag name
Akronace12772021-02-19 13:16:26 +0100443 # Skip sentences
Marc Kupietz985da0c2021-02-15 19:29:50 +0100444 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akronace12772021-02-19 13:16:26 +0100445 if (defined $e->[$_IDX]) {
446 retr_info($rl+1, \$e->[$_IDX]);
447 }
448 next;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100449 }
Peter Hardersd892a582020-02-12 15:45:22 +0100450
Akronace12772021-02-19 13:16:26 +0100451 my $anno = $structures->add_new_annotation($e->[1]);
Peter Hardersd892a582020-02-12 15:45:22 +0100452
Akron7501ca02020-08-01 21:05:25 +0200453 # Add element also to token list
454 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
455 $tokens->add_annotation($anno);
456 };
Peter Hardersd892a582020-02-12 15:45:22 +0100457
Akrond658df72021-02-18 18:58:56 +0100458 # Handle attributes (if attributes exist)
459 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100460
Akrond658df72021-02-18 18:58:56 +0100461 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
462 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
463 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
464 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100465
Peter Harders6f526a32020-06-29 21:44:41 +0200466 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200467 $anno->add_attribute(
468 @{$e->[3]}[$c, $c + 1]
469 );
Akrond658df72021-02-18 18:58:56 +0100470 };
471 };
Peter Harders6f526a32020-06-29 21:44:41 +0200472
473 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200474 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200475
Akrond658df72021-02-18 18:58:56 +0100476
Peter Harders6f526a32020-06-29 21:44:41 +0200477 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200478 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200479 #~~~~
480
481
Akrond658df72021-02-18 18:58:56 +0100482 # Call function recursively
483 # do no recursion, if $e->[$_IDX] is not defined
484 # (because we have no array of child-nodes, e.g.: <back/>)
485 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200486
Akrond658df72021-02-18 18:58:56 +0100487 # Recursion with array of child-nodes
488 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200489 }
490
491
492 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200493 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200494 #~~~~~
495
Akrond658df72021-02-18 18:58:56 +0100496 # NOTE: use $pos, because the offsets are _between_ the characters
497 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200498 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200499
Akrond658df72021-02-18 18:58:56 +0100500 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200501
Akrond658df72021-02-18 18:58:56 +0100502 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200503
Peter Harders6f526a32020-06-29 21:44:41 +0200504 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100505 if ($fval > 0 && not exists $ws{$fval - 1}) {
506
507 # ~ previous node was a text-node ~
508 $anno->set_from($fval - 1);
509 }
510
511 # in case this fails, check input
512 if (($fval - 1) > $pos) {
513 die $log->fatal("text_id='$text_id', " .
514 "processing of structures: " .
515 "from-value ($fval) is 2 or more greater " .
516 "than to-value ($pos) => please check. Aborting");
517 };
518
519 # TODO: find example for which this case applies
520 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
521 #
522 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
523 # do testing with bigger corpus excerpt (wikipedia?)
524 $anno->set_from($pos) if $fval == $pos + 1;
525 $anno->set_to($pos);
526 $anno->set_level($rl);
527
528 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200529 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100530
531
Peter Harders41c35622020-07-12 01:16:22 +0200532 #~~~~
533 # until here: tag-node (closing)
534 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200535 }
536
Akrond658df72021-02-18 18:58:56 +0100537 # Text node
538 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200539
Akrond658df72021-02-18 18:58:56 +0100540 $add_one = 1;
541 $data->append($e->[1]);
542 }
543
544 # Whitespace node
545 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
546 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
547
548 # state, that this from-index belongs to a whitespace-node
549 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
550 $ws{$data->position}++;
551
552 $add_one = 0;
553 $data->append($e->[1]);
554 }
555
556 # not yet handled type
557 else {
558
559 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
560 };
561 };
562};
563
Peter Harders6f526a32020-06-29 21:44:41 +0200564
Akrond949e182020-02-14 12:23:57 +0100565__END__
566
567=pod
568
569=encoding utf8
570
571=head1 NAME
572
573tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
574
575=head1 SYNOPSIS
576
577 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
578
579=head1 DESCRIPTION
580
Akronee434b12020-07-08 12:53:01 +0200581C<tei2korapxml> is a script to convert TEI P5 and
582L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
583based documents to the
584L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
585If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100586read from C<STDIN>. If no specific output is defined, data is written
587to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200588
Akrond949e182020-02-14 12:23:57 +0100589This program is usually called from inside another script.
590
Akronee434b12020-07-08 12:53:01 +0200591=head1 FORMATS
592
593=head2 Input restrictions
594
595=over 2
596
597=item
598
Akronee434b12020-07-08 12:53:01 +0200599TEI P5 formatted input with certain restrictions:
600
601=over 4
602
603=item
604
605B<mandatory>: text-header with integrated textsigle, text-body
606
607=item
608
609B<optional>: corp-header with integrated corpsigle,
610doc-header with integrated docsigle
611
612=back
613
614=item
615
Akron0c41ab32020-09-29 07:33:33 +0200616All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200617newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200618(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200619into blanks between 2 tokens could lead to additional blanks,
620where there should be none (e.g.: punctuation characters like C<,> or
621C<.> should not be seperated from their predecessor token).
622(see also code section C<~ whitespace handling ~>).
623
624=back
625
626=head2 Notes on the output
627
628=over 2
629
630=item
631
632zip file output (default on C<stdout>) with utf8 encoded entries
633(which together form the KorAP-XML format)
634
635=back
636
Akrond949e182020-02-14 12:23:57 +0100637=head1 INSTALLATION
638
639C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
640these bindings are available, the preferred way to install the script is
641to use L<cpanm|App::cpanminus>.
642
643 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
644
645In case everything went well, the C<tei2korapxml> tool will
646be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200647
Akrond949e182020-02-14 12:23:57 +0100648Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
649
650=head1 OPTIONS
651
652=over 2
653
Akron4e603a52020-07-27 14:23:49 +0200654=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100655
Akron4e603a52020-07-27 14:23:49 +0200656The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100657
658=item B<--help|-h>
659
660Print help information.
661
662=item B<--version|-v>
663
664Print version information.
665
Akron4e603a52020-07-27 14:23:49 +0200666=item B<--tokenizer-call|-tc>
667
668Call an external tokenizer process, that will tokenize
669a single line from STDIN and outputs one token per line.
670
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200671=item B<--tokenizer-korap|-tk>
672
673Use the standard KorAP/DeReKo tokenizer.
674
Akron6d7b8e42020-09-29 07:37:41 +0200675=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200676
677Tokenize the data using two embedded tokenizers,
678that will take an I<Aggressive> and a I<conservative>
679approach.
680
Akron1a5271a2021-02-18 13:18:15 +0100681=item B<--inline-tokens> <foundry>#[<file>]
682
683Define the foundry and file (without extension)
684to store inline token information in.
685If L</KORAPXMLTEI_INLINE> is set, this will contain
686annotations as well.
687Defaults to C<tokens> and C<morpho>.
688
Akrondd0be8f2021-02-18 19:29:41 +0100689=item B<--inline-structures> <foundry>#[<file>]
690
691Define the foundry and file (without extension)
692to store inline structure information in.
693Defaults to C<struct> and C<structures>.
694
Akron26a71522021-02-19 10:27:37 +0100695=item B<--base-foundry> <foundry>
696
697Define the base foundry to store newly generated
698token information in.
699Defaults to C<base>.
700
701=item B<--data-file> <file>
702
703Define the file (without extension)
704to store primary data information in.
705Defaults to C<data>.
706
707=item B<--header-file> <file>
708
709Define the file name (without extension)
710to store header information on
711the corpus, document, and text level in.
712Defaults to C<header>.
713
Marc Kupietz985da0c2021-02-15 19:29:50 +0100714=item B<--use-tokenizer-sentence-splits|-s>
715
716Replace existing with, or add new, sentence boundary information
717provided by the KorAP tokenizer (currently supported only).
718
Akron91705d72021-02-19 10:59:45 +0100719=item B<--tokens-file> <file>
720
721Define the file (without extension)
722to store generated token information in
723(either from the KorAP tokenizer or an externally called tokenizer).
724Defaults to C<tokens>.
725
Akron3378dfd2020-08-01 15:01:36 +0200726=item B<--log|-l>
727
728Loglevel for I<Log::Any>. Defaults to C<notice>.
729
Akrond949e182020-02-14 12:23:57 +0100730=back
731
Akronb3649472020-09-29 08:24:46 +0200732=head1 ENVIRONMENT VARIABLES
733
734=over 2
735
736=item B<KORAPXMLTEI_DEBUG>
737
738Activate minimal debugging.
739Defaults to C<false>.
740
741=item B<KORAPXMLTEI_INLINE>
742
743Process inline annotations, if present.
744Defaults to C<false>.
745
746=back
747
Akrond949e182020-02-14 12:23:57 +0100748=head1 COPYRIGHT AND LICENSE
749
Marc Kupietze955ecc2021-02-17 17:42:01 +0100750Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100751
752Author: Peter Harders
753
Akronaabd0952020-09-29 07:35:08 +0200754Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100755
756L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
757Corpus Analysis Platform at the
758L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
759member of the
760L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
761
762This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100763L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100764
765=cut
Akronf8088e62021-02-18 16:18:59 +0100766
767# NOTES
768
769## Notes on how 'XML::CompactTree::XS' works
770
771Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
772
773Print out name of 'node2' for the above example:
774
775echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
776
777Exploring the structure of $data ( = reference to below array ):
778
779[ 0: XML_READER_TYPE_DOCUMENT,
780 1: ?
Akron91577922021-02-19 10:32:54 +0100781 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100782 1: 'node'
783 2: ?
784 3: HASH (attributes)
785 4: 1 (line number)
786 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
787 1: 'node1'
788 2: ?
789 3: undefined (no attributes)
790 4: 1 (line number)
791 5: [ 0: [ 0: XML_READER_TYPE_TEXT
792 1: 'some '
793 ]
794 1: [ 0: XML_READER_TYPE_ELEMENT
795 1: 'n'
796 2: ?
797 3: undefined (no attributes)
798 4: 1 (line number)
799 5: undefined (no child-nodes)
800 ]
801 2: [ 0: XML_READER_TYPE_TEXT
802 1: ' text'
803 ]
804 ]
805 ]
806 1: [ 0: XML_READER_TYPE_ELEMENT
807 1: 'node2'
808 2: ?
809 3: undefined (not attributes)
810 4: 1 (line number)
811 5: [ 0: [ 0: XML_READER_TYPE_TEXT
812 1: 'more-text'
813 ]
814 ]
815 ]
816 ]
817 ]
818 ]
819]
820
821$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
822
823ref($data->[2]) == ARRAY (with 1 element for 'node')
824ref($data->[2]->[0]) == ARRAY (with 6 elements)
825
826$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
827$data->[2]->[0]->[1] == 'node'
828ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
829$data->[2]->[0]->[4] == 1 (line number)
830ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
831 # child-nodes of actual node (see $_IDX)
832
833ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
834$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
835$data->[2]->[0]->[5]->[0]->[1] == 'node1'
836$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
837$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
838ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
839
840ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
841$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
842$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
843
844ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
845$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
846$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
847$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
848$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
849$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
850
851ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
852$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
853$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
854
855
856retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
857Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
858${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
859
860
861## Notes on whitespace handling
862
863Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
864(see function 'retr_info()').
865
866Definition of significant and insignificant whitespace
867(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
868
869Significant whitespace is part of the document content and should be preserved.
870Insignificant whitespace is used when editing XML documents for readability.
871These whitespaces are typically not intended for inclusion in the delivery of the document.
872
873### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
874
875The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
876 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
877
878When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
879 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
880 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
881
882echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
883
884
885Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
886
887Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
888 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
889
890The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
891 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
892
893The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
894 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
895
896When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
897 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
898 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
899 the last read 'non-tag'-node has to be corrected (see [1]),
900
901For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
902 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
903
904[1]
905Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
906 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
907 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
908
909[2]
910Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
911 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
912
913The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
914 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
915
916Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
917 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
918
919
920## Notes on whitespace fixing
921
922The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
923 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
924
925It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
926 example further down and notes on 'Input restrictions' in the manpage).
927
928Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
929
930Examples (how primary text with linebreaks would be converted by below code):
931
932 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
933 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
934
935Blanks are inserted before the 1st character:
936
937 NOTE: not stringent ('...' stands for text):
938
939 beg1............................end1 => no blank before 'beg1'
940 beg2....<pb/>...................end2 => no blank before 'beg2'
941 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
942 beg4....<test>ok</test>.........end4 => blank before 'beg4'
943
944 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
945 ^
946 |_blank between 'end3' and 'beg4'
947
948
949## Notes on segfault prevention
950
Akron91577922021-02-19 10:32:54 +0100951binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100952(see notes on 'PerlIO layers' in 'man XML::LibXML'),
953removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
954see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
955see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.