blob: b307ceb20f6ecf6f03e9150452f79770b82b7c37 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron91705d72021-02-19 10:59:45 +010049 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akrondd0be8f2021-02-18 19:29:41 +010052 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron26a71522021-02-19 10:27:37 +010053 'base-foundry=s' => \(my $_tok_dir = 'base'),
54 'data-file=s' => \(my $_data_file = 'data'),
55 'header-file=s' => \(my $_header_file = 'header'),
Akron91705d72021-02-19 10:59:45 +010056 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron3378dfd2020-08-01 15:01:36 +020057 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020058 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010059 pod2usage(
60 -verbose => 99,
61 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
62 -msg => $VERSION_MSG,
63 -output => '-'
64 )
65 },
66 'version|v' => sub {
67 pod2usage(
68 -verbose => 0,
69 -msg => $VERSION_MSG,
70 -output => '-'
71 )
72 }
Peter Hardersd892a582020-02-12 15:45:22 +010073);
74
Marc Kupietz44b1f252020-11-26 16:31:40 +010075binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020076Log::Any::Adapter->set('Stderr', log_level => $log_level);
77
Akronb3649472020-09-29 08:24:46 +020078$log->notice('Debugging is activated') if DEBUG;
79
Peter Harders6f526a32020-06-29 21:44:41 +020080#
81# ~~~ parameter (mandatory) ~~~
82#
Akron0529e512021-02-22 09:55:35 +010083# tag (without attributes), which contains the primary text
84my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020085# optional
Akron09e0b2c2020-07-28 15:57:01 +020086
Akron0529e512021-02-22 09:55:35 +010087# TODO: IDS-specific (and redundant)
88my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +020089
Marc Kupietz985da0c2021-02-15 19:29:50 +010090if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
91 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
92}
93
Akron0c41ab32020-09-29 07:33:33 +020094my $ext_tok;
95if ($tokenizer_call) {
96 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
97}
Marc Kupietz1e882fb2020-09-09 00:05:46 +020098
Akron0c41ab32020-09-29 07:33:33 +020099elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100100 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200101};
Peter Harders6f526a32020-06-29 21:44:41 +0200102##
103
Akron0c41ab32020-09-29 07:33:33 +0200104
Akron4e3c7e32021-02-18 15:19:53 +0100105#
106# ~~~ constants ~~~
107#
108
109
Akron8b511f92020-07-09 17:28:08 +0200110## intern tokenization
Akron0c41ab32020-09-29 07:33:33 +0200111my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
112my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200113##
114
Peter Harders6f526a32020-06-29 21:44:41 +0200115## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
116my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100117
118
Akrondd0be8f2021-02-18 19:29:41 +0100119# Name of the directory and the file containing all inline structure informations
120# except for $_TOKEN_TAG information
121my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
122$_structure_file .= '.xml';
123
124
Akron1a5271a2021-02-18 13:18:15 +0100125# Name of the directory and the file containing all inline token informations
126# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
127my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
128$_tokens_file .= '.xml';
129
Peter Harders6f526a32020-06-29 21:44:41 +0200130my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
131
Akron4e3c7e32021-02-18 15:19:53 +0100132# Handling inline annotations (inside $_TOKENS_TAG)
133my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200134
Peter Harders6f526a32020-06-29 21:44:41 +0200135
136#
137# ~~~ variables ~~~
138#
139
Akron7501ca02020-08-01 21:05:25 +0200140# Initialize Token- and Structure-Collector
141my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
142my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200143
144
Akrona10ad592020-08-03 11:20:23 +0200145# Initialize Data-Collector
146my $data = KorAP::XML::TEI::Data->new;
147
148
Akron85717512020-07-08 11:19:19 +0200149# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200150my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200151my $input_fh; # input file handle (default: stdin)
152
Peter Harders6f526a32020-06-29 21:44:41 +0200153my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200154
Akron0c41ab32020-09-29 07:33:33 +0200155my ( $text_id,
156 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200157
Peter Harders6f526a32020-06-29 21:44:41 +0200158# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100159my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200160 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200161 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
162 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200163 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200164 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
165 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200166 # (means: 'from-index - 1' is a key in %ws).
167 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
168
Akron7501ca02020-08-01 21:05:25 +0200169my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200170
Peter Harders6f526a32020-06-29 21:44:41 +0200171
172#
173# ~~~ main ~~~
174#
175
176# ~ initializations ~
177
Akron4e3c7e32021-02-18 15:19:53 +0100178# Include line numbers in elements of $tree_data for debugging
179DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200180
Akron7501ca02020-08-01 21:05:25 +0200181$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200182
Peter Harders6f526a32020-06-29 21:44:41 +0200183# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100184
Akron347be812020-09-29 07:52:52 +0200185my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100186
Akron347be812020-09-29 07:52:52 +0200187$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100188
Akron347be812020-09-29 07:52:52 +0200189# Maybe not necessary
190$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100191
Akrondafaa7a2021-02-19 15:17:58 +0100192$dir = '';
Peter Hardersd892a582020-02-12 15:45:22 +0100193
Akron347be812020-09-29 07:52:52 +0200194if ( $input_fname ne '' ){
195 unless (open($input_fh, '<', $input_fname)) {
196 die $log->fatal("File '$input_fname' could not be opened.");
197 };
198}
Peter Harders6f526a32020-06-29 21:44:41 +0200199
Akronf8088e62021-02-18 16:18:59 +0100200# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200201binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200202
Akrond20898f2021-02-19 15:52:17 +0100203my $sfx;
Akron347be812020-09-29 07:52:52 +0200204my $pos;
Akroneaa96232020-10-15 17:06:15 +0200205my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200206my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200207
Akron347be812020-09-29 07:52:52 +0200208# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200209
Akron347be812020-09-29 07:52:52 +0200210MAIN: while ( <$input_fh> ){
211
212 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
213
Akroneaa96232020-10-15 17:06:15 +0200214 # Set input encoding
215 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
216 $input_enc = $2;
217 next;
218 };
219
220 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100221 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200222
Akron347be812020-09-29 07:52:52 +0200223 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
224
225 # ~ start of text body ~
226
Akron347be812020-09-29 07:52:52 +0200227 $sfx = $2;
228
Akrond20898f2021-02-19 15:52:17 +0100229 if ($1 !~ /^\s*$/ || $sfx !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200230 die $log->fatal("input line number $.: " .
231 "line with opening text-body tag '${_TEXT_BODY}' " .
232 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200233 };
Peter Harders6f526a32020-06-29 21:44:41 +0200234
Akron347be812020-09-29 07:52:52 +0200235 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
236 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200237
Akron347be812020-09-29 07:52:52 +0200238 # Iterate over all lines in the text body
239 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200240
Akron347be812020-09-29 07:52:52 +0200241 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200242 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100243 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200244
Akron347be812020-09-29 07:52:52 +0200245 # ~ end of text body ~
246 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200247
Akron91705d72021-02-19 10:59:45 +0100248 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200249
Akron347be812020-09-29 07:52:52 +0200250 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
251 die $log->fatal("input line number $.: " .
252 "line with closing text-body tag '${_TEXT_BODY}'".
253 " contains additional information ... => Aborting (line=$_)");
254 };
Peter Harders6f526a32020-06-29 21:44:41 +0200255
Akrondafaa7a2021-02-19 15:17:58 +0100256 if ($dir eq '') {
257 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
258 next MAIN;
259 };
Peter Harders6f526a32020-06-29 21:44:41 +0200260
Akrondafaa7a2021-02-19 15:17:58 +0100261 my $reader = XML::LibXML::Reader->new(
262 string => "<text>$buf_in</text>",
263 huge => 1
264 );
265
266 # See notes on whitespace handling
267 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
268
269 # XCT_LINE_NUMBERS is only needed for debugging
270 # (see XML::CompactTree::XS)
271 $param |= XCT_LINE_NUMBERS if DEBUG;
272 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
273
274 $structures->reset;
275
276 $tokens->reset if $_TOKENS_PROC;
277
278 # ~ whitespace related issue ~
279 $add_one = 0;
280 %ws = ();
281
282 # ~ recursion ~
283 retr_info(1, \$tree_data->[2] ); # parse input data
284
285 if (DEBUG) {
286 $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
287 };
288
289 # ~ write data.xml ~
290 $data->to_zip(
291 $zipper->new_stream("$dir/${_data_file}.xml"),
292 $text_id_esc
293 );
294
295 # ~ tokenization ~
Akron9df4a242021-02-19 15:31:16 +0100296 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100297
298 # Tokenize and output
299 $ext_tok->tokenize($data->data)->to_zip(
300 $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
301 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100302 );
Akrondafaa7a2021-02-19 15:17:58 +0100303 };
Peter Harders6f526a32020-06-29 21:44:41 +0200304
Akrondafaa7a2021-02-19 15:17:58 +0100305 if ($_GEN_TOK_INT) {
Peter Harders6f526a32020-06-29 21:44:41 +0200306
Akrondafaa7a2021-02-19 15:17:58 +0100307 # Tokenize and output
308 $cons_tok->tokenize($data->data)->to_zip(
309 $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200310 $text_id_esc
311 );
Akron598d1a72020-08-02 17:33:31 +0200312
Akrondafaa7a2021-02-19 15:17:58 +0100313 $aggr_tok->tokenize($data->data)->to_zip(
314 $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
315 $text_id_esc
316 );
Akron598d1a72020-08-02 17:33:31 +0200317
Akrondafaa7a2021-02-19 15:17:58 +0100318 $aggr_tok->reset;
319 $cons_tok->reset;
320 };
Akrona10ad592020-08-03 11:20:23 +0200321
Akrondafaa7a2021-02-19 15:17:58 +0100322 if ($use_tokenizer_sentence_splits) {
323 $ext_tok->sentencize_from_previous_input($structures);
Akron9df4a242021-02-19 15:31:16 +0100324 };
Akron598d1a72020-08-02 17:33:31 +0200325
Akrondafaa7a2021-02-19 15:17:58 +0100326 # ~ write structures ~
327 if (!$structures->empty) {
328 $structures->to_zip(
329 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
330 $text_id_esc,
331 2 # = structure serialization
332 );
333 };
334
335 # ~ write tokens ~
336 if ($_TOKENS_PROC && !$tokens->empty) {
337 $tokens->to_zip(
338 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
339 $text_id_esc,
340 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
341 );
342 };
343
344 # reinit.
345 $dir = '';
346
347 # Maybe not necessary
348 $data->reset;
349
Akron347be812020-09-29 07:52:52 +0200350 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200351 };
352
Akron347be812020-09-29 07:52:52 +0200353 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200354
Akron347be812020-09-29 07:52:52 +0200355 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200356
Akronf8088e62021-02-18 16:18:59 +0100357 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100358
Akronf8088e62021-02-18 16:18:59 +0100359 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
360 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
361
362 # Remove consecutive whitespace at beginning and end (mostly one newline)
363 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200364
Akron347be812020-09-29 07:52:52 +0200365 ### NOTE: this is only relevant, if a text consists of more than one line
366 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
367 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
368 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200369
Akron347be812020-09-29 07:52:52 +0200370 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200371
Akron347be812020-09-29 07:52:52 +0200372 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
373 }
374 ###
Akronf57ed812020-07-27 10:37:52 +0200375
Akron347be812020-09-29 07:52:52 +0200376 # add line to buffer
377 $buf_in .= $_;
378 };
Akronf57ed812020-07-27 10:37:52 +0200379
Akron0529e512021-02-22 09:55:35 +0100380 } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200381
Akron347be812020-09-29 07:52:52 +0200382 # ~ start of header ~
Akron347be812020-09-29 07:52:52 +0200383 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200384
Akrond20898f2021-02-19 15:52:17 +0100385 if ($1 !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200386 die $log->fatal("input line number $.: " .
387 "line with opening header tag" .
388 " is not in expected format ... => Aborting (line=$_)");
389 };
390
391 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200392 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200393
394 # Header was parseable
395 if ($header) {
396
397 # Write header to zip
Akron26a71522021-02-19 10:27:37 +0100398 my $file = $header->dir . '/' . $_header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200399
Akronb3649472020-09-29 08:24:46 +0200400 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200401
402 $header->to_zip($zipper->new_stream($file));
403
404 # Header is for text level
405 if ($header->type eq 'text') {
406
407 # Remember dir and sigles
408 $dir = $header->dir;
409 $text_id = $header->id;
410 $text_id_esc = $header->id_esc;
411
412 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100413 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200414
415 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200416 }
417 }
Akron347be812020-09-29 07:52:52 +0200418 }
419} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100420
Akron347be812020-09-29 07:52:52 +0200421$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200422
Akron9df4a242021-02-19 15:31:16 +0100423$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100424
Akron347be812020-09-29 07:52:52 +0200425exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100426
Peter Hardersd892a582020-02-12 15:45:22 +0100427
Akrond658df72021-02-18 18:58:56 +0100428# Recursively called function to handle XML tree data
429sub retr_info {
Akron1c4f2202020-07-30 09:28:22 +0200430 # recursion level
431 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
432 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100433
Akrond658df72021-02-18 18:58:56 +0100434 # Iteration through all array elements
435 # ($_[0] is a reference to an array reference)
436 # See notes on how 'XML::CompactTree::XS' works and
437 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
438 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100439
Akrond658df72021-02-18 18:58:56 +0100440 # Element node
441 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100442
Peter Harders6f526a32020-06-29 21:44:41 +0200443 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200444 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200445 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100446
Akron7501ca02020-08-01 21:05:25 +0200447 # $e->[1] represents the tag name
Akronace12772021-02-19 13:16:26 +0100448 # Skip sentences
Marc Kupietz985da0c2021-02-15 19:29:50 +0100449 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akronace12772021-02-19 13:16:26 +0100450 if (defined $e->[$_IDX]) {
451 retr_info($rl+1, \$e->[$_IDX]);
452 }
453 next;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100454 }
Peter Hardersd892a582020-02-12 15:45:22 +0100455
Akronace12772021-02-19 13:16:26 +0100456 my $anno = $structures->add_new_annotation($e->[1]);
Peter Hardersd892a582020-02-12 15:45:22 +0100457
Akron7501ca02020-08-01 21:05:25 +0200458 # Add element also to token list
459 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
460 $tokens->add_annotation($anno);
461 };
Peter Hardersd892a582020-02-12 15:45:22 +0100462
Akrond658df72021-02-18 18:58:56 +0100463 # Handle attributes (if attributes exist)
464 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100465
Akrond658df72021-02-18 18:58:56 +0100466 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
467 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
468 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
469 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100470
Peter Harders6f526a32020-06-29 21:44:41 +0200471 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200472 $anno->add_attribute(
473 @{$e->[3]}[$c, $c + 1]
474 );
Akrond658df72021-02-18 18:58:56 +0100475 };
476 };
Peter Harders6f526a32020-06-29 21:44:41 +0200477
478 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200479 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200480
Akrond658df72021-02-18 18:58:56 +0100481
Peter Harders6f526a32020-06-29 21:44:41 +0200482 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200483 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200484 #~~~~
485
486
Akrond658df72021-02-18 18:58:56 +0100487 # Call function recursively
488 # do no recursion, if $e->[$_IDX] is not defined
489 # (because we have no array of child-nodes, e.g.: <back/>)
490 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200491
Akrond658df72021-02-18 18:58:56 +0100492 # Recursion with array of child-nodes
493 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200494 }
495
496
497 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200498 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200499 #~~~~~
500
Akrond658df72021-02-18 18:58:56 +0100501 # NOTE: use $pos, because the offsets are _between_ the characters
502 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200503 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200504
Akrond658df72021-02-18 18:58:56 +0100505 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200506
Akrond658df72021-02-18 18:58:56 +0100507 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200508
Peter Harders6f526a32020-06-29 21:44:41 +0200509 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100510 if ($fval > 0 && not exists $ws{$fval - 1}) {
511
512 # ~ previous node was a text-node ~
513 $anno->set_from($fval - 1);
514 }
515
516 # in case this fails, check input
517 if (($fval - 1) > $pos) {
518 die $log->fatal("text_id='$text_id', " .
519 "processing of structures: " .
520 "from-value ($fval) is 2 or more greater " .
521 "than to-value ($pos) => please check. Aborting");
522 };
523
524 # TODO: find example for which this case applies
525 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
526 #
527 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
528 # do testing with bigger corpus excerpt (wikipedia?)
529 $anno->set_from($pos) if $fval == $pos + 1;
530 $anno->set_to($pos);
531 $anno->set_level($rl);
532
533 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200534 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100535
536
Peter Harders41c35622020-07-12 01:16:22 +0200537 #~~~~
538 # until here: tag-node (closing)
539 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200540 }
541
Akrond658df72021-02-18 18:58:56 +0100542 # Text node
543 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200544
Akrond658df72021-02-18 18:58:56 +0100545 $add_one = 1;
546 $data->append($e->[1]);
547 }
548
549 # Whitespace node
550 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
551 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
552
553 # state, that this from-index belongs to a whitespace-node
554 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
555 $ws{$data->position}++;
556
557 $add_one = 0;
558 $data->append($e->[1]);
559 }
560
561 # not yet handled type
562 else {
563
564 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
565 };
566 };
567};
568
Peter Harders6f526a32020-06-29 21:44:41 +0200569
Akrond949e182020-02-14 12:23:57 +0100570__END__
571
572=pod
573
574=encoding utf8
575
576=head1 NAME
577
578tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
579
580=head1 SYNOPSIS
581
582 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
583
584=head1 DESCRIPTION
585
Akronee434b12020-07-08 12:53:01 +0200586C<tei2korapxml> is a script to convert TEI P5 and
587L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
588based documents to the
589L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
590If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100591read from C<STDIN>. If no specific output is defined, data is written
592to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200593
Akrond949e182020-02-14 12:23:57 +0100594This program is usually called from inside another script.
595
Akronee434b12020-07-08 12:53:01 +0200596=head1 FORMATS
597
598=head2 Input restrictions
599
600=over 2
601
602=item
603
Akronee434b12020-07-08 12:53:01 +0200604TEI P5 formatted input with certain restrictions:
605
606=over 4
607
608=item
609
610B<mandatory>: text-header with integrated textsigle, text-body
611
612=item
613
614B<optional>: corp-header with integrated corpsigle,
615doc-header with integrated docsigle
616
617=back
618
619=item
620
Akron0c41ab32020-09-29 07:33:33 +0200621All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200622newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200623(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200624into blanks between 2 tokens could lead to additional blanks,
625where there should be none (e.g.: punctuation characters like C<,> or
626C<.> should not be seperated from their predecessor token).
627(see also code section C<~ whitespace handling ~>).
628
629=back
630
631=head2 Notes on the output
632
633=over 2
634
635=item
636
637zip file output (default on C<stdout>) with utf8 encoded entries
638(which together form the KorAP-XML format)
639
640=back
641
Akrond949e182020-02-14 12:23:57 +0100642=head1 INSTALLATION
643
644C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
645these bindings are available, the preferred way to install the script is
646to use L<cpanm|App::cpanminus>.
647
648 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
649
650In case everything went well, the C<tei2korapxml> tool will
651be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200652
Akrond949e182020-02-14 12:23:57 +0100653Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
654
655=head1 OPTIONS
656
657=over 2
658
Akron4e603a52020-07-27 14:23:49 +0200659=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100660
Akron4e603a52020-07-27 14:23:49 +0200661The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100662
663=item B<--help|-h>
664
665Print help information.
666
667=item B<--version|-v>
668
669Print version information.
670
Akron4e603a52020-07-27 14:23:49 +0200671=item B<--tokenizer-call|-tc>
672
673Call an external tokenizer process, that will tokenize
674a single line from STDIN and outputs one token per line.
675
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200676=item B<--tokenizer-korap|-tk>
677
678Use the standard KorAP/DeReKo tokenizer.
679
Akron6d7b8e42020-09-29 07:37:41 +0200680=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200681
682Tokenize the data using two embedded tokenizers,
683that will take an I<Aggressive> and a I<conservative>
684approach.
685
Akron1a5271a2021-02-18 13:18:15 +0100686=item B<--inline-tokens> <foundry>#[<file>]
687
688Define the foundry and file (without extension)
689to store inline token information in.
690If L</KORAPXMLTEI_INLINE> is set, this will contain
691annotations as well.
692Defaults to C<tokens> and C<morpho>.
693
Akrondd0be8f2021-02-18 19:29:41 +0100694=item B<--inline-structures> <foundry>#[<file>]
695
696Define the foundry and file (without extension)
697to store inline structure information in.
698Defaults to C<struct> and C<structures>.
699
Akron26a71522021-02-19 10:27:37 +0100700=item B<--base-foundry> <foundry>
701
702Define the base foundry to store newly generated
703token information in.
704Defaults to C<base>.
705
706=item B<--data-file> <file>
707
708Define the file (without extension)
709to store primary data information in.
710Defaults to C<data>.
711
712=item B<--header-file> <file>
713
714Define the file name (without extension)
715to store header information on
716the corpus, document, and text level in.
717Defaults to C<header>.
718
Marc Kupietz985da0c2021-02-15 19:29:50 +0100719=item B<--use-tokenizer-sentence-splits|-s>
720
721Replace existing with, or add new, sentence boundary information
722provided by the KorAP tokenizer (currently supported only).
723
Akron91705d72021-02-19 10:59:45 +0100724=item B<--tokens-file> <file>
725
726Define the file (without extension)
727to store generated token information in
728(either from the KorAP tokenizer or an externally called tokenizer).
729Defaults to C<tokens>.
730
Akron3378dfd2020-08-01 15:01:36 +0200731=item B<--log|-l>
732
733Loglevel for I<Log::Any>. Defaults to C<notice>.
734
Akrond949e182020-02-14 12:23:57 +0100735=back
736
Akronb3649472020-09-29 08:24:46 +0200737=head1 ENVIRONMENT VARIABLES
738
739=over 2
740
741=item B<KORAPXMLTEI_DEBUG>
742
743Activate minimal debugging.
744Defaults to C<false>.
745
746=item B<KORAPXMLTEI_INLINE>
747
748Process inline annotations, if present.
749Defaults to C<false>.
750
751=back
752
Akrond949e182020-02-14 12:23:57 +0100753=head1 COPYRIGHT AND LICENSE
754
Marc Kupietze955ecc2021-02-17 17:42:01 +0100755Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100756
757Author: Peter Harders
758
Akronaabd0952020-09-29 07:35:08 +0200759Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100760
761L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
762Corpus Analysis Platform at the
763L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
764member of the
765L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
766
767This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100768L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100769
770=cut
Akronf8088e62021-02-18 16:18:59 +0100771
772# NOTES
773
774## Notes on how 'XML::CompactTree::XS' works
775
776Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
777
778Print out name of 'node2' for the above example:
779
780echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
781
782Exploring the structure of $data ( = reference to below array ):
783
784[ 0: XML_READER_TYPE_DOCUMENT,
785 1: ?
Akron91577922021-02-19 10:32:54 +0100786 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100787 1: 'node'
788 2: ?
789 3: HASH (attributes)
790 4: 1 (line number)
791 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
792 1: 'node1'
793 2: ?
794 3: undefined (no attributes)
795 4: 1 (line number)
796 5: [ 0: [ 0: XML_READER_TYPE_TEXT
797 1: 'some '
798 ]
799 1: [ 0: XML_READER_TYPE_ELEMENT
800 1: 'n'
801 2: ?
802 3: undefined (no attributes)
803 4: 1 (line number)
804 5: undefined (no child-nodes)
805 ]
806 2: [ 0: XML_READER_TYPE_TEXT
807 1: ' text'
808 ]
809 ]
810 ]
811 1: [ 0: XML_READER_TYPE_ELEMENT
812 1: 'node2'
813 2: ?
814 3: undefined (not attributes)
815 4: 1 (line number)
816 5: [ 0: [ 0: XML_READER_TYPE_TEXT
817 1: 'more-text'
818 ]
819 ]
820 ]
821 ]
822 ]
823 ]
824]
825
826$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
827
828ref($data->[2]) == ARRAY (with 1 element for 'node')
829ref($data->[2]->[0]) == ARRAY (with 6 elements)
830
831$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
832$data->[2]->[0]->[1] == 'node'
833ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
834$data->[2]->[0]->[4] == 1 (line number)
835ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
836 # child-nodes of actual node (see $_IDX)
837
838ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
839$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
840$data->[2]->[0]->[5]->[0]->[1] == 'node1'
841$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
842$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
843ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
844
845ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
846$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
847$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
848
849ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
850$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
851$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
852$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
853$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
854$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
855
856ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
857$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
858$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
859
860
861retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
862Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
863${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
864
865
866## Notes on whitespace handling
867
868Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
869(see function 'retr_info()').
870
871Definition of significant and insignificant whitespace
872(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
873
874Significant whitespace is part of the document content and should be preserved.
875Insignificant whitespace is used when editing XML documents for readability.
876These whitespaces are typically not intended for inclusion in the delivery of the document.
877
878### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
879
880The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
881 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
882
883When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
884 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
885 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
886
887echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
888
889
890Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
891
892Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
893 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
894
895The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
896 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
897
898The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
899 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
900
901When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
902 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
903 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
904 the last read 'non-tag'-node has to be corrected (see [1]),
905
906For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
907 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
908
909[1]
910Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
911 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
912 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
913
914[2]
915Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
916 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
917
918The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
919 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
920
921Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
922 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
923
924
925## Notes on whitespace fixing
926
927The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
928 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
929
930It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
931 example further down and notes on 'Input restrictions' in the manpage).
932
933Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
934
935Examples (how primary text with linebreaks would be converted by below code):
936
937 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
938 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
939
940Blanks are inserted before the 1st character:
941
942 NOTE: not stringent ('...' stands for text):
943
944 beg1............................end1 => no blank before 'beg1'
945 beg2....<pb/>...................end2 => no blank before 'beg2'
946 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
947 beg4....<test>ok</test>.........end4 => blank before 'beg4'
948
949 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
950 ^
951 |_blank between 'end3' and 'beg4'
952
953
954## Notes on segfault prevention
955
Akron91577922021-02-19 10:32:54 +0100956binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100957(see notes on 'PerlIO layers' in 'man XML::LibXML'),
958removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
959see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
960see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.