blob: 70289f4aa875b70495bff1eb6503521746a99132 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Akrond53913c2021-02-24 09:50:13 +010045 "root|r=s" => \(my $root_dir = '.'),
Akron75d63142021-02-23 18:40:56 +010046 "input|i=s" => \(my $input_fname = ''),
47 'tokenizer-call|tc=s' => \(my $tokenizer_call),
48 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010049 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akron75d63142021-02-23 18:40:56 +010050 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
51 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
52 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
53 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akrond53913c2021-02-24 09:50:13 +010054 'base-foundry=s' => \(my $base_dir = 'base'),
55 'data-file=s' => \(my $data_file = 'data'),
56 'header-file=s' => \(my $header_file = 'header'),
57 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron75d63142021-02-23 18:40:56 +010058 'log|l=s' => \(my $log_level = 'notice'),
59 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010060 pod2usage(
61 -verbose => 99,
62 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
63 -msg => $VERSION_MSG,
64 -output => '-'
65 )
66 },
67 'version|v' => sub {
68 pod2usage(
69 -verbose => 0,
70 -msg => $VERSION_MSG,
71 -output => '-'
72 )
73 }
Peter Hardersd892a582020-02-12 15:45:22 +010074);
75
Akronb87c58d2021-02-23 17:23:30 +010076# Establish logger
Marc Kupietz44b1f252020-11-26 16:31:40 +010077binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020078Log::Any::Adapter->set('Stderr', log_level => $log_level);
79
Akronb3649472020-09-29 08:24:46 +020080$log->notice('Debugging is activated') if DEBUG;
81
Akron0529e512021-02-22 09:55:35 +010082# tag (without attributes), which contains the primary text
83my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020084# optional
Akron09e0b2c2020-07-28 15:57:01 +020085
Akron0529e512021-02-22 09:55:35 +010086# TODO: IDS-specific (and redundant)
87my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +020088
Akrond53913c2021-02-24 09:50:13 +010089# name of the tag containing all information stored in $_tokens_file
90my $_TOKENS_TAG = 'w';
91
92
Marc Kupietz985da0c2021-02-15 19:29:50 +010093if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
94 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
Akronb87c58d2021-02-23 17:23:30 +010095};
Marc Kupietz985da0c2021-02-15 19:29:50 +010096
Akron0c41ab32020-09-29 07:33:33 +020097my $ext_tok;
98if ($tokenizer_call) {
99 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
100}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200101
Akron0c41ab32020-09-29 07:33:33 +0200102elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100103 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200104};
Peter Harders6f526a32020-06-29 21:44:41 +0200105##
106
Akron0c41ab32020-09-29 07:33:33 +0200107
Akron4e3c7e32021-02-18 15:19:53 +0100108#
109# ~~~ constants ~~~
110#
111
112
Akron8b511f92020-07-09 17:28:08 +0200113## intern tokenization
Akronb87c58d2021-02-23 17:23:30 +0100114my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
115my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200116##
117
Akrondd0be8f2021-02-18 19:29:41 +0100118# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100119# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100120my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100121
Akron1a5271a2021-02-18 13:18:15 +0100122# Name of the directory and the file containing all inline token informations
123# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
124my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100125
Akron4e3c7e32021-02-18 15:19:53 +0100126# Handling inline annotations (inside $_TOKENS_TAG)
Akronb87c58d2021-02-23 17:23:30 +0100127my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
128
129# Initialize Token- and Structure-Collector
130my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
131my $structures = KorAP::XML::TEI::Annotations::Collector->new;
132
133# Initialize Data-Collector
134my $data = KorAP::XML::TEI::Data->new;
135
136# Initialize zipper
Akrond53913c2021-02-24 09:50:13 +0100137my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron09e0b2c2020-07-28 15:57:01 +0200138
Peter Harders6f526a32020-06-29 21:44:41 +0200139
140#
141# ~~~ variables ~~~
142#
143
Akrond53913c2021-02-24 09:50:13 +0100144my $dir = ''; # text directory (below $root_dir)
Akron09e0b2c2020-07-28 15:57:01 +0200145
Akrond53913c2021-02-24 09:50:13 +0100146# '$text_id_esc' = escaped version of $text_id
147my ($text_id, $text_id_esc);
Peter Harders6f526a32020-06-29 21:44:41 +0200148
Peter Harders6f526a32020-06-29 21:44:41 +0200149# these are only used inside recursive function 'retr_info'
Akrond53913c2021-02-24 09:50:13 +0100150# value is set dependent on DEBUG - for extracting array of
151# child elements from element in $tree_data
152my $child_idx;
153
154# element from $tree_data
155my $e;
156
157# Keeping track of the current positions in the text
158my $pos;
159
160# Default encoding of the text
161my $input_enc = 'UTF-8';
162
163# variables for handling ~ whitespace related issue ~
164# (it is sometimes necessary, to correct the from-values for some tags)
165my $add_one;
166my $from = 0;
167
168# text line (needed for whitespace handling)
169my $text_line = 0;
170
171# hash for indices of whitespace-nodes
172# (needed to recorrect from-values)
173# IDEA:
174# when closing element, check if it's from-index minus 1 refers to a whitespace-node
175# (means: 'from-index - 1' is a key in %ws).
176# if this is _not_ the case, then the from-value is one
177# to high => correct it by substracting 1
178my %ws;
Peter Harders6f526a32020-06-29 21:44:41 +0200179
Peter Harders6f526a32020-06-29 21:44:41 +0200180
181#
182# ~~~ main ~~~
183#
184
Akron4e3c7e32021-02-18 15:19:53 +0100185# Include line numbers in elements of $tree_data for debugging
Akrond53913c2021-02-24 09:50:13 +0100186DEBUG ? ($child_idx = 5) : ($child_idx = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200187
Peter Harders6f526a32020-06-29 21:44:41 +0200188
Peter Harders6f526a32020-06-29 21:44:41 +0200189# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100190
Akrond53913c2021-02-24 09:50:13 +0100191# Input file handle (default: stdin)
192my $input_fh = *STDIN;
Peter Hardersd892a582020-02-12 15:45:22 +0100193
Akrond53913c2021-02-24 09:50:13 +0100194if ($input_fname ne '') {
Akron347be812020-09-29 07:52:52 +0200195 unless (open($input_fh, '<', $input_fname)) {
196 die $log->fatal("File '$input_fname' could not be opened.");
197 };
Akrond53913c2021-02-24 09:50:13 +0100198};
Peter Harders6f526a32020-06-29 21:44:41 +0200199
Akronf8088e62021-02-18 16:18:59 +0100200# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200201binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200202
Peter Harders6f526a32020-06-29 21:44:41 +0200203
Akrond53913c2021-02-24 09:50:13 +0100204# Reading input document
Akron347be812020-09-29 07:52:52 +0200205MAIN: while ( <$input_fh> ){
206
Akrond53913c2021-02-24 09:50:13 +0100207 # remove HTML (multi-line) comments (<!--...-->)
208 $_ = remove_xml_comments( $input_fh, $_ );
Akron347be812020-09-29 07:52:52 +0200209
Akroneaa96232020-10-15 17:06:15 +0200210 # Set input encoding
Akrond53913c2021-02-24 09:50:13 +0100211 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akroneaa96232020-10-15 17:06:15 +0200212 $input_enc = $2;
213 next;
214 };
215
216 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100217 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200218
Akrond53913c2021-02-24 09:50:13 +0100219 # Start of Text body
220 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#){
Akron347be812020-09-29 07:52:52 +0200221
Akrond53913c2021-02-24 09:50:13 +0100222 my $suffix = $2;
Akron347be812020-09-29 07:52:52 +0200223
Akrond53913c2021-02-24 09:50:13 +0100224 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200225 die $log->fatal("input line number $.: " .
226 "line with opening text-body tag '${_TEXT_BODY}' " .
227 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200228 };
Peter Harders6f526a32020-06-29 21:44:41 +0200229
Akrond53913c2021-02-24 09:50:13 +0100230 # Text body data extracted from input document ($input_fh),
231 # further processed by XML::LibXML::Reader
232 my $text_buffer = '';
Peter Harders90157342020-07-01 21:05:14 +0200233
Akron347be812020-09-29 07:52:52 +0200234 # Iterate over all lines in the text body
235 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200236
Akron347be812020-09-29 07:52:52 +0200237 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200238 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100239 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200240
Akrond53913c2021-02-24 09:50:13 +0100241 # End of text body
Akron347be812020-09-29 07:52:52 +0200242 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200243
Akron91705d72021-02-19 10:59:45 +0100244 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200245
Akrond53913c2021-02-24 09:50:13 +0100246 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200247 die $log->fatal("input line number $.: " .
248 "line with closing text-body tag '${_TEXT_BODY}'".
249 " contains additional information ... => Aborting (line=$_)");
250 };
Peter Harders6f526a32020-06-29 21:44:41 +0200251
Akrondafaa7a2021-02-19 15:17:58 +0100252 if ($dir eq '') {
Akrond53913c2021-02-24 09:50:13 +0100253 $log->warn(
254 "Maybe empty textSigle => skipping this text ...\n" .
255 'data=' . substr($data->data, 0, 200)
256 );
Akrondafaa7a2021-02-19 15:17:58 +0100257 next MAIN;
258 };
Peter Harders6f526a32020-06-29 21:44:41 +0200259
Akrondafaa7a2021-02-19 15:17:58 +0100260 my $reader = XML::LibXML::Reader->new(
Akrond53913c2021-02-24 09:50:13 +0100261 string => "<text>$text_buffer</text>",
Akrondafaa7a2021-02-19 15:17:58 +0100262 huge => 1
263 );
264
265 # See notes on whitespace handling
266 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
267
268 # XCT_LINE_NUMBERS is only needed for debugging
269 # (see XML::CompactTree::XS)
270 $param |= XCT_LINE_NUMBERS if DEBUG;
271 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
272
273 $structures->reset;
274
Akron75d63142021-02-23 18:40:56 +0100275 $tokens->reset unless $skip_inline_tokens;
Akrondafaa7a2021-02-19 15:17:58 +0100276
277 # ~ whitespace related issue ~
278 $add_one = 0;
279 %ws = ();
280
281 # ~ recursion ~
Akron3556c752021-02-24 09:53:24 +0100282 retr_info(1, $tree_data->[2]); # parse input data
Akrondafaa7a2021-02-19 15:17:58 +0100283
284 if (DEBUG) {
Akrond53913c2021-02-24 09:50:13 +0100285 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akrondafaa7a2021-02-19 15:17:58 +0100286 };
287
Akrond53913c2021-02-24 09:50:13 +0100288 # Write data.xml
Akrondafaa7a2021-02-19 15:17:58 +0100289 $data->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100290 $zipper->new_stream("$dir/${data_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100291 $text_id_esc
292 );
293
Akrond53913c2021-02-24 09:50:13 +0100294 # Tokenize with external tokenizer
Akron9df4a242021-02-19 15:31:16 +0100295 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100296
297 # Tokenize and output
298 $ext_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100299 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100300 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100301 );
Akrond53ab4b2021-02-24 09:56:12 +0100302
303 if ($use_tokenizer_sentence_splits) {
304 $ext_tok->sentencize_from_previous_input($structures);
305 };
Akrondafaa7a2021-02-19 15:17:58 +0100306 };
Peter Harders6f526a32020-06-29 21:44:41 +0200307
Akrond53913c2021-02-24 09:50:13 +0100308 # Tokenize with internal tokenizer
309 if ($tokenizer_intern) {
Peter Harders6f526a32020-06-29 21:44:41 +0200310
Akrondafaa7a2021-02-19 15:17:58 +0100311 # Tokenize and output
312 $cons_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100313 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200314 $text_id_esc
315 );
Akron598d1a72020-08-02 17:33:31 +0200316
Akrondafaa7a2021-02-19 15:17:58 +0100317 $aggr_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100318 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akrondafaa7a2021-02-19 15:17:58 +0100319 $text_id_esc
320 );
Akron598d1a72020-08-02 17:33:31 +0200321
Akrondafaa7a2021-02-19 15:17:58 +0100322 $aggr_tok->reset;
323 $cons_tok->reset;
324 };
Akrona10ad592020-08-03 11:20:23 +0200325
Akrondafaa7a2021-02-19 15:17:58 +0100326 # ~ write structures ~
327 if (!$structures->empty) {
328 $structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100329 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100330 $text_id_esc,
331 2 # = structure serialization
332 );
333 };
334
335 # ~ write tokens ~
Akron75d63142021-02-23 18:40:56 +0100336 unless ($skip_inline_tokens || $tokens->empty) {
Akrondafaa7a2021-02-19 15:17:58 +0100337 $tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100338 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100339 $text_id_esc,
340 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
341 );
342 };
343
344 # reinit.
345 $dir = '';
346
347 # Maybe not necessary
348 $data->reset;
349
Akron347be812020-09-29 07:52:52 +0200350 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200351 };
352
Peter Harders6f526a32020-06-29 21:44:41 +0200353
Akron347be812020-09-29 07:52:52 +0200354 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200355
Akronf8088e62021-02-18 16:18:59 +0100356 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100357
Akrond53913c2021-02-24 09:50:13 +0100358 # TODO:
359 # Maybe it's best, to keep the stripping of whitespace and
360 # to just remove the if-clause and to insert a blank by default
361 # (with possibly an option on how newlines in primary text should
362 # be handled (stripped or replaced by a whitespace)).
Akronf8088e62021-02-18 16:18:59 +0100363
364 # Remove consecutive whitespace at beginning and end (mostly one newline)
365 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200366
Akrond53913c2021-02-24 09:50:13 +0100367 # NOTE:
368 # this is only relevant, if a text consists of more than one line
Akronf57ed812020-07-27 10:37:52 +0200369
Akrond53913c2021-02-24 09:50:13 +0100370 # TODO:
371 # find a better solution, or create a warning, if a text has more
372 # than one line ($text_line > 1)
Akronf57ed812020-07-27 10:37:52 +0200373
Akrond53913c2021-02-24 09:50:13 +0100374 # TODO:
375 # do testing with 2 different corpora
376 # (one with only one-line texts, the other with several lines per text)
377
378 # line contains at least one tag with at least one character contents
379 if (m/<[^>]+>[^<]/) {
380
381 # Increment counter for text lines
382 $text_line++;
383
384 # insert blank before 1st character
385 #(for 2nd line and consecutive lines)
386 s/^(.)/ $1/ if $text_line > 1;
Akron347be812020-09-29 07:52:52 +0200387 }
Akronf57ed812020-07-27 10:37:52 +0200388
Akron347be812020-09-29 07:52:52 +0200389 # add line to buffer
Akrond53913c2021-02-24 09:50:13 +0100390 $text_buffer .= $_;
Akron347be812020-09-29 07:52:52 +0200391 };
Akronf57ed812020-07-27 10:37:52 +0200392
Akron0529e512021-02-22 09:55:35 +0100393 } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200394
Akron347be812020-09-29 07:52:52 +0200395 # ~ start of header ~
Akron347be812020-09-29 07:52:52 +0200396 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200397
Akrond20898f2021-02-19 15:52:17 +0100398 if ($1 !~ /^\s*$/) {
Akrond53913c2021-02-24 09:50:13 +0100399 die $log->fatal(
400 "input line number $.: " .
401 'line with opening header tag is not in expected format ... ' .
402 "=> Aborting (line=$_)");
Akron347be812020-09-29 07:52:52 +0200403 };
404
405 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200406 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200407
408 # Header was parseable
409 if ($header) {
410
411 # Write header to zip
Akrond53913c2021-02-24 09:50:13 +0100412 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200413
Akronb3649472020-09-29 08:24:46 +0200414 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200415
416 $header->to_zip($zipper->new_stream($file));
417
418 # Header is for text level
419 if ($header->type eq 'text') {
420
421 # Remember dir and sigles
422 $dir = $header->dir;
423 $text_id = $header->id;
424 $text_id_esc = $header->id_esc;
425
426 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100427 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200428
Akrond53913c2021-02-24 09:50:13 +0100429 # Reset counter for text lines
430 # (needed for whitespace handling)
431 $text_line = 0;
432 };
433 };
434 };
435};
Peter Hardersd892a582020-02-12 15:45:22 +0100436
Akron347be812020-09-29 07:52:52 +0200437$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200438
Akron9df4a242021-02-19 15:31:16 +0100439$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100440
Akrond53913c2021-02-24 09:50:13 +0100441close $input_fh;
442
Akron347be812020-09-29 07:52:52 +0200443exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100444
Peter Hardersd892a582020-02-12 15:45:22 +0100445
Akrond658df72021-02-18 18:58:56 +0100446# Recursively called function to handle XML tree data
447sub retr_info {
Akrond53913c2021-02-24 09:50:13 +0100448
Akron1c4f2202020-07-30 09:28:22 +0200449 # recursion level
450 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
Akrond53913c2021-02-24 09:50:13 +0100451 my $depth = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100452
Akrond658df72021-02-18 18:58:56 +0100453 # Iteration through all array elements
454 # ($_[0] is a reference to an array reference)
455 # See notes on how 'XML::CompactTree::XS' works and
456 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron3556c752021-02-24 09:53:24 +0100457 foreach $e (@{$_[0]}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100458
Akrond658df72021-02-18 18:58:56 +0100459 # Element node
460 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100461
Peter Harders6f526a32020-06-29 21:44:41 +0200462 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200463 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200464 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100465
Akron7501ca02020-08-01 21:05:25 +0200466 # $e->[1] represents the tag name
Akronace12772021-02-19 13:16:26 +0100467 # Skip sentences
Marc Kupietz985da0c2021-02-15 19:29:50 +0100468 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akrond53913c2021-02-24 09:50:13 +0100469 if (defined $e->[$child_idx]) {
Akron3556c752021-02-24 09:53:24 +0100470 retr_info($depth+1, $e->[$child_idx]);
Akronace12772021-02-19 13:16:26 +0100471 }
472 next;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100473 }
Peter Hardersd892a582020-02-12 15:45:22 +0100474
Akronace12772021-02-19 13:16:26 +0100475 my $anno = $structures->add_new_annotation($e->[1]);
Peter Hardersd892a582020-02-12 15:45:22 +0100476
Akron7501ca02020-08-01 21:05:25 +0200477 # Add element also to token list
Akron75d63142021-02-23 18:40:56 +0100478 if (!$skip_inline_tokens && $e->[1] eq $_TOKENS_TAG) {
Akron7501ca02020-08-01 21:05:25 +0200479 $tokens->add_annotation($anno);
480 };
Peter Hardersd892a582020-02-12 15:45:22 +0100481
Akrond658df72021-02-18 18:58:56 +0100482 # Handle attributes (if attributes exist)
483 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100484
Akrond658df72021-02-18 18:58:56 +0100485 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
486 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
487 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akrondac5d932021-02-23 21:12:02 +0100488 for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100489
Akrondac5d932021-02-23 21:12:02 +0100490 # '$_' references the 'key' and '$_+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200491 $anno->add_attribute(
Akrondac5d932021-02-23 21:12:02 +0100492 @{$e->[3]}[$_, $_ + 1]
Akron7501ca02020-08-01 21:05:25 +0200493 );
Akrond658df72021-02-18 18:58:56 +0100494 };
495 };
Peter Harders6f526a32020-06-29 21:44:41 +0200496
497 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200498 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200499
Akrond658df72021-02-18 18:58:56 +0100500
Peter Harders6f526a32020-06-29 21:44:41 +0200501 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200502 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200503 #~~~~
504
505
Akrond658df72021-02-18 18:58:56 +0100506 # Call function recursively
Akrond53913c2021-02-24 09:50:13 +0100507 # do no recursion, if $e->[$child_idx] is not defined
Akrond658df72021-02-18 18:58:56 +0100508 # (because we have no array of child-nodes, e.g.: <back/>)
Akrond53913c2021-02-24 09:50:13 +0100509 if (defined $e->[$child_idx]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200510
Akrond658df72021-02-18 18:58:56 +0100511 # Recursion with array of child-nodes
Akron3556c752021-02-24 09:53:24 +0100512 retr_info($depth+1, $e->[$child_idx]);
Peter Harders6f526a32020-06-29 21:44:41 +0200513 }
514
515
516 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200517 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200518 #~~~~~
519
Akrond658df72021-02-18 18:58:56 +0100520 # NOTE: use $pos, because the offsets are _between_ the characters
521 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200522 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200523
Akrond658df72021-02-18 18:58:56 +0100524 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200525
Akrond53913c2021-02-24 09:50:13 +0100526 $from = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200527
Peter Harders6f526a32020-06-29 21:44:41 +0200528 # ~ whitespace related issue ~
Akrond53913c2021-02-24 09:50:13 +0100529 if ($from > 0 && not exists $ws{$from - 1}) {
Akrond658df72021-02-18 18:58:56 +0100530
531 # ~ previous node was a text-node ~
Akrond53913c2021-02-24 09:50:13 +0100532 $anno->set_from($from - 1);
Akrond658df72021-02-18 18:58:56 +0100533 };
534
Akrond53913c2021-02-24 09:50:13 +0100535 # in case this fails, check input
536 if (($from - 1) > $pos) {
537 die $log->fatal(
538 "text_id='$text_id', " .
539 'processing of structures: ' .
540 "from-value ($from) is 2 or more greater " .
541 "than to-value ($pos) => please check. Aborting"
542 );
543 };
544
545 # TODO:
546 # find example for which this case applies
547 # maybe this is not necessary anymore, because the
548 # above recorrection of the from-value suffices
Akrond658df72021-02-18 18:58:56 +0100549 #
Akrond53913c2021-02-24 09:50:13 +0100550 # TODO:
551 # check, if it's better to remove this line and
552 # change above check to 'if ($from - 1) >= $pos;
Akrond658df72021-02-18 18:58:56 +0100553 # do testing with bigger corpus excerpt (wikipedia?)
Akrond53913c2021-02-24 09:50:13 +0100554 $anno->set_from($pos) if $from == $pos + 1;
Akrond658df72021-02-18 18:58:56 +0100555 $anno->set_to($pos);
Akrond53913c2021-02-24 09:50:13 +0100556 $anno->set_level($depth);
Akrond658df72021-02-18 18:58:56 +0100557
558 # Clean up whitespace
Akrond53913c2021-02-24 09:50:13 +0100559 delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100560
561
Peter Harders41c35622020-07-12 01:16:22 +0200562 #~~~~
563 # until here: tag-node (closing)
564 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200565 }
566
Akrond658df72021-02-18 18:58:56 +0100567 # Text node
568 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200569
Akrond658df72021-02-18 18:58:56 +0100570 $add_one = 1;
571 $data->append($e->[1]);
572 }
573
574 # Whitespace node
575 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
576 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
577
578 # state, that this from-index belongs to a whitespace-node
579 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
580 $ws{$data->position}++;
581
582 $add_one = 0;
583 $data->append($e->[1]);
584 }
585
586 # not yet handled type
587 else {
588
589 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
590 };
591 };
592};
593
Peter Harders6f526a32020-06-29 21:44:41 +0200594
Akrond949e182020-02-14 12:23:57 +0100595__END__
596
597=pod
598
599=encoding utf8
600
601=head1 NAME
602
603tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
604
605=head1 SYNOPSIS
606
607 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
608
609=head1 DESCRIPTION
610
Akronee434b12020-07-08 12:53:01 +0200611C<tei2korapxml> is a script to convert TEI P5 and
612L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
613based documents to the
614L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
615If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100616read from C<STDIN>. If no specific output is defined, data is written
617to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200618
Akrond949e182020-02-14 12:23:57 +0100619This program is usually called from inside another script.
620
Akronee434b12020-07-08 12:53:01 +0200621=head1 FORMATS
622
623=head2 Input restrictions
624
625=over 2
626
627=item
628
Akronee434b12020-07-08 12:53:01 +0200629TEI P5 formatted input with certain restrictions:
630
631=over 4
632
633=item
634
635B<mandatory>: text-header with integrated textsigle, text-body
636
637=item
638
639B<optional>: corp-header with integrated corpsigle,
640doc-header with integrated docsigle
641
642=back
643
644=item
645
Akron0c41ab32020-09-29 07:33:33 +0200646All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200647newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200648(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200649into blanks between 2 tokens could lead to additional blanks,
650where there should be none (e.g.: punctuation characters like C<,> or
651C<.> should not be seperated from their predecessor token).
652(see also code section C<~ whitespace handling ~>).
653
654=back
655
656=head2 Notes on the output
657
658=over 2
659
660=item
661
662zip file output (default on C<stdout>) with utf8 encoded entries
663(which together form the KorAP-XML format)
664
665=back
666
Akrond949e182020-02-14 12:23:57 +0100667=head1 INSTALLATION
668
669C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
670these bindings are available, the preferred way to install the script is
671to use L<cpanm|App::cpanminus>.
672
673 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
674
675In case everything went well, the C<tei2korapxml> tool will
676be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200677
Akrond949e182020-02-14 12:23:57 +0100678Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
679
680=head1 OPTIONS
681
682=over 2
683
Akron4e603a52020-07-27 14:23:49 +0200684=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100685
Akron4e603a52020-07-27 14:23:49 +0200686The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100687
688=item B<--help|-h>
689
690Print help information.
691
692=item B<--version|-v>
693
694Print version information.
695
Akron4e603a52020-07-27 14:23:49 +0200696=item B<--tokenizer-call|-tc>
697
698Call an external tokenizer process, that will tokenize
699a single line from STDIN and outputs one token per line.
700
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200701=item B<--tokenizer-korap|-tk>
702
703Use the standard KorAP/DeReKo tokenizer.
704
Akron6d7b8e42020-09-29 07:37:41 +0200705=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200706
707Tokenize the data using two embedded tokenizers,
708that will take an I<Aggressive> and a I<conservative>
709approach.
710
Akron75d63142021-02-23 18:40:56 +0100711=item B<--skip-inline-tokens>
712
713Boolean flag indicating that inline tokens should not
714be processed. Defaults to false (meaning inline tokens will be processed).
715
Akron1a5271a2021-02-18 13:18:15 +0100716=item B<--inline-tokens> <foundry>#[<file>]
717
718Define the foundry and file (without extension)
719to store inline token information in.
720If L</KORAPXMLTEI_INLINE> is set, this will contain
721annotations as well.
722Defaults to C<tokens> and C<morpho>.
723
Akrondd0be8f2021-02-18 19:29:41 +0100724=item B<--inline-structures> <foundry>#[<file>]
725
726Define the foundry and file (without extension)
727to store inline structure information in.
728Defaults to C<struct> and C<structures>.
729
Akron26a71522021-02-19 10:27:37 +0100730=item B<--base-foundry> <foundry>
731
732Define the base foundry to store newly generated
733token information in.
734Defaults to C<base>.
735
736=item B<--data-file> <file>
737
738Define the file (without extension)
739to store primary data information in.
740Defaults to C<data>.
741
742=item B<--header-file> <file>
743
744Define the file name (without extension)
745to store header information on
746the corpus, document, and text level in.
747Defaults to C<header>.
748
Marc Kupietz985da0c2021-02-15 19:29:50 +0100749=item B<--use-tokenizer-sentence-splits|-s>
750
751Replace existing with, or add new, sentence boundary information
752provided by the KorAP tokenizer (currently supported only).
753
Akron91705d72021-02-19 10:59:45 +0100754=item B<--tokens-file> <file>
755
756Define the file (without extension)
757to store generated token information in
758(either from the KorAP tokenizer or an externally called tokenizer).
759Defaults to C<tokens>.
760
Akron3378dfd2020-08-01 15:01:36 +0200761=item B<--log|-l>
762
763Loglevel for I<Log::Any>. Defaults to C<notice>.
764
Akrond949e182020-02-14 12:23:57 +0100765=back
766
Akronb3649472020-09-29 08:24:46 +0200767=head1 ENVIRONMENT VARIABLES
768
769=over 2
770
771=item B<KORAPXMLTEI_DEBUG>
772
773Activate minimal debugging.
774Defaults to C<false>.
775
776=item B<KORAPXMLTEI_INLINE>
777
778Process inline annotations, if present.
779Defaults to C<false>.
780
781=back
782
Akrond949e182020-02-14 12:23:57 +0100783=head1 COPYRIGHT AND LICENSE
784
Marc Kupietze955ecc2021-02-17 17:42:01 +0100785Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100786
787Author: Peter Harders
788
Akronaabd0952020-09-29 07:35:08 +0200789Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100790
791L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
792Corpus Analysis Platform at the
793L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
794member of the
795L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
796
797This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100798L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100799
800=cut
Akronf8088e62021-02-18 16:18:59 +0100801
802# NOTES
803
804## Notes on how 'XML::CompactTree::XS' works
805
806Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
807
808Print out name of 'node2' for the above example:
809
810echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
811
812Exploring the structure of $data ( = reference to below array ):
813
814[ 0: XML_READER_TYPE_DOCUMENT,
815 1: ?
Akron91577922021-02-19 10:32:54 +0100816 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100817 1: 'node'
818 2: ?
819 3: HASH (attributes)
820 4: 1 (line number)
821 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
822 1: 'node1'
823 2: ?
824 3: undefined (no attributes)
825 4: 1 (line number)
826 5: [ 0: [ 0: XML_READER_TYPE_TEXT
827 1: 'some '
828 ]
829 1: [ 0: XML_READER_TYPE_ELEMENT
830 1: 'n'
831 2: ?
832 3: undefined (no attributes)
833 4: 1 (line number)
834 5: undefined (no child-nodes)
835 ]
836 2: [ 0: XML_READER_TYPE_TEXT
837 1: ' text'
838 ]
839 ]
840 ]
841 1: [ 0: XML_READER_TYPE_ELEMENT
842 1: 'node2'
843 2: ?
844 3: undefined (not attributes)
845 4: 1 (line number)
846 5: [ 0: [ 0: XML_READER_TYPE_TEXT
847 1: 'more-text'
848 ]
849 ]
850 ]
851 ]
852 ]
853 ]
854]
855
856$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
857
858ref($data->[2]) == ARRAY (with 1 element for 'node')
859ref($data->[2]->[0]) == ARRAY (with 6 elements)
860
861$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
862$data->[2]->[0]->[1] == 'node'
863ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
864$data->[2]->[0]->[4] == 1 (line number)
865ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akrond53913c2021-02-24 09:50:13 +0100866 # child-nodes of actual node (see $child_idx)
Akronf8088e62021-02-18 16:18:59 +0100867
868ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
869$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
870$data->[2]->[0]->[5]->[0]->[1] == 'node1'
871$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
872$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
873ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
874
875ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
876$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
877$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
878
879ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
880$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
881$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
882$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
883$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
884$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
885
886ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
887$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
888$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
889
890
891retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
892Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
893${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
894
895
896## Notes on whitespace handling
897
898Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
899(see function 'retr_info()').
900
901Definition of significant and insignificant whitespace
902(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
903
904Significant whitespace is part of the document content and should be preserved.
905Insignificant whitespace is used when editing XML documents for readability.
906These whitespaces are typically not intended for inclusion in the delivery of the document.
907
908### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
909
910The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
911 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
912
913When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
914 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
915 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
916
917echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
918
919
920Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
921
922Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
923 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
924
925The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
926 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
927
928The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
929 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
930
931When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
932 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
933 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
934 the last read 'non-tag'-node has to be corrected (see [1]),
935
936For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
937 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
938
939[1]
940Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
941 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akrond53913c2021-02-24 09:50:13 +0100942 (see above code fragment '... not exists $ws{ $from - 1 } ...').
Akronf8088e62021-02-18 16:18:59 +0100943
944[2]
945Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
946 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
947
948The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
949 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
950
951Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
952 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
953
954
955## Notes on whitespace fixing
956
957The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
958 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
959
960It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
961 example further down and notes on 'Input restrictions' in the manpage).
962
963Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
964
965Examples (how primary text with linebreaks would be converted by below code):
966
967 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
968 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
969
970Blanks are inserted before the 1st character:
971
972 NOTE: not stringent ('...' stands for text):
973
974 beg1............................end1 => no blank before 'beg1'
975 beg2....<pb/>...................end2 => no blank before 'beg2'
976 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
977 beg4....<test>ok</test>.........end4 => blank before 'beg4'
978
979 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
980 ^
981 |_blank between 'end3' and 'beg4'
982
983
984## Notes on segfault prevention
985
Akron91577922021-02-19 10:32:54 +0100986binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100987(see notes on 'PerlIO layers' in 'man XML::LibXML'),
988removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
989see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
990see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.