blob: c18f9412f52c43f3aefb1c0461b45bd5ac8b13be [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Akrond53913c2021-02-24 09:50:13 +010045 "root|r=s" => \(my $root_dir = '.'),
Akron75d63142021-02-23 18:40:56 +010046 "input|i=s" => \(my $input_fname = ''),
47 'tokenizer-call|tc=s' => \(my $tokenizer_call),
48 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010049 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akron75d63142021-02-23 18:40:56 +010050 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
51 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
52 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
53 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akrond53913c2021-02-24 09:50:13 +010054 'base-foundry=s' => \(my $base_dir = 'base'),
55 'data-file=s' => \(my $data_file = 'data'),
56 'header-file=s' => \(my $header_file = 'header'),
57 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron75d63142021-02-23 18:40:56 +010058 'log|l=s' => \(my $log_level = 'notice'),
59 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010060 pod2usage(
61 -verbose => 99,
62 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
63 -msg => $VERSION_MSG,
64 -output => '-'
65 )
66 },
67 'version|v' => sub {
68 pod2usage(
69 -verbose => 0,
70 -msg => $VERSION_MSG,
71 -output => '-'
72 )
73 }
Peter Hardersd892a582020-02-12 15:45:22 +010074);
75
Akronb87c58d2021-02-23 17:23:30 +010076# Establish logger
Marc Kupietz44b1f252020-11-26 16:31:40 +010077binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020078Log::Any::Adapter->set('Stderr', log_level => $log_level);
79
Akronb3649472020-09-29 08:24:46 +020080$log->notice('Debugging is activated') if DEBUG;
81
Akron0529e512021-02-22 09:55:35 +010082# tag (without attributes), which contains the primary text
83my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020084# optional
Akron09e0b2c2020-07-28 15:57:01 +020085
Akron0529e512021-02-22 09:55:35 +010086# TODO: IDS-specific (and redundant)
87my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +020088
Akrond53913c2021-02-24 09:50:13 +010089# name of the tag containing all information stored in $_tokens_file
90my $_TOKENS_TAG = 'w';
91
92
Marc Kupietz985da0c2021-02-15 19:29:50 +010093if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
94 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
Akronb87c58d2021-02-23 17:23:30 +010095};
Marc Kupietz985da0c2021-02-15 19:29:50 +010096
Akron0c41ab32020-09-29 07:33:33 +020097my $ext_tok;
98if ($tokenizer_call) {
99 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
100}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200101
Akron0c41ab32020-09-29 07:33:33 +0200102elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100103 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200104};
Peter Harders6f526a32020-06-29 21:44:41 +0200105##
106
Akron0c41ab32020-09-29 07:33:33 +0200107
Akron4e3c7e32021-02-18 15:19:53 +0100108#
109# ~~~ constants ~~~
110#
111
112
Akron8b511f92020-07-09 17:28:08 +0200113## intern tokenization
Akronb87c58d2021-02-23 17:23:30 +0100114my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
115my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200116##
117
Akrondd0be8f2021-02-18 19:29:41 +0100118# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100119# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100120my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100121
Akron1a5271a2021-02-18 13:18:15 +0100122# Name of the directory and the file containing all inline token informations
123# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
124my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100125
Akron4e3c7e32021-02-18 15:19:53 +0100126# Handling inline annotations (inside $_TOKENS_TAG)
Akronb87c58d2021-02-23 17:23:30 +0100127my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
128
129# Initialize Token- and Structure-Collector
130my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
131my $structures = KorAP::XML::TEI::Annotations::Collector->new;
132
133# Initialize Data-Collector
134my $data = KorAP::XML::TEI::Data->new;
135
136# Initialize zipper
Akrond53913c2021-02-24 09:50:13 +0100137my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron09e0b2c2020-07-28 15:57:01 +0200138
Peter Harders6f526a32020-06-29 21:44:41 +0200139
140#
141# ~~~ variables ~~~
142#
143
Akronbc899192021-02-24 12:14:47 +0100144# text directory (below $root_dir)
145my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200146
Akronbc899192021-02-24 12:14:47 +0100147# Escaped version of text id
148my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200149
Akrond53913c2021-02-24 09:50:13 +0100150# element from $tree_data
151my $e;
152
153# Keeping track of the current positions in the text
154my $pos;
155
156# Default encoding of the text
157my $input_enc = 'UTF-8';
158
159# variables for handling ~ whitespace related issue ~
160# (it is sometimes necessary, to correct the from-values for some tags)
161my $add_one;
162my $from = 0;
163
164# text line (needed for whitespace handling)
165my $text_line = 0;
166
167# hash for indices of whitespace-nodes
168# (needed to recorrect from-values)
169# IDEA:
170# when closing element, check if it's from-index minus 1 refers to a whitespace-node
171# (means: 'from-index - 1' is a key in %ws).
172# if this is _not_ the case, then the from-value is one
173# to high => correct it by substracting 1
174my %ws;
Peter Harders6f526a32020-06-29 21:44:41 +0200175
Peter Harders6f526a32020-06-29 21:44:41 +0200176
177#
178# ~~~ main ~~~
179#
180
Peter Harders6f526a32020-06-29 21:44:41 +0200181# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100182
Akrond53913c2021-02-24 09:50:13 +0100183# Input file handle (default: stdin)
184my $input_fh = *STDIN;
Peter Hardersd892a582020-02-12 15:45:22 +0100185
Akrond53913c2021-02-24 09:50:13 +0100186if ($input_fname ne '') {
Akron347be812020-09-29 07:52:52 +0200187 unless (open($input_fh, '<', $input_fname)) {
188 die $log->fatal("File '$input_fname' could not be opened.");
189 };
Akrond53913c2021-02-24 09:50:13 +0100190};
Peter Harders6f526a32020-06-29 21:44:41 +0200191
Akronf8088e62021-02-18 16:18:59 +0100192# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200193binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200194
Peter Harders6f526a32020-06-29 21:44:41 +0200195
Akrond53913c2021-02-24 09:50:13 +0100196# Reading input document
Akron347be812020-09-29 07:52:52 +0200197MAIN: while ( <$input_fh> ){
198
Akrond53913c2021-02-24 09:50:13 +0100199 # remove HTML (multi-line) comments (<!--...-->)
200 $_ = remove_xml_comments( $input_fh, $_ );
Akron347be812020-09-29 07:52:52 +0200201
Akroneaa96232020-10-15 17:06:15 +0200202 # Set input encoding
Akrond53913c2021-02-24 09:50:13 +0100203 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akroneaa96232020-10-15 17:06:15 +0200204 $input_enc = $2;
205 next;
206 };
207
208 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100209 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200210
Akrond53913c2021-02-24 09:50:13 +0100211 # Start of Text body
212 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#){
Akron347be812020-09-29 07:52:52 +0200213
Akrond53913c2021-02-24 09:50:13 +0100214 my $suffix = $2;
Akron347be812020-09-29 07:52:52 +0200215
Akrond53913c2021-02-24 09:50:13 +0100216 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200217 die $log->fatal("input line number $.: " .
218 "line with opening text-body tag '${_TEXT_BODY}' " .
219 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200220 };
Peter Harders6f526a32020-06-29 21:44:41 +0200221
Akrond53913c2021-02-24 09:50:13 +0100222 # Text body data extracted from input document ($input_fh),
223 # further processed by XML::LibXML::Reader
224 my $text_buffer = '';
Peter Harders90157342020-07-01 21:05:14 +0200225
Akron347be812020-09-29 07:52:52 +0200226 # Iterate over all lines in the text body
227 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200228
Akron347be812020-09-29 07:52:52 +0200229 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200230 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100231 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200232
Akrond53913c2021-02-24 09:50:13 +0100233 # End of text body
Akron347be812020-09-29 07:52:52 +0200234 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200235
Akron91705d72021-02-19 10:59:45 +0100236 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200237
Akrond53913c2021-02-24 09:50:13 +0100238 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200239 die $log->fatal("input line number $.: " .
240 "line with closing text-body tag '${_TEXT_BODY}'".
241 " contains additional information ... => Aborting (line=$_)");
242 };
Peter Harders6f526a32020-06-29 21:44:41 +0200243
Akrondafaa7a2021-02-19 15:17:58 +0100244 if ($dir eq '') {
Akrond53913c2021-02-24 09:50:13 +0100245 $log->warn(
246 "Maybe empty textSigle => skipping this text ...\n" .
247 'data=' . substr($data->data, 0, 200)
248 );
Akrondafaa7a2021-02-19 15:17:58 +0100249 next MAIN;
250 };
Peter Harders6f526a32020-06-29 21:44:41 +0200251
Akrondafaa7a2021-02-19 15:17:58 +0100252 my $reader = XML::LibXML::Reader->new(
Akrond53913c2021-02-24 09:50:13 +0100253 string => "<text>$text_buffer</text>",
Akrondafaa7a2021-02-19 15:17:58 +0100254 huge => 1
255 );
256
257 # See notes on whitespace handling
258 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
259
260 # XCT_LINE_NUMBERS is only needed for debugging
261 # (see XML::CompactTree::XS)
262 $param |= XCT_LINE_NUMBERS if DEBUG;
263 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
264
Akrondafaa7a2021-02-19 15:17:58 +0100265 # ~ whitespace related issue ~
266 $add_one = 0;
267 %ws = ();
268
269 # ~ recursion ~
Akron5aca0d22021-02-24 12:09:53 +0100270 descend(1, $tree_data->[2]); # parse input data
Akrondafaa7a2021-02-19 15:17:58 +0100271
272 if (DEBUG) {
Akrond53913c2021-02-24 09:50:13 +0100273 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akrondafaa7a2021-02-19 15:17:58 +0100274 };
275
Akrond53913c2021-02-24 09:50:13 +0100276 # Write data.xml
Akrondafaa7a2021-02-19 15:17:58 +0100277 $data->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100278 $zipper->new_stream("$dir/${data_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100279 $text_id_esc
280 );
281
Akrond53913c2021-02-24 09:50:13 +0100282 # Tokenize with external tokenizer
Akron9df4a242021-02-19 15:31:16 +0100283 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100284
285 # Tokenize and output
286 $ext_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100287 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100288 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100289 );
Akrond53ab4b2021-02-24 09:56:12 +0100290
291 if ($use_tokenizer_sentence_splits) {
292 $ext_tok->sentencize_from_previous_input($structures);
293 };
Akrondafaa7a2021-02-19 15:17:58 +0100294 };
Peter Harders6f526a32020-06-29 21:44:41 +0200295
Akrond53913c2021-02-24 09:50:13 +0100296 # Tokenize with internal tokenizer
297 if ($tokenizer_intern) {
Peter Harders6f526a32020-06-29 21:44:41 +0200298
Akrondafaa7a2021-02-19 15:17:58 +0100299 # Tokenize and output
300 $cons_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100301 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200302 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100303 )->reset;
Akron598d1a72020-08-02 17:33:31 +0200304
Akrondafaa7a2021-02-19 15:17:58 +0100305 $aggr_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100306 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akrondafaa7a2021-02-19 15:17:58 +0100307 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100308 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100309 };
Akrona10ad592020-08-03 11:20:23 +0200310
Akrondafaa7a2021-02-19 15:17:58 +0100311 # ~ write structures ~
312 if (!$structures->empty) {
313 $structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100314 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100315 $text_id_esc,
316 2 # = structure serialization
Akroncc27d792021-02-24 12:32:20 +0100317 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100318 };
319
320 # ~ write tokens ~
Akron75d63142021-02-23 18:40:56 +0100321 unless ($skip_inline_tokens || $tokens->empty) {
Akrondafaa7a2021-02-19 15:17:58 +0100322 $tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100323 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100324 $text_id_esc,
325 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akroncc27d792021-02-24 12:32:20 +0100326 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100327 };
328
329 # reinit.
330 $dir = '';
331
332 # Maybe not necessary
333 $data->reset;
334
Akron347be812020-09-29 07:52:52 +0200335 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200336 };
337
Peter Harders6f526a32020-06-29 21:44:41 +0200338
Akron347be812020-09-29 07:52:52 +0200339 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200340
Akronf8088e62021-02-18 16:18:59 +0100341 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100342
Akrond53913c2021-02-24 09:50:13 +0100343 # TODO:
344 # Maybe it's best, to keep the stripping of whitespace and
345 # to just remove the if-clause and to insert a blank by default
346 # (with possibly an option on how newlines in primary text should
347 # be handled (stripped or replaced by a whitespace)).
Akronf8088e62021-02-18 16:18:59 +0100348
349 # Remove consecutive whitespace at beginning and end (mostly one newline)
350 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200351
Akrond53913c2021-02-24 09:50:13 +0100352 # NOTE:
353 # this is only relevant, if a text consists of more than one line
Akronf57ed812020-07-27 10:37:52 +0200354
Akrond53913c2021-02-24 09:50:13 +0100355 # TODO:
356 # find a better solution, or create a warning, if a text has more
357 # than one line ($text_line > 1)
Akronf57ed812020-07-27 10:37:52 +0200358
Akrond53913c2021-02-24 09:50:13 +0100359 # TODO:
360 # do testing with 2 different corpora
361 # (one with only one-line texts, the other with several lines per text)
362
363 # line contains at least one tag with at least one character contents
364 if (m/<[^>]+>[^<]/) {
365
366 # Increment counter for text lines
367 $text_line++;
368
369 # insert blank before 1st character
370 #(for 2nd line and consecutive lines)
371 s/^(.)/ $1/ if $text_line > 1;
Akron347be812020-09-29 07:52:52 +0200372 }
Akronf57ed812020-07-27 10:37:52 +0200373
Akron347be812020-09-29 07:52:52 +0200374 # add line to buffer
Akrond53913c2021-02-24 09:50:13 +0100375 $text_buffer .= $_;
Akron347be812020-09-29 07:52:52 +0200376 };
Akronf57ed812020-07-27 10:37:52 +0200377
Akron0529e512021-02-22 09:55:35 +0100378 } elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200379
Akron347be812020-09-29 07:52:52 +0200380 # ~ start of header ~
Akron347be812020-09-29 07:52:52 +0200381 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200382
Akrond20898f2021-02-19 15:52:17 +0100383 if ($1 !~ /^\s*$/) {
Akrond53913c2021-02-24 09:50:13 +0100384 die $log->fatal(
385 "input line number $.: " .
386 'line with opening header tag is not in expected format ... ' .
387 "=> Aborting (line=$_)");
Akron347be812020-09-29 07:52:52 +0200388 };
389
390 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200391 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200392
393 # Header was parseable
394 if ($header) {
395
396 # Write header to zip
Akrond53913c2021-02-24 09:50:13 +0100397 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200398
Akronb3649472020-09-29 08:24:46 +0200399 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200400
401 $header->to_zip($zipper->new_stream($file));
402
403 # Header is for text level
404 if ($header->type eq 'text') {
405
406 # Remember dir and sigles
407 $dir = $header->dir;
Akron347be812020-09-29 07:52:52 +0200408 $text_id_esc = $header->id_esc;
409
410 # log output for seeing progression
Akronbc899192021-02-24 12:14:47 +0100411 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200412
Akrond53913c2021-02-24 09:50:13 +0100413 # Reset counter for text lines
414 # (needed for whitespace handling)
415 $text_line = 0;
416 };
417 };
418 };
419};
Peter Hardersd892a582020-02-12 15:45:22 +0100420
Akron347be812020-09-29 07:52:52 +0200421$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200422
Akron9df4a242021-02-19 15:31:16 +0100423$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100424
Akrond53913c2021-02-24 09:50:13 +0100425close $input_fh;
426
Akron347be812020-09-29 07:52:52 +0200427exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100428
Peter Hardersd892a582020-02-12 15:45:22 +0100429
Akrond658df72021-02-18 18:58:56 +0100430# Recursively called function to handle XML tree data
Akron5aca0d22021-02-24 12:09:53 +0100431sub descend {
Akrond53913c2021-02-24 09:50:13 +0100432
Akron1c4f2202020-07-30 09:28:22 +0200433 # recursion level
Akron5aca0d22021-02-24 12:09:53 +0100434 # (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akrond53913c2021-02-24 09:50:13 +0100435 my $depth = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100436
Akrond658df72021-02-18 18:58:56 +0100437 # Iteration through all array elements
438 # ($_[0] is a reference to an array reference)
439 # See notes on how 'XML::CompactTree::XS' works and
440 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron3556c752021-02-24 09:53:24 +0100441 foreach $e (@{$_[0]}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100442
Akrond658df72021-02-18 18:58:56 +0100443 # Element node
444 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100445
Peter Harders6f526a32020-06-29 21:44:41 +0200446 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200447 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200448 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100449
Akron5aca0d22021-02-24 12:09:53 +0100450 # Get the child index depending on the debug state.
451 # This is likely to be optimized away by the compiler.
452 my $children = $e->[DEBUG ? 5 : 4];
453
Akron7501ca02020-08-01 21:05:25 +0200454 # $e->[1] represents the tag name
Akronace12772021-02-19 13:16:26 +0100455 # Skip sentences
Marc Kupietz985da0c2021-02-15 19:29:50 +0100456 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akron5aca0d22021-02-24 12:09:53 +0100457 descend($depth+1, $children) if defined $children;
Akronace12772021-02-19 13:16:26 +0100458 next;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100459 }
Peter Hardersd892a582020-02-12 15:45:22 +0100460
Akronace12772021-02-19 13:16:26 +0100461 my $anno = $structures->add_new_annotation($e->[1]);
Peter Hardersd892a582020-02-12 15:45:22 +0100462
Akron7501ca02020-08-01 21:05:25 +0200463 # Add element also to token list
Akron75d63142021-02-23 18:40:56 +0100464 if (!$skip_inline_tokens && $e->[1] eq $_TOKENS_TAG) {
Akron7501ca02020-08-01 21:05:25 +0200465 $tokens->add_annotation($anno);
466 };
Peter Hardersd892a582020-02-12 15:45:22 +0100467
Akrond658df72021-02-18 18:58:56 +0100468 # Handle attributes (if attributes exist)
469 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100470
Akrond658df72021-02-18 18:58:56 +0100471 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
472 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
473 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akrondac5d932021-02-23 21:12:02 +0100474 for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100475
Akrondac5d932021-02-23 21:12:02 +0100476 # '$_' references the 'key' and '$_+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200477 $anno->add_attribute(
Akrondac5d932021-02-23 21:12:02 +0100478 @{$e->[3]}[$_, $_ + 1]
Akron7501ca02020-08-01 21:05:25 +0200479 );
Akrond658df72021-02-18 18:58:56 +0100480 };
481 };
Peter Harders6f526a32020-06-29 21:44:41 +0200482
483 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200484 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200485
Akrond658df72021-02-18 18:58:56 +0100486
Peter Harders6f526a32020-06-29 21:44:41 +0200487 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200488 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200489 #~~~~
490
491
Akrond658df72021-02-18 18:58:56 +0100492 # Call function recursively
Akron5aca0d22021-02-24 12:09:53 +0100493 # do no recursion, if $children is not defined
Akrond658df72021-02-18 18:58:56 +0100494 # (because we have no array of child-nodes, e.g.: <back/>)
Akron5aca0d22021-02-24 12:09:53 +0100495 descend($depth+1, $children) if defined $children;
Peter Harders6f526a32020-06-29 21:44:41 +0200496
497
498 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200499 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200500 #~~~~~
501
Akrond658df72021-02-18 18:58:56 +0100502 # NOTE: use $pos, because the offsets are _between_ the characters
503 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200504 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200505
Akrond658df72021-02-18 18:58:56 +0100506 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200507
Akrond53913c2021-02-24 09:50:13 +0100508 $from = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200509
Peter Harders6f526a32020-06-29 21:44:41 +0200510 # ~ whitespace related issue ~
Akrond53913c2021-02-24 09:50:13 +0100511 if ($from > 0 && not exists $ws{$from - 1}) {
Akrond658df72021-02-18 18:58:56 +0100512
513 # ~ previous node was a text-node ~
Akrond53913c2021-02-24 09:50:13 +0100514 $anno->set_from($from - 1);
Akrond658df72021-02-18 18:58:56 +0100515 };
516
Akrond53913c2021-02-24 09:50:13 +0100517 # in case this fails, check input
518 if (($from - 1) > $pos) {
519 die $log->fatal(
Akronbc899192021-02-24 12:14:47 +0100520 "text_id='$text_id_esc', " .
Akrond53913c2021-02-24 09:50:13 +0100521 'processing of structures: ' .
522 "from-value ($from) is 2 or more greater " .
523 "than to-value ($pos) => please check. Aborting"
524 );
525 };
526
527 # TODO:
528 # find example for which this case applies
529 # maybe this is not necessary anymore, because the
530 # above recorrection of the from-value suffices
Akrond658df72021-02-18 18:58:56 +0100531 #
Akrond53913c2021-02-24 09:50:13 +0100532 # TODO:
533 # check, if it's better to remove this line and
534 # change above check to 'if ($from - 1) >= $pos;
Akrond658df72021-02-18 18:58:56 +0100535 # do testing with bigger corpus excerpt (wikipedia?)
Akrond53913c2021-02-24 09:50:13 +0100536 $anno->set_from($pos) if $from == $pos + 1;
Akrond658df72021-02-18 18:58:56 +0100537 $anno->set_to($pos);
Akrond53913c2021-02-24 09:50:13 +0100538 $anno->set_level($depth);
Akrond658df72021-02-18 18:58:56 +0100539
540 # Clean up whitespace
Akrond53913c2021-02-24 09:50:13 +0100541 delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100542
543
Peter Harders41c35622020-07-12 01:16:22 +0200544 #~~~~
545 # until here: tag-node (closing)
546 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200547 }
548
Akrond658df72021-02-18 18:58:56 +0100549 # Text node
550 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200551
Akrond658df72021-02-18 18:58:56 +0100552 $add_one = 1;
553 $data->append($e->[1]);
554 }
555
556 # Whitespace node
557 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
558 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
559
560 # state, that this from-index belongs to a whitespace-node
561 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
562 $ws{$data->position}++;
563
564 $add_one = 0;
565 $data->append($e->[1]);
566 }
567
568 # not yet handled type
569 else {
570
571 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
572 };
573 };
574};
575
Peter Harders6f526a32020-06-29 21:44:41 +0200576
Akrond949e182020-02-14 12:23:57 +0100577__END__
578
579=pod
580
581=encoding utf8
582
583=head1 NAME
584
585tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
586
587=head1 SYNOPSIS
588
589 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
590
591=head1 DESCRIPTION
592
Akronee434b12020-07-08 12:53:01 +0200593C<tei2korapxml> is a script to convert TEI P5 and
594L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
595based documents to the
596L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
597If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100598read from C<STDIN>. If no specific output is defined, data is written
599to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200600
Akrond949e182020-02-14 12:23:57 +0100601This program is usually called from inside another script.
602
Akronee434b12020-07-08 12:53:01 +0200603=head1 FORMATS
604
605=head2 Input restrictions
606
607=over 2
608
609=item
610
Akronee434b12020-07-08 12:53:01 +0200611TEI P5 formatted input with certain restrictions:
612
613=over 4
614
615=item
616
617B<mandatory>: text-header with integrated textsigle, text-body
618
619=item
620
621B<optional>: corp-header with integrated corpsigle,
622doc-header with integrated docsigle
623
624=back
625
626=item
627
Akron0c41ab32020-09-29 07:33:33 +0200628All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200629newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200630(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200631into blanks between 2 tokens could lead to additional blanks,
632where there should be none (e.g.: punctuation characters like C<,> or
633C<.> should not be seperated from their predecessor token).
634(see also code section C<~ whitespace handling ~>).
635
636=back
637
638=head2 Notes on the output
639
640=over 2
641
642=item
643
644zip file output (default on C<stdout>) with utf8 encoded entries
645(which together form the KorAP-XML format)
646
647=back
648
Akrond949e182020-02-14 12:23:57 +0100649=head1 INSTALLATION
650
651C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
652these bindings are available, the preferred way to install the script is
653to use L<cpanm|App::cpanminus>.
654
655 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
656
657In case everything went well, the C<tei2korapxml> tool will
658be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200659
Akrond949e182020-02-14 12:23:57 +0100660Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
661
662=head1 OPTIONS
663
664=over 2
665
Akron4e603a52020-07-27 14:23:49 +0200666=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100667
Akron4e603a52020-07-27 14:23:49 +0200668The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100669
670=item B<--help|-h>
671
672Print help information.
673
674=item B<--version|-v>
675
676Print version information.
677
Akron4e603a52020-07-27 14:23:49 +0200678=item B<--tokenizer-call|-tc>
679
680Call an external tokenizer process, that will tokenize
681a single line from STDIN and outputs one token per line.
682
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200683=item B<--tokenizer-korap|-tk>
684
685Use the standard KorAP/DeReKo tokenizer.
686
Akron6d7b8e42020-09-29 07:37:41 +0200687=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200688
689Tokenize the data using two embedded tokenizers,
690that will take an I<Aggressive> and a I<conservative>
691approach.
692
Akron75d63142021-02-23 18:40:56 +0100693=item B<--skip-inline-tokens>
694
695Boolean flag indicating that inline tokens should not
696be processed. Defaults to false (meaning inline tokens will be processed).
697
Akron1a5271a2021-02-18 13:18:15 +0100698=item B<--inline-tokens> <foundry>#[<file>]
699
700Define the foundry and file (without extension)
701to store inline token information in.
702If L</KORAPXMLTEI_INLINE> is set, this will contain
703annotations as well.
704Defaults to C<tokens> and C<morpho>.
705
Akrondd0be8f2021-02-18 19:29:41 +0100706=item B<--inline-structures> <foundry>#[<file>]
707
708Define the foundry and file (without extension)
709to store inline structure information in.
710Defaults to C<struct> and C<structures>.
711
Akron26a71522021-02-19 10:27:37 +0100712=item B<--base-foundry> <foundry>
713
714Define the base foundry to store newly generated
715token information in.
716Defaults to C<base>.
717
718=item B<--data-file> <file>
719
720Define the file (without extension)
721to store primary data information in.
722Defaults to C<data>.
723
724=item B<--header-file> <file>
725
726Define the file name (without extension)
727to store header information on
728the corpus, document, and text level in.
729Defaults to C<header>.
730
Marc Kupietz985da0c2021-02-15 19:29:50 +0100731=item B<--use-tokenizer-sentence-splits|-s>
732
733Replace existing with, or add new, sentence boundary information
734provided by the KorAP tokenizer (currently supported only).
735
Akron91705d72021-02-19 10:59:45 +0100736=item B<--tokens-file> <file>
737
738Define the file (without extension)
739to store generated token information in
740(either from the KorAP tokenizer or an externally called tokenizer).
741Defaults to C<tokens>.
742
Akron3378dfd2020-08-01 15:01:36 +0200743=item B<--log|-l>
744
745Loglevel for I<Log::Any>. Defaults to C<notice>.
746
Akrond949e182020-02-14 12:23:57 +0100747=back
748
Akronb3649472020-09-29 08:24:46 +0200749=head1 ENVIRONMENT VARIABLES
750
751=over 2
752
753=item B<KORAPXMLTEI_DEBUG>
754
755Activate minimal debugging.
756Defaults to C<false>.
757
758=item B<KORAPXMLTEI_INLINE>
759
760Process inline annotations, if present.
761Defaults to C<false>.
762
763=back
764
Akrond949e182020-02-14 12:23:57 +0100765=head1 COPYRIGHT AND LICENSE
766
Marc Kupietze955ecc2021-02-17 17:42:01 +0100767Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100768
769Author: Peter Harders
770
Akronaabd0952020-09-29 07:35:08 +0200771Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100772
773L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
774Corpus Analysis Platform at the
775L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
776member of the
777L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
778
779This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100780L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100781
782=cut
Akronf8088e62021-02-18 16:18:59 +0100783
784# NOTES
785
786## Notes on how 'XML::CompactTree::XS' works
787
788Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
789
790Print out name of 'node2' for the above example:
791
792echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
793
794Exploring the structure of $data ( = reference to below array ):
795
796[ 0: XML_READER_TYPE_DOCUMENT,
797 1: ?
Akron5aca0d22021-02-24 12:09:53 +0100798 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100799 1: 'node'
800 2: ?
801 3: HASH (attributes)
802 4: 1 (line number)
803 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
804 1: 'node1'
805 2: ?
806 3: undefined (no attributes)
807 4: 1 (line number)
808 5: [ 0: [ 0: XML_READER_TYPE_TEXT
809 1: 'some '
810 ]
811 1: [ 0: XML_READER_TYPE_ELEMENT
812 1: 'n'
813 2: ?
814 3: undefined (no attributes)
815 4: 1 (line number)
816 5: undefined (no child-nodes)
817 ]
818 2: [ 0: XML_READER_TYPE_TEXT
819 1: ' text'
820 ]
821 ]
822 ]
823 1: [ 0: XML_READER_TYPE_ELEMENT
824 1: 'node2'
825 2: ?
826 3: undefined (not attributes)
827 4: 1 (line number)
828 5: [ 0: [ 0: XML_READER_TYPE_TEXT
829 1: 'more-text'
830 ]
831 ]
832 ]
833 ]
834 ]
835 ]
836]
837
838$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
839
840ref($data->[2]) == ARRAY (with 1 element for 'node')
841ref($data->[2]->[0]) == ARRAY (with 6 elements)
842
843$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
844$data->[2]->[0]->[1] == 'node'
845ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
846$data->[2]->[0]->[4] == 1 (line number)
847ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron5aca0d22021-02-24 12:09:53 +0100848 # child-nodes of actual node (see $children)
Akronf8088e62021-02-18 16:18:59 +0100849
850ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
851$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
852$data->[2]->[0]->[5]->[0]->[1] == 'node1'
853$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
854$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
855ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
856
857ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
858$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
859$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
860
861ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
862$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
863$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
864$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
865$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
866$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
867
868ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
869$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
870$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
871
872
Akron5aca0d22021-02-24 12:09:53 +0100873descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akronf8088e62021-02-18 16:18:59 +0100874Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
875${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
876
877
878## Notes on whitespace handling
879
880Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron5aca0d22021-02-24 12:09:53 +0100881(see function 'descend()').
Akronf8088e62021-02-18 16:18:59 +0100882
883Definition of significant and insignificant whitespace
884(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
885
886Significant whitespace is part of the document content and should be preserved.
887Insignificant whitespace is used when editing XML documents for readability.
888These whitespaces are typically not intended for inclusion in the delivery of the document.
889
890### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
891
892The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
893 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
894
895When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
896 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
897 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
898
899echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
900
901
902Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
903
904Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
905 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
906
907The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
908 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
909
910The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
911 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
912
913When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
914 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
915 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
916 the last read 'non-tag'-node has to be corrected (see [1]),
917
918For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
919 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
920
921[1]
922Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
923 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akrond53913c2021-02-24 09:50:13 +0100924 (see above code fragment '... not exists $ws{ $from - 1 } ...').
Akronf8088e62021-02-18 16:18:59 +0100925
926[2]
927Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
928 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
929
930The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
931 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
932
933Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
934 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
935
936
937## Notes on whitespace fixing
938
939The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
940 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
941
942It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
943 example further down and notes on 'Input restrictions' in the manpage).
944
945Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
946
947Examples (how primary text with linebreaks would be converted by below code):
948
949 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
950 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
951
952Blanks are inserted before the 1st character:
953
954 NOTE: not stringent ('...' stands for text):
955
956 beg1............................end1 => no blank before 'beg1'
957 beg2....<pb/>...................end2 => no blank before 'beg2'
958 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
959 beg4....<test>ok</test>.........end4 => blank before 'beg4'
960
961 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
962 ^
963 |_blank between 'end3' and 'beg4'
964
965
966## Notes on segfault prevention
967
Akron91577922021-02-19 10:32:54 +0100968binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100969(see notes on 'PerlIO layers' in 'man XML::LibXML'),
970removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
971see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
972see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.