blob: fb58d8e2794fb13ceed04d6699c7a5d77b7ed069 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Akrond3e1d282021-02-24 14:51:27 +010036our $VERSION = '1.01';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akron33db4ec2021-02-24 12:52:21 +010040use constant {
41 # Set to 1 for minimal more debug output (no need to be parametrized)
42 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0,
43
44 # XCT_LINE_NUMBERS is only needed for debugging
45 # (see XML::CompactTree::XS)
46 XCT_PARAM => (
47 XCT_DOCUMENT_ROOT
48 | XCT_IGNORE_COMMENTS
49 | XCT_ATTRIBUTE_ARRAY
50 | ($ENV{KORAPXMLTEI_DEBUG} ? XCT_LINE_NUMBERS : 0)
51 )
52};
Peter Hardersd892a582020-02-12 15:45:22 +010053
Peter Harders6f526a32020-06-29 21:44:41 +020054# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010055GetOptions(
Akrond3e1d282021-02-24 14:51:27 +010056 'root|r=s' => \(my $root_dir = '.'),
57 'input|i=s' => \(my $input_fname = ''),
Akron75d63142021-02-23 18:40:56 +010058 'tokenizer-call|tc=s' => \(my $tokenizer_call),
59 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010060 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akron75d63142021-02-23 18:40:56 +010061 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
62 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
63 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
64 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akrond3e1d282021-02-24 14:51:27 +010069 'log|l=s' => \(my $log_level = 'notice'),
Akron75d63142021-02-23 18:40:56 +010070 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010071 pod2usage(
72 -verbose => 99,
73 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
74 -msg => $VERSION_MSG,
75 -output => '-'
76 )
77 },
78 'version|v' => sub {
79 pod2usage(
80 -verbose => 0,
81 -msg => $VERSION_MSG,
82 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010083 );
Akrond949e182020-02-14 12:23:57 +010084 }
Peter Hardersd892a582020-02-12 15:45:22 +010085);
86
Akrond3e1d282021-02-24 14:51:27 +010087
Akronb87c58d2021-02-23 17:23:30 +010088# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010089binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020090Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020091$log->notice('Debugging is activated') if DEBUG;
92
Akrond3e1d282021-02-24 14:51:27 +010093
Akron0529e512021-02-22 09:55:35 +010094# tag (without attributes), which contains the primary text
95my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020096# optional
Akron09e0b2c2020-07-28 15:57:01 +020097
Akron0529e512021-02-22 09:55:35 +010098# TODO: IDS-specific (and redundant)
99my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +0200100
Akrond53913c2021-02-24 09:50:13 +0100101# name of the tag containing all information stored in $_tokens_file
102my $_TOKENS_TAG = 'w';
103
Akrond3e1d282021-02-24 14:51:27 +0100104
105# Define tokenizers
Marc Kupietz985da0c2021-02-15 19:29:50 +0100106if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
Akron33db4ec2021-02-24 12:52:21 +0100107 die $log->fatal(
108 'Sentence splitting is currently only supported by KorAP tokenizer ' .
109 '(use -tk to activate it)'
110 );
Akronb87c58d2021-02-23 17:23:30 +0100111};
Marc Kupietz985da0c2021-02-15 19:29:50 +0100112
Akrond3e1d282021-02-24 14:51:27 +0100113# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200114my $ext_tok;
115if ($tokenizer_call) {
116 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
117}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200118
Akron0c41ab32020-09-29 07:33:33 +0200119elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100120 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200121};
Peter Harders6f526a32020-06-29 21:44:41 +0200122
Akron0c41ab32020-09-29 07:33:33 +0200123
Akrond3e1d282021-02-24 14:51:27 +0100124# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100125my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
126my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100127
Peter Harders41c35622020-07-12 01:16:22 +0200128
Akrondd0be8f2021-02-18 19:29:41 +0100129# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100130# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100131my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100132
Akron1a5271a2021-02-18 13:18:15 +0100133# Name of the directory and the file containing all inline token informations
134# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
135my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100136
Akron4e3c7e32021-02-18 15:19:53 +0100137# Handling inline annotations (inside $_TOKENS_TAG)
Akronb87c58d2021-02-23 17:23:30 +0100138my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
139
140# Initialize Token- and Structure-Collector
141my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
142my $structures = KorAP::XML::TEI::Annotations::Collector->new;
143
144# Initialize Data-Collector
145my $data = KorAP::XML::TEI::Data->new;
146
147# Initialize zipper
Akrond53913c2021-02-24 09:50:13 +0100148my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron09e0b2c2020-07-28 15:57:01 +0200149
Peter Harders6f526a32020-06-29 21:44:41 +0200150
Akronbc899192021-02-24 12:14:47 +0100151# text directory (below $root_dir)
152my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200153
Akronbc899192021-02-24 12:14:47 +0100154# Escaped version of text id
155my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200156
Akrond53913c2021-02-24 09:50:13 +0100157# element from $tree_data
158my $e;
159
Akrond53913c2021-02-24 09:50:13 +0100160# Default encoding of the text
161my $input_enc = 'UTF-8';
162
163# variables for handling ~ whitespace related issue ~
164# (it is sometimes necessary, to correct the from-values for some tags)
165my $add_one;
Akrond53913c2021-02-24 09:50:13 +0100166
167# text line (needed for whitespace handling)
168my $text_line = 0;
169
170# hash for indices of whitespace-nodes
171# (needed to recorrect from-values)
172# IDEA:
173# when closing element, check if it's from-index minus 1 refers to a whitespace-node
174# (means: 'from-index - 1' is a key in %ws).
175# if this is _not_ the case, then the from-value is one
176# to high => correct it by substracting 1
177my %ws;
Peter Harders6f526a32020-06-29 21:44:41 +0200178
Peter Harders6f526a32020-06-29 21:44:41 +0200179
Akrond53913c2021-02-24 09:50:13 +0100180# Input file handle (default: stdin)
181my $input_fh = *STDIN;
Peter Hardersd892a582020-02-12 15:45:22 +0100182
Akrond53913c2021-02-24 09:50:13 +0100183if ($input_fname ne '') {
Akron347be812020-09-29 07:52:52 +0200184 unless (open($input_fh, '<', $input_fname)) {
185 die $log->fatal("File '$input_fname' could not be opened.");
186 };
Akrond53913c2021-02-24 09:50:13 +0100187};
Peter Harders6f526a32020-06-29 21:44:41 +0200188
Akronf8088e62021-02-18 16:18:59 +0100189# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200190binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200191
Peter Harders6f526a32020-06-29 21:44:41 +0200192
Akrond53913c2021-02-24 09:50:13 +0100193# Reading input document
Akrond3e1d282021-02-24 14:51:27 +0100194MAIN: while (<$input_fh>) {
Akron347be812020-09-29 07:52:52 +0200195
Akrond53913c2021-02-24 09:50:13 +0100196 # remove HTML (multi-line) comments (<!--...-->)
Akrond3e1d282021-02-24 14:51:27 +0100197 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200198
Akroneaa96232020-10-15 17:06:15 +0200199 # Set input encoding
Akrond53913c2021-02-24 09:50:13 +0100200 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akroneaa96232020-10-15 17:06:15 +0200201 $input_enc = $2;
202 next;
203 };
204
205 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100206 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200207
Akrond3e1d282021-02-24 14:51:27 +0100208 # Start of text body
209 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
Akrond53913c2021-02-24 09:50:13 +0100210 my $suffix = $2;
Akron347be812020-09-29 07:52:52 +0200211
Akrond53913c2021-02-24 09:50:13 +0100212 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200213 die $log->fatal("input line number $.: " .
214 "line with opening text-body tag '${_TEXT_BODY}' " .
215 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200216 };
Peter Harders6f526a32020-06-29 21:44:41 +0200217
Akrond53913c2021-02-24 09:50:13 +0100218 # Text body data extracted from input document ($input_fh),
219 # further processed by XML::LibXML::Reader
220 my $text_buffer = '';
Peter Harders90157342020-07-01 21:05:14 +0200221
Akron347be812020-09-29 07:52:52 +0200222 # Iterate over all lines in the text body
223 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200224
Akrond3e1d282021-02-24 14:51:27 +0100225 $_ = remove_xml_comments($input_fh, $_);
Akroneaa96232020-10-15 17:06:15 +0200226 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100227 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200228
Akrond53913c2021-02-24 09:50:13 +0100229 # End of text body
Akronb43b4912021-02-25 10:31:11 +0100230 if ((my $pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200231
Akron91705d72021-02-19 10:59:45 +0100232 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200233
Akrond53913c2021-02-24 09:50:13 +0100234 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200235 die $log->fatal("input line number $.: " .
236 "line with closing text-body tag '${_TEXT_BODY}'".
237 " contains additional information ... => Aborting (line=$_)");
238 };
Peter Harders6f526a32020-06-29 21:44:41 +0200239
Akrondafaa7a2021-02-19 15:17:58 +0100240 if ($dir eq '') {
Akrond53913c2021-02-24 09:50:13 +0100241 $log->warn(
242 "Maybe empty textSigle => skipping this text ...\n" .
243 'data=' . substr($data->data, 0, 200)
244 );
Akrondafaa7a2021-02-19 15:17:58 +0100245 next MAIN;
246 };
Peter Harders6f526a32020-06-29 21:44:41 +0200247
Akrondafaa7a2021-02-19 15:17:58 +0100248 my $reader = XML::LibXML::Reader->new(
Akrond53913c2021-02-24 09:50:13 +0100249 string => "<text>$text_buffer</text>",
Akrondafaa7a2021-02-19 15:17:58 +0100250 huge => 1
251 );
252
Akron33db4ec2021-02-24 12:52:21 +0100253 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, XCT_PARAM);
Akrondafaa7a2021-02-19 15:17:58 +0100254
Akrondafaa7a2021-02-19 15:17:58 +0100255 # ~ whitespace related issue ~
256 $add_one = 0;
257 %ws = ();
258
Akrond3e1d282021-02-24 14:51:27 +0100259 # Recursively parse all children
260 descend(1, $tree_data->[2]);
Akrondafaa7a2021-02-19 15:17:58 +0100261
262 if (DEBUG) {
Akrond53913c2021-02-24 09:50:13 +0100263 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akrondafaa7a2021-02-19 15:17:58 +0100264 };
265
Akrond53913c2021-02-24 09:50:13 +0100266 # Write data.xml
Akrondafaa7a2021-02-19 15:17:58 +0100267 $data->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100268 $zipper->new_stream("$dir/${data_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100269 $text_id_esc
270 );
271
Akrond53913c2021-02-24 09:50:13 +0100272 # Tokenize with external tokenizer
Akron9df4a242021-02-19 15:31:16 +0100273 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100274
275 # Tokenize and output
276 $ext_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100277 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100278 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100279 );
Akrond53ab4b2021-02-24 09:56:12 +0100280
281 if ($use_tokenizer_sentence_splits) {
282 $ext_tok->sentencize_from_previous_input($structures);
283 };
Akrondafaa7a2021-02-19 15:17:58 +0100284 };
Peter Harders6f526a32020-06-29 21:44:41 +0200285
Akrond53913c2021-02-24 09:50:13 +0100286 # Tokenize with internal tokenizer
287 if ($tokenizer_intern) {
Peter Harders6f526a32020-06-29 21:44:41 +0200288
Akrondafaa7a2021-02-19 15:17:58 +0100289 # Tokenize and output
290 $cons_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100291 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200292 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100293 )->reset;
Akron598d1a72020-08-02 17:33:31 +0200294
Akrondafaa7a2021-02-19 15:17:58 +0100295 $aggr_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100296 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akrondafaa7a2021-02-19 15:17:58 +0100297 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100298 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100299 };
Akrona10ad592020-08-03 11:20:23 +0200300
Akrondafaa7a2021-02-19 15:17:58 +0100301 # ~ write structures ~
302 if (!$structures->empty) {
303 $structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100304 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100305 $text_id_esc,
306 2 # = structure serialization
Akroncc27d792021-02-24 12:32:20 +0100307 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100308 };
309
310 # ~ write tokens ~
Akron75d63142021-02-23 18:40:56 +0100311 unless ($skip_inline_tokens || $tokens->empty) {
Akrondafaa7a2021-02-19 15:17:58 +0100312 $tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100313 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100314 $text_id_esc,
315 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akroncc27d792021-02-24 12:32:20 +0100316 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100317 };
318
319 # reinit.
320 $dir = '';
321
322 # Maybe not necessary
323 $data->reset;
324
Akron347be812020-09-29 07:52:52 +0200325 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200326 };
327
Peter Harders6f526a32020-06-29 21:44:41 +0200328
Akron347be812020-09-29 07:52:52 +0200329 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200330
Akronf8088e62021-02-18 16:18:59 +0100331 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100332
Akrond53913c2021-02-24 09:50:13 +0100333 # TODO:
334 # Maybe it's best, to keep the stripping of whitespace and
335 # to just remove the if-clause and to insert a blank by default
336 # (with possibly an option on how newlines in primary text should
337 # be handled (stripped or replaced by a whitespace)).
Akronf8088e62021-02-18 16:18:59 +0100338
339 # Remove consecutive whitespace at beginning and end (mostly one newline)
340 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200341
Akrond53913c2021-02-24 09:50:13 +0100342 # NOTE:
343 # this is only relevant, if a text consists of more than one line
Akronf57ed812020-07-27 10:37:52 +0200344
Akrond53913c2021-02-24 09:50:13 +0100345 # TODO:
346 # find a better solution, or create a warning, if a text has more
347 # than one line ($text_line > 1)
Akronf57ed812020-07-27 10:37:52 +0200348
Akrond53913c2021-02-24 09:50:13 +0100349 # TODO:
350 # do testing with 2 different corpora
351 # (one with only one-line texts, the other with several lines per text)
352
353 # line contains at least one tag with at least one character contents
354 if (m/<[^>]+>[^<]/) {
355
356 # Increment counter for text lines
357 $text_line++;
358
359 # insert blank before 1st character
Akron6e2b1252021-02-24 12:41:15 +0100360 # (for 2nd line and consecutive lines)
361 $_ = ' ' . $_ if $text_line > 1;
Akron347be812020-09-29 07:52:52 +0200362 }
Akronf57ed812020-07-27 10:37:52 +0200363
Akron347be812020-09-29 07:52:52 +0200364 # add line to buffer
Akrond53913c2021-02-24 09:50:13 +0100365 $text_buffer .= $_;
Akron347be812020-09-29 07:52:52 +0200366 };
Akrond3e1d282021-02-24 14:51:27 +0100367 }
Akronf57ed812020-07-27 10:37:52 +0200368
Akrond3e1d282021-02-24 14:51:27 +0100369 # Start of header section
370 elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200371
Akron347be812020-09-29 07:52:52 +0200372 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200373
Akrond20898f2021-02-19 15:52:17 +0100374 if ($1 !~ /^\s*$/) {
Akrond53913c2021-02-24 09:50:13 +0100375 die $log->fatal(
376 "input line number $.: " .
377 'line with opening header tag is not in expected format ... ' .
378 "=> Aborting (line=$_)");
Akron347be812020-09-29 07:52:52 +0200379 };
380
381 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200382 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200383
384 # Header was parseable
385 if ($header) {
386
387 # Write header to zip
Akrond53913c2021-02-24 09:50:13 +0100388 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200389
Akronb3649472020-09-29 08:24:46 +0200390 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200391
392 $header->to_zip($zipper->new_stream($file));
393
394 # Header is for text level
395 if ($header->type eq 'text') {
396
397 # Remember dir and sigles
398 $dir = $header->dir;
Akron347be812020-09-29 07:52:52 +0200399 $text_id_esc = $header->id_esc;
400
401 # log output for seeing progression
Akronbc899192021-02-24 12:14:47 +0100402 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200403
Akrond53913c2021-02-24 09:50:13 +0100404 # Reset counter for text lines
405 # (needed for whitespace handling)
406 $text_line = 0;
407 };
408 };
409 };
410};
Peter Hardersd892a582020-02-12 15:45:22 +0100411
Akron347be812020-09-29 07:52:52 +0200412$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200413
Akron9df4a242021-02-19 15:31:16 +0100414$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100415
Akrond53913c2021-02-24 09:50:13 +0100416close $input_fh;
417
Akron347be812020-09-29 07:52:52 +0200418exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100419
Peter Hardersd892a582020-02-12 15:45:22 +0100420
Akrond658df72021-02-18 18:58:56 +0100421# Recursively called function to handle XML tree data
Akron5aca0d22021-02-24 12:09:53 +0100422sub descend {
Akrond53913c2021-02-24 09:50:13 +0100423
Akron1c4f2202020-07-30 09:28:22 +0200424 # recursion level
Akron5aca0d22021-02-24 12:09:53 +0100425 # (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akrond53913c2021-02-24 09:50:13 +0100426 my $depth = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100427
Akrond658df72021-02-18 18:58:56 +0100428 # Iteration through all array elements
429 # ($_[0] is a reference to an array reference)
430 # See notes on how 'XML::CompactTree::XS' works and
431 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron3556c752021-02-24 09:53:24 +0100432 foreach $e (@{$_[0]}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100433
Akrond3e1d282021-02-24 14:51:27 +0100434 # $e->[1] represents the tag name of an element node
435 # or the primary data of a text or ws node
436 my $node_info = $e->[1];
437
Akrond658df72021-02-18 18:58:56 +0100438 # Element node
439 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100440
Akrond3e1d282021-02-24 14:51:27 +0100441 # Deal with opening tag
Peter Hardersd892a582020-02-12 15:45:22 +0100442
Akron5aca0d22021-02-24 12:09:53 +0100443 # Get the child index depending on the debug state.
444 # This is likely to be optimized away by the compiler.
445 my $children = $e->[DEBUG ? 5 : 4];
446
Akronace12772021-02-19 13:16:26 +0100447 # Skip sentences
Akrond3e1d282021-02-24 14:51:27 +0100448 if ($use_tokenizer_sentence_splits && $node_info eq 's') {
449 descend($depth + 1, $children) if defined $children;
Akronace12772021-02-19 13:16:26 +0100450 next;
Akrond3e1d282021-02-24 14:51:27 +0100451 };
Peter Hardersd892a582020-02-12 15:45:22 +0100452
Akrond3e1d282021-02-24 14:51:27 +0100453 my $anno = $structures->add_new_annotation($node_info);
Peter Hardersd892a582020-02-12 15:45:22 +0100454
Akron7501ca02020-08-01 21:05:25 +0200455 # Add element also to token list
Akrond3e1d282021-02-24 14:51:27 +0100456 if (!$skip_inline_tokens && $node_info eq $_TOKENS_TAG) {
Akron7501ca02020-08-01 21:05:25 +0200457 $tokens->add_annotation($anno);
458 };
Peter Hardersd892a582020-02-12 15:45:22 +0100459
Akrond658df72021-02-18 18:58:56 +0100460 # Handle attributes (if attributes exist)
461 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100462
Akrond658df72021-02-18 18:58:56 +0100463 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
Akrond3e1d282021-02-24 14:51:27 +0100464 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
465 # NOTE:
466 # arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akrondac5d932021-02-23 21:12:02 +0100467 for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Akron7501ca02020-08-01 21:05:25 +0200468 $anno->add_attribute(
Akrondac5d932021-02-23 21:12:02 +0100469 @{$e->[3]}[$_, $_ + 1]
Akron7501ca02020-08-01 21:05:25 +0200470 );
Akrond658df72021-02-18 18:58:56 +0100471 };
472 };
Peter Harders6f526a32020-06-29 21:44:41 +0200473
474 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200475 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200476
Akrond658df72021-02-18 18:58:56 +0100477
Akrond658df72021-02-18 18:58:56 +0100478 # Call function recursively
Akron5aca0d22021-02-24 12:09:53 +0100479 # do no recursion, if $children is not defined
Akrond658df72021-02-18 18:58:56 +0100480 # (because we have no array of child-nodes, e.g.: <back/>)
Akron5aca0d22021-02-24 12:09:53 +0100481 descend($depth+1, $children) if defined $children;
Peter Harders6f526a32020-06-29 21:44:41 +0200482
483
Akrond3e1d282021-02-24 14:51:27 +0100484 # Deal with closing tag
Peter Harders6f526a32020-06-29 21:44:41 +0200485
Akrond3e1d282021-02-24 14:51:27 +0100486 # NOTE:
487 # use $pos, because the offsets are _between_ the characters
488 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200489 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200490
Akrond658df72021-02-18 18:58:56 +0100491 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200492
Akronb43b4912021-02-25 10:31:11 +0100493 my $from = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200494
Peter Harders6f526a32020-06-29 21:44:41 +0200495 # ~ whitespace related issue ~
Akrond53913c2021-02-24 09:50:13 +0100496 if ($from > 0 && not exists $ws{$from - 1}) {
Akrond658df72021-02-18 18:58:56 +0100497
Akrond3e1d282021-02-24 14:51:27 +0100498 # Previous node was a text-node
Akrond53913c2021-02-24 09:50:13 +0100499 $anno->set_from($from - 1);
Akrond658df72021-02-18 18:58:56 +0100500 };
501
Akrond53913c2021-02-24 09:50:13 +0100502 # in case this fails, check input
503 if (($from - 1) > $pos) {
504 die $log->fatal(
Akronbc899192021-02-24 12:14:47 +0100505 "text_id='$text_id_esc', " .
Akrond53913c2021-02-24 09:50:13 +0100506 'processing of structures: ' .
507 "from-value ($from) is 2 or more greater " .
508 "than to-value ($pos) => please check. Aborting"
509 );
510 };
511
512 # TODO:
513 # find example for which this case applies
514 # maybe this is not necessary anymore, because the
515 # above recorrection of the from-value suffices
Akrond658df72021-02-18 18:58:56 +0100516 #
Akrond53913c2021-02-24 09:50:13 +0100517 # TODO:
518 # check, if it's better to remove this line and
519 # change above check to 'if ($from - 1) >= $pos;
Akrond658df72021-02-18 18:58:56 +0100520 # do testing with bigger corpus excerpt (wikipedia?)
Akrond53913c2021-02-24 09:50:13 +0100521 $anno->set_from($pos) if $from == $pos + 1;
Akrond658df72021-02-18 18:58:56 +0100522 $anno->set_to($pos);
Akrond53913c2021-02-24 09:50:13 +0100523 $anno->set_level($depth);
Akrond658df72021-02-18 18:58:56 +0100524
525 # Clean up whitespace
Akrond53913c2021-02-24 09:50:13 +0100526 delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders6f526a32020-06-29 21:44:41 +0200527 }
528
Akrond658df72021-02-18 18:58:56 +0100529 # Text node
Akrond3e1d282021-02-24 14:51:27 +0100530 elsif ($e->[0] == XML_READER_TYPE_TEXT) {
Peter Harders6f526a32020-06-29 21:44:41 +0200531
Akrond658df72021-02-18 18:58:56 +0100532 $add_one = 1;
Akrond3e1d282021-02-24 14:51:27 +0100533 $data->append($node_info);
Akrond658df72021-02-18 18:58:56 +0100534 }
535
536 # Whitespace node
537 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
538 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
539
540 # state, that this from-index belongs to a whitespace-node
541 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
542 $ws{$data->position}++;
543
544 $add_one = 0;
Akrond3e1d282021-02-24 14:51:27 +0100545 $data->append($node_info);
Akrond658df72021-02-18 18:58:56 +0100546 }
547
548 # not yet handled type
549 else {
550
551 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
552 };
553 };
554};
555
Peter Harders6f526a32020-06-29 21:44:41 +0200556
Akrond949e182020-02-14 12:23:57 +0100557__END__
558
559=pod
560
561=encoding utf8
562
563=head1 NAME
564
565tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
566
567=head1 SYNOPSIS
568
569 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
570
571=head1 DESCRIPTION
572
Akronee434b12020-07-08 12:53:01 +0200573C<tei2korapxml> is a script to convert TEI P5 and
574L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
575based documents to the
576L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
577If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100578read from C<STDIN>. If no specific output is defined, data is written
579to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200580
Akrond949e182020-02-14 12:23:57 +0100581This program is usually called from inside another script.
582
Akronee434b12020-07-08 12:53:01 +0200583=head1 FORMATS
584
585=head2 Input restrictions
586
587=over 2
588
589=item
590
Akronee434b12020-07-08 12:53:01 +0200591TEI P5 formatted input with certain restrictions:
592
593=over 4
594
595=item
596
597B<mandatory>: text-header with integrated textsigle, text-body
598
599=item
600
601B<optional>: corp-header with integrated corpsigle,
602doc-header with integrated docsigle
603
604=back
605
606=item
607
Akron0c41ab32020-09-29 07:33:33 +0200608All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200609newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200610(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200611into blanks between 2 tokens could lead to additional blanks,
612where there should be none (e.g.: punctuation characters like C<,> or
613C<.> should not be seperated from their predecessor token).
614(see also code section C<~ whitespace handling ~>).
615
616=back
617
618=head2 Notes on the output
619
620=over 2
621
622=item
623
624zip file output (default on C<stdout>) with utf8 encoded entries
625(which together form the KorAP-XML format)
626
627=back
628
Akrond949e182020-02-14 12:23:57 +0100629=head1 INSTALLATION
630
631C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
632these bindings are available, the preferred way to install the script is
633to use L<cpanm|App::cpanminus>.
634
635 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
636
637In case everything went well, the C<tei2korapxml> tool will
638be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200639
Akrond949e182020-02-14 12:23:57 +0100640Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
641
642=head1 OPTIONS
643
644=over 2
645
Akron4e603a52020-07-27 14:23:49 +0200646=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100647
Akron4e603a52020-07-27 14:23:49 +0200648The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100649
650=item B<--help|-h>
651
652Print help information.
653
654=item B<--version|-v>
655
656Print version information.
657
Akron4e603a52020-07-27 14:23:49 +0200658=item B<--tokenizer-call|-tc>
659
660Call an external tokenizer process, that will tokenize
661a single line from STDIN and outputs one token per line.
662
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200663=item B<--tokenizer-korap|-tk>
664
665Use the standard KorAP/DeReKo tokenizer.
666
Akron6d7b8e42020-09-29 07:37:41 +0200667=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200668
669Tokenize the data using two embedded tokenizers,
670that will take an I<Aggressive> and a I<conservative>
671approach.
672
Akron75d63142021-02-23 18:40:56 +0100673=item B<--skip-inline-tokens>
674
675Boolean flag indicating that inline tokens should not
676be processed. Defaults to false (meaning inline tokens will be processed).
677
Akron1a5271a2021-02-18 13:18:15 +0100678=item B<--inline-tokens> <foundry>#[<file>]
679
680Define the foundry and file (without extension)
681to store inline token information in.
682If L</KORAPXMLTEI_INLINE> is set, this will contain
683annotations as well.
684Defaults to C<tokens> and C<morpho>.
685
Akrondd0be8f2021-02-18 19:29:41 +0100686=item B<--inline-structures> <foundry>#[<file>]
687
688Define the foundry and file (without extension)
689to store inline structure information in.
690Defaults to C<struct> and C<structures>.
691
Akron26a71522021-02-19 10:27:37 +0100692=item B<--base-foundry> <foundry>
693
694Define the base foundry to store newly generated
695token information in.
696Defaults to C<base>.
697
698=item B<--data-file> <file>
699
700Define the file (without extension)
701to store primary data information in.
702Defaults to C<data>.
703
704=item B<--header-file> <file>
705
706Define the file name (without extension)
707to store header information on
708the corpus, document, and text level in.
709Defaults to C<header>.
710
Marc Kupietz985da0c2021-02-15 19:29:50 +0100711=item B<--use-tokenizer-sentence-splits|-s>
712
713Replace existing with, or add new, sentence boundary information
714provided by the KorAP tokenizer (currently supported only).
715
Akron91705d72021-02-19 10:59:45 +0100716=item B<--tokens-file> <file>
717
718Define the file (without extension)
719to store generated token information in
720(either from the KorAP tokenizer or an externally called tokenizer).
721Defaults to C<tokens>.
722
Akron3378dfd2020-08-01 15:01:36 +0200723=item B<--log|-l>
724
725Loglevel for I<Log::Any>. Defaults to C<notice>.
726
Akrond949e182020-02-14 12:23:57 +0100727=back
728
Akronb3649472020-09-29 08:24:46 +0200729=head1 ENVIRONMENT VARIABLES
730
731=over 2
732
733=item B<KORAPXMLTEI_DEBUG>
734
735Activate minimal debugging.
736Defaults to C<false>.
737
738=item B<KORAPXMLTEI_INLINE>
739
740Process inline annotations, if present.
741Defaults to C<false>.
742
743=back
744
Akrond949e182020-02-14 12:23:57 +0100745=head1 COPYRIGHT AND LICENSE
746
Marc Kupietze955ecc2021-02-17 17:42:01 +0100747Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100748
749Author: Peter Harders
750
Akronaabd0952020-09-29 07:35:08 +0200751Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100752
753L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
754Corpus Analysis Platform at the
755L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
756member of the
757L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
758
759This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100760L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100761
762=cut
Akronf8088e62021-02-18 16:18:59 +0100763
764# NOTES
765
766## Notes on how 'XML::CompactTree::XS' works
767
768Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
769
770Print out name of 'node2' for the above example:
771
772echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
773
774Exploring the structure of $data ( = reference to below array ):
775
776[ 0: XML_READER_TYPE_DOCUMENT,
777 1: ?
Akron5aca0d22021-02-24 12:09:53 +0100778 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100779 1: 'node'
780 2: ?
781 3: HASH (attributes)
782 4: 1 (line number)
783 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
784 1: 'node1'
785 2: ?
786 3: undefined (no attributes)
787 4: 1 (line number)
788 5: [ 0: [ 0: XML_READER_TYPE_TEXT
789 1: 'some '
790 ]
791 1: [ 0: XML_READER_TYPE_ELEMENT
792 1: 'n'
793 2: ?
794 3: undefined (no attributes)
795 4: 1 (line number)
796 5: undefined (no child-nodes)
797 ]
798 2: [ 0: XML_READER_TYPE_TEXT
799 1: ' text'
800 ]
801 ]
802 ]
803 1: [ 0: XML_READER_TYPE_ELEMENT
804 1: 'node2'
805 2: ?
806 3: undefined (not attributes)
807 4: 1 (line number)
808 5: [ 0: [ 0: XML_READER_TYPE_TEXT
809 1: 'more-text'
810 ]
811 ]
812 ]
813 ]
814 ]
815 ]
816]
817
818$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
819
820ref($data->[2]) == ARRAY (with 1 element for 'node')
821ref($data->[2]->[0]) == ARRAY (with 6 elements)
822
823$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
824$data->[2]->[0]->[1] == 'node'
825ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
826$data->[2]->[0]->[4] == 1 (line number)
827ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron5aca0d22021-02-24 12:09:53 +0100828 # child-nodes of actual node (see $children)
Akronf8088e62021-02-18 16:18:59 +0100829
830ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
831$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
832$data->[2]->[0]->[5]->[0]->[1] == 'node1'
833$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
834$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
835ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
836
837ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
838$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
839$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
840
841ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
842$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
843$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
844$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
845$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
846$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
847
848ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
849$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
850$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
851
852
Akron5aca0d22021-02-24 12:09:53 +0100853descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akronf8088e62021-02-18 16:18:59 +0100854Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
855${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
856
857
858## Notes on whitespace handling
859
860Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron5aca0d22021-02-24 12:09:53 +0100861(see function 'descend()').
Akronf8088e62021-02-18 16:18:59 +0100862
863Definition of significant and insignificant whitespace
864(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
865
866Significant whitespace is part of the document content and should be preserved.
867Insignificant whitespace is used when editing XML documents for readability.
868These whitespaces are typically not intended for inclusion in the delivery of the document.
869
870### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
871
872The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
873 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
874
875When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
876 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
877 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
878
879echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
880
881
882Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
883
884Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
885 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
886
887The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
888 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
889
890The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
891 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
892
893When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
894 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
895 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
896 the last read 'non-tag'-node has to be corrected (see [1]),
897
898For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
899 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
900
901[1]
902Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
903 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akrond53913c2021-02-24 09:50:13 +0100904 (see above code fragment '... not exists $ws{ $from - 1 } ...').
Akronf8088e62021-02-18 16:18:59 +0100905
906[2]
907Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
908 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
909
910The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
911 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
912
913Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
914 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
915
916
917## Notes on whitespace fixing
918
919The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
920 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
921
922It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
923 example further down and notes on 'Input restrictions' in the manpage).
924
925Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
926
927Examples (how primary text with linebreaks would be converted by below code):
928
929 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
930 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
931
932Blanks are inserted before the 1st character:
933
934 NOTE: not stringent ('...' stands for text):
935
936 beg1............................end1 => no blank before 'beg1'
937 beg2....<pb/>...................end2 => no blank before 'beg2'
938 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
939 beg4....<test>ok</test>.........end4 => blank before 'beg4'
940
941 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
942 ^
943 |_blank between 'end3' and 'beg4'
944
945
946## Notes on segfault prevention
947
Akron91577922021-02-19 10:32:54 +0100948binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100949(see notes on 'PerlIO layers' in 'man XML::LibXML'),
950removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
951see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
952see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.