blob: b0318f8afc6c61bb3b41649503ad6499c2a9ae14 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Akrond3e1d282021-02-24 14:51:27 +010036our $VERSION = '1.01';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akron33db4ec2021-02-24 12:52:21 +010040use constant {
41 # Set to 1 for minimal more debug output (no need to be parametrized)
42 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0,
43
44 # XCT_LINE_NUMBERS is only needed for debugging
45 # (see XML::CompactTree::XS)
46 XCT_PARAM => (
47 XCT_DOCUMENT_ROOT
48 | XCT_IGNORE_COMMENTS
49 | XCT_ATTRIBUTE_ARRAY
50 | ($ENV{KORAPXMLTEI_DEBUG} ? XCT_LINE_NUMBERS : 0)
51 )
52};
Peter Hardersd892a582020-02-12 15:45:22 +010053
Peter Harders6f526a32020-06-29 21:44:41 +020054# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010055GetOptions(
Akrond3e1d282021-02-24 14:51:27 +010056 'root|r=s' => \(my $root_dir = '.'),
57 'input|i=s' => \(my $input_fname = ''),
Akron75d63142021-02-23 18:40:56 +010058 'tokenizer-call|tc=s' => \(my $tokenizer_call),
59 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010060 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akron75d63142021-02-23 18:40:56 +010061 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
62 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
63 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
64 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akrond3e1d282021-02-24 14:51:27 +010069 'log|l=s' => \(my $log_level = 'notice'),
Akron75d63142021-02-23 18:40:56 +010070 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010071 pod2usage(
72 -verbose => 99,
73 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
74 -msg => $VERSION_MSG,
75 -output => '-'
76 )
77 },
78 'version|v' => sub {
79 pod2usage(
80 -verbose => 0,
81 -msg => $VERSION_MSG,
82 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010083 );
Akrond949e182020-02-14 12:23:57 +010084 }
Peter Hardersd892a582020-02-12 15:45:22 +010085);
86
Akrond3e1d282021-02-24 14:51:27 +010087
Akronb87c58d2021-02-23 17:23:30 +010088# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010089binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020090Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020091$log->notice('Debugging is activated') if DEBUG;
92
Akrond3e1d282021-02-24 14:51:27 +010093
Akron0529e512021-02-22 09:55:35 +010094# tag (without attributes), which contains the primary text
95my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020096# optional
Akron09e0b2c2020-07-28 15:57:01 +020097
Akron0529e512021-02-22 09:55:35 +010098# TODO: IDS-specific (and redundant)
99my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +0200100
Akrond53913c2021-02-24 09:50:13 +0100101# name of the tag containing all information stored in $_tokens_file
102my $_TOKENS_TAG = 'w';
103
Akrond3e1d282021-02-24 14:51:27 +0100104
105# Define tokenizers
Marc Kupietz985da0c2021-02-15 19:29:50 +0100106if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
Akron33db4ec2021-02-24 12:52:21 +0100107 die $log->fatal(
108 'Sentence splitting is currently only supported by KorAP tokenizer ' .
109 '(use -tk to activate it)'
110 );
Akronb87c58d2021-02-23 17:23:30 +0100111};
Marc Kupietz985da0c2021-02-15 19:29:50 +0100112
Akrond3e1d282021-02-24 14:51:27 +0100113# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200114my $ext_tok;
115if ($tokenizer_call) {
116 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
117}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200118
Akron0c41ab32020-09-29 07:33:33 +0200119elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100120 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200121};
Peter Harders6f526a32020-06-29 21:44:41 +0200122
Akron0c41ab32020-09-29 07:33:33 +0200123
Akrond3e1d282021-02-24 14:51:27 +0100124# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100125my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
126my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100127
Peter Harders41c35622020-07-12 01:16:22 +0200128
Akrondd0be8f2021-02-18 19:29:41 +0100129# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100130# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100131my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100132
Akron1a5271a2021-02-18 13:18:15 +0100133# Name of the directory and the file containing all inline token informations
134# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
135my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100136
Akron4e3c7e32021-02-18 15:19:53 +0100137# Handling inline annotations (inside $_TOKENS_TAG)
Akronb87c58d2021-02-23 17:23:30 +0100138my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
139
140# Initialize Token- and Structure-Collector
141my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
142my $structures = KorAP::XML::TEI::Annotations::Collector->new;
143
144# Initialize Data-Collector
145my $data = KorAP::XML::TEI::Data->new;
146
147# Initialize zipper
Akrond53913c2021-02-24 09:50:13 +0100148my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron09e0b2c2020-07-28 15:57:01 +0200149
Peter Harders6f526a32020-06-29 21:44:41 +0200150
Akronbc899192021-02-24 12:14:47 +0100151# text directory (below $root_dir)
152my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200153
Akronbc899192021-02-24 12:14:47 +0100154# Escaped version of text id
155my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200156
Akrond53913c2021-02-24 09:50:13 +0100157# element from $tree_data
158my $e;
159
160# Keeping track of the current positions in the text
161my $pos;
162
163# Default encoding of the text
164my $input_enc = 'UTF-8';
165
166# variables for handling ~ whitespace related issue ~
167# (it is sometimes necessary, to correct the from-values for some tags)
168my $add_one;
169my $from = 0;
170
171# text line (needed for whitespace handling)
172my $text_line = 0;
173
174# hash for indices of whitespace-nodes
175# (needed to recorrect from-values)
176# IDEA:
177# when closing element, check if it's from-index minus 1 refers to a whitespace-node
178# (means: 'from-index - 1' is a key in %ws).
179# if this is _not_ the case, then the from-value is one
180# to high => correct it by substracting 1
181my %ws;
Peter Harders6f526a32020-06-29 21:44:41 +0200182
Peter Harders6f526a32020-06-29 21:44:41 +0200183
Akrond53913c2021-02-24 09:50:13 +0100184# Input file handle (default: stdin)
185my $input_fh = *STDIN;
Peter Hardersd892a582020-02-12 15:45:22 +0100186
Akrond53913c2021-02-24 09:50:13 +0100187if ($input_fname ne '') {
Akron347be812020-09-29 07:52:52 +0200188 unless (open($input_fh, '<', $input_fname)) {
189 die $log->fatal("File '$input_fname' could not be opened.");
190 };
Akrond53913c2021-02-24 09:50:13 +0100191};
Peter Harders6f526a32020-06-29 21:44:41 +0200192
Akronf8088e62021-02-18 16:18:59 +0100193# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200194binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200195
Peter Harders6f526a32020-06-29 21:44:41 +0200196
Akrond53913c2021-02-24 09:50:13 +0100197# Reading input document
Akrond3e1d282021-02-24 14:51:27 +0100198MAIN: while (<$input_fh>) {
Akron347be812020-09-29 07:52:52 +0200199
Akrond53913c2021-02-24 09:50:13 +0100200 # remove HTML (multi-line) comments (<!--...-->)
Akrond3e1d282021-02-24 14:51:27 +0100201 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200202
Akroneaa96232020-10-15 17:06:15 +0200203 # Set input encoding
Akrond53913c2021-02-24 09:50:13 +0100204 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akroneaa96232020-10-15 17:06:15 +0200205 $input_enc = $2;
206 next;
207 };
208
209 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100210 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200211
Akrond3e1d282021-02-24 14:51:27 +0100212 # Start of text body
213 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
Akrond53913c2021-02-24 09:50:13 +0100214 my $suffix = $2;
Akron347be812020-09-29 07:52:52 +0200215
Akrond53913c2021-02-24 09:50:13 +0100216 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200217 die $log->fatal("input line number $.: " .
218 "line with opening text-body tag '${_TEXT_BODY}' " .
219 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200220 };
Peter Harders6f526a32020-06-29 21:44:41 +0200221
Akrond53913c2021-02-24 09:50:13 +0100222 # Text body data extracted from input document ($input_fh),
223 # further processed by XML::LibXML::Reader
224 my $text_buffer = '';
Peter Harders90157342020-07-01 21:05:14 +0200225
Akron347be812020-09-29 07:52:52 +0200226 # Iterate over all lines in the text body
227 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200228
Akrond3e1d282021-02-24 14:51:27 +0100229 $_ = remove_xml_comments($input_fh, $_);
Akroneaa96232020-10-15 17:06:15 +0200230 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100231 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200232
Akrond53913c2021-02-24 09:50:13 +0100233 # End of text body
Akron347be812020-09-29 07:52:52 +0200234 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200235
Akron91705d72021-02-19 10:59:45 +0100236 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200237
Akrond53913c2021-02-24 09:50:13 +0100238 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200239 die $log->fatal("input line number $.: " .
240 "line with closing text-body tag '${_TEXT_BODY}'".
241 " contains additional information ... => Aborting (line=$_)");
242 };
Peter Harders6f526a32020-06-29 21:44:41 +0200243
Akrondafaa7a2021-02-19 15:17:58 +0100244 if ($dir eq '') {
Akrond53913c2021-02-24 09:50:13 +0100245 $log->warn(
246 "Maybe empty textSigle => skipping this text ...\n" .
247 'data=' . substr($data->data, 0, 200)
248 );
Akrondafaa7a2021-02-19 15:17:58 +0100249 next MAIN;
250 };
Peter Harders6f526a32020-06-29 21:44:41 +0200251
Akrondafaa7a2021-02-19 15:17:58 +0100252 my $reader = XML::LibXML::Reader->new(
Akrond53913c2021-02-24 09:50:13 +0100253 string => "<text>$text_buffer</text>",
Akrondafaa7a2021-02-19 15:17:58 +0100254 huge => 1
255 );
256
Akron33db4ec2021-02-24 12:52:21 +0100257 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, XCT_PARAM);
Akrondafaa7a2021-02-19 15:17:58 +0100258
Akrondafaa7a2021-02-19 15:17:58 +0100259 # ~ whitespace related issue ~
260 $add_one = 0;
261 %ws = ();
262
Akrond3e1d282021-02-24 14:51:27 +0100263 # Recursively parse all children
264 descend(1, $tree_data->[2]);
Akrondafaa7a2021-02-19 15:17:58 +0100265
266 if (DEBUG) {
Akrond53913c2021-02-24 09:50:13 +0100267 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akrondafaa7a2021-02-19 15:17:58 +0100268 };
269
Akrond53913c2021-02-24 09:50:13 +0100270 # Write data.xml
Akrondafaa7a2021-02-19 15:17:58 +0100271 $data->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100272 $zipper->new_stream("$dir/${data_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100273 $text_id_esc
274 );
275
Akrond53913c2021-02-24 09:50:13 +0100276 # Tokenize with external tokenizer
Akron9df4a242021-02-19 15:31:16 +0100277 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100278
279 # Tokenize and output
280 $ext_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100281 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100282 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100283 );
Akrond53ab4b2021-02-24 09:56:12 +0100284
285 if ($use_tokenizer_sentence_splits) {
286 $ext_tok->sentencize_from_previous_input($structures);
287 };
Akrondafaa7a2021-02-19 15:17:58 +0100288 };
Peter Harders6f526a32020-06-29 21:44:41 +0200289
Akrond53913c2021-02-24 09:50:13 +0100290 # Tokenize with internal tokenizer
291 if ($tokenizer_intern) {
Peter Harders6f526a32020-06-29 21:44:41 +0200292
Akrondafaa7a2021-02-19 15:17:58 +0100293 # Tokenize and output
294 $cons_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100295 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200296 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100297 )->reset;
Akron598d1a72020-08-02 17:33:31 +0200298
Akrondafaa7a2021-02-19 15:17:58 +0100299 $aggr_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100300 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akrondafaa7a2021-02-19 15:17:58 +0100301 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100302 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100303 };
Akrona10ad592020-08-03 11:20:23 +0200304
Akrondafaa7a2021-02-19 15:17:58 +0100305 # ~ write structures ~
306 if (!$structures->empty) {
307 $structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100308 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100309 $text_id_esc,
310 2 # = structure serialization
Akroncc27d792021-02-24 12:32:20 +0100311 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100312 };
313
314 # ~ write tokens ~
Akron75d63142021-02-23 18:40:56 +0100315 unless ($skip_inline_tokens || $tokens->empty) {
Akrondafaa7a2021-02-19 15:17:58 +0100316 $tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100317 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100318 $text_id_esc,
319 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akroncc27d792021-02-24 12:32:20 +0100320 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100321 };
322
323 # reinit.
324 $dir = '';
325
326 # Maybe not necessary
327 $data->reset;
328
Akron347be812020-09-29 07:52:52 +0200329 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200330 };
331
Peter Harders6f526a32020-06-29 21:44:41 +0200332
Akron347be812020-09-29 07:52:52 +0200333 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200334
Akronf8088e62021-02-18 16:18:59 +0100335 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100336
Akrond53913c2021-02-24 09:50:13 +0100337 # TODO:
338 # Maybe it's best, to keep the stripping of whitespace and
339 # to just remove the if-clause and to insert a blank by default
340 # (with possibly an option on how newlines in primary text should
341 # be handled (stripped or replaced by a whitespace)).
Akronf8088e62021-02-18 16:18:59 +0100342
343 # Remove consecutive whitespace at beginning and end (mostly one newline)
344 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200345
Akrond53913c2021-02-24 09:50:13 +0100346 # NOTE:
347 # this is only relevant, if a text consists of more than one line
Akronf57ed812020-07-27 10:37:52 +0200348
Akrond53913c2021-02-24 09:50:13 +0100349 # TODO:
350 # find a better solution, or create a warning, if a text has more
351 # than one line ($text_line > 1)
Akronf57ed812020-07-27 10:37:52 +0200352
Akrond53913c2021-02-24 09:50:13 +0100353 # TODO:
354 # do testing with 2 different corpora
355 # (one with only one-line texts, the other with several lines per text)
356
357 # line contains at least one tag with at least one character contents
358 if (m/<[^>]+>[^<]/) {
359
360 # Increment counter for text lines
361 $text_line++;
362
363 # insert blank before 1st character
Akron6e2b1252021-02-24 12:41:15 +0100364 # (for 2nd line and consecutive lines)
365 $_ = ' ' . $_ if $text_line > 1;
Akron347be812020-09-29 07:52:52 +0200366 }
Akronf57ed812020-07-27 10:37:52 +0200367
Akron347be812020-09-29 07:52:52 +0200368 # add line to buffer
Akrond53913c2021-02-24 09:50:13 +0100369 $text_buffer .= $_;
Akron347be812020-09-29 07:52:52 +0200370 };
Akrond3e1d282021-02-24 14:51:27 +0100371 }
Akronf57ed812020-07-27 10:37:52 +0200372
Akrond3e1d282021-02-24 14:51:27 +0100373 # Start of header section
374 elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200375
Akron347be812020-09-29 07:52:52 +0200376 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200377
Akrond20898f2021-02-19 15:52:17 +0100378 if ($1 !~ /^\s*$/) {
Akrond53913c2021-02-24 09:50:13 +0100379 die $log->fatal(
380 "input line number $.: " .
381 'line with opening header tag is not in expected format ... ' .
382 "=> Aborting (line=$_)");
Akron347be812020-09-29 07:52:52 +0200383 };
384
385 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200386 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200387
388 # Header was parseable
389 if ($header) {
390
391 # Write header to zip
Akrond53913c2021-02-24 09:50:13 +0100392 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200393
Akronb3649472020-09-29 08:24:46 +0200394 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200395
396 $header->to_zip($zipper->new_stream($file));
397
398 # Header is for text level
399 if ($header->type eq 'text') {
400
401 # Remember dir and sigles
402 $dir = $header->dir;
Akron347be812020-09-29 07:52:52 +0200403 $text_id_esc = $header->id_esc;
404
405 # log output for seeing progression
Akronbc899192021-02-24 12:14:47 +0100406 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200407
Akrond53913c2021-02-24 09:50:13 +0100408 # Reset counter for text lines
409 # (needed for whitespace handling)
410 $text_line = 0;
411 };
412 };
413 };
414};
Peter Hardersd892a582020-02-12 15:45:22 +0100415
Akron347be812020-09-29 07:52:52 +0200416$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200417
Akron9df4a242021-02-19 15:31:16 +0100418$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100419
Akrond53913c2021-02-24 09:50:13 +0100420close $input_fh;
421
Akron347be812020-09-29 07:52:52 +0200422exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100423
Peter Hardersd892a582020-02-12 15:45:22 +0100424
Akrond658df72021-02-18 18:58:56 +0100425# Recursively called function to handle XML tree data
Akron5aca0d22021-02-24 12:09:53 +0100426sub descend {
Akrond53913c2021-02-24 09:50:13 +0100427
Akron1c4f2202020-07-30 09:28:22 +0200428 # recursion level
Akron5aca0d22021-02-24 12:09:53 +0100429 # (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akrond53913c2021-02-24 09:50:13 +0100430 my $depth = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100431
Akrond658df72021-02-18 18:58:56 +0100432 # Iteration through all array elements
433 # ($_[0] is a reference to an array reference)
434 # See notes on how 'XML::CompactTree::XS' works and
435 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron3556c752021-02-24 09:53:24 +0100436 foreach $e (@{$_[0]}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100437
Akrond3e1d282021-02-24 14:51:27 +0100438 # $e->[1] represents the tag name of an element node
439 # or the primary data of a text or ws node
440 my $node_info = $e->[1];
441
Akrond658df72021-02-18 18:58:56 +0100442 # Element node
443 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100444
Akrond3e1d282021-02-24 14:51:27 +0100445 # Deal with opening tag
Peter Hardersd892a582020-02-12 15:45:22 +0100446
Akron5aca0d22021-02-24 12:09:53 +0100447 # Get the child index depending on the debug state.
448 # This is likely to be optimized away by the compiler.
449 my $children = $e->[DEBUG ? 5 : 4];
450
Akronace12772021-02-19 13:16:26 +0100451 # Skip sentences
Akrond3e1d282021-02-24 14:51:27 +0100452 if ($use_tokenizer_sentence_splits && $node_info eq 's') {
453 descend($depth + 1, $children) if defined $children;
Akronace12772021-02-19 13:16:26 +0100454 next;
Akrond3e1d282021-02-24 14:51:27 +0100455 };
Peter Hardersd892a582020-02-12 15:45:22 +0100456
Akrond3e1d282021-02-24 14:51:27 +0100457 my $anno = $structures->add_new_annotation($node_info);
Peter Hardersd892a582020-02-12 15:45:22 +0100458
Akron7501ca02020-08-01 21:05:25 +0200459 # Add element also to token list
Akrond3e1d282021-02-24 14:51:27 +0100460 if (!$skip_inline_tokens && $node_info eq $_TOKENS_TAG) {
Akron7501ca02020-08-01 21:05:25 +0200461 $tokens->add_annotation($anno);
462 };
Peter Hardersd892a582020-02-12 15:45:22 +0100463
Akrond658df72021-02-18 18:58:56 +0100464 # Handle attributes (if attributes exist)
465 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100466
Akrond658df72021-02-18 18:58:56 +0100467 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
Akrond3e1d282021-02-24 14:51:27 +0100468 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
469 # NOTE:
470 # arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akrondac5d932021-02-23 21:12:02 +0100471 for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Akron7501ca02020-08-01 21:05:25 +0200472 $anno->add_attribute(
Akrondac5d932021-02-23 21:12:02 +0100473 @{$e->[3]}[$_, $_ + 1]
Akron7501ca02020-08-01 21:05:25 +0200474 );
Akrond658df72021-02-18 18:58:56 +0100475 };
476 };
Peter Harders6f526a32020-06-29 21:44:41 +0200477
478 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200479 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200480
Akrond658df72021-02-18 18:58:56 +0100481
Akrond658df72021-02-18 18:58:56 +0100482 # Call function recursively
Akron5aca0d22021-02-24 12:09:53 +0100483 # do no recursion, if $children is not defined
Akrond658df72021-02-18 18:58:56 +0100484 # (because we have no array of child-nodes, e.g.: <back/>)
Akron5aca0d22021-02-24 12:09:53 +0100485 descend($depth+1, $children) if defined $children;
Peter Harders6f526a32020-06-29 21:44:41 +0200486
487
Akrond3e1d282021-02-24 14:51:27 +0100488 # Deal with closing tag
Peter Harders6f526a32020-06-29 21:44:41 +0200489
Akrond3e1d282021-02-24 14:51:27 +0100490 # NOTE:
491 # use $pos, because the offsets are _between_ the characters
492 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200493 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200494
Akrond658df72021-02-18 18:58:56 +0100495 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200496
Akrond53913c2021-02-24 09:50:13 +0100497 $from = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200498
Peter Harders6f526a32020-06-29 21:44:41 +0200499 # ~ whitespace related issue ~
Akrond53913c2021-02-24 09:50:13 +0100500 if ($from > 0 && not exists $ws{$from - 1}) {
Akrond658df72021-02-18 18:58:56 +0100501
Akrond3e1d282021-02-24 14:51:27 +0100502 # Previous node was a text-node
Akrond53913c2021-02-24 09:50:13 +0100503 $anno->set_from($from - 1);
Akrond658df72021-02-18 18:58:56 +0100504 };
505
Akrond53913c2021-02-24 09:50:13 +0100506 # in case this fails, check input
507 if (($from - 1) > $pos) {
508 die $log->fatal(
Akronbc899192021-02-24 12:14:47 +0100509 "text_id='$text_id_esc', " .
Akrond53913c2021-02-24 09:50:13 +0100510 'processing of structures: ' .
511 "from-value ($from) is 2 or more greater " .
512 "than to-value ($pos) => please check. Aborting"
513 );
514 };
515
516 # TODO:
517 # find example for which this case applies
518 # maybe this is not necessary anymore, because the
519 # above recorrection of the from-value suffices
Akrond658df72021-02-18 18:58:56 +0100520 #
Akrond53913c2021-02-24 09:50:13 +0100521 # TODO:
522 # check, if it's better to remove this line and
523 # change above check to 'if ($from - 1) >= $pos;
Akrond658df72021-02-18 18:58:56 +0100524 # do testing with bigger corpus excerpt (wikipedia?)
Akrond53913c2021-02-24 09:50:13 +0100525 $anno->set_from($pos) if $from == $pos + 1;
Akrond658df72021-02-18 18:58:56 +0100526 $anno->set_to($pos);
Akrond53913c2021-02-24 09:50:13 +0100527 $anno->set_level($depth);
Akrond658df72021-02-18 18:58:56 +0100528
529 # Clean up whitespace
Akrond53913c2021-02-24 09:50:13 +0100530 delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders6f526a32020-06-29 21:44:41 +0200531 }
532
Akrond658df72021-02-18 18:58:56 +0100533 # Text node
Akrond3e1d282021-02-24 14:51:27 +0100534 elsif ($e->[0] == XML_READER_TYPE_TEXT) {
Peter Harders6f526a32020-06-29 21:44:41 +0200535
Akrond658df72021-02-18 18:58:56 +0100536 $add_one = 1;
Akrond3e1d282021-02-24 14:51:27 +0100537 $data->append($node_info);
Akrond658df72021-02-18 18:58:56 +0100538 }
539
540 # Whitespace node
541 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
542 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
543
544 # state, that this from-index belongs to a whitespace-node
545 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
546 $ws{$data->position}++;
547
548 $add_one = 0;
Akrond3e1d282021-02-24 14:51:27 +0100549 $data->append($node_info);
Akrond658df72021-02-18 18:58:56 +0100550 }
551
552 # not yet handled type
553 else {
554
555 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
556 };
557 };
558};
559
Peter Harders6f526a32020-06-29 21:44:41 +0200560
Akrond949e182020-02-14 12:23:57 +0100561__END__
562
563=pod
564
565=encoding utf8
566
567=head1 NAME
568
569tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
570
571=head1 SYNOPSIS
572
573 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
574
575=head1 DESCRIPTION
576
Akronee434b12020-07-08 12:53:01 +0200577C<tei2korapxml> is a script to convert TEI P5 and
578L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
579based documents to the
580L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
581If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100582read from C<STDIN>. If no specific output is defined, data is written
583to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200584
Akrond949e182020-02-14 12:23:57 +0100585This program is usually called from inside another script.
586
Akronee434b12020-07-08 12:53:01 +0200587=head1 FORMATS
588
589=head2 Input restrictions
590
591=over 2
592
593=item
594
Akronee434b12020-07-08 12:53:01 +0200595TEI P5 formatted input with certain restrictions:
596
597=over 4
598
599=item
600
601B<mandatory>: text-header with integrated textsigle, text-body
602
603=item
604
605B<optional>: corp-header with integrated corpsigle,
606doc-header with integrated docsigle
607
608=back
609
610=item
611
Akron0c41ab32020-09-29 07:33:33 +0200612All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200613newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200614(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200615into blanks between 2 tokens could lead to additional blanks,
616where there should be none (e.g.: punctuation characters like C<,> or
617C<.> should not be seperated from their predecessor token).
618(see also code section C<~ whitespace handling ~>).
619
620=back
621
622=head2 Notes on the output
623
624=over 2
625
626=item
627
628zip file output (default on C<stdout>) with utf8 encoded entries
629(which together form the KorAP-XML format)
630
631=back
632
Akrond949e182020-02-14 12:23:57 +0100633=head1 INSTALLATION
634
635C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
636these bindings are available, the preferred way to install the script is
637to use L<cpanm|App::cpanminus>.
638
639 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
640
641In case everything went well, the C<tei2korapxml> tool will
642be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200643
Akrond949e182020-02-14 12:23:57 +0100644Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
645
646=head1 OPTIONS
647
648=over 2
649
Akron4e603a52020-07-27 14:23:49 +0200650=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100651
Akron4e603a52020-07-27 14:23:49 +0200652The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100653
654=item B<--help|-h>
655
656Print help information.
657
658=item B<--version|-v>
659
660Print version information.
661
Akron4e603a52020-07-27 14:23:49 +0200662=item B<--tokenizer-call|-tc>
663
664Call an external tokenizer process, that will tokenize
665a single line from STDIN and outputs one token per line.
666
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200667=item B<--tokenizer-korap|-tk>
668
669Use the standard KorAP/DeReKo tokenizer.
670
Akron6d7b8e42020-09-29 07:37:41 +0200671=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200672
673Tokenize the data using two embedded tokenizers,
674that will take an I<Aggressive> and a I<conservative>
675approach.
676
Akron75d63142021-02-23 18:40:56 +0100677=item B<--skip-inline-tokens>
678
679Boolean flag indicating that inline tokens should not
680be processed. Defaults to false (meaning inline tokens will be processed).
681
Akron1a5271a2021-02-18 13:18:15 +0100682=item B<--inline-tokens> <foundry>#[<file>]
683
684Define the foundry and file (without extension)
685to store inline token information in.
686If L</KORAPXMLTEI_INLINE> is set, this will contain
687annotations as well.
688Defaults to C<tokens> and C<morpho>.
689
Akrondd0be8f2021-02-18 19:29:41 +0100690=item B<--inline-structures> <foundry>#[<file>]
691
692Define the foundry and file (without extension)
693to store inline structure information in.
694Defaults to C<struct> and C<structures>.
695
Akron26a71522021-02-19 10:27:37 +0100696=item B<--base-foundry> <foundry>
697
698Define the base foundry to store newly generated
699token information in.
700Defaults to C<base>.
701
702=item B<--data-file> <file>
703
704Define the file (without extension)
705to store primary data information in.
706Defaults to C<data>.
707
708=item B<--header-file> <file>
709
710Define the file name (without extension)
711to store header information on
712the corpus, document, and text level in.
713Defaults to C<header>.
714
Marc Kupietz985da0c2021-02-15 19:29:50 +0100715=item B<--use-tokenizer-sentence-splits|-s>
716
717Replace existing with, or add new, sentence boundary information
718provided by the KorAP tokenizer (currently supported only).
719
Akron91705d72021-02-19 10:59:45 +0100720=item B<--tokens-file> <file>
721
722Define the file (without extension)
723to store generated token information in
724(either from the KorAP tokenizer or an externally called tokenizer).
725Defaults to C<tokens>.
726
Akron3378dfd2020-08-01 15:01:36 +0200727=item B<--log|-l>
728
729Loglevel for I<Log::Any>. Defaults to C<notice>.
730
Akrond949e182020-02-14 12:23:57 +0100731=back
732
Akronb3649472020-09-29 08:24:46 +0200733=head1 ENVIRONMENT VARIABLES
734
735=over 2
736
737=item B<KORAPXMLTEI_DEBUG>
738
739Activate minimal debugging.
740Defaults to C<false>.
741
742=item B<KORAPXMLTEI_INLINE>
743
744Process inline annotations, if present.
745Defaults to C<false>.
746
747=back
748
Akrond949e182020-02-14 12:23:57 +0100749=head1 COPYRIGHT AND LICENSE
750
Marc Kupietze955ecc2021-02-17 17:42:01 +0100751Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100752
753Author: Peter Harders
754
Akronaabd0952020-09-29 07:35:08 +0200755Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100756
757L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
758Corpus Analysis Platform at the
759L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
760member of the
761L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
762
763This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100764L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100765
766=cut
Akronf8088e62021-02-18 16:18:59 +0100767
768# NOTES
769
770## Notes on how 'XML::CompactTree::XS' works
771
772Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
773
774Print out name of 'node2' for the above example:
775
776echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
777
778Exploring the structure of $data ( = reference to below array ):
779
780[ 0: XML_READER_TYPE_DOCUMENT,
781 1: ?
Akron5aca0d22021-02-24 12:09:53 +0100782 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100783 1: 'node'
784 2: ?
785 3: HASH (attributes)
786 4: 1 (line number)
787 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
788 1: 'node1'
789 2: ?
790 3: undefined (no attributes)
791 4: 1 (line number)
792 5: [ 0: [ 0: XML_READER_TYPE_TEXT
793 1: 'some '
794 ]
795 1: [ 0: XML_READER_TYPE_ELEMENT
796 1: 'n'
797 2: ?
798 3: undefined (no attributes)
799 4: 1 (line number)
800 5: undefined (no child-nodes)
801 ]
802 2: [ 0: XML_READER_TYPE_TEXT
803 1: ' text'
804 ]
805 ]
806 ]
807 1: [ 0: XML_READER_TYPE_ELEMENT
808 1: 'node2'
809 2: ?
810 3: undefined (not attributes)
811 4: 1 (line number)
812 5: [ 0: [ 0: XML_READER_TYPE_TEXT
813 1: 'more-text'
814 ]
815 ]
816 ]
817 ]
818 ]
819 ]
820]
821
822$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
823
824ref($data->[2]) == ARRAY (with 1 element for 'node')
825ref($data->[2]->[0]) == ARRAY (with 6 elements)
826
827$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
828$data->[2]->[0]->[1] == 'node'
829ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
830$data->[2]->[0]->[4] == 1 (line number)
831ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron5aca0d22021-02-24 12:09:53 +0100832 # child-nodes of actual node (see $children)
Akronf8088e62021-02-18 16:18:59 +0100833
834ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
835$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
836$data->[2]->[0]->[5]->[0]->[1] == 'node1'
837$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
838$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
839ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
840
841ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
842$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
843$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
844
845ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
846$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
847$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
848$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
849$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
850$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
851
852ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
853$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
854$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
855
856
Akron5aca0d22021-02-24 12:09:53 +0100857descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akronf8088e62021-02-18 16:18:59 +0100858Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
859${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
860
861
862## Notes on whitespace handling
863
864Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron5aca0d22021-02-24 12:09:53 +0100865(see function 'descend()').
Akronf8088e62021-02-18 16:18:59 +0100866
867Definition of significant and insignificant whitespace
868(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
869
870Significant whitespace is part of the document content and should be preserved.
871Insignificant whitespace is used when editing XML documents for readability.
872These whitespaces are typically not intended for inclusion in the delivery of the document.
873
874### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
875
876The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
877 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
878
879When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
880 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
881 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
882
883echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
884
885
886Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
887
888Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
889 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
890
891The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
892 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
893
894The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
895 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
896
897When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
898 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
899 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
900 the last read 'non-tag'-node has to be corrected (see [1]),
901
902For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
903 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
904
905[1]
906Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
907 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akrond53913c2021-02-24 09:50:13 +0100908 (see above code fragment '... not exists $ws{ $from - 1 } ...').
Akronf8088e62021-02-18 16:18:59 +0100909
910[2]
911Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
912 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
913
914The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
915 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
916
917Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
918 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
919
920
921## Notes on whitespace fixing
922
923The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
924 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
925
926It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
927 example further down and notes on 'Input restrictions' in the manpage).
928
929Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
930
931Examples (how primary text with linebreaks would be converted by below code):
932
933 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
934 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
935
936Blanks are inserted before the 1st character:
937
938 NOTE: not stringent ('...' stands for text):
939
940 beg1............................end1 => no blank before 'beg1'
941 beg2....<pb/>...................end2 => no blank before 'beg2'
942 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
943 beg4....<test>ok</test>.........end4 => blank before 'beg4'
944
945 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
946 ^
947 |_blank between 'end3' and 'beg4'
948
949
950## Notes on segfault prevention
951
Akron91577922021-02-19 10:32:54 +0100952binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100953(see notes on 'PerlIO layers' in 'man XML::LibXML'),
954removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
955see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
956see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.