blob: af1bc059111cbdaee2c511a2d351d2eb75462fa7 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Akrond3e1d282021-02-24 14:51:27 +010036our $VERSION = '1.01';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akron33db4ec2021-02-24 12:52:21 +010040use constant {
41 # Set to 1 for minimal more debug output (no need to be parametrized)
42 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0,
43
44 # XCT_LINE_NUMBERS is only needed for debugging
45 # (see XML::CompactTree::XS)
46 XCT_PARAM => (
47 XCT_DOCUMENT_ROOT
48 | XCT_IGNORE_COMMENTS
49 | XCT_ATTRIBUTE_ARRAY
50 | ($ENV{KORAPXMLTEI_DEBUG} ? XCT_LINE_NUMBERS : 0)
51 )
52};
Peter Hardersd892a582020-02-12 15:45:22 +010053
Peter Harders6f526a32020-06-29 21:44:41 +020054# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010055GetOptions(
Akrond3e1d282021-02-24 14:51:27 +010056 'root|r=s' => \(my $root_dir = '.'),
57 'input|i=s' => \(my $input_fname = ''),
Akron75d63142021-02-23 18:40:56 +010058 'tokenizer-call|tc=s' => \(my $tokenizer_call),
59 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010060 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akron75d63142021-02-23 18:40:56 +010061 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
62 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
63 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
64 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron54c3ff12021-02-25 11:33:37 +010065 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010066 'base-foundry=s' => \(my $base_dir = 'base'),
67 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010068 'header-file=s' => \(my $header_file = 'header'),
69 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akrond3e1d282021-02-24 14:51:27 +010070 'log|l=s' => \(my $log_level = 'notice'),
Akron75d63142021-02-23 18:40:56 +010071 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010072 pod2usage(
73 -verbose => 99,
74 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
75 -msg => $VERSION_MSG,
76 -output => '-'
77 )
78 },
79 'version|v' => sub {
80 pod2usage(
81 -verbose => 0,
82 -msg => $VERSION_MSG,
83 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010084 );
Akrond949e182020-02-14 12:23:57 +010085 }
Peter Hardersd892a582020-02-12 15:45:22 +010086);
87
Akrond3e1d282021-02-24 14:51:27 +010088
Akronb87c58d2021-02-23 17:23:30 +010089# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010090binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020091Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020092$log->notice('Debugging is activated') if DEBUG;
93
Akrond3e1d282021-02-24 14:51:27 +010094
Akron0529e512021-02-22 09:55:35 +010095# tag (without attributes), which contains the primary text
96my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020097# optional
Akron09e0b2c2020-07-28 15:57:01 +020098
Akron0529e512021-02-22 09:55:35 +010099# TODO: IDS-specific (and redundant)
100my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +0200101
Akrond53913c2021-02-24 09:50:13 +0100102# name of the tag containing all information stored in $_tokens_file
103my $_TOKENS_TAG = 'w';
104
Akrond3e1d282021-02-24 14:51:27 +0100105
106# Define tokenizers
Marc Kupietz985da0c2021-02-15 19:29:50 +0100107if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
Akron33db4ec2021-02-24 12:52:21 +0100108 die $log->fatal(
109 'Sentence splitting is currently only supported by KorAP tokenizer ' .
110 '(use -tk to activate it)'
111 );
Akronb87c58d2021-02-23 17:23:30 +0100112};
Marc Kupietz985da0c2021-02-15 19:29:50 +0100113
Akron54c3ff12021-02-25 11:33:37 +0100114# Remember to skip certain inline tags
115my %skip_inline_tags = ();
116if ($skip_inline_tags_str) {
117 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
118 $skip_inline_tags{$_} = 1;
119 };
120};
121
Akrond3e1d282021-02-24 14:51:27 +0100122# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200123my $ext_tok;
124if ($tokenizer_call) {
125 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
126}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200127
Akron0c41ab32020-09-29 07:33:33 +0200128elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100129 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron54c3ff12021-02-25 11:33:37 +0100130 if ($use_tokenizer_sentence_splits) {
131 $skip_inline_tags{s} = 1;
132 };
Akron0c41ab32020-09-29 07:33:33 +0200133};
Peter Harders6f526a32020-06-29 21:44:41 +0200134
Akron0c41ab32020-09-29 07:33:33 +0200135
Akrond3e1d282021-02-24 14:51:27 +0100136# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100137my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
138my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100139
Peter Harders41c35622020-07-12 01:16:22 +0200140
Akrondd0be8f2021-02-18 19:29:41 +0100141# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100142# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100143my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100144
Akron1a5271a2021-02-18 13:18:15 +0100145# Name of the directory and the file containing all inline token informations
146# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
147my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100148
Akron4e3c7e32021-02-18 15:19:53 +0100149# Handling inline annotations (inside $_TOKENS_TAG)
Akronb87c58d2021-02-23 17:23:30 +0100150my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
151
152# Initialize Token- and Structure-Collector
153my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
154my $structures = KorAP::XML::TEI::Annotations::Collector->new;
155
156# Initialize Data-Collector
157my $data = KorAP::XML::TEI::Data->new;
158
159# Initialize zipper
Akrond53913c2021-02-24 09:50:13 +0100160my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron09e0b2c2020-07-28 15:57:01 +0200161
Peter Harders6f526a32020-06-29 21:44:41 +0200162
Akronbc899192021-02-24 12:14:47 +0100163# text directory (below $root_dir)
164my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200165
Akronbc899192021-02-24 12:14:47 +0100166# Escaped version of text id
167my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200168
Akrond53913c2021-02-24 09:50:13 +0100169# element from $tree_data
170my $e;
171
Akrond53913c2021-02-24 09:50:13 +0100172# Default encoding of the text
173my $input_enc = 'UTF-8';
174
175# variables for handling ~ whitespace related issue ~
176# (it is sometimes necessary, to correct the from-values for some tags)
177my $add_one;
Akrond53913c2021-02-24 09:50:13 +0100178
179# text line (needed for whitespace handling)
180my $text_line = 0;
181
182# hash for indices of whitespace-nodes
183# (needed to recorrect from-values)
184# IDEA:
185# when closing element, check if it's from-index minus 1 refers to a whitespace-node
186# (means: 'from-index - 1' is a key in %ws).
187# if this is _not_ the case, then the from-value is one
188# to high => correct it by substracting 1
189my %ws;
Peter Harders6f526a32020-06-29 21:44:41 +0200190
Peter Harders6f526a32020-06-29 21:44:41 +0200191
Akrond53913c2021-02-24 09:50:13 +0100192# Input file handle (default: stdin)
193my $input_fh = *STDIN;
Peter Hardersd892a582020-02-12 15:45:22 +0100194
Akrond53913c2021-02-24 09:50:13 +0100195if ($input_fname ne '') {
Akron347be812020-09-29 07:52:52 +0200196 unless (open($input_fh, '<', $input_fname)) {
197 die $log->fatal("File '$input_fname' could not be opened.");
198 };
Akrond53913c2021-02-24 09:50:13 +0100199};
Peter Harders6f526a32020-06-29 21:44:41 +0200200
Akronf8088e62021-02-18 16:18:59 +0100201# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200202binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200203
Peter Harders6f526a32020-06-29 21:44:41 +0200204
Akrond53913c2021-02-24 09:50:13 +0100205# Reading input document
Akrond3e1d282021-02-24 14:51:27 +0100206MAIN: while (<$input_fh>) {
Akron347be812020-09-29 07:52:52 +0200207
Akrond53913c2021-02-24 09:50:13 +0100208 # remove HTML (multi-line) comments (<!--...-->)
Akrond3e1d282021-02-24 14:51:27 +0100209 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200210
Akroneaa96232020-10-15 17:06:15 +0200211 # Set input encoding
Akrond53913c2021-02-24 09:50:13 +0100212 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akroneaa96232020-10-15 17:06:15 +0200213 $input_enc = $2;
214 next;
215 };
216
217 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100218 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200219
Akrond3e1d282021-02-24 14:51:27 +0100220 # Start of text body
221 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
Akrond53913c2021-02-24 09:50:13 +0100222 my $suffix = $2;
Akron347be812020-09-29 07:52:52 +0200223
Akrond53913c2021-02-24 09:50:13 +0100224 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200225 die $log->fatal("input line number $.: " .
226 "line with opening text-body tag '${_TEXT_BODY}' " .
227 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200228 };
Peter Harders6f526a32020-06-29 21:44:41 +0200229
Akrond53913c2021-02-24 09:50:13 +0100230 # Text body data extracted from input document ($input_fh),
231 # further processed by XML::LibXML::Reader
232 my $text_buffer = '';
Peter Harders90157342020-07-01 21:05:14 +0200233
Akron347be812020-09-29 07:52:52 +0200234 # Iterate over all lines in the text body
235 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200236
Akrond3e1d282021-02-24 14:51:27 +0100237 $_ = remove_xml_comments($input_fh, $_);
Akroneaa96232020-10-15 17:06:15 +0200238 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100239 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200240
Akrond53913c2021-02-24 09:50:13 +0100241 # End of text body
Akronb43b4912021-02-25 10:31:11 +0100242 if ((my $pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200243
Akron91705d72021-02-19 10:59:45 +0100244 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200245
Akrond53913c2021-02-24 09:50:13 +0100246 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200247 die $log->fatal("input line number $.: " .
248 "line with closing text-body tag '${_TEXT_BODY}'".
249 " contains additional information ... => Aborting (line=$_)");
250 };
Peter Harders6f526a32020-06-29 21:44:41 +0200251
Akrondafaa7a2021-02-19 15:17:58 +0100252 if ($dir eq '') {
Akrond53913c2021-02-24 09:50:13 +0100253 $log->warn(
254 "Maybe empty textSigle => skipping this text ...\n" .
255 'data=' . substr($data->data, 0, 200)
256 );
Akrondafaa7a2021-02-19 15:17:58 +0100257 next MAIN;
258 };
Peter Harders6f526a32020-06-29 21:44:41 +0200259
Akrondafaa7a2021-02-19 15:17:58 +0100260 my $reader = XML::LibXML::Reader->new(
Akrond53913c2021-02-24 09:50:13 +0100261 string => "<text>$text_buffer</text>",
Akrondafaa7a2021-02-19 15:17:58 +0100262 huge => 1
263 );
264
Akron33db4ec2021-02-24 12:52:21 +0100265 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, XCT_PARAM);
Akrondafaa7a2021-02-19 15:17:58 +0100266
Akrondafaa7a2021-02-19 15:17:58 +0100267 # ~ whitespace related issue ~
268 $add_one = 0;
269 %ws = ();
270
Akrond3e1d282021-02-24 14:51:27 +0100271 # Recursively parse all children
272 descend(1, $tree_data->[2]);
Akrondafaa7a2021-02-19 15:17:58 +0100273
274 if (DEBUG) {
Akrond53913c2021-02-24 09:50:13 +0100275 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akrondafaa7a2021-02-19 15:17:58 +0100276 };
277
Akrond53913c2021-02-24 09:50:13 +0100278 # Write data.xml
Akrondafaa7a2021-02-19 15:17:58 +0100279 $data->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100280 $zipper->new_stream("$dir/${data_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100281 $text_id_esc
282 );
283
Akrond53913c2021-02-24 09:50:13 +0100284 # Tokenize with external tokenizer
Akron9df4a242021-02-19 15:31:16 +0100285 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100286
287 # Tokenize and output
288 $ext_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100289 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100290 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100291 );
Akrond53ab4b2021-02-24 09:56:12 +0100292
293 if ($use_tokenizer_sentence_splits) {
294 $ext_tok->sentencize_from_previous_input($structures);
295 };
Akrondafaa7a2021-02-19 15:17:58 +0100296 };
Peter Harders6f526a32020-06-29 21:44:41 +0200297
Akrond53913c2021-02-24 09:50:13 +0100298 # Tokenize with internal tokenizer
299 if ($tokenizer_intern) {
Peter Harders6f526a32020-06-29 21:44:41 +0200300
Akrondafaa7a2021-02-19 15:17:58 +0100301 # Tokenize and output
302 $cons_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100303 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200304 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100305 )->reset;
Akron598d1a72020-08-02 17:33:31 +0200306
Akrondafaa7a2021-02-19 15:17:58 +0100307 $aggr_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100308 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akrondafaa7a2021-02-19 15:17:58 +0100309 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100310 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100311 };
Akrona10ad592020-08-03 11:20:23 +0200312
Akrondafaa7a2021-02-19 15:17:58 +0100313 # ~ write structures ~
314 if (!$structures->empty) {
315 $structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100316 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100317 $text_id_esc,
318 2 # = structure serialization
Akroncc27d792021-02-24 12:32:20 +0100319 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100320 };
321
322 # ~ write tokens ~
Akron75d63142021-02-23 18:40:56 +0100323 unless ($skip_inline_tokens || $tokens->empty) {
Akrondafaa7a2021-02-19 15:17:58 +0100324 $tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100325 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100326 $text_id_esc,
327 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akroncc27d792021-02-24 12:32:20 +0100328 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100329 };
330
331 # reinit.
332 $dir = '';
333
334 # Maybe not necessary
335 $data->reset;
336
Akron347be812020-09-29 07:52:52 +0200337 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200338 };
339
Peter Harders6f526a32020-06-29 21:44:41 +0200340
Akron347be812020-09-29 07:52:52 +0200341 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200342
Akronf8088e62021-02-18 16:18:59 +0100343 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100344
Akrond53913c2021-02-24 09:50:13 +0100345 # TODO:
346 # Maybe it's best, to keep the stripping of whitespace and
347 # to just remove the if-clause and to insert a blank by default
348 # (with possibly an option on how newlines in primary text should
349 # be handled (stripped or replaced by a whitespace)).
Akronf8088e62021-02-18 16:18:59 +0100350
351 # Remove consecutive whitespace at beginning and end (mostly one newline)
352 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200353
Akrond53913c2021-02-24 09:50:13 +0100354 # NOTE:
355 # this is only relevant, if a text consists of more than one line
Akronf57ed812020-07-27 10:37:52 +0200356
Akrond53913c2021-02-24 09:50:13 +0100357 # TODO:
358 # find a better solution, or create a warning, if a text has more
359 # than one line ($text_line > 1)
Akronf57ed812020-07-27 10:37:52 +0200360
Akrond53913c2021-02-24 09:50:13 +0100361 # TODO:
362 # do testing with 2 different corpora
363 # (one with only one-line texts, the other with several lines per text)
364
365 # line contains at least one tag with at least one character contents
366 if (m/<[^>]+>[^<]/) {
367
368 # Increment counter for text lines
369 $text_line++;
370
371 # insert blank before 1st character
Akron6e2b1252021-02-24 12:41:15 +0100372 # (for 2nd line and consecutive lines)
373 $_ = ' ' . $_ if $text_line > 1;
Akron347be812020-09-29 07:52:52 +0200374 }
Akronf57ed812020-07-27 10:37:52 +0200375
Akron347be812020-09-29 07:52:52 +0200376 # add line to buffer
Akrond53913c2021-02-24 09:50:13 +0100377 $text_buffer .= $_;
Akron347be812020-09-29 07:52:52 +0200378 };
Akrond3e1d282021-02-24 14:51:27 +0100379 }
Akronf57ed812020-07-27 10:37:52 +0200380
Akrond3e1d282021-02-24 14:51:27 +0100381 # Start of header section
382 elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200383
Akron347be812020-09-29 07:52:52 +0200384 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200385
Akrond20898f2021-02-19 15:52:17 +0100386 if ($1 !~ /^\s*$/) {
Akrond53913c2021-02-24 09:50:13 +0100387 die $log->fatal(
388 "input line number $.: " .
389 'line with opening header tag is not in expected format ... ' .
390 "=> Aborting (line=$_)");
Akron347be812020-09-29 07:52:52 +0200391 };
392
393 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200394 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200395
396 # Header was parseable
397 if ($header) {
398
399 # Write header to zip
Akrond53913c2021-02-24 09:50:13 +0100400 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200401
Akronb3649472020-09-29 08:24:46 +0200402 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200403
404 $header->to_zip($zipper->new_stream($file));
405
406 # Header is for text level
407 if ($header->type eq 'text') {
408
409 # Remember dir and sigles
410 $dir = $header->dir;
Akron347be812020-09-29 07:52:52 +0200411 $text_id_esc = $header->id_esc;
412
413 # log output for seeing progression
Akronbc899192021-02-24 12:14:47 +0100414 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200415
Akrond53913c2021-02-24 09:50:13 +0100416 # Reset counter for text lines
417 # (needed for whitespace handling)
418 $text_line = 0;
419 };
420 };
421 };
422};
Peter Hardersd892a582020-02-12 15:45:22 +0100423
Akron347be812020-09-29 07:52:52 +0200424$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200425
Akron9df4a242021-02-19 15:31:16 +0100426$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100427
Akrond53913c2021-02-24 09:50:13 +0100428close $input_fh;
429
Akron347be812020-09-29 07:52:52 +0200430exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100431
Peter Hardersd892a582020-02-12 15:45:22 +0100432
Akrond658df72021-02-18 18:58:56 +0100433# Recursively called function to handle XML tree data
Akron5aca0d22021-02-24 12:09:53 +0100434sub descend {
Akrond53913c2021-02-24 09:50:13 +0100435
Akron1c4f2202020-07-30 09:28:22 +0200436 # recursion level
Akron5aca0d22021-02-24 12:09:53 +0100437 # (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akrond53913c2021-02-24 09:50:13 +0100438 my $depth = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100439
Akrond658df72021-02-18 18:58:56 +0100440 # Iteration through all array elements
441 # ($_[0] is a reference to an array reference)
442 # See notes on how 'XML::CompactTree::XS' works and
443 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron3556c752021-02-24 09:53:24 +0100444 foreach $e (@{$_[0]}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100445
Akrond3e1d282021-02-24 14:51:27 +0100446 # $e->[1] represents the tag name of an element node
447 # or the primary data of a text or ws node
448 my $node_info = $e->[1];
449
Akrond658df72021-02-18 18:58:56 +0100450 # Element node
451 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100452
Akrond3e1d282021-02-24 14:51:27 +0100453 # Deal with opening tag
Peter Hardersd892a582020-02-12 15:45:22 +0100454
Akron5aca0d22021-02-24 12:09:53 +0100455 # Get the child index depending on the debug state.
456 # This is likely to be optimized away by the compiler.
457 my $children = $e->[DEBUG ? 5 : 4];
458
Akron54c3ff12021-02-25 11:33:37 +0100459 # Skip certain tags
460 if ($skip_inline_tags{$node_info}) {
Akrond3e1d282021-02-24 14:51:27 +0100461 descend($depth + 1, $children) if defined $children;
Akronace12772021-02-19 13:16:26 +0100462 next;
Akrond3e1d282021-02-24 14:51:27 +0100463 };
Peter Hardersd892a582020-02-12 15:45:22 +0100464
Akrond3e1d282021-02-24 14:51:27 +0100465 my $anno = $structures->add_new_annotation($node_info);
Peter Hardersd892a582020-02-12 15:45:22 +0100466
Akron7501ca02020-08-01 21:05:25 +0200467 # Add element also to token list
Akrond3e1d282021-02-24 14:51:27 +0100468 if (!$skip_inline_tokens && $node_info eq $_TOKENS_TAG) {
Akron7501ca02020-08-01 21:05:25 +0200469 $tokens->add_annotation($anno);
470 };
Peter Hardersd892a582020-02-12 15:45:22 +0100471
Akrond658df72021-02-18 18:58:56 +0100472 # Handle attributes (if attributes exist)
473 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100474
Akrond658df72021-02-18 18:58:56 +0100475 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
Akrond3e1d282021-02-24 14:51:27 +0100476 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
477 # NOTE:
478 # arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akrondac5d932021-02-23 21:12:02 +0100479 for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Akron7501ca02020-08-01 21:05:25 +0200480 $anno->add_attribute(
Akrondac5d932021-02-23 21:12:02 +0100481 @{$e->[3]}[$_, $_ + 1]
Akron7501ca02020-08-01 21:05:25 +0200482 );
Akrond658df72021-02-18 18:58:56 +0100483 };
484 };
Peter Harders6f526a32020-06-29 21:44:41 +0200485
486 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200487 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200488
Akrond658df72021-02-18 18:58:56 +0100489
Akrond658df72021-02-18 18:58:56 +0100490 # Call function recursively
Akron5aca0d22021-02-24 12:09:53 +0100491 # do no recursion, if $children is not defined
Akrond658df72021-02-18 18:58:56 +0100492 # (because we have no array of child-nodes, e.g.: <back/>)
Akron5aca0d22021-02-24 12:09:53 +0100493 descend($depth+1, $children) if defined $children;
Peter Harders6f526a32020-06-29 21:44:41 +0200494
495
Akrond3e1d282021-02-24 14:51:27 +0100496 # Deal with closing tag
Peter Harders6f526a32020-06-29 21:44:41 +0200497
Akrond3e1d282021-02-24 14:51:27 +0100498 # NOTE:
499 # use $pos, because the offsets are _between_ the characters
500 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200501 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200502
Akrond658df72021-02-18 18:58:56 +0100503 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200504
Akronb43b4912021-02-25 10:31:11 +0100505 my $from = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200506
Peter Harders6f526a32020-06-29 21:44:41 +0200507 # ~ whitespace related issue ~
Akrond53913c2021-02-24 09:50:13 +0100508 if ($from > 0 && not exists $ws{$from - 1}) {
Akrond658df72021-02-18 18:58:56 +0100509
Akrond3e1d282021-02-24 14:51:27 +0100510 # Previous node was a text-node
Akrond53913c2021-02-24 09:50:13 +0100511 $anno->set_from($from - 1);
Akrond658df72021-02-18 18:58:56 +0100512 };
513
Akrond53913c2021-02-24 09:50:13 +0100514 # in case this fails, check input
515 if (($from - 1) > $pos) {
516 die $log->fatal(
Akronbc899192021-02-24 12:14:47 +0100517 "text_id='$text_id_esc', " .
Akrond53913c2021-02-24 09:50:13 +0100518 'processing of structures: ' .
519 "from-value ($from) is 2 or more greater " .
520 "than to-value ($pos) => please check. Aborting"
521 );
522 };
523
524 # TODO:
525 # find example for which this case applies
526 # maybe this is not necessary anymore, because the
527 # above recorrection of the from-value suffices
Akrond658df72021-02-18 18:58:56 +0100528 #
Akrond53913c2021-02-24 09:50:13 +0100529 # TODO:
530 # check, if it's better to remove this line and
531 # change above check to 'if ($from - 1) >= $pos;
Akrond658df72021-02-18 18:58:56 +0100532 # do testing with bigger corpus excerpt (wikipedia?)
Akrond53913c2021-02-24 09:50:13 +0100533 $anno->set_from($pos) if $from == $pos + 1;
Akrond658df72021-02-18 18:58:56 +0100534 $anno->set_to($pos);
Akrond53913c2021-02-24 09:50:13 +0100535 $anno->set_level($depth);
Akrond658df72021-02-18 18:58:56 +0100536
537 # Clean up whitespace
Akrond53913c2021-02-24 09:50:13 +0100538 delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders6f526a32020-06-29 21:44:41 +0200539 }
540
Akrond658df72021-02-18 18:58:56 +0100541 # Text node
Akrond3e1d282021-02-24 14:51:27 +0100542 elsif ($e->[0] == XML_READER_TYPE_TEXT) {
Peter Harders6f526a32020-06-29 21:44:41 +0200543
Akrond658df72021-02-18 18:58:56 +0100544 $add_one = 1;
Akrond3e1d282021-02-24 14:51:27 +0100545 $data->append($node_info);
Akrond658df72021-02-18 18:58:56 +0100546 }
547
548 # Whitespace node
549 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
550 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
551
552 # state, that this from-index belongs to a whitespace-node
553 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
554 $ws{$data->position}++;
555
556 $add_one = 0;
Akrond3e1d282021-02-24 14:51:27 +0100557 $data->append($node_info);
Akrond658df72021-02-18 18:58:56 +0100558 }
559
560 # not yet handled type
561 else {
562
563 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
564 };
565 };
566};
567
Peter Harders6f526a32020-06-29 21:44:41 +0200568
Akrond949e182020-02-14 12:23:57 +0100569__END__
570
571=pod
572
573=encoding utf8
574
575=head1 NAME
576
577tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
578
579=head1 SYNOPSIS
580
581 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
582
583=head1 DESCRIPTION
584
Akronee434b12020-07-08 12:53:01 +0200585C<tei2korapxml> is a script to convert TEI P5 and
586L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
587based documents to the
588L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
589If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100590read from C<STDIN>. If no specific output is defined, data is written
591to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200592
Akrond949e182020-02-14 12:23:57 +0100593This program is usually called from inside another script.
594
Akronee434b12020-07-08 12:53:01 +0200595=head1 FORMATS
596
597=head2 Input restrictions
598
599=over 2
600
601=item
602
Akronee434b12020-07-08 12:53:01 +0200603TEI P5 formatted input with certain restrictions:
604
605=over 4
606
607=item
608
609B<mandatory>: text-header with integrated textsigle, text-body
610
611=item
612
613B<optional>: corp-header with integrated corpsigle,
614doc-header with integrated docsigle
615
616=back
617
618=item
619
Akron0c41ab32020-09-29 07:33:33 +0200620All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200621newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200622(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200623into blanks between 2 tokens could lead to additional blanks,
624where there should be none (e.g.: punctuation characters like C<,> or
625C<.> should not be seperated from their predecessor token).
626(see also code section C<~ whitespace handling ~>).
627
628=back
629
630=head2 Notes on the output
631
632=over 2
633
634=item
635
636zip file output (default on C<stdout>) with utf8 encoded entries
637(which together form the KorAP-XML format)
638
639=back
640
Akrond949e182020-02-14 12:23:57 +0100641=head1 INSTALLATION
642
643C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
644these bindings are available, the preferred way to install the script is
645to use L<cpanm|App::cpanminus>.
646
647 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
648
649In case everything went well, the C<tei2korapxml> tool will
650be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200651
Akrond949e182020-02-14 12:23:57 +0100652Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
653
654=head1 OPTIONS
655
656=over 2
657
Akron4e603a52020-07-27 14:23:49 +0200658=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100659
Akron4e603a52020-07-27 14:23:49 +0200660The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100661
662=item B<--help|-h>
663
664Print help information.
665
666=item B<--version|-v>
667
668Print version information.
669
Akron4e603a52020-07-27 14:23:49 +0200670=item B<--tokenizer-call|-tc>
671
672Call an external tokenizer process, that will tokenize
673a single line from STDIN and outputs one token per line.
674
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200675=item B<--tokenizer-korap|-tk>
676
677Use the standard KorAP/DeReKo tokenizer.
678
Akron6d7b8e42020-09-29 07:37:41 +0200679=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200680
681Tokenize the data using two embedded tokenizers,
682that will take an I<Aggressive> and a I<conservative>
683approach.
684
Akron75d63142021-02-23 18:40:56 +0100685=item B<--skip-inline-tokens>
686
687Boolean flag indicating that inline tokens should not
688be processed. Defaults to false (meaning inline tokens will be processed).
689
Akron54c3ff12021-02-25 11:33:37 +0100690=item B<--skip-inline-tags>
691
692Expects a comma-separated list of tags to be ignored when the structure
693is parsed. Content of these tags however will be processed.
694
Akron1a5271a2021-02-18 13:18:15 +0100695=item B<--inline-tokens> <foundry>#[<file>]
696
697Define the foundry and file (without extension)
698to store inline token information in.
699If L</KORAPXMLTEI_INLINE> is set, this will contain
700annotations as well.
701Defaults to C<tokens> and C<morpho>.
702
Akrondd0be8f2021-02-18 19:29:41 +0100703=item B<--inline-structures> <foundry>#[<file>]
704
705Define the foundry and file (without extension)
706to store inline structure information in.
707Defaults to C<struct> and C<structures>.
708
Akron26a71522021-02-19 10:27:37 +0100709=item B<--base-foundry> <foundry>
710
711Define the base foundry to store newly generated
712token information in.
713Defaults to C<base>.
714
715=item B<--data-file> <file>
716
717Define the file (without extension)
718to store primary data information in.
719Defaults to C<data>.
720
721=item B<--header-file> <file>
722
723Define the file name (without extension)
724to store header information on
725the corpus, document, and text level in.
726Defaults to C<header>.
727
Marc Kupietz985da0c2021-02-15 19:29:50 +0100728=item B<--use-tokenizer-sentence-splits|-s>
729
730Replace existing with, or add new, sentence boundary information
731provided by the KorAP tokenizer (currently supported only).
732
Akron91705d72021-02-19 10:59:45 +0100733=item B<--tokens-file> <file>
734
735Define the file (without extension)
736to store generated token information in
737(either from the KorAP tokenizer or an externally called tokenizer).
738Defaults to C<tokens>.
739
Akron3378dfd2020-08-01 15:01:36 +0200740=item B<--log|-l>
741
742Loglevel for I<Log::Any>. Defaults to C<notice>.
743
Akrond949e182020-02-14 12:23:57 +0100744=back
745
Akronb3649472020-09-29 08:24:46 +0200746=head1 ENVIRONMENT VARIABLES
747
748=over 2
749
750=item B<KORAPXMLTEI_DEBUG>
751
752Activate minimal debugging.
753Defaults to C<false>.
754
755=item B<KORAPXMLTEI_INLINE>
756
757Process inline annotations, if present.
758Defaults to C<false>.
759
760=back
761
Akrond949e182020-02-14 12:23:57 +0100762=head1 COPYRIGHT AND LICENSE
763
Marc Kupietze955ecc2021-02-17 17:42:01 +0100764Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100765
766Author: Peter Harders
767
Akronaabd0952020-09-29 07:35:08 +0200768Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100769
770L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
771Corpus Analysis Platform at the
772L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
773member of the
774L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
775
776This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100777L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100778
779=cut
Akronf8088e62021-02-18 16:18:59 +0100780
781# NOTES
782
783## Notes on how 'XML::CompactTree::XS' works
784
785Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
786
787Print out name of 'node2' for the above example:
788
789echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
790
791Exploring the structure of $data ( = reference to below array ):
792
793[ 0: XML_READER_TYPE_DOCUMENT,
794 1: ?
Akron5aca0d22021-02-24 12:09:53 +0100795 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100796 1: 'node'
797 2: ?
798 3: HASH (attributes)
799 4: 1 (line number)
800 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
801 1: 'node1'
802 2: ?
803 3: undefined (no attributes)
804 4: 1 (line number)
805 5: [ 0: [ 0: XML_READER_TYPE_TEXT
806 1: 'some '
807 ]
808 1: [ 0: XML_READER_TYPE_ELEMENT
809 1: 'n'
810 2: ?
811 3: undefined (no attributes)
812 4: 1 (line number)
813 5: undefined (no child-nodes)
814 ]
815 2: [ 0: XML_READER_TYPE_TEXT
816 1: ' text'
817 ]
818 ]
819 ]
820 1: [ 0: XML_READER_TYPE_ELEMENT
821 1: 'node2'
822 2: ?
823 3: undefined (not attributes)
824 4: 1 (line number)
825 5: [ 0: [ 0: XML_READER_TYPE_TEXT
826 1: 'more-text'
827 ]
828 ]
829 ]
830 ]
831 ]
832 ]
833]
834
835$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
836
837ref($data->[2]) == ARRAY (with 1 element for 'node')
838ref($data->[2]->[0]) == ARRAY (with 6 elements)
839
840$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
841$data->[2]->[0]->[1] == 'node'
842ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
843$data->[2]->[0]->[4] == 1 (line number)
844ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron5aca0d22021-02-24 12:09:53 +0100845 # child-nodes of actual node (see $children)
Akronf8088e62021-02-18 16:18:59 +0100846
847ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
848$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
849$data->[2]->[0]->[5]->[0]->[1] == 'node1'
850$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
851$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
852ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
853
854ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
855$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
856$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
857
858ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
859$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
860$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
861$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
862$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
863$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
864
865ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
866$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
867$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
868
869
Akron5aca0d22021-02-24 12:09:53 +0100870descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akronf8088e62021-02-18 16:18:59 +0100871Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
872${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
873
874
875## Notes on whitespace handling
876
877Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron5aca0d22021-02-24 12:09:53 +0100878(see function 'descend()').
Akronf8088e62021-02-18 16:18:59 +0100879
880Definition of significant and insignificant whitespace
881(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
882
883Significant whitespace is part of the document content and should be preserved.
884Insignificant whitespace is used when editing XML documents for readability.
885These whitespaces are typically not intended for inclusion in the delivery of the document.
886
887### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
888
889The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
890 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
891
892When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
893 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
894 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
895
896echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
897
898
899Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
900
901Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
902 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
903
904The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
905 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
906
907The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
908 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
909
910When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
911 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
912 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
913 the last read 'non-tag'-node has to be corrected (see [1]),
914
915For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
916 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
917
918[1]
919Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
920 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akrond53913c2021-02-24 09:50:13 +0100921 (see above code fragment '... not exists $ws{ $from - 1 } ...').
Akronf8088e62021-02-18 16:18:59 +0100922
923[2]
924Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
925 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
926
927The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
928 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
929
930Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
931 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
932
933
934## Notes on whitespace fixing
935
936The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
937 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
938
939It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
940 example further down and notes on 'Input restrictions' in the manpage).
941
942Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
943
944Examples (how primary text with linebreaks would be converted by below code):
945
946 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
947 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
948
949Blanks are inserted before the 1st character:
950
951 NOTE: not stringent ('...' stands for text):
952
953 beg1............................end1 => no blank before 'beg1'
954 beg2....<pb/>...................end2 => no blank before 'beg2'
955 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
956 beg4....<test>ok</test>.........end4 => blank before 'beg4'
957
958 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
959 ^
960 |_blank between 'end3' and 'beg4'
961
962
963## Notes on segfault prevention
964
Akron91577922021-02-19 10:32:54 +0100965binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100966(see notes on 'PerlIO layers' in 'man XML::LibXML'),
967removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
968see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
969see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.