blob: aad30063a3ec3c0f1e8f941c72244dd80702888d [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron91705d72021-02-19 10:59:45 +010049 'tokenizer-internal|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akrondd0be8f2021-02-18 19:29:41 +010052 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron26a71522021-02-19 10:27:37 +010053 'base-foundry=s' => \(my $_tok_dir = 'base'),
54 'data-file=s' => \(my $_data_file = 'data'),
55 'header-file=s' => \(my $_header_file = 'header'),
Akron91705d72021-02-19 10:59:45 +010056 'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron3378dfd2020-08-01 15:01:36 +020057 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020058 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010059 pod2usage(
60 -verbose => 99,
61 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
62 -msg => $VERSION_MSG,
63 -output => '-'
64 )
65 },
66 'version|v' => sub {
67 pod2usage(
68 -verbose => 0,
69 -msg => $VERSION_MSG,
70 -output => '-'
71 )
72 }
Peter Hardersd892a582020-02-12 15:45:22 +010073);
74
Marc Kupietz44b1f252020-11-26 16:31:40 +010075binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020076Log::Any::Adapter->set('Stderr', log_level => $log_level);
77
Akronb3649472020-09-29 08:24:46 +020078$log->notice('Debugging is activated') if DEBUG;
79
Peter Harders6f526a32020-06-29 21:44:41 +020080#
81# ~~~ parameter (mandatory) ~~~
82#
Peter Harders6f526a32020-06-29 21:44:41 +020083my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron0c41ab32020-09-29 07:33:33 +020084# optional
Peter Harders6f526a32020-06-29 21:44:41 +020085my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron0c41ab32020-09-29 07:33:33 +020086# optional
Peter Harders6f526a32020-06-29 21:44:41 +020087my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron0c41ab32020-09-29 07:33:33 +020088# mandatory
Peter Harders6f526a32020-06-29 21:44:41 +020089my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron09e0b2c2020-07-28 15:57:01 +020090
Akron0c41ab32020-09-29 07:33:33 +020091
Marc Kupietz985da0c2021-02-15 19:29:50 +010092if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
93 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
94}
95
Akron0c41ab32020-09-29 07:33:33 +020096my $ext_tok;
97if ($tokenizer_call) {
98 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
99}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200100
Akron0c41ab32020-09-29 07:33:33 +0200101elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100102 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200103};
Peter Harders6f526a32020-06-29 21:44:41 +0200104##
105
Akron0c41ab32020-09-29 07:33:33 +0200106
Akron4e3c7e32021-02-18 15:19:53 +0100107#
108# ~~~ constants ~~~
109#
110
111
Akron8b511f92020-07-09 17:28:08 +0200112## intern tokenization
Akron0c41ab32020-09-29 07:33:33 +0200113my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
114my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200115##
116
Peter Harders6f526a32020-06-29 21:44:41 +0200117## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
118my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100119
120
Akrondd0be8f2021-02-18 19:29:41 +0100121# Name of the directory and the file containing all inline structure informations
122# except for $_TOKEN_TAG information
123my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
124$_structure_file .= '.xml';
125
126
Akron1a5271a2021-02-18 13:18:15 +0100127# Name of the directory and the file containing all inline token informations
128# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
129my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
130$_tokens_file .= '.xml';
131
Peter Harders6f526a32020-06-29 21:44:41 +0200132my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
133
Akron4e3c7e32021-02-18 15:19:53 +0100134# Handling inline annotations (inside $_TOKENS_TAG)
135my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200136
Peter Harders6f526a32020-06-29 21:44:41 +0200137
138#
139# ~~~ variables ~~~
140#
141
Akron7501ca02020-08-01 21:05:25 +0200142# Initialize Token- and Structure-Collector
143my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
144my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200145
146
Akrona10ad592020-08-03 11:20:23 +0200147# Initialize Data-Collector
148my $data = KorAP::XML::TEI::Data->new;
149
150
Akron85717512020-07-08 11:19:19 +0200151# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200152my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200153my $input_fh; # input file handle (default: stdin)
154
Peter Harders6f526a32020-06-29 21:44:41 +0200155my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200156
Akron0c41ab32020-09-29 07:33:33 +0200157my ( $text_id,
158 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200159
Peter Harders6f526a32020-06-29 21:44:41 +0200160# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100161my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200162 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200163 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
164 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200165 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200166 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
167 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200168 # (means: 'from-index - 1' is a key in %ws).
169 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
170
Akron7501ca02020-08-01 21:05:25 +0200171my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200172
Peter Harders6f526a32020-06-29 21:44:41 +0200173
174#
175# ~~~ main ~~~
176#
177
178# ~ initializations ~
179
Akron4e3c7e32021-02-18 15:19:53 +0100180# Include line numbers in elements of $tree_data for debugging
181DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200182
Akron7501ca02020-08-01 21:05:25 +0200183$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200184
Akronec2cef22020-07-31 10:00:15 +0200185# Normalize regex for header parsing
186for ($_CORP_HEADER_BEG,
187 $_DOC_HEADER_BEG,
188 $_TEXT_HEADER_BEG) {
189 s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
190};
Peter Hardersd892a582020-02-12 15:45:22 +0100191
Peter Hardersd892a582020-02-12 15:45:22 +0100192
Peter Harders6f526a32020-06-29 21:44:41 +0200193# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100194
Akron347be812020-09-29 07:52:52 +0200195my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100196
Akron347be812020-09-29 07:52:52 +0200197$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100198
Akron347be812020-09-29 07:52:52 +0200199# Maybe not necessary
200$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100201
Akrondafaa7a2021-02-19 15:17:58 +0100202$dir = '';
Peter Hardersd892a582020-02-12 15:45:22 +0100203
Akron347be812020-09-29 07:52:52 +0200204if ( $input_fname ne '' ){
205 unless (open($input_fh, '<', $input_fname)) {
206 die $log->fatal("File '$input_fname' could not be opened.");
207 };
208}
Peter Harders6f526a32020-06-29 21:44:41 +0200209
Akronf8088e62021-02-18 16:18:59 +0100210# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200211binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200212
Akrond20898f2021-02-19 15:52:17 +0100213my $sfx;
Akron347be812020-09-29 07:52:52 +0200214my $pos;
Akroneaa96232020-10-15 17:06:15 +0200215my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200216my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200217
Akron347be812020-09-29 07:52:52 +0200218# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200219
Akron347be812020-09-29 07:52:52 +0200220MAIN: while ( <$input_fh> ){
221
222 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
223
Akroneaa96232020-10-15 17:06:15 +0200224 # Set input encoding
225 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
226 $input_enc = $2;
227 next;
228 };
229
230 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100231 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200232
Akron347be812020-09-29 07:52:52 +0200233 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
234
235 # ~ start of text body ~
236
Akron347be812020-09-29 07:52:52 +0200237 $sfx = $2;
238
Akrond20898f2021-02-19 15:52:17 +0100239 if ($1 !~ /^\s*$/ || $sfx !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200240 die $log->fatal("input line number $.: " .
241 "line with opening text-body tag '${_TEXT_BODY}' " .
242 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200243 };
Peter Harders6f526a32020-06-29 21:44:41 +0200244
Akron347be812020-09-29 07:52:52 +0200245 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
246 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200247
Akron347be812020-09-29 07:52:52 +0200248 # Iterate over all lines in the text body
249 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200250
Akron347be812020-09-29 07:52:52 +0200251 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200252 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100253 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200254
Akron347be812020-09-29 07:52:52 +0200255 # ~ end of text body ~
256 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200257
Akron91705d72021-02-19 10:59:45 +0100258 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200259
Akron347be812020-09-29 07:52:52 +0200260 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
261 die $log->fatal("input line number $.: " .
262 "line with closing text-body tag '${_TEXT_BODY}'".
263 " contains additional information ... => Aborting (line=$_)");
264 };
Peter Harders6f526a32020-06-29 21:44:41 +0200265
Akrondafaa7a2021-02-19 15:17:58 +0100266 if ($dir eq '') {
267 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
268 next MAIN;
269 };
Peter Harders6f526a32020-06-29 21:44:41 +0200270
Akrondafaa7a2021-02-19 15:17:58 +0100271 my $reader = XML::LibXML::Reader->new(
272 string => "<text>$buf_in</text>",
273 huge => 1
274 );
275
276 # See notes on whitespace handling
277 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
278
279 # XCT_LINE_NUMBERS is only needed for debugging
280 # (see XML::CompactTree::XS)
281 $param |= XCT_LINE_NUMBERS if DEBUG;
282 my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
283
284 $structures->reset;
285
286 $tokens->reset if $_TOKENS_PROC;
287
288 # ~ whitespace related issue ~
289 $add_one = 0;
290 %ws = ();
291
292 # ~ recursion ~
293 retr_info(1, \$tree_data->[2] ); # parse input data
294
295 if (DEBUG) {
296 $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
297 };
298
299 # ~ write data.xml ~
300 $data->to_zip(
301 $zipper->new_stream("$dir/${_data_file}.xml"),
302 $text_id_esc
303 );
304
305 # ~ tokenization ~
Akron9df4a242021-02-19 15:31:16 +0100306 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100307
308 # Tokenize and output
309 $ext_tok->tokenize($data->data)->to_zip(
310 $zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
311 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100312 );
Akrondafaa7a2021-02-19 15:17:58 +0100313 };
Peter Harders6f526a32020-06-29 21:44:41 +0200314
Akrondafaa7a2021-02-19 15:17:58 +0100315 if ($_GEN_TOK_INT) {
Peter Harders6f526a32020-06-29 21:44:41 +0200316
Akrondafaa7a2021-02-19 15:17:58 +0100317 # Tokenize and output
318 $cons_tok->tokenize($data->data)->to_zip(
319 $zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200320 $text_id_esc
321 );
Akron598d1a72020-08-02 17:33:31 +0200322
Akrondafaa7a2021-02-19 15:17:58 +0100323 $aggr_tok->tokenize($data->data)->to_zip(
324 $zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
325 $text_id_esc
326 );
Akron598d1a72020-08-02 17:33:31 +0200327
Akrondafaa7a2021-02-19 15:17:58 +0100328 $aggr_tok->reset;
329 $cons_tok->reset;
330 };
Akrona10ad592020-08-03 11:20:23 +0200331
Akrondafaa7a2021-02-19 15:17:58 +0100332 if ($use_tokenizer_sentence_splits) {
333 $ext_tok->sentencize_from_previous_input($structures);
Akron9df4a242021-02-19 15:31:16 +0100334 };
Akron598d1a72020-08-02 17:33:31 +0200335
Akrondafaa7a2021-02-19 15:17:58 +0100336 # ~ write structures ~
337 if (!$structures->empty) {
338 $structures->to_zip(
339 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
340 $text_id_esc,
341 2 # = structure serialization
342 );
343 };
344
345 # ~ write tokens ~
346 if ($_TOKENS_PROC && !$tokens->empty) {
347 $tokens->to_zip(
348 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
349 $text_id_esc,
350 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
351 );
352 };
353
354 # reinit.
355 $dir = '';
356
357 # Maybe not necessary
358 $data->reset;
359
Akron347be812020-09-29 07:52:52 +0200360 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200361 };
362
Akron347be812020-09-29 07:52:52 +0200363 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200364
Akron347be812020-09-29 07:52:52 +0200365 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200366
Akronf8088e62021-02-18 16:18:59 +0100367 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100368
Akronf8088e62021-02-18 16:18:59 +0100369 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
370 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
371
372 # Remove consecutive whitespace at beginning and end (mostly one newline)
373 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200374
Akron347be812020-09-29 07:52:52 +0200375 ### NOTE: this is only relevant, if a text consists of more than one line
376 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
377 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
378 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200379
Akron347be812020-09-29 07:52:52 +0200380 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200381
Akron347be812020-09-29 07:52:52 +0200382 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
383 }
384 ###
Akronf57ed812020-07-27 10:37:52 +0200385
Akron347be812020-09-29 07:52:52 +0200386 # add line to buffer
387 $buf_in .= $_;
388 };
Akronf57ed812020-07-27 10:37:52 +0200389
Akron347be812020-09-29 07:52:52 +0200390 } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200391
Akron347be812020-09-29 07:52:52 +0200392 # ~ start of header ~
Akron347be812020-09-29 07:52:52 +0200393 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200394
Akrond20898f2021-02-19 15:52:17 +0100395 if ($1 !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200396 die $log->fatal("input line number $.: " .
397 "line with opening header tag" .
398 " is not in expected format ... => Aborting (line=$_)");
399 };
400
401 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200402 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200403
404 # Header was parseable
405 if ($header) {
406
407 # Write header to zip
Akron26a71522021-02-19 10:27:37 +0100408 my $file = $header->dir . '/' . $_header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200409
Akronb3649472020-09-29 08:24:46 +0200410 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200411
412 $header->to_zip($zipper->new_stream($file));
413
414 # Header is for text level
415 if ($header->type eq 'text') {
416
417 # Remember dir and sigles
418 $dir = $header->dir;
419 $text_id = $header->id;
420 $text_id_esc = $header->id_esc;
421
422 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100423 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200424
425 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200426 }
427 }
Akron347be812020-09-29 07:52:52 +0200428 }
429} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100430
Akron347be812020-09-29 07:52:52 +0200431$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200432
Akron9df4a242021-02-19 15:31:16 +0100433$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100434
Akron347be812020-09-29 07:52:52 +0200435exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100436
Peter Hardersd892a582020-02-12 15:45:22 +0100437
Akrond658df72021-02-18 18:58:56 +0100438# Recursively called function to handle XML tree data
439sub retr_info {
Akron1c4f2202020-07-30 09:28:22 +0200440 # recursion level
441 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
442 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100443
Marc Kupietz985da0c2021-02-15 19:29:50 +0100444 my $dummy_anno;
445 if ($use_tokenizer_sentence_splits) {
Akrond658df72021-02-18 18:58:56 +0100446 $dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100447 }
448
Akrond658df72021-02-18 18:58:56 +0100449 # Iteration through all array elements
450 # ($_[0] is a reference to an array reference)
451 # See notes on how 'XML::CompactTree::XS' works and
452 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
453 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100454
Akrond658df72021-02-18 18:58:56 +0100455 # Element node
456 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100457
Peter Harders6f526a32020-06-29 21:44:41 +0200458 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200459 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200460 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100461
Marc Kupietz985da0c2021-02-15 19:29:50 +0100462 my $anno;
463
Akron7501ca02020-08-01 21:05:25 +0200464 # $e->[1] represents the tag name
Marc Kupietz985da0c2021-02-15 19:29:50 +0100465 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
466 $anno = $dummy_anno;
467 } else {
468 $anno = $structures->add_new_annotation($e->[1]);
469 }
Peter Hardersd892a582020-02-12 15:45:22 +0100470
Peter Hardersd892a582020-02-12 15:45:22 +0100471
Akron7501ca02020-08-01 21:05:25 +0200472 # Add element also to token list
473 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
474 $tokens->add_annotation($anno);
475 };
Peter Hardersd892a582020-02-12 15:45:22 +0100476
Akrond658df72021-02-18 18:58:56 +0100477 # Handle attributes (if attributes exist)
478 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100479
Akrond658df72021-02-18 18:58:56 +0100480 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
481 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
482 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
483 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100484
Peter Harders6f526a32020-06-29 21:44:41 +0200485 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200486 $anno->add_attribute(
487 @{$e->[3]}[$c, $c + 1]
488 );
Akrond658df72021-02-18 18:58:56 +0100489 };
490 };
Peter Harders6f526a32020-06-29 21:44:41 +0200491
492 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200493 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200494
Akrond658df72021-02-18 18:58:56 +0100495
Peter Harders6f526a32020-06-29 21:44:41 +0200496 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200497 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200498 #~~~~
499
500
Akrond658df72021-02-18 18:58:56 +0100501 # Call function recursively
502 # do no recursion, if $e->[$_IDX] is not defined
503 # (because we have no array of child-nodes, e.g.: <back/>)
504 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200505
Akrond658df72021-02-18 18:58:56 +0100506 # Recursion with array of child-nodes
507 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200508 }
509
510
511 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200512 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200513 #~~~~~
514
Akrond658df72021-02-18 18:58:56 +0100515 # NOTE: use $pos, because the offsets are _between_ the characters
516 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200517 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200518
Akrond658df72021-02-18 18:58:56 +0100519 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200520
Akrond658df72021-02-18 18:58:56 +0100521 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200522
Peter Harders6f526a32020-06-29 21:44:41 +0200523 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100524 if ($fval > 0 && not exists $ws{$fval - 1}) {
525
526 # ~ previous node was a text-node ~
527 $anno->set_from($fval - 1);
528 }
529
530 # in case this fails, check input
531 if (($fval - 1) > $pos) {
532 die $log->fatal("text_id='$text_id', " .
533 "processing of structures: " .
534 "from-value ($fval) is 2 or more greater " .
535 "than to-value ($pos) => please check. Aborting");
536 };
537
538 # TODO: find example for which this case applies
539 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
540 #
541 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
542 # do testing with bigger corpus excerpt (wikipedia?)
543 $anno->set_from($pos) if $fval == $pos + 1;
544 $anno->set_to($pos);
545 $anno->set_level($rl);
546
547 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200548 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100549
550
Peter Harders41c35622020-07-12 01:16:22 +0200551 #~~~~
552 # until here: tag-node (closing)
553 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200554 }
555
Akrond658df72021-02-18 18:58:56 +0100556 # Text node
557 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200558
Akrond658df72021-02-18 18:58:56 +0100559 $add_one = 1;
560 $data->append($e->[1]);
561 }
562
563 # Whitespace node
564 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
565 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
566
567 # state, that this from-index belongs to a whitespace-node
568 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
569 $ws{$data->position}++;
570
571 $add_one = 0;
572 $data->append($e->[1]);
573 }
574
575 # not yet handled type
576 else {
577
578 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
579 };
580 };
581};
582
Peter Harders6f526a32020-06-29 21:44:41 +0200583
Akrond949e182020-02-14 12:23:57 +0100584__END__
585
586=pod
587
588=encoding utf8
589
590=head1 NAME
591
592tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
593
594=head1 SYNOPSIS
595
596 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
597
598=head1 DESCRIPTION
599
Akronee434b12020-07-08 12:53:01 +0200600C<tei2korapxml> is a script to convert TEI P5 and
601L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
602based documents to the
603L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
604If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100605read from C<STDIN>. If no specific output is defined, data is written
606to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200607
Akrond949e182020-02-14 12:23:57 +0100608This program is usually called from inside another script.
609
Akronee434b12020-07-08 12:53:01 +0200610=head1 FORMATS
611
612=head2 Input restrictions
613
614=over 2
615
616=item
617
Akronee434b12020-07-08 12:53:01 +0200618TEI P5 formatted input with certain restrictions:
619
620=over 4
621
622=item
623
624B<mandatory>: text-header with integrated textsigle, text-body
625
626=item
627
628B<optional>: corp-header with integrated corpsigle,
629doc-header with integrated docsigle
630
631=back
632
633=item
634
Akron0c41ab32020-09-29 07:33:33 +0200635All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200636newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200637(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200638into blanks between 2 tokens could lead to additional blanks,
639where there should be none (e.g.: punctuation characters like C<,> or
640C<.> should not be seperated from their predecessor token).
641(see also code section C<~ whitespace handling ~>).
642
643=back
644
645=head2 Notes on the output
646
647=over 2
648
649=item
650
651zip file output (default on C<stdout>) with utf8 encoded entries
652(which together form the KorAP-XML format)
653
654=back
655
Akrond949e182020-02-14 12:23:57 +0100656=head1 INSTALLATION
657
658C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
659these bindings are available, the preferred way to install the script is
660to use L<cpanm|App::cpanminus>.
661
662 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
663
664In case everything went well, the C<tei2korapxml> tool will
665be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200666
Akrond949e182020-02-14 12:23:57 +0100667Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
668
669=head1 OPTIONS
670
671=over 2
672
Akron4e603a52020-07-27 14:23:49 +0200673=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100674
Akron4e603a52020-07-27 14:23:49 +0200675The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100676
677=item B<--help|-h>
678
679Print help information.
680
681=item B<--version|-v>
682
683Print version information.
684
Akron4e603a52020-07-27 14:23:49 +0200685=item B<--tokenizer-call|-tc>
686
687Call an external tokenizer process, that will tokenize
688a single line from STDIN and outputs one token per line.
689
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200690=item B<--tokenizer-korap|-tk>
691
692Use the standard KorAP/DeReKo tokenizer.
693
Akron6d7b8e42020-09-29 07:37:41 +0200694=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200695
696Tokenize the data using two embedded tokenizers,
697that will take an I<Aggressive> and a I<conservative>
698approach.
699
Akron1a5271a2021-02-18 13:18:15 +0100700=item B<--inline-tokens> <foundry>#[<file>]
701
702Define the foundry and file (without extension)
703to store inline token information in.
704If L</KORAPXMLTEI_INLINE> is set, this will contain
705annotations as well.
706Defaults to C<tokens> and C<morpho>.
707
Akrondd0be8f2021-02-18 19:29:41 +0100708=item B<--inline-structures> <foundry>#[<file>]
709
710Define the foundry and file (without extension)
711to store inline structure information in.
712Defaults to C<struct> and C<structures>.
713
Akron26a71522021-02-19 10:27:37 +0100714=item B<--base-foundry> <foundry>
715
716Define the base foundry to store newly generated
717token information in.
718Defaults to C<base>.
719
720=item B<--data-file> <file>
721
722Define the file (without extension)
723to store primary data information in.
724Defaults to C<data>.
725
726=item B<--header-file> <file>
727
728Define the file name (without extension)
729to store header information on
730the corpus, document, and text level in.
731Defaults to C<header>.
732
Marc Kupietz985da0c2021-02-15 19:29:50 +0100733=item B<--use-tokenizer-sentence-splits|-s>
734
735Replace existing with, or add new, sentence boundary information
736provided by the KorAP tokenizer (currently supported only).
737
Akron91705d72021-02-19 10:59:45 +0100738=item B<--tokens-file> <file>
739
740Define the file (without extension)
741to store generated token information in
742(either from the KorAP tokenizer or an externally called tokenizer).
743Defaults to C<tokens>.
744
Akron3378dfd2020-08-01 15:01:36 +0200745=item B<--log|-l>
746
747Loglevel for I<Log::Any>. Defaults to C<notice>.
748
Akrond949e182020-02-14 12:23:57 +0100749=back
750
Akronb3649472020-09-29 08:24:46 +0200751=head1 ENVIRONMENT VARIABLES
752
753=over 2
754
755=item B<KORAPXMLTEI_DEBUG>
756
757Activate minimal debugging.
758Defaults to C<false>.
759
760=item B<KORAPXMLTEI_INLINE>
761
762Process inline annotations, if present.
763Defaults to C<false>.
764
765=back
766
Akrond949e182020-02-14 12:23:57 +0100767=head1 COPYRIGHT AND LICENSE
768
Marc Kupietze955ecc2021-02-17 17:42:01 +0100769Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100770
771Author: Peter Harders
772
Akronaabd0952020-09-29 07:35:08 +0200773Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100774
775L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
776Corpus Analysis Platform at the
777L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
778member of the
779L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
780
781This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100782L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100783
784=cut
Akronf8088e62021-02-18 16:18:59 +0100785
786# NOTES
787
788## Notes on how 'XML::CompactTree::XS' works
789
790Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
791
792Print out name of 'node2' for the above example:
793
794echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
795
796Exploring the structure of $data ( = reference to below array ):
797
798[ 0: XML_READER_TYPE_DOCUMENT,
799 1: ?
Akron91577922021-02-19 10:32:54 +0100800 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100801 1: 'node'
802 2: ?
803 3: HASH (attributes)
804 4: 1 (line number)
805 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
806 1: 'node1'
807 2: ?
808 3: undefined (no attributes)
809 4: 1 (line number)
810 5: [ 0: [ 0: XML_READER_TYPE_TEXT
811 1: 'some '
812 ]
813 1: [ 0: XML_READER_TYPE_ELEMENT
814 1: 'n'
815 2: ?
816 3: undefined (no attributes)
817 4: 1 (line number)
818 5: undefined (no child-nodes)
819 ]
820 2: [ 0: XML_READER_TYPE_TEXT
821 1: ' text'
822 ]
823 ]
824 ]
825 1: [ 0: XML_READER_TYPE_ELEMENT
826 1: 'node2'
827 2: ?
828 3: undefined (not attributes)
829 4: 1 (line number)
830 5: [ 0: [ 0: XML_READER_TYPE_TEXT
831 1: 'more-text'
832 ]
833 ]
834 ]
835 ]
836 ]
837 ]
838]
839
840$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
841
842ref($data->[2]) == ARRAY (with 1 element for 'node')
843ref($data->[2]->[0]) == ARRAY (with 6 elements)
844
845$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
846$data->[2]->[0]->[1] == 'node'
847ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
848$data->[2]->[0]->[4] == 1 (line number)
849ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
850 # child-nodes of actual node (see $_IDX)
851
852ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
853$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
854$data->[2]->[0]->[5]->[0]->[1] == 'node1'
855$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
856$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
857ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
858
859ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
860$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
861$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
862
863ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
864$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
865$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
866$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
867$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
868$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
869
870ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
871$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
872$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
873
874
875retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
876Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
877${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
878
879
880## Notes on whitespace handling
881
882Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
883(see function 'retr_info()').
884
885Definition of significant and insignificant whitespace
886(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
887
888Significant whitespace is part of the document content and should be preserved.
889Insignificant whitespace is used when editing XML documents for readability.
890These whitespaces are typically not intended for inclusion in the delivery of the document.
891
892### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
893
894The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
895 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
896
897When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
898 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
899 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
900
901echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
902
903
904Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
905
906Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
907 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
908
909The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
910 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
911
912The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
913 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
914
915When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
916 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
917 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
918 the last read 'non-tag'-node has to be corrected (see [1]),
919
920For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
921 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
922
923[1]
924Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
925 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
926 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
927
928[2]
929Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
930 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
931
932The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
933 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
934
935Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
936 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
937
938
939## Notes on whitespace fixing
940
941The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
942 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
943
944It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
945 example further down and notes on 'Input restrictions' in the manpage).
946
947Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
948
949Examples (how primary text with linebreaks would be converted by below code):
950
951 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
952 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
953
954Blanks are inserted before the 1st character:
955
956 NOTE: not stringent ('...' stands for text):
957
958 beg1............................end1 => no blank before 'beg1'
959 beg2....<pb/>...................end2 => no blank before 'beg2'
960 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
961 beg4....<test>ok</test>.........end4 => blank before 'beg4'
962
963 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
964 ^
965 |_blank between 'end3' and 'beg4'
966
967
968## Notes on segfault prevention
969
Akron91577922021-02-19 10:32:54 +0100970binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100971(see notes on 'PerlIO layers' in 'man XML::LibXML'),
972removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
973see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
974see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.