blob: cbd0b1e6f9cf2df54163d25449dafdfab834c406 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Marc Kupietza1421f02021-02-18 15:32:38 +010036our $VERSION = '1.00';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron6d7b8e42020-09-29 07:37:41 +020049 'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akrondd0be8f2021-02-18 19:29:41 +010052 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron26a71522021-02-19 10:27:37 +010053 'base-foundry=s' => \(my $_tok_dir = 'base'),
54 'data-file=s' => \(my $_data_file = 'data'),
55 'header-file=s' => \(my $_header_file = 'header'),
Akron3378dfd2020-08-01 15:01:36 +020056 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020057 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010058 pod2usage(
59 -verbose => 99,
60 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
61 -msg => $VERSION_MSG,
62 -output => '-'
63 )
64 },
65 'version|v' => sub {
66 pod2usage(
67 -verbose => 0,
68 -msg => $VERSION_MSG,
69 -output => '-'
70 )
71 }
Peter Hardersd892a582020-02-12 15:45:22 +010072);
73
Marc Kupietz44b1f252020-11-26 16:31:40 +010074binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020075Log::Any::Adapter->set('Stderr', log_level => $log_level);
76
Akronb3649472020-09-29 08:24:46 +020077$log->notice('Debugging is activated') if DEBUG;
78
Peter Harders6f526a32020-06-29 21:44:41 +020079#
80# ~~~ parameter (mandatory) ~~~
81#
Peter Harders6f526a32020-06-29 21:44:41 +020082my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron0c41ab32020-09-29 07:33:33 +020083# optional
Peter Harders6f526a32020-06-29 21:44:41 +020084my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron0c41ab32020-09-29 07:33:33 +020085# optional
Peter Harders6f526a32020-06-29 21:44:41 +020086my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron0c41ab32020-09-29 07:33:33 +020087# mandatory
Peter Harders6f526a32020-06-29 21:44:41 +020088my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron09e0b2c2020-07-28 15:57:01 +020089
Akron0c41ab32020-09-29 07:33:33 +020090
Peter Harders41c35622020-07-12 01:16:22 +020091## extern tokenization
Marc Kupietz1e882fb2020-09-09 00:05:46 +020092my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
93
Marc Kupietz985da0c2021-02-15 19:29:50 +010094if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
95 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
96}
97
Akron0c41ab32020-09-29 07:33:33 +020098my $ext_tok;
99if ($tokenizer_call) {
100 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
101}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200102
Akron0c41ab32020-09-29 07:33:33 +0200103elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100104 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200105};
106my $_tok_file_ext = "tokens.xml";
Peter Harders6f526a32020-06-29 21:44:41 +0200107##
108
Akron0c41ab32020-09-29 07:33:33 +0200109
Akron4e3c7e32021-02-18 15:19:53 +0100110#
111# ~~~ constants ~~~
112#
113
114
Akron8b511f92020-07-09 17:28:08 +0200115## intern tokenization
Peter Hardersf9c51242020-07-21 02:37:44 +0200116my $_GEN_TOK_INT = $tokenizer_intern; # simple tokenization (recommended for testing)
Akron0c41ab32020-09-29 07:33:33 +0200117my $_tok_file_con = "tokens_conservative.xml";
118my $_tok_file_agg = "tokens_aggressive.xml";
119my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
120my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200121##
122
Peter Harders6f526a32020-06-29 21:44:41 +0200123## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
124my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100125
126
Akrondd0be8f2021-02-18 19:29:41 +0100127# Name of the directory and the file containing all inline structure informations
128# except for $_TOKEN_TAG information
129my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
130$_structure_file .= '.xml';
131
132
Akron1a5271a2021-02-18 13:18:15 +0100133# Name of the directory and the file containing all inline token informations
134# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
135my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
136$_tokens_file .= '.xml';
137
Peter Harders6f526a32020-06-29 21:44:41 +0200138my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
139
Akron4e3c7e32021-02-18 15:19:53 +0100140# Handling inline annotations (inside $_TOKENS_TAG)
141my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200142
Peter Harders6f526a32020-06-29 21:44:41 +0200143
144#
145# ~~~ variables ~~~
146#
147
Akron7501ca02020-08-01 21:05:25 +0200148# Initialize Token- and Structure-Collector
149my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
150my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200151
152
Akrona10ad592020-08-03 11:20:23 +0200153# Initialize Data-Collector
154my $data = KorAP::XML::TEI::Data->new;
155
156
Akron85717512020-07-08 11:19:19 +0200157# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200158my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200159my $input_fh; # input file handle (default: stdin)
160
Peter Harders6f526a32020-06-29 21:44:41 +0200161my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200162
Akron0c41ab32020-09-29 07:33:33 +0200163my ( $text_id,
164 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200165
Peter Harders6f526a32020-06-29 21:44:41 +0200166my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
167 $tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
168
169# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100170my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200171 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200172 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
173 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200174 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200175 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
176 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200177 # (means: 'from-index - 1' is a key in %ws).
178 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
179
Akron7501ca02020-08-01 21:05:25 +0200180my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200181
Peter Harders6f526a32020-06-29 21:44:41 +0200182
183#
184# ~~~ main ~~~
185#
186
187# ~ initializations ~
188
Akron4e3c7e32021-02-18 15:19:53 +0100189# Include line numbers in elements of $tree_data for debugging
190DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200191
Akron7501ca02020-08-01 21:05:25 +0200192$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200193
Akronec2cef22020-07-31 10:00:15 +0200194# Normalize regex for header parsing
195for ($_CORP_HEADER_BEG,
196 $_DOC_HEADER_BEG,
197 $_TEXT_HEADER_BEG) {
198 s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
199};
Peter Hardersd892a582020-02-12 15:45:22 +0100200
Peter Hardersd892a582020-02-12 15:45:22 +0100201
Peter Harders6f526a32020-06-29 21:44:41 +0200202# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100203
Akron347be812020-09-29 07:52:52 +0200204my ( $pfx, $sfx );
Peter Hardersd892a582020-02-12 15:45:22 +0100205
Akron347be812020-09-29 07:52:52 +0200206my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100207
Akron347be812020-09-29 07:52:52 +0200208$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100209
Akron347be812020-09-29 07:52:52 +0200210# Maybe not necessary
211$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100212
Akron347be812020-09-29 07:52:52 +0200213$dir = "";
Peter Hardersd892a582020-02-12 15:45:22 +0100214
Akron347be812020-09-29 07:52:52 +0200215if ( $input_fname ne '' ){
216 unless (open($input_fh, '<', $input_fname)) {
217 die $log->fatal("File '$input_fname' could not be opened.");
218 };
219}
Peter Harders6f526a32020-06-29 21:44:41 +0200220
Akronf8088e62021-02-18 16:18:59 +0100221# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200222binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200223
Akron347be812020-09-29 07:52:52 +0200224my $pos;
Akroneaa96232020-10-15 17:06:15 +0200225my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200226my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200227
Akron347be812020-09-29 07:52:52 +0200228# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200229
Akron347be812020-09-29 07:52:52 +0200230MAIN: while ( <$input_fh> ){
231
232 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
233
Akroneaa96232020-10-15 17:06:15 +0200234 # Set input encoding
235 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
236 $input_enc = $2;
237 next;
238 };
239
240 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100241 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200242
Akron347be812020-09-29 07:52:52 +0200243 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
244
245 # ~ start of text body ~
246
247 $pfx = $1;
248 $sfx = $2;
249
250 if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
251 die $log->fatal("input line number $.: " .
252 "line with opening text-body tag '${_TEXT_BODY}' " .
253 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200254 };
Peter Harders6f526a32020-06-29 21:44:41 +0200255
Akron347be812020-09-29 07:52:52 +0200256 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
257 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200258
Akron347be812020-09-29 07:52:52 +0200259 # Iterate over all lines in the text body
260 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200261
Akron347be812020-09-29 07:52:52 +0200262 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200263 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100264 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200265
Akron347be812020-09-29 07:52:52 +0200266 # ~ end of text body ~
267 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200268
Akron347be812020-09-29 07:52:52 +0200269 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
Peter Harders6f526a32020-06-29 21:44:41 +0200270
Akron347be812020-09-29 07:52:52 +0200271 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
272 die $log->fatal("input line number $.: " .
273 "line with closing text-body tag '${_TEXT_BODY}'".
274 " contains additional information ... => Aborting (line=$_)");
275 };
Peter Harders6f526a32020-06-29 21:44:41 +0200276
Akron347be812020-09-29 07:52:52 +0200277 if ($dir ne "") {
Peter Harders6f526a32020-06-29 21:44:41 +0200278
Akron347be812020-09-29 07:52:52 +0200279 $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
Peter Harders6f526a32020-06-29 21:44:41 +0200280
Akronf8088e62021-02-18 16:18:59 +0100281 # See notes on whitespace handling
Akron347be812020-09-29 07:52:52 +0200282 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
Peter Harders6f526a32020-06-29 21:44:41 +0200283
Akron4e3c7e32021-02-18 15:19:53 +0100284 # XCT_LINE_NUMBERS is only needed for debugging
285 # (see XML::CompactTree::XS)
286 $param |= XCT_LINE_NUMBERS if DEBUG;
Akron347be812020-09-29 07:52:52 +0200287 $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
Akron598d1a72020-08-02 17:33:31 +0200288
Akron347be812020-09-29 07:52:52 +0200289 $structures->reset;
Akron598d1a72020-08-02 17:33:31 +0200290
Akron347be812020-09-29 07:52:52 +0200291 $tokens->reset if $_TOKENS_PROC;
Akron598d1a72020-08-02 17:33:31 +0200292
Akron347be812020-09-29 07:52:52 +0200293 # ~ whitespace related issue ~
294 $add_one = 0;
295 %ws = ();
Akron598d1a72020-08-02 17:33:31 +0200296
Akron347be812020-09-29 07:52:52 +0200297 # ~ recursion ~
298 retr_info(1, \$tree_data->[2] ); # parse input data
Akron598d1a72020-08-02 17:33:31 +0200299
Akronb3649472020-09-29 08:24:46 +0200300 if (DEBUG) {
Akron26a71522021-02-19 10:27:37 +0100301 $log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
Akron0bb7e722020-09-29 07:48:33 +0200302 };
Akron598d1a72020-08-02 17:33:31 +0200303
Akron347be812020-09-29 07:52:52 +0200304 # ~ write data.xml ~
305 $data->to_zip(
Akron26a71522021-02-19 10:27:37 +0100306 $zipper->new_stream("$dir/${_data_file}.xml"),
Akron347be812020-09-29 07:52:52 +0200307 $text_id_esc
308 );
Akron598d1a72020-08-02 17:33:31 +0200309
Akron347be812020-09-29 07:52:52 +0200310 # ~ tokenization ~
311 if ($_GEN_TOK_EXT) {
Akron598d1a72020-08-02 17:33:31 +0200312
Akron347be812020-09-29 07:52:52 +0200313 # Tokenize and output
314 $ext_tok->tokenize($data->data)->to_zip(
315 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
316 $text_id_esc
317 );
318 };
Akrona10ad592020-08-03 11:20:23 +0200319
Akron347be812020-09-29 07:52:52 +0200320 if ($_GEN_TOK_INT) {
Akrona10ad592020-08-03 11:20:23 +0200321
Akron347be812020-09-29 07:52:52 +0200322 # Tokenize and output
323 $cons_tok->tokenize($data->data)->to_zip(
324 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
Akrona10ad592020-08-03 11:20:23 +0200325 $text_id_esc
326 );
Marc Kupietz74ed7f32020-09-09 18:22:07 +0200327
Akron347be812020-09-29 07:52:52 +0200328 $aggr_tok->tokenize($data->data)->to_zip(
329 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
330 $text_id_esc
331 );
Akron598d1a72020-08-02 17:33:31 +0200332
Akron347be812020-09-29 07:52:52 +0200333 $aggr_tok->reset;
334 $cons_tok->reset;
335 };
Akron598d1a72020-08-02 17:33:31 +0200336
Marc Kupietz985da0c2021-02-15 19:29:50 +0100337 if ($use_tokenizer_sentence_splits) {
338 $ext_tok->sentencize_from_previous_input($structures);
339 }
340
Akron347be812020-09-29 07:52:52 +0200341 # ~ write structures ~
342 if (!$structures->empty) {
343 $structures->to_zip(
344 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
345 $text_id_esc,
346 2 # = structure serialization
347 );
348 };
Akron598d1a72020-08-02 17:33:31 +0200349
Akron347be812020-09-29 07:52:52 +0200350 # ~ write tokens ~
351 if ($_TOKENS_PROC && !$tokens->empty) {
352 $tokens->to_zip(
353 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
354 $text_id_esc,
355 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
356 );
357 };
Akron598d1a72020-08-02 17:33:31 +0200358
Akron347be812020-09-29 07:52:52 +0200359 $dir = ""; # reinit.
Akron598d1a72020-08-02 17:33:31 +0200360
Akron347be812020-09-29 07:52:52 +0200361 # Maybe not necessary
362 $data->reset;
Akron598d1a72020-08-02 17:33:31 +0200363
Akron347be812020-09-29 07:52:52 +0200364 } else { # $dir eq ""
Akron598d1a72020-08-02 17:33:31 +0200365
Akron347be812020-09-29 07:52:52 +0200366 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
Akron598d1a72020-08-02 17:33:31 +0200367 }
Akron598d1a72020-08-02 17:33:31 +0200368
Akron347be812020-09-29 07:52:52 +0200369 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200370 };
371
Akron347be812020-09-29 07:52:52 +0200372 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200373
Akron347be812020-09-29 07:52:52 +0200374 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200375
Akronf8088e62021-02-18 16:18:59 +0100376 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100377
Akronf8088e62021-02-18 16:18:59 +0100378 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
379 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
380
381 # Remove consecutive whitespace at beginning and end (mostly one newline)
382 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200383
Akron347be812020-09-29 07:52:52 +0200384 ### NOTE: this is only relevant, if a text consists of more than one line
385 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
386 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
387 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200388
Akron347be812020-09-29 07:52:52 +0200389 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200390
Akron347be812020-09-29 07:52:52 +0200391 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
392 }
393 ###
Akronf57ed812020-07-27 10:37:52 +0200394
Akron347be812020-09-29 07:52:52 +0200395 # add line to buffer
396 $buf_in .= $_;
397 };
Akronf57ed812020-07-27 10:37:52 +0200398
Akron347be812020-09-29 07:52:52 +0200399 } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200400
Akron347be812020-09-29 07:52:52 +0200401 # ~ start of header ~
402 $pfx = $1;
403 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200404
Akron347be812020-09-29 07:52:52 +0200405 if ($pfx !~ /^\s*$/) {
406 die $log->fatal("input line number $.: " .
407 "line with opening header tag" .
408 " is not in expected format ... => Aborting (line=$_)");
409 };
410
411 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200412 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200413
414 # Header was parseable
415 if ($header) {
416
417 # Write header to zip
Akron26a71522021-02-19 10:27:37 +0100418 my $file = $header->dir . '/' . $_header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200419
Akronb3649472020-09-29 08:24:46 +0200420 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200421
422 $header->to_zip($zipper->new_stream($file));
423
424 # Header is for text level
425 if ($header->type eq 'text') {
426
427 # Remember dir and sigles
428 $dir = $header->dir;
429 $text_id = $header->id;
430 $text_id_esc = $header->id_esc;
431
432 # log output for seeing progression
Akron91577922021-02-19 10:32:54 +0100433 $log->notice("$0: text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200434
435 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200436 }
437 }
Akron347be812020-09-29 07:52:52 +0200438 }
439} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100440
Akron347be812020-09-29 07:52:52 +0200441$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200442
Akron347be812020-09-29 07:52:52 +0200443$ext_tok->close if $_GEN_TOK_EXT;
Peter Hardersd892a582020-02-12 15:45:22 +0100444
Akron347be812020-09-29 07:52:52 +0200445exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100446
Peter Hardersd892a582020-02-12 15:45:22 +0100447
Akrond658df72021-02-18 18:58:56 +0100448# Recursively called function to handle XML tree data
449sub retr_info {
Akron1c4f2202020-07-30 09:28:22 +0200450 # recursion level
451 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
452 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100453
Marc Kupietz985da0c2021-02-15 19:29:50 +0100454 my $dummy_anno;
455 if ($use_tokenizer_sentence_splits) {
Akrond658df72021-02-18 18:58:56 +0100456 $dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz985da0c2021-02-15 19:29:50 +0100457 }
458
Akrond658df72021-02-18 18:58:56 +0100459 # Iteration through all array elements
460 # ($_[0] is a reference to an array reference)
461 # See notes on how 'XML::CompactTree::XS' works and
462 # see 'NODE TYPES' in manpage of XML::LibXML::Reader
463 foreach $e (@{${$_[0]}}) {
Peter Hardersd892a582020-02-12 15:45:22 +0100464
Akrond658df72021-02-18 18:58:56 +0100465 # Element node
466 if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Hardersd892a582020-02-12 15:45:22 +0100467
Peter Harders6f526a32020-06-29 21:44:41 +0200468 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200469 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200470 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100471
Marc Kupietz985da0c2021-02-15 19:29:50 +0100472 my $anno;
473
Akron7501ca02020-08-01 21:05:25 +0200474 # $e->[1] represents the tag name
Marc Kupietz985da0c2021-02-15 19:29:50 +0100475 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
476 $anno = $dummy_anno;
477 } else {
478 $anno = $structures->add_new_annotation($e->[1]);
479 }
Peter Hardersd892a582020-02-12 15:45:22 +0100480
Peter Hardersd892a582020-02-12 15:45:22 +0100481
Akron7501ca02020-08-01 21:05:25 +0200482 # Add element also to token list
483 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
484 $tokens->add_annotation($anno);
485 };
Peter Hardersd892a582020-02-12 15:45:22 +0100486
Akrond658df72021-02-18 18:58:56 +0100487 # Handle attributes (if attributes exist)
488 if (defined $e->[3]) {
Peter Hardersd892a582020-02-12 15:45:22 +0100489
Akrond658df72021-02-18 18:58:56 +0100490 # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
491 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
492 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
493 for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Hardersd892a582020-02-12 15:45:22 +0100494
Peter Harders6f526a32020-06-29 21:44:41 +0200495 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200496 $anno->add_attribute(
497 @{$e->[3]}[$c, $c + 1]
498 );
Akrond658df72021-02-18 18:58:56 +0100499 };
500 };
Peter Harders6f526a32020-06-29 21:44:41 +0200501
502 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200503 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200504
Akrond658df72021-02-18 18:58:56 +0100505
Peter Harders6f526a32020-06-29 21:44:41 +0200506 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200507 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200508 #~~~~
509
510
Akrond658df72021-02-18 18:58:56 +0100511 # Call function recursively
512 # do no recursion, if $e->[$_IDX] is not defined
513 # (because we have no array of child-nodes, e.g.: <back/>)
514 if (defined $e->[$_IDX]) {
Peter Harders6f526a32020-06-29 21:44:41 +0200515
Akrond658df72021-02-18 18:58:56 +0100516 # Recursion with array of child-nodes
517 retr_info($rl+1, \$e->[$_IDX]);
Peter Harders6f526a32020-06-29 21:44:41 +0200518 }
519
520
521 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200522 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200523 #~~~~~
524
Akrond658df72021-02-18 18:58:56 +0100525 # NOTE: use $pos, because the offsets are _between_ the characters
526 # (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akrona10ad592020-08-03 11:20:23 +0200527 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200528
Akrond658df72021-02-18 18:58:56 +0100529 # Handle structures and tokens
Peter Harders6f526a32020-06-29 21:44:41 +0200530
Akrond658df72021-02-18 18:58:56 +0100531 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200532
Peter Harders6f526a32020-06-29 21:44:41 +0200533 # ~ whitespace related issue ~
Akrond658df72021-02-18 18:58:56 +0100534 if ($fval > 0 && not exists $ws{$fval - 1}) {
535
536 # ~ previous node was a text-node ~
537 $anno->set_from($fval - 1);
538 }
539
540 # in case this fails, check input
541 if (($fval - 1) > $pos) {
542 die $log->fatal("text_id='$text_id', " .
543 "processing of structures: " .
544 "from-value ($fval) is 2 or more greater " .
545 "than to-value ($pos) => please check. Aborting");
546 };
547
548 # TODO: find example for which this case applies
549 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
550 #
551 # TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
552 # do testing with bigger corpus excerpt (wikipedia?)
553 $anno->set_from($pos) if $fval == $pos + 1;
554 $anno->set_to($pos);
555 $anno->set_level($rl);
556
557 # Clean up whitespace
Akron0c41ab32020-09-29 07:33:33 +0200558 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100559
560
Peter Harders41c35622020-07-12 01:16:22 +0200561 #~~~~
562 # until here: tag-node (closing)
563 #~~~~
Peter Harders6f526a32020-06-29 21:44:41 +0200564 }
565
Akrond658df72021-02-18 18:58:56 +0100566 # Text node
567 elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders6f526a32020-06-29 21:44:41 +0200568
Akrond658df72021-02-18 18:58:56 +0100569 $add_one = 1;
570 $data->append($e->[1]);
571 }
572
573 # Whitespace node
574 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
575 elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
576
577 # state, that this from-index belongs to a whitespace-node
578 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
579 $ws{$data->position}++;
580
581 $add_one = 0;
582 $data->append($e->[1]);
583 }
584
585 # not yet handled type
586 else {
587
588 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
589 };
590 };
591};
592
Peter Harders6f526a32020-06-29 21:44:41 +0200593
Akrond949e182020-02-14 12:23:57 +0100594__END__
595
596=pod
597
598=encoding utf8
599
600=head1 NAME
601
602tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
603
604=head1 SYNOPSIS
605
606 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
607
608=head1 DESCRIPTION
609
Akronee434b12020-07-08 12:53:01 +0200610C<tei2korapxml> is a script to convert TEI P5 and
611L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
612based documents to the
613L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
614If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100615read from C<STDIN>. If no specific output is defined, data is written
616to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200617
Akrond949e182020-02-14 12:23:57 +0100618This program is usually called from inside another script.
619
Akronee434b12020-07-08 12:53:01 +0200620=head1 FORMATS
621
622=head2 Input restrictions
623
624=over 2
625
626=item
627
Akronee434b12020-07-08 12:53:01 +0200628TEI P5 formatted input with certain restrictions:
629
630=over 4
631
632=item
633
634B<mandatory>: text-header with integrated textsigle, text-body
635
636=item
637
638B<optional>: corp-header with integrated corpsigle,
639doc-header with integrated docsigle
640
641=back
642
643=item
644
Akron0c41ab32020-09-29 07:33:33 +0200645All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200646newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200647(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200648into blanks between 2 tokens could lead to additional blanks,
649where there should be none (e.g.: punctuation characters like C<,> or
650C<.> should not be seperated from their predecessor token).
651(see also code section C<~ whitespace handling ~>).
652
653=back
654
655=head2 Notes on the output
656
657=over 2
658
659=item
660
661zip file output (default on C<stdout>) with utf8 encoded entries
662(which together form the KorAP-XML format)
663
664=back
665
Akrond949e182020-02-14 12:23:57 +0100666=head1 INSTALLATION
667
668C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
669these bindings are available, the preferred way to install the script is
670to use L<cpanm|App::cpanminus>.
671
672 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
673
674In case everything went well, the C<tei2korapxml> tool will
675be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200676
Akrond949e182020-02-14 12:23:57 +0100677Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
678
679=head1 OPTIONS
680
681=over 2
682
Akron4e603a52020-07-27 14:23:49 +0200683=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100684
Akron4e603a52020-07-27 14:23:49 +0200685The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100686
687=item B<--help|-h>
688
689Print help information.
690
691=item B<--version|-v>
692
693Print version information.
694
Akron4e603a52020-07-27 14:23:49 +0200695=item B<--tokenizer-call|-tc>
696
697Call an external tokenizer process, that will tokenize
698a single line from STDIN and outputs one token per line.
699
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200700=item B<--tokenizer-korap|-tk>
701
702Use the standard KorAP/DeReKo tokenizer.
703
Akron6d7b8e42020-09-29 07:37:41 +0200704=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200705
706Tokenize the data using two embedded tokenizers,
707that will take an I<Aggressive> and a I<conservative>
708approach.
709
Akron1a5271a2021-02-18 13:18:15 +0100710=item B<--inline-tokens> <foundry>#[<file>]
711
712Define the foundry and file (without extension)
713to store inline token information in.
714If L</KORAPXMLTEI_INLINE> is set, this will contain
715annotations as well.
716Defaults to C<tokens> and C<morpho>.
717
Akrondd0be8f2021-02-18 19:29:41 +0100718=item B<--inline-structures> <foundry>#[<file>]
719
720Define the foundry and file (without extension)
721to store inline structure information in.
722Defaults to C<struct> and C<structures>.
723
Akron26a71522021-02-19 10:27:37 +0100724=item B<--base-foundry> <foundry>
725
726Define the base foundry to store newly generated
727token information in.
728Defaults to C<base>.
729
730=item B<--data-file> <file>
731
732Define the file (without extension)
733to store primary data information in.
734Defaults to C<data>.
735
736=item B<--header-file> <file>
737
738Define the file name (without extension)
739to store header information on
740the corpus, document, and text level in.
741Defaults to C<header>.
742
Marc Kupietz985da0c2021-02-15 19:29:50 +0100743=item B<--use-tokenizer-sentence-splits|-s>
744
745Replace existing with, or add new, sentence boundary information
746provided by the KorAP tokenizer (currently supported only).
747
Akron3378dfd2020-08-01 15:01:36 +0200748=item B<--log|-l>
749
750Loglevel for I<Log::Any>. Defaults to C<notice>.
751
Akrond949e182020-02-14 12:23:57 +0100752=back
753
Akronb3649472020-09-29 08:24:46 +0200754=head1 ENVIRONMENT VARIABLES
755
756=over 2
757
758=item B<KORAPXMLTEI_DEBUG>
759
760Activate minimal debugging.
761Defaults to C<false>.
762
763=item B<KORAPXMLTEI_INLINE>
764
765Process inline annotations, if present.
766Defaults to C<false>.
767
768=back
769
Akrond949e182020-02-14 12:23:57 +0100770=head1 COPYRIGHT AND LICENSE
771
Marc Kupietze955ecc2021-02-17 17:42:01 +0100772Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100773
774Author: Peter Harders
775
Akronaabd0952020-09-29 07:35:08 +0200776Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100777
778L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
779Corpus Analysis Platform at the
780L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
781member of the
782L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
783
784This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100785L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100786
787=cut
Akronf8088e62021-02-18 16:18:59 +0100788
789# NOTES
790
791## Notes on how 'XML::CompactTree::XS' works
792
793Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
794
795Print out name of 'node2' for the above example:
796
797echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
798
799Exploring the structure of $data ( = reference to below array ):
800
801[ 0: XML_READER_TYPE_DOCUMENT,
802 1: ?
Akron91577922021-02-19 10:32:54 +0100803 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akronf8088e62021-02-18 16:18:59 +0100804 1: 'node'
805 2: ?
806 3: HASH (attributes)
807 4: 1 (line number)
808 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
809 1: 'node1'
810 2: ?
811 3: undefined (no attributes)
812 4: 1 (line number)
813 5: [ 0: [ 0: XML_READER_TYPE_TEXT
814 1: 'some '
815 ]
816 1: [ 0: XML_READER_TYPE_ELEMENT
817 1: 'n'
818 2: ?
819 3: undefined (no attributes)
820 4: 1 (line number)
821 5: undefined (no child-nodes)
822 ]
823 2: [ 0: XML_READER_TYPE_TEXT
824 1: ' text'
825 ]
826 ]
827 ]
828 1: [ 0: XML_READER_TYPE_ELEMENT
829 1: 'node2'
830 2: ?
831 3: undefined (not attributes)
832 4: 1 (line number)
833 5: [ 0: [ 0: XML_READER_TYPE_TEXT
834 1: 'more-text'
835 ]
836 ]
837 ]
838 ]
839 ]
840 ]
841]
842
843$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
844
845ref($data->[2]) == ARRAY (with 1 element for 'node')
846ref($data->[2]->[0]) == ARRAY (with 6 elements)
847
848$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
849$data->[2]->[0]->[1] == 'node'
850ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
851$data->[2]->[0]->[4] == 1 (line number)
852ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
853 # child-nodes of actual node (see $_IDX)
854
855ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
856$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
857$data->[2]->[0]->[5]->[0]->[1] == 'node1'
858$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
859$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
860ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
861
862ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
863$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
864$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
865
866ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
867$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
868$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
869$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
870$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
871$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
872
873ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
874$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
875$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
876
877
878retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
879Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
880${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
881
882
883## Notes on whitespace handling
884
885Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
886(see function 'retr_info()').
887
888Definition of significant and insignificant whitespace
889(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
890
891Significant whitespace is part of the document content and should be preserved.
892Insignificant whitespace is used when editing XML documents for readability.
893These whitespaces are typically not intended for inclusion in the delivery of the document.
894
895### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
896
897The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
898 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
899
900When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
901 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
902 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
903
904echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
905
906
907Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
908
909Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
910 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
911
912The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
913 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
914
915The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
916 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
917
918When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
919 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
920 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
921 the last read 'non-tag'-node has to be corrected (see [1]),
922
923For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
924 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
925
926[1]
927Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
928 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
929 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
930
931[2]
932Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
933 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
934
935The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
936 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
937
938Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
939 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
940
941
942## Notes on whitespace fixing
943
944The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
945 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
946
947It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
948 example further down and notes on 'Input restrictions' in the manpage).
949
950Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
951
952Examples (how primary text with linebreaks would be converted by below code):
953
954 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
955 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
956
957Blanks are inserted before the 1st character:
958
959 NOTE: not stringent ('...' stands for text):
960
961 beg1............................end1 => no blank before 'beg1'
962 beg2....<pb/>...................end2 => no blank before 'beg2'
963 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
964 beg4....<test>ok</test>.........end4 => blank before 'beg4'
965
966 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
967 ^
968 |_blank between 'end3' and 'beg4'
969
970
971## Notes on segfault prevention
972
Akron91577922021-02-19 10:32:54 +0100973binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100974(see notes on 'PerlIO layers' in 'man XML::LibXML'),
975removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
976see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
977see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.