blob: 4c2e6d864d976ec2c974f9793a445f47c14d874a [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Peter Hardersd892a582020-02-12 15:45:22 +010014use XML::CompactTree::XS;
15use XML::LibXML::Reader;
Peter Hardersd892a582020-02-12 15:45:22 +010016
Akron4f67cd42020-07-02 12:27:58 +020017use FindBin;
18BEGIN {
19 unshift @INC, "$FindBin::Bin/../lib";
20};
21
Marc Kupietz8a954e52021-02-16 22:03:07 +010022use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020023use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020024use KorAP::XML::TEI::Tokenizer::Conservative;
25use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron7501ca02020-08-01 21:05:25 +020026use KorAP::XML::TEI::Annotations::Collector;
Akrona10ad592020-08-03 11:20:23 +020027use KorAP::XML::TEI::Data;
Akron85717512020-07-08 11:19:19 +020028use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020029use KorAP::XML::TEI::Header;
Peter Hardersd892a582020-02-12 15:45:22 +010030
Marc Kupietz1e882fb2020-09-09 00:05:46 +020031eval {
32 require KorAP::XML::TEI::Tokenizer::KorAP;
33 1;
34};
Peter Harders1c5ce152020-07-22 18:02:50 +020035
Akronf7084c42021-01-07 10:25:22 +010036our $VERSION = '0.03';
Peter Harders6f526a32020-06-29 21:44:41 +020037
Akrond949e182020-02-14 12:23:57 +010038our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
39
Akronb3649472020-09-29 08:24:46 +020040# Set to 1 for minimal more debug output (no need to be parametrized)
41use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Hardersd892a582020-02-12 15:45:22 +010042
Peter Harders6f526a32020-06-29 21:44:41 +020043# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010044GetOptions(
Peter Harders6f526a32020-06-29 21:44:41 +020045 "root|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
46 "input|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron8b511f92020-07-09 17:28:08 +020047 'tokenizer-call|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz1e882fb2020-09-09 00:05:46 +020048 'tokenizer-korap|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron6d7b8e42020-09-29 07:37:41 +020049 'tokenizer-internal|ti' => \(my $tokenizer_intern), # use intern tokenization (default = no)
Marc Kupietz985da0c2021-02-15 19:29:50 +010050 'use-tokenizer-sentence-splits|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron1a5271a2021-02-18 13:18:15 +010051 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akron3378dfd2020-08-01 15:01:36 +020052 'log|l=s' => \(my $log_level = 'notice'),
Akron8b511f92020-07-09 17:28:08 +020053 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010054 pod2usage(
55 -verbose => 99,
56 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
57 -msg => $VERSION_MSG,
58 -output => '-'
59 )
60 },
61 'version|v' => sub {
62 pod2usage(
63 -verbose => 0,
64 -msg => $VERSION_MSG,
65 -output => '-'
66 )
67 }
Peter Hardersd892a582020-02-12 15:45:22 +010068);
69
Marc Kupietz44b1f252020-11-26 16:31:40 +010070binmode(STDERR, ":encoding(UTF-8)");
Akron3378dfd2020-08-01 15:01:36 +020071Log::Any::Adapter->set('Stderr', log_level => $log_level);
72
Akronb3649472020-09-29 08:24:46 +020073$log->notice('Debugging is activated') if DEBUG;
74
Peter Harders6f526a32020-06-29 21:44:41 +020075#
76# ~~~ parameter (mandatory) ~~~
77#
Peter Harders6f526a32020-06-29 21:44:41 +020078my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron0c41ab32020-09-29 07:33:33 +020079# optional
Peter Harders6f526a32020-06-29 21:44:41 +020080my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron0c41ab32020-09-29 07:33:33 +020081# optional
Peter Harders6f526a32020-06-29 21:44:41 +020082my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron0c41ab32020-09-29 07:33:33 +020083# mandatory
Peter Harders6f526a32020-06-29 21:44:41 +020084my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron09e0b2c2020-07-28 15:57:01 +020085
Akron0c41ab32020-09-29 07:33:33 +020086
Peter Harders41c35622020-07-12 01:16:22 +020087## extern tokenization
Marc Kupietz1e882fb2020-09-09 00:05:46 +020088my $_GEN_TOK_EXT = $tokenizer_call || $tokenizer_korap ? 1 : 0;
89
Marc Kupietz985da0c2021-02-15 19:29:50 +010090if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
91 die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
92}
93
Akron0c41ab32020-09-29 07:33:33 +020094my $ext_tok;
95if ($tokenizer_call) {
96 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
97}
Marc Kupietz1e882fb2020-09-09 00:05:46 +020098
Akron0c41ab32020-09-29 07:33:33 +020099elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100100 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200101};
102my $_tok_file_ext = "tokens.xml";
Peter Harders6f526a32020-06-29 21:44:41 +0200103##
104
Akron0c41ab32020-09-29 07:33:33 +0200105
Akron4e3c7e32021-02-18 15:19:53 +0100106#
107# ~~~ constants ~~~
108#
109
110
Akron8b511f92020-07-09 17:28:08 +0200111## intern tokenization
Peter Hardersf9c51242020-07-21 02:37:44 +0200112my $_GEN_TOK_INT = $tokenizer_intern; # simple tokenization (recommended for testing)
Akron0c41ab32020-09-29 07:33:33 +0200113my $_tok_file_con = "tokens_conservative.xml";
114my $_tok_file_agg = "tokens_aggressive.xml";
115my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
116my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders41c35622020-07-12 01:16:22 +0200117##
118
Akron0c41ab32020-09-29 07:33:33 +0200119
Peter Harders41c35622020-07-12 01:16:22 +0200120my $_tok_dir = "base"; # name of directory for storing tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200121
Peter Harders6f526a32020-06-29 21:44:41 +0200122my $_header_file = "header.xml"; # name of files containing the text, document and corpus header
123my $_data_file = "data.xml"; # name of file containing the primary text data (tokens)
124my $_structure_dir = "struct"; # name of directory containing the $_structure_file
125my $_structure_file = "structure.xml"; # name of file containing all tags (except ${_TOKEN_TAG}'s) related information
126 # (= their names and byte offsets in $_data)
127## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
128my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron1a5271a2021-02-18 13:18:15 +0100129
130
131# Name of the directory and the file containing all inline token informations
132# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
133my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
134$_tokens_file .= '.xml';
135
Peter Harders6f526a32020-06-29 21:44:41 +0200136my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
137
Akron4e3c7e32021-02-18 15:19:53 +0100138# Handling inline annotations (inside $_TOKENS_TAG)
139my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron09e0b2c2020-07-28 15:57:01 +0200140
Peter Harders6f526a32020-06-29 21:44:41 +0200141
142#
143# ~~~ variables ~~~
144#
145
Akron7501ca02020-08-01 21:05:25 +0200146# Initialize Token- and Structure-Collector
147my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
148my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron09e0b2c2020-07-28 15:57:01 +0200149
150
Akrona10ad592020-08-03 11:20:23 +0200151# Initialize Data-Collector
152my $data = KorAP::XML::TEI::Data->new;
153
154
Akron85717512020-07-08 11:19:19 +0200155# Initialize zipper
Akron3bdc0a32020-08-03 12:12:56 +0200156my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders6f526a32020-06-29 21:44:41 +0200157my $input_fh; # input file handle (default: stdin)
158
Peter Harders6f526a32020-06-29 21:44:41 +0200159my $dir; # text directory (below $_root_dir)
Peter Harders6f526a32020-06-29 21:44:41 +0200160
Akron0c41ab32020-09-29 07:33:33 +0200161my ( $text_id,
162 $text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders6f526a32020-06-29 21:44:41 +0200163
Peter Harders6f526a32020-06-29 21:44:41 +0200164my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
165 $tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
166
167# these are only used inside recursive function 'retr_info'
Akron4e3c7e32021-02-18 15:19:53 +0100168my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200169 $e, # element from $tree_data
Peter Harders6f526a32020-06-29 21:44:41 +0200170 ## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
171 $add_one, # ...
Akron7501ca02020-08-01 21:05:25 +0200172 $fval, # ...
Peter Harders41c35622020-07-12 01:16:22 +0200173 %ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
174 # idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders6f526a32020-06-29 21:44:41 +0200175 # (means: 'from-index - 1' is a key in %ws).
176 # if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
177
Akron7501ca02020-08-01 21:05:25 +0200178my $c; # index variables used in loops
Peter Harders6f526a32020-06-29 21:44:41 +0200179
Peter Harders6f526a32020-06-29 21:44:41 +0200180
181#
182# ~~~ main ~~~
183#
184
185# ~ initializations ~
186
Akron4e3c7e32021-02-18 15:19:53 +0100187# Include line numbers in elements of $tree_data for debugging
188DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders6f526a32020-06-29 21:44:41 +0200189
Akron7501ca02020-08-01 21:05:25 +0200190$fval = 0;
Peter Harders6f526a32020-06-29 21:44:41 +0200191
Akronec2cef22020-07-31 10:00:15 +0200192# Normalize regex for header parsing
193for ($_CORP_HEADER_BEG,
194 $_DOC_HEADER_BEG,
195 $_TEXT_HEADER_BEG) {
196 s!^([^\s]+)(.*)$!$1\[\^>\]*$2!;
197};
Peter Hardersd892a582020-02-12 15:45:22 +0100198
Peter Hardersd892a582020-02-12 15:45:22 +0100199
Peter Harders6f526a32020-06-29 21:44:41 +0200200# ~ read input and write output (text by text) ~
Peter Hardersd892a582020-02-12 15:45:22 +0100201
Akron347be812020-09-29 07:52:52 +0200202my ( $pfx, $sfx );
Peter Hardersd892a582020-02-12 15:45:22 +0100203
Akron347be812020-09-29 07:52:52 +0200204my $tl = 0; # text line (needed for whitespace handling)
Peter Hardersd892a582020-02-12 15:45:22 +0100205
Akron347be812020-09-29 07:52:52 +0200206$input_fh = *STDIN; # input file handle (default: stdin)
Peter Hardersd892a582020-02-12 15:45:22 +0100207
Akron347be812020-09-29 07:52:52 +0200208# Maybe not necessary
209$data->reset;
Peter Hardersd892a582020-02-12 15:45:22 +0100210
Akron347be812020-09-29 07:52:52 +0200211$dir = "";
Peter Hardersd892a582020-02-12 15:45:22 +0100212
Akron347be812020-09-29 07:52:52 +0200213if ( $input_fname ne '' ){
214 unless (open($input_fh, '<', $input_fname)) {
215 die $log->fatal("File '$input_fname' could not be opened.");
216 };
217}
Peter Harders6f526a32020-06-29 21:44:41 +0200218
Akronf8088e62021-02-18 16:18:59 +0100219# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200220binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200221
Akron347be812020-09-29 07:52:52 +0200222my $pos;
Akroneaa96232020-10-15 17:06:15 +0200223my $input_enc = 'UTF-8';
Akron347be812020-09-29 07:52:52 +0200224my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders6f526a32020-06-29 21:44:41 +0200225
Akron347be812020-09-29 07:52:52 +0200226# ~ loop (reading input document) ~
Peter Harders6f526a32020-06-29 21:44:41 +0200227
Akron347be812020-09-29 07:52:52 +0200228MAIN: while ( <$input_fh> ){
229
230 $_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
231
Akroneaa96232020-10-15 17:06:15 +0200232 # Set input encoding
233 if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
234 $input_enc = $2;
235 next;
236 };
237
238 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100239 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200240
Akron347be812020-09-29 07:52:52 +0200241 if ( index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$# ){
242
243 # ~ start of text body ~
244
245 $pfx = $1;
246 $sfx = $2;
247
248 if ($pfx !~ /^\s*$/ || $sfx !~ /^\s*$/) {
249 die $log->fatal("input line number $.: " .
250 "line with opening text-body tag '${_TEXT_BODY}' " .
251 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200252 };
Peter Harders6f526a32020-06-29 21:44:41 +0200253
Akron347be812020-09-29 07:52:52 +0200254 # text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
255 my $buf_in = '';
Peter Harders90157342020-07-01 21:05:14 +0200256
Akron347be812020-09-29 07:52:52 +0200257 # Iterate over all lines in the text body
258 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200259
Akron347be812020-09-29 07:52:52 +0200260 $_ = remove_xml_comments( $input_fh, $_ );
Akroneaa96232020-10-15 17:06:15 +0200261 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100262 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200263
Akron347be812020-09-29 07:52:52 +0200264 # ~ end of text body ~
265 if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200266
Akron347be812020-09-29 07:52:52 +0200267 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files (s.a.: $_tok_file_ext, $_tok_file_con, $_tok_file_agg)
Peter Harders6f526a32020-06-29 21:44:41 +0200268
Akron347be812020-09-29 07:52:52 +0200269 if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
270 die $log->fatal("input line number $.: " .
271 "line with closing text-body tag '${_TEXT_BODY}'".
272 " contains additional information ... => Aborting (line=$_)");
273 };
Peter Harders6f526a32020-06-29 21:44:41 +0200274
Akron347be812020-09-29 07:52:52 +0200275 if ($dir ne "") {
Peter Harders6f526a32020-06-29 21:44:41 +0200276
Akron347be812020-09-29 07:52:52 +0200277 $reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
Peter Harders6f526a32020-06-29 21:44:41 +0200278
Akronf8088e62021-02-18 16:18:59 +0100279 # See notes on whitespace handling
Akron347be812020-09-29 07:52:52 +0200280 my $param = XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_ATTRIBUTE_ARRAY;
Peter Harders6f526a32020-06-29 21:44:41 +0200281
Akron4e3c7e32021-02-18 15:19:53 +0100282 # XCT_LINE_NUMBERS is only needed for debugging
283 # (see XML::CompactTree::XS)
284 $param |= XCT_LINE_NUMBERS if DEBUG;
Akron347be812020-09-29 07:52:52 +0200285 $tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
Akron598d1a72020-08-02 17:33:31 +0200286
Akron347be812020-09-29 07:52:52 +0200287 $structures->reset;
Akron598d1a72020-08-02 17:33:31 +0200288
Akron347be812020-09-29 07:52:52 +0200289 $tokens->reset if $_TOKENS_PROC;
Akron598d1a72020-08-02 17:33:31 +0200290
Akron347be812020-09-29 07:52:52 +0200291 # ~ whitespace related issue ~
292 $add_one = 0;
293 %ws = ();
Akron598d1a72020-08-02 17:33:31 +0200294
Akron347be812020-09-29 07:52:52 +0200295 # ~ recursion ~
296 retr_info(1, \$tree_data->[2] ); # parse input data
Akron598d1a72020-08-02 17:33:31 +0200297
Akronb3649472020-09-29 08:24:46 +0200298 if (DEBUG) {
Akron347be812020-09-29 07:52:52 +0200299 $log->debug("Writing (utf8-formatted) xml file $dir/$_data_file");
Akron0bb7e722020-09-29 07:48:33 +0200300 };
Akron598d1a72020-08-02 17:33:31 +0200301
Akron347be812020-09-29 07:52:52 +0200302 # ~ write data.xml ~
303 $data->to_zip(
304 $zipper->new_stream("$dir/${_data_file}"),
305 $text_id_esc
306 );
Akron598d1a72020-08-02 17:33:31 +0200307
Akron347be812020-09-29 07:52:52 +0200308 # ~ tokenization ~
309 if ($_GEN_TOK_EXT) {
Akron598d1a72020-08-02 17:33:31 +0200310
Akron347be812020-09-29 07:52:52 +0200311 # Tokenize and output
312 $ext_tok->tokenize($data->data)->to_zip(
313 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_ext"),
314 $text_id_esc
315 );
316 };
Akrona10ad592020-08-03 11:20:23 +0200317
Akron347be812020-09-29 07:52:52 +0200318 if ($_GEN_TOK_INT) {
Akrona10ad592020-08-03 11:20:23 +0200319
Akron347be812020-09-29 07:52:52 +0200320 # Tokenize and output
321 $cons_tok->tokenize($data->data)->to_zip(
322 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_con"),
Akrona10ad592020-08-03 11:20:23 +0200323 $text_id_esc
324 );
Marc Kupietz74ed7f32020-09-09 18:22:07 +0200325
Akron347be812020-09-29 07:52:52 +0200326 $aggr_tok->tokenize($data->data)->to_zip(
327 $zipper->new_stream("$dir/$_tok_dir/$_tok_file_agg"),
328 $text_id_esc
329 );
Akron598d1a72020-08-02 17:33:31 +0200330
Akron347be812020-09-29 07:52:52 +0200331 $aggr_tok->reset;
332 $cons_tok->reset;
333 };
Akron598d1a72020-08-02 17:33:31 +0200334
Marc Kupietz985da0c2021-02-15 19:29:50 +0100335 if ($use_tokenizer_sentence_splits) {
336 $ext_tok->sentencize_from_previous_input($structures);
337 }
338
Akron347be812020-09-29 07:52:52 +0200339 # ~ write structures ~
340 if (!$structures->empty) {
341 $structures->to_zip(
342 $zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
343 $text_id_esc,
344 2 # = structure serialization
345 );
346 };
Akron598d1a72020-08-02 17:33:31 +0200347
Akron347be812020-09-29 07:52:52 +0200348 # ~ write tokens ~
349 if ($_TOKENS_PROC && !$tokens->empty) {
350 $tokens->to_zip(
351 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
352 $text_id_esc,
353 $_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
354 );
355 };
Akron598d1a72020-08-02 17:33:31 +0200356
Akron347be812020-09-29 07:52:52 +0200357 $dir = ""; # reinit.
Akron598d1a72020-08-02 17:33:31 +0200358
Akron347be812020-09-29 07:52:52 +0200359 # Maybe not necessary
360 $data->reset;
Akron598d1a72020-08-02 17:33:31 +0200361
Akron347be812020-09-29 07:52:52 +0200362 } else { # $dir eq ""
Akron598d1a72020-08-02 17:33:31 +0200363
Akron347be812020-09-29 07:52:52 +0200364 $log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
Akron598d1a72020-08-02 17:33:31 +0200365 }
Akron598d1a72020-08-02 17:33:31 +0200366
Akron347be812020-09-29 07:52:52 +0200367 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200368 };
369
Akron347be812020-09-29 07:52:52 +0200370 # ~ inside text body ~
Peter Harders6f526a32020-06-29 21:44:41 +0200371
Akron347be812020-09-29 07:52:52 +0200372 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200373
Akronf8088e62021-02-18 16:18:59 +0100374 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100375
Akronf8088e62021-02-18 16:18:59 +0100376 # TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
377 # an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
378
379 # Remove consecutive whitespace at beginning and end (mostly one newline)
380 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200381
Akron347be812020-09-29 07:52:52 +0200382 ### NOTE: this is only relevant, if a text consists of more than one line
383 ### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
384 ### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
385 if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akronf57ed812020-07-27 10:37:52 +0200386
Akron347be812020-09-29 07:52:52 +0200387 $tl++; # counter for text lines
Akronf57ed812020-07-27 10:37:52 +0200388
Akron347be812020-09-29 07:52:52 +0200389 s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
390 }
391 ###
Akronf57ed812020-07-27 10:37:52 +0200392
Akron347be812020-09-29 07:52:52 +0200393 # add line to buffer
394 $buf_in .= $_;
395 };
Akronf57ed812020-07-27 10:37:52 +0200396
Akron347be812020-09-29 07:52:52 +0200397 } elsif (m#^(.*)(<(?:${_TEXT_HEADER_BEG}|${_DOC_HEADER_BEG}|${_CORP_HEADER_BEG}).*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200398
Akron347be812020-09-29 07:52:52 +0200399 # ~ start of header ~
400 $pfx = $1;
401 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200402
Akron347be812020-09-29 07:52:52 +0200403 if ($pfx !~ /^\s*$/) {
404 die $log->fatal("input line number $.: " .
405 "line with opening header tag" .
406 " is not in expected format ... => Aborting (line=$_)");
407 };
408
409 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200410 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200411
412 # Header was parseable
413 if ($header) {
414
415 # Write header to zip
416 my $file = $header->dir . '/' . $_header_file;
417
Akronb3649472020-09-29 08:24:46 +0200418 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200419
420 $header->to_zip($zipper->new_stream($file));
421
422 # Header is for text level
423 if ($header->type eq 'text') {
424
425 # Remember dir and sigles
426 $dir = $header->dir;
427 $text_id = $header->id;
428 $text_id_esc = $header->id_esc;
429
430 # log output for seeing progression
Marc Kupietz44b1f252020-11-26 16:31:40 +0100431 $log->notice("$0: main(): text_id=$text_id");
Akron347be812020-09-29 07:52:52 +0200432
433 $tl = 0; # reset (needed for ~ whitespace handling ~)
Akronf57ed812020-07-27 10:37:52 +0200434 }
435 }
Akron347be812020-09-29 07:52:52 +0200436 }
437} #end: while
Peter Hardersd892a582020-02-12 15:45:22 +0100438
Akron347be812020-09-29 07:52:52 +0200439$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200440
Akron347be812020-09-29 07:52:52 +0200441$ext_tok->close if $_GEN_TOK_EXT;
Peter Hardersd892a582020-02-12 15:45:22 +0100442
Akron347be812020-09-29 07:52:52 +0200443exit(0);
Peter Hardersd892a582020-02-12 15:45:22 +0100444
Peter Hardersd892a582020-02-12 15:45:22 +0100445
Peter Harders41c35622020-07-12 01:16:22 +0200446sub retr_info { # called from main()
Akron1c4f2202020-07-30 09:28:22 +0200447 # recursion level
448 # (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
449 my $rl = shift;
Peter Hardersd892a582020-02-12 15:45:22 +0100450
Marc Kupietz985da0c2021-02-15 19:29:50 +0100451 my $dummy_anno;
452 if ($use_tokenizer_sentence_splits) {
453 $dummy_anno = $structures->new_dummy_annotation();
454 }
455
Akronf8088e62021-02-18 16:18:59 +0100456 # See NOTES ON HOW
Peter Hardersd892a582020-02-12 15:45:22 +0100457
Akron0c41ab32020-09-29 07:33:33 +0200458 foreach $e (@{${$_[0]}}) { # iteration through all array elements ($_[0] is a reference to an array reference)
Peter Harders41c35622020-07-12 01:16:22 +0200459
Akron0c41ab32020-09-29 07:33:33 +0200460 if ($e->[0] == XML_READER_TYPE_ELEMENT) { # element-node (see 'NODE TYPES' in manpage of XML::LibXML::Reader)
Peter Hardersd892a582020-02-12 15:45:22 +0100461
Peter Harders6f526a32020-06-29 21:44:41 +0200462 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200463 # from here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200464 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100465
Peter Harders6f526a32020-06-29 21:44:41 +0200466 # ~ handle structures ~
Peter Hardersd892a582020-02-12 15:45:22 +0100467
Marc Kupietz985da0c2021-02-15 19:29:50 +0100468 my $anno;
469
Akron7501ca02020-08-01 21:05:25 +0200470 # $e->[1] represents the tag name
Marc Kupietz985da0c2021-02-15 19:29:50 +0100471 if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
472 $anno = $dummy_anno;
473 } else {
474 $anno = $structures->add_new_annotation($e->[1]);
475 }
Peter Hardersd892a582020-02-12 15:45:22 +0100476
Peter Harders6f526a32020-06-29 21:44:41 +0200477 # ~ handle tokens ~
Peter Hardersd892a582020-02-12 15:45:22 +0100478
Akron7501ca02020-08-01 21:05:25 +0200479 # Add element also to token list
480 if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
481 $tokens->add_annotation($anno);
482 };
Peter Hardersd892a582020-02-12 15:45:22 +0100483
Peter Harders6f526a32020-06-29 21:44:41 +0200484 # ~ handle attributes ~
Peter Hardersd892a582020-02-12 15:45:22 +0100485
Akron0c41ab32020-09-29 07:33:33 +0200486 if (defined $e->[3]) { # only if attributes exist
Peter Hardersd892a582020-02-12 15:45:22 +0100487
Akron0c41ab32020-09-29 07:33:33 +0200488 for ($c = 0; $c < @{$e->[3]}; $c += 2) { # with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
Peter Harders6f526a32020-06-29 21:44:41 +0200489 # [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
490 # note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Peter Hardersd892a582020-02-12 15:45:22 +0100491
Peter Harders6f526a32020-06-29 21:44:41 +0200492 # '$c' references the 'key' and '$c+1' the 'value'
Akron7501ca02020-08-01 21:05:25 +0200493 $anno->add_attribute(
494 @{$e->[3]}[$c, $c + 1]
495 );
Peter Harders6f526a32020-06-29 21:44:41 +0200496 }
497 }
498
499
500 # ~ index 'from' ~
501
502 # this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akrona10ad592020-08-03 11:20:23 +0200503 $anno->set_from($data->position + $add_one);
Peter Harders6f526a32020-06-29 21:44:41 +0200504
505 #~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200506 # until here: tag-node (opening)
Peter Harders6f526a32020-06-29 21:44:41 +0200507 #~~~~
508
509
510 # ~~ RECURSION ~~
511
Akron0c41ab32020-09-29 07:33:33 +0200512 if (defined $e->[$_IDX]) { # do no recursion, if $e->[$_IDX] is not defined (because we have no array of child-nodes, e.g.: <back/>)
Peter Harders6f526a32020-06-29 21:44:41 +0200513
Akron1c4f2202020-07-30 09:28:22 +0200514 retr_info($rl+1, \$e->[$_IDX]); # recursion with array of child-nodes
Peter Harders6f526a32020-06-29 21:44:41 +0200515 }
516
517
518 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200519 # from here: tag-node (closing)
Peter Harders6f526a32020-06-29 21:44:41 +0200520 #~~~~~
521
Akrona10ad592020-08-03 11:20:23 +0200522 my $pos = $data->position;
Peter Harders6f526a32020-06-29 21:44:41 +0200523
Akron7501ca02020-08-01 21:05:25 +0200524 # ~ handle structures and tokens ~
Peter Harders6f526a32020-06-29 21:44:41 +0200525
526 {
Akron7501ca02020-08-01 21:05:25 +0200527 $fval = $anno->from;
Peter Harders6f526a32020-06-29 21:44:41 +0200528
Akron0c41ab32020-09-29 07:33:33 +0200529 if ($fval > 0 && not exists $ws{$fval - 1}) { # ~ whitespace related issue ~
Peter Harders6f526a32020-06-29 21:44:41 +0200530
Peter Harders41c35622020-07-12 01:16:22 +0200531 # ~ previous node was a text-node ~
Peter Harders6f526a32020-06-29 21:44:41 +0200532
Akron7501ca02020-08-01 21:05:25 +0200533 $anno->set_from($fval - 1);
Peter Harders6f526a32020-06-29 21:44:41 +0200534 }
535
536 # in case this fails, check input
Akron0bb7e722020-09-29 07:48:33 +0200537 if (($fval - 1) > $pos) {
538 die $log->fatal("text_id='$text_id', " .
539 "processing of structures: " .
540 "from-value ($fval) is 2 or more greater " .
541 "than to-value ($pos) => please check. Aborting");
542 };
Peter Harders6f526a32020-06-29 21:44:41 +0200543
Peter Harders41c35622020-07-12 01:16:22 +0200544 # TODO: find example for which this case applies
Peter Harders6f526a32020-06-29 21:44:41 +0200545 # maybe this is not necessary anymore, because the above recorrection of the from-value suffices
Akron0c41ab32020-09-29 07:33:33 +0200546 #
Akrona10ad592020-08-03 11:20:23 +0200547 # TODO: check, if it's better to remove this line and change above check to 'if ( $fval - 1) >= $pos;
Peter Harders41c35622020-07-12 01:16:22 +0200548 # do testing with bigger corpus excerpt (wikipedia?)
Akrona10ad592020-08-03 11:20:23 +0200549 $anno->set_from($pos) if $fval == $pos + 1;
550 $anno->set_to($pos);
Akron7501ca02020-08-01 21:05:25 +0200551 $anno->set_level($rl);
Peter Harders6f526a32020-06-29 21:44:41 +0200552
Akrona10ad592020-08-03 11:20:23 +0200553 # note: use $pos, because the offsets are _between_ the characters (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Peter Harders6f526a32020-06-29 21:44:41 +0200554 }
555
Peter Harders6f526a32020-06-29 21:44:41 +0200556 # ~ whitespace related issue ~
Peter Hardersd892a582020-02-12 15:45:22 +0100557 # clean up
Akron0c41ab32020-09-29 07:33:33 +0200558 delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Hardersd892a582020-02-12 15:45:22 +0100559
560
Peter Harders41c35622020-07-12 01:16:22 +0200561 #~~~~
562 # until here: tag-node (closing)
563 #~~~~
Peter Hardersd892a582020-02-12 15:45:22 +0100564
565
Akron0c41ab32020-09-29 07:33:33 +0200566 #~~~~~
567 # from here: text- and whitespace-nodes
568 #~~~~~
Peter Harders41c35622020-07-12 01:16:22 +0200569
Akronf8088e62021-02-18 16:18:59 +0100570 # (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
Akron0c41ab32020-09-29 07:33:33 +0200571 } elsif ($e->[0] == XML_READER_TYPE_TEXT || $e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE){
Peter Hardersd892a582020-02-12 15:45:22 +0100572
Akron0c41ab32020-09-29 07:33:33 +0200573 if ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
Peter Harders6f526a32020-06-29 21:44:41 +0200574
Peter Harders41c35622020-07-12 01:16:22 +0200575 # ~ whitespace-node ~
576
577 # ~ whitespace related issue ~
Peter Harders6f526a32020-06-29 21:44:41 +0200578
579 $add_one = 0;
580
Akrona10ad592020-08-03 11:20:23 +0200581 # state, that this from-index belongs to a whitespace-node
582 # ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
583 $ws{$data->position}++;
Peter Harders6f526a32020-06-29 21:44:41 +0200584
Akron0c41ab32020-09-29 07:33:33 +0200585 } else {
Peter Harders6f526a32020-06-29 21:44:41 +0200586
587 # ~ text-node ~
588
589 $add_one = 1;
Akron0c41ab32020-09-29 07:33:33 +0200590 };
Peter Harders6f526a32020-06-29 21:44:41 +0200591
592
Akrona10ad592020-08-03 11:20:23 +0200593 # ~ update $data ~
Peter Harders6f526a32020-06-29 21:44:41 +0200594
Akrona10ad592020-08-03 11:20:23 +0200595 $data->append($e->[1]);
Peter Harders6f526a32020-06-29 21:44:41 +0200596
Peter Harders41c35622020-07-12 01:16:22 +0200597 #~~~~~
598 # until here: text- and whitespace-nodes
599 #~~~~~
600
Akron0c41ab32020-09-29 07:33:33 +0200601 } else { # not yet handled type
Peter Harders6f526a32020-06-29 21:44:41 +0200602
Akron0bb7e722020-09-29 07:48:33 +0200603 die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
Peter Harders6f526a32020-06-29 21:44:41 +0200604 }
605
606 } # end: foreach iteration
607
608} # end: sub retr_info
609
Akrond949e182020-02-14 12:23:57 +0100610__END__
611
612=pod
613
614=encoding utf8
615
616=head1 NAME
617
618tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
619
620=head1 SYNOPSIS
621
622 cat corpus.i5.xml | tei2korapxml > corpus.korapxml.zip
623
624=head1 DESCRIPTION
625
Akronee434b12020-07-08 12:53:01 +0200626C<tei2korapxml> is a script to convert TEI P5 and
627L<I5|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
628based documents to the
629L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
630If no specific input is defined, data is
Akrond949e182020-02-14 12:23:57 +0100631read from C<STDIN>. If no specific output is defined, data is written
632to C<STDOUT>.
Peter Harders6f526a32020-06-29 21:44:41 +0200633
Akrond949e182020-02-14 12:23:57 +0100634This program is usually called from inside another script.
635
Akronee434b12020-07-08 12:53:01 +0200636=head1 FORMATS
637
638=head2 Input restrictions
639
640=over 2
641
642=item
643
Akronee434b12020-07-08 12:53:01 +0200644TEI P5 formatted input with certain restrictions:
645
646=over 4
647
648=item
649
650B<mandatory>: text-header with integrated textsigle, text-body
651
652=item
653
654B<optional>: corp-header with integrated corpsigle,
655doc-header with integrated docsigle
656
657=back
658
659=item
660
Akron0c41ab32020-09-29 07:33:33 +0200661All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200662newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200663(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200664into blanks between 2 tokens could lead to additional blanks,
665where there should be none (e.g.: punctuation characters like C<,> or
666C<.> should not be seperated from their predecessor token).
667(see also code section C<~ whitespace handling ~>).
668
669=back
670
671=head2 Notes on the output
672
673=over 2
674
675=item
676
677zip file output (default on C<stdout>) with utf8 encoded entries
678(which together form the KorAP-XML format)
679
680=back
681
Akrond949e182020-02-14 12:23:57 +0100682=head1 INSTALLATION
683
684C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
685these bindings are available, the preferred way to install the script is
686to use L<cpanm|App::cpanminus>.
687
688 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
689
690In case everything went well, the C<tei2korapxml> tool will
691be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200692
Akrond949e182020-02-14 12:23:57 +0100693Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
694
695=head1 OPTIONS
696
697=over 2
698
Akron4e603a52020-07-27 14:23:49 +0200699=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100700
Akron4e603a52020-07-27 14:23:49 +0200701The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100702
703=item B<--help|-h>
704
705Print help information.
706
707=item B<--version|-v>
708
709Print version information.
710
Akron4e603a52020-07-27 14:23:49 +0200711=item B<--tokenizer-call|-tc>
712
713Call an external tokenizer process, that will tokenize
714a single line from STDIN and outputs one token per line.
715
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200716=item B<--tokenizer-korap|-tk>
717
718Use the standard KorAP/DeReKo tokenizer.
719
Akron6d7b8e42020-09-29 07:37:41 +0200720=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200721
722Tokenize the data using two embedded tokenizers,
723that will take an I<Aggressive> and a I<conservative>
724approach.
725
Akron1a5271a2021-02-18 13:18:15 +0100726=item B<--inline-tokens> <foundry>#[<file>]
727
728Define the foundry and file (without extension)
729to store inline token information in.
730If L</KORAPXMLTEI_INLINE> is set, this will contain
731annotations as well.
732Defaults to C<tokens> and C<morpho>.
733
Marc Kupietz985da0c2021-02-15 19:29:50 +0100734=item B<--use-tokenizer-sentence-splits|-s>
735
736Replace existing with, or add new, sentence boundary information
737provided by the KorAP tokenizer (currently supported only).
738
Akron3378dfd2020-08-01 15:01:36 +0200739=item B<--log|-l>
740
741Loglevel for I<Log::Any>. Defaults to C<notice>.
742
Akrond949e182020-02-14 12:23:57 +0100743=back
744
Akronb3649472020-09-29 08:24:46 +0200745=head1 ENVIRONMENT VARIABLES
746
747=over 2
748
749=item B<KORAPXMLTEI_DEBUG>
750
751Activate minimal debugging.
752Defaults to C<false>.
753
754=item B<KORAPXMLTEI_INLINE>
755
756Process inline annotations, if present.
757Defaults to C<false>.
758
759=back
760
Akrond949e182020-02-14 12:23:57 +0100761=head1 COPYRIGHT AND LICENSE
762
Marc Kupietze955ecc2021-02-17 17:42:01 +0100763Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100764
765Author: Peter Harders
766
Akronaabd0952020-09-29 07:35:08 +0200767Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100768
769L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
770Corpus Analysis Platform at the
771L<Leibniz Institute for the German Language (IDS)|http://ids-mannheim.de/>,
772member of the
773L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
774
775This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100776L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100777
778=cut
Akronf8088e62021-02-18 16:18:59 +0100779
780# NOTES
781
782## Notes on how 'XML::CompactTree::XS' works
783
784Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
785
786Print out name of 'node2' for the above example:
787
788echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
789
790Exploring the structure of $data ( = reference to below array ):
791
792[ 0: XML_READER_TYPE_DOCUMENT,
793 1: ?
794 2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see main(): retr_info( \$tree_data->[2] ))
795 1: 'node'
796 2: ?
797 3: HASH (attributes)
798 4: 1 (line number)
799 5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
800 1: 'node1'
801 2: ?
802 3: undefined (no attributes)
803 4: 1 (line number)
804 5: [ 0: [ 0: XML_READER_TYPE_TEXT
805 1: 'some '
806 ]
807 1: [ 0: XML_READER_TYPE_ELEMENT
808 1: 'n'
809 2: ?
810 3: undefined (no attributes)
811 4: 1 (line number)
812 5: undefined (no child-nodes)
813 ]
814 2: [ 0: XML_READER_TYPE_TEXT
815 1: ' text'
816 ]
817 ]
818 ]
819 1: [ 0: XML_READER_TYPE_ELEMENT
820 1: 'node2'
821 2: ?
822 3: undefined (not attributes)
823 4: 1 (line number)
824 5: [ 0: [ 0: XML_READER_TYPE_TEXT
825 1: 'more-text'
826 ]
827 ]
828 ]
829 ]
830 ]
831 ]
832]
833
834$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
835
836ref($data->[2]) == ARRAY (with 1 element for 'node')
837ref($data->[2]->[0]) == ARRAY (with 6 elements)
838
839$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
840$data->[2]->[0]->[1] == 'node'
841ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
842$data->[2]->[0]->[4] == 1 (line number)
843ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
844 # child-nodes of actual node (see $_IDX)
845
846ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
847$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
848$data->[2]->[0]->[5]->[0]->[1] == 'node1'
849$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
850$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
851ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
852
853ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
854$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
855$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
856
857ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
858$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
859$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
860$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
861$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
862$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
863
864ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
865$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
866$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
867
868
869retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
870Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
871${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
872
873
874## Notes on whitespace handling
875
876Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
877(see function 'retr_info()').
878
879Definition of significant and insignificant whitespace
880(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
881
882Significant whitespace is part of the document content and should be preserved.
883Insignificant whitespace is used when editing XML documents for readability.
884These whitespaces are typically not intended for inclusion in the delivery of the document.
885
886### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
887
888The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
889 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
890
891When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
892 '</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
893 (XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
894
895echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' | perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT | XCT_IGNORE_COMMENTS | XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
896
897
898Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
899
900Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
901 'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
902
903The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
904 it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
905
906The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
907 enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
908
909When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
910 So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
911 the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
912 the last read 'non-tag'-node has to be corrected (see [1]),
913
914For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
915 additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
916
917[1]
918Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
919 In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
920 (see above code fragment '... not exists $ws{ $fval - 1 } ...').
921
922[2]
923Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
924 whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
925
926The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
927 (even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
928
929Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
930 and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
931
932
933## Notes on whitespace fixing
934
935The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
936 into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
937
938It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
939 example further down and notes on 'Input restrictions' in the manpage).
940
941Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
942
943Examples (how primary text with linebreaks would be converted by below code):
944
945 '...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
946 '...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
947
948Blanks are inserted before the 1st character:
949
950 NOTE: not stringent ('...' stands for text):
951
952 beg1............................end1 => no blank before 'beg1'
953 beg2....<pb/>...................end2 => no blank before 'beg2'
954 beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
955 beg4....<test>ok</test>.........end4 => blank before 'beg4'
956
957 => beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
958 ^
959 |_blank between 'end3' and 'beg4'
960
961
962## Notes on segfault prevention
963
964binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside 'main()'
965(see notes on 'PerlIO layers' in 'man XML::LibXML'),
966removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
967see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
968see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.