Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 8d15173adabe055c6fc1e0ae1b96036262c6e797 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Marc Kupietz	a1421f0	2021-02-18 15:32:38 +0100	[diff] [blame]	36	our $VERSION = '1.00';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	40	# Set to 1 for minimal more debug output (no need to be parametrized)
				41	use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	42
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	43	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	44	GetOptions(
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	45	"root\|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
				46	"input\|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	47	'tokenizer-call\|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	48	'tokenizer-korap\|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	49	'tokenizer-internal\|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	50	'use-tokenizer-sentence-splits\|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	51	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	52	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	53	'base-foundry=s' => \(my $_tok_dir = 'base'),
				54	'data-file=s' => \(my $_data_file = 'data'),
				55	'header-file=s' => \(my $_header_file = 'header'),
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	56	'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	57	'log\|l=s' => \(my $log_level = 'notice'),
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	58	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	59	pod2usage(
				60	-verbose => 99,
				61	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				62	-msg => $VERSION_MSG,
				63	-output => '-'
				64	)
				65	},
				66	'version\|v' => sub {
				67	pod2usage(
				68	-verbose => 0,
				69	-msg => $VERSION_MSG,
				70	-output => '-'
				71	)
				72	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	73	);
				74
Marc Kupietz	44b1f25	2020-11-26 16:31:40 +0100	[diff] [blame]	75	binmode(STDERR, ":encoding(UTF-8)");
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	76	Log::Any::Adapter->set('Stderr', log_level => $log_level);
				77
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	78	$log->notice('Debugging is activated') if DEBUG;
				79
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	80	#
				81	# ~~~ parameter (mandatory) ~~~
				82	#
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	83	my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	84	# optional
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	85	my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	86	# optional
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	87	my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	88	# mandatory
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	89	my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	90
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	91
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	92	## extern tokenization
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	93	my $_GEN_TOK_EXT = $tokenizer_call \|\| $tokenizer_korap ? 1 : 0;
				94
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	95	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
				96	die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
				97	}
				98
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	99	my $ext_tok;
				100	if ($tokenizer_call) {
				101	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				102	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	103
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	104	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	105	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	106	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	107	##
				108
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	109
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	110	#
				111	# ~~~ constants ~~~
				112	#
				113
				114
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	115	## intern tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	116	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				117	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	118	##
				119
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	120	## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
				121	my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	122
				123
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	124	# Name of the directory and the file containing all inline structure informations
				125	# except for $_TOKEN_TAG information
				126	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
				127	$_structure_file .= '.xml';
				128
				129
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	130	# Name of the directory and the file containing all inline token informations
				131	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				132	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
				133	$_tokens_file .= '.xml';
				134
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	135	my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
				136
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	137	# Handling inline annotations (inside $_TOKENS_TAG)
				138	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	139
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	140
				141	#
				142	# ~~~ variables ~~~
				143	#
				144
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	145	# Initialize Token- and Structure-Collector
				146	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				147	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	148
				149
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	150	# Initialize Data-Collector
				151	my $data = KorAP::XML::TEI::Data->new;
				152
				153
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	154	# Initialize zipper
Akron	3bdc0a3	2020-08-03 12:12:56 +0200	[diff] [blame]	155	my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	156	my $input_fh; # input file handle (default: stdin)
				157
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	158	my $dir; # text directory (below $_root_dir)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	159
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	160	my ( $text_id,
				161	$text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	162
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	163	# these are only used inside recursive function 'retr_info'
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	164	my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	165	$e, # element from $tree_data
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	166	## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
				167	$add_one, # ...
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	168	$fval, # ...
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	169	%ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
				170	# idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	171	# (means: 'from-index - 1' is a key in %ws).
				172	# if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
				173
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	174	my $c; # index variables used in loops
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	175
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	176
				177	#
				178	# ~~~ main ~~~
				179	#
				180
				181	# ~ initializations ~
				182
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	183	# Include line numbers in elements of $tree_data for debugging
				184	DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	185
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	186	$fval = 0;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	187
Akron	ec2cef2	2020-07-31 10:00:15 +0200	[diff] [blame]	188	# Normalize regex for header parsing
				189	for ($_CORP_HEADER_BEG,
				190	$_DOC_HEADER_BEG,
				191	$_TEXT_HEADER_BEG) {
				192	s!^([^\s]+)(.)$!$1\[\^>\]$2!;
				193	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	194
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	195
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	196	# ~ read input and write output (text by text) ~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	197
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	198	my $tl = 0; # text line (needed for whitespace handling)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	199
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	200	$input_fh = *STDIN; # input file handle (default: stdin)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	201
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	202	# Maybe not necessary
				203	$data->reset;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	204
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	205	$dir = '';
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	206
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	207	if ( $input_fname ne '' ){
				208	unless (open($input_fh, '<', $input_fname)) {
				209	die $log->fatal("File '$input_fname' could not be opened.");
				210	};
				211	}
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	212
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	213	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	214	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	215
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	216	my $sfx;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	217	my $pos;
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	218	my $input_enc = 'UTF-8';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	219	my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	220
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	221	# ~ loop (reading input document) ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	222
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	223	MAIN: while ( <$input_fh> ){
				224
				225	$_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
				226
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	227	# Set input encoding
				228	if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				229	$input_enc = $2;
				230	next;
				231	};
				232
				233	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	234	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	235
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	236	if ( index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$# ){
				237
				238	# ~ start of text body ~
				239
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	240	$sfx = $2;
				241
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	242	if ($1 !~ /^\s$/ \|\| $sfx !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	243	die $log->fatal("input line number $.: " .
				244	"line with opening text-body tag '${_TEXT_BODY}' " .
				245	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	246	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	247
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	248	# text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
				249	my $buf_in = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	250
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	251	# Iterate over all lines in the text body
				252	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	253
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	254	$_ = remove_xml_comments( $input_fh, $_ );
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	255	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	256	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	257
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	258	# ~ end of text body ~
				259	if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	260
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	261	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	262
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	263	if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
				264	die $log->fatal("input line number $.: " .
				265	"line with closing text-body tag '${_TEXT_BODY}'".
				266	" contains additional information ... => Aborting (line=$_)");
				267	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	268
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	269	if ($dir eq '') {
				270	$log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
				271	next MAIN;
				272	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	273
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	274	my $reader = XML::LibXML::Reader->new(
				275	string => "<text>$buf_in</text>",
				276	huge => 1
				277	);
				278
				279	# See notes on whitespace handling
				280	my $param = XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_ATTRIBUTE_ARRAY;
				281
				282	# XCT_LINE_NUMBERS is only needed for debugging
				283	# (see XML::CompactTree::XS)
				284	$param \|= XCT_LINE_NUMBERS if DEBUG;
				285	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
				286
				287	$structures->reset;
				288
				289	$tokens->reset if $_TOKENS_PROC;
				290
				291	# ~ whitespace related issue ~
				292	$add_one = 0;
				293	%ws = ();
				294
				295	# ~ recursion ~
				296	retr_info(1, \$tree_data->[2] ); # parse input data
				297
				298	if (DEBUG) {
				299	$log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
				300	};
				301
				302	# ~ write data.xml ~
				303	$data->to_zip(
				304	$zipper->new_stream("$dir/${_data_file}.xml"),
				305	$text_id_esc
				306	);
				307
				308	# ~ tokenization ~
				309	if ($_GEN_TOK_EXT) {
				310
				311	# Tokenize and output
				312	$ext_tok->tokenize($data->data)->to_zip(
				313	$zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
				314	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	315	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	316	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	317
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	318	if ($_GEN_TOK_INT) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	319
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	320	# Tokenize and output
				321	$cons_tok->tokenize($data->data)->to_zip(
				322	$zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	323	$text_id_esc
				324	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	325
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	326	$aggr_tok->tokenize($data->data)->to_zip(
				327	$zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
				328	$text_id_esc
				329	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	330
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	331	$aggr_tok->reset;
				332	$cons_tok->reset;
				333	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	334
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	335	if ($use_tokenizer_sentence_splits) {
				336	$ext_tok->sentencize_from_previous_input($structures);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	337	}
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	338
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame^]	339	# ~ write structures ~
				340	if (!$structures->empty) {
				341	$structures->to_zip(
				342	$zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
				343	$text_id_esc,
				344	2 # = structure serialization
				345	);
				346	};
				347
				348	# ~ write tokens ~
				349	if ($_TOKENS_PROC && !$tokens->empty) {
				350	$tokens->to_zip(
				351	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
				352	$text_id_esc,
				353	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
				354	);
				355	};
				356
				357	# reinit.
				358	$dir = '';
				359
				360	# Maybe not necessary
				361	$data->reset;
				362
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	363	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	364	};
				365
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	366	# ~ inside text body ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	367
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	368	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	369
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	370	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	371
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	372	# TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
				373	# an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
				374
				375	# Remove consecutive whitespace at beginning and end (mostly one newline)
				376	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	377
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	378	### NOTE: this is only relevant, if a text consists of more than one line
				379	### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
				380	### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
				381	if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	382
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	383	$tl++; # counter for text lines
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	384
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	385	s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
				386	}
				387	###
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	388
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	389	# add line to buffer
				390	$buf_in .= $_;
				391	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	392
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	393	} elsif (m#^(.)(<(?:${_TEXT_HEADER_BEG}\|${_DOC_HEADER_BEG}\|${_CORP_HEADER_BEG}).)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	394
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	395	# ~ start of header ~
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	396	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	397
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	398	if ($1 !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	399	die $log->fatal("input line number $.: " .
				400	"line with opening header tag" .
				401	" is not in expected format ... => Aborting (line=$_)");
				402	};
				403
				404	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	405	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	406
				407	# Header was parseable
				408	if ($header) {
				409
				410	# Write header to zip
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	411	my $file = $header->dir . '/' . $_header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	412
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	413	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	414
				415	$header->to_zip($zipper->new_stream($file));
				416
				417	# Header is for text level
				418	if ($header->type eq 'text') {
				419
				420	# Remember dir and sigles
				421	$dir = $header->dir;
				422	$text_id = $header->id;
				423	$text_id_esc = $header->id_esc;
				424
				425	# log output for seeing progression
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	426	$log->notice("$0: text_id=$text_id");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	427
				428	$tl = 0; # reset (needed for ~ whitespace handling ~)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	429	}
				430	}
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	431	}
				432	} #end: while
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	433
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	434	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	435
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	436	$ext_tok->close if $_GEN_TOK_EXT;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	437
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	438	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	439
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	440
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	441	# Recursively called function to handle XML tree data
				442	sub retr_info {
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	443	# recursion level
				444	# (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
				445	my $rl = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	446
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	447	my $dummy_anno;
				448	if ($use_tokenizer_sentence_splits) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	449	$dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	450	}
				451
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	452	# Iteration through all array elements
				453	# ($_[0] is a reference to an array reference)
				454	# See notes on how 'XML::CompactTree::XS' works and
				455	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
				456	foreach $e (@{${$_[0]}}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	457
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	458	# Element node
				459	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	460
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	461	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	462	# from here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	463	#~~~~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	464
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	465	my $anno;
				466
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	467	# $e->[1] represents the tag name
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	468	if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
				469	$anno = $dummy_anno;
				470	} else {
				471	$anno = $structures->add_new_annotation($e->[1]);
				472	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	473
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	474
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	475	# Add element also to token list
				476	if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
				477	$tokens->add_annotation($anno);
				478	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	479
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	480	# Handle attributes (if attributes exist)
				481	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	482
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	483	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
				484	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				485	# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
				486	for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	487
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	488	# '$c' references the 'key' and '$c+1' the 'value'
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	489	$anno->add_attribute(
				490	@{$e->[3]}[$c, $c + 1]
				491	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	492	};
				493	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	494
				495	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	496	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	497
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	498
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	499	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	500	# until here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	501	#~~~~
				502
				503
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	504	# Call function recursively
				505	# do no recursion, if $e->[$_IDX] is not defined
				506	# (because we have no array of child-nodes, e.g.: <back/>)
				507	if (defined $e->[$_IDX]) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	508
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	509	# Recursion with array of child-nodes
				510	retr_info($rl+1, \$e->[$_IDX]);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	511	}
				512
				513
				514	#~~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	515	# from here: tag-node (closing)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	516	#~~~~~
				517
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	518	# NOTE: use $pos, because the offsets are _between_ the characters
				519	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	520	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	521
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	522	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	523
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	524	$fval = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	525
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	526	# ~ whitespace related issue ~
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	527	if ($fval > 0 && not exists $ws{$fval - 1}) {
				528
				529	# ~ previous node was a text-node ~
				530	$anno->set_from($fval - 1);
				531	}
				532
				533	# in case this fails, check input
				534	if (($fval - 1) > $pos) {
				535	die $log->fatal("text_id='$text_id', " .
				536	"processing of structures: " .
				537	"from-value ($fval) is 2 or more greater " .
				538	"than to-value ($pos) => please check. Aborting");
				539	};
				540
				541	# TODO: find example for which this case applies
				542	# maybe this is not necessary anymore, because the above recorrection of the from-value suffices
				543	#
				544	# TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
				545	# do testing with bigger corpus excerpt (wikipedia?)
				546	$anno->set_from($pos) if $fval == $pos + 1;
				547	$anno->set_to($pos);
				548	$anno->set_level($rl);
				549
				550	# Clean up whitespace
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	551	delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	552
				553
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	554	#~~~~
				555	# until here: tag-node (closing)
				556	#~~~~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	557	}
				558
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	559	# Text node
				560	elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	561
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	562	$add_one = 1;
				563	$data->append($e->[1]);
				564	}
				565
				566	# Whitespace node
				567	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				568	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				569
				570	# state, that this from-index belongs to a whitespace-node
				571	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				572	$ws{$data->position}++;
				573
				574	$add_one = 0;
				575	$data->append($e->[1]);
				576	}
				577
				578	# not yet handled type
				579	else {
				580
				581	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				582	};
				583	};
				584	};
				585
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	586
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	587	__END__
				588
				589	=pod
				590
				591	=encoding utf8
				592
				593	=head1 NAME
				594
				595	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				596
				597	=head1 SYNOPSIS
				598
				599	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				600
				601	=head1 DESCRIPTION
				602
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	603	C<tei2korapxml> is a script to convert TEI P5 and
				604	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				605	based documents to the
				606	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				607	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	608	read from C<STDIN>. If no specific output is defined, data is written
				609	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	610
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	611	This program is usually called from inside another script.
				612
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	613	=head1 FORMATS
				614
				615	=head2 Input restrictions
				616
				617	=over 2
				618
				619	=item
				620
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	621	TEI P5 formatted input with certain restrictions:
				622
				623	=over 4
				624
				625	=item
				626
				627	B<mandatory>: text-header with integrated textsigle, text-body
				628
				629	=item
				630
				631	B<optional>: corp-header with integrated corpsigle,
				632	doc-header with integrated docsigle
				633
				634	=back
				635
				636	=item
				637
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	638	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	639	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	640	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	641	into blanks between 2 tokens could lead to additional blanks,
				642	where there should be none (e.g.: punctuation characters like C<,> or
				643	C<.> should not be seperated from their predecessor token).
				644	(see also code section C<~ whitespace handling ~>).
				645
				646	=back
				647
				648	=head2 Notes on the output
				649
				650	=over 2
				651
				652	=item
				653
				654	zip file output (default on C<stdout>) with utf8 encoded entries
				655	(which together form the KorAP-XML format)
				656
				657	=back
				658
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	659	=head1 INSTALLATION
				660
				661	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				662	these bindings are available, the preferred way to install the script is
				663	to use L<cpanm\|App::cpanminus>.
				664
				665	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				666
				667	In case everything went well, the C<tei2korapxml> tool will
				668	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	669
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	670	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				671
				672	=head1 OPTIONS
				673
				674	=over 2
				675
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	676	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	677
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	678	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	679
				680	=item B<--help\|-h>
				681
				682	Print help information.
				683
				684	=item B<--version\|-v>
				685
				686	Print version information.
				687
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	688	=item B<--tokenizer-call\|-tc>
				689
				690	Call an external tokenizer process, that will tokenize
				691	a single line from STDIN and outputs one token per line.
				692
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	693	=item B<--tokenizer-korap\|-tk>
				694
				695	Use the standard KorAP/DeReKo tokenizer.
				696
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	697	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	698
				699	Tokenize the data using two embedded tokenizers,
				700	that will take an I<Aggressive> and a I<conservative>
				701	approach.
				702
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	703	=item B<--inline-tokens> <foundry>#[<file>]
				704
				705	Define the foundry and file (without extension)
				706	to store inline token information in.
				707	If L</KORAPXMLTEI_INLINE> is set, this will contain
				708	annotations as well.
				709	Defaults to C<tokens> and C<morpho>.
				710
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	711	=item B<--inline-structures> <foundry>#[<file>]
				712
				713	Define the foundry and file (without extension)
				714	to store inline structure information in.
				715	Defaults to C<struct> and C<structures>.
				716
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	717	=item B<--base-foundry> <foundry>
				718
				719	Define the base foundry to store newly generated
				720	token information in.
				721	Defaults to C<base>.
				722
				723	=item B<--data-file> <file>
				724
				725	Define the file (without extension)
				726	to store primary data information in.
				727	Defaults to C<data>.
				728
				729	=item B<--header-file> <file>
				730
				731	Define the file name (without extension)
				732	to store header information on
				733	the corpus, document, and text level in.
				734	Defaults to C<header>.
				735
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	736	=item B<--use-tokenizer-sentence-splits\|-s>
				737
				738	Replace existing with, or add new, sentence boundary information
				739	provided by the KorAP tokenizer (currently supported only).
				740
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	741	=item B<--tokens-file> <file>
				742
				743	Define the file (without extension)
				744	to store generated token information in
				745	(either from the KorAP tokenizer or an externally called tokenizer).
				746	Defaults to C<tokens>.
				747
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	748	=item B<--log\|-l>
				749
				750	Loglevel for I<Log::Any>. Defaults to C<notice>.
				751
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	752	=back
				753
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	754	=head1 ENVIRONMENT VARIABLES
				755
				756	=over 2
				757
				758	=item B<KORAPXMLTEI_DEBUG>
				759
				760	Activate minimal debugging.
				761	Defaults to C<false>.
				762
				763	=item B<KORAPXMLTEI_INLINE>
				764
				765	Process inline annotations, if present.
				766	Defaults to C<false>.
				767
				768	=back
				769
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	770	=head1 COPYRIGHT AND LICENSE
				771
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	772	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	773
				774	Author: Peter Harders
				775
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	776	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	777
				778	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				779	Corpus Analysis Platform at the
				780	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				781	member of the
				782	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				783
				784	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	785	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	786
				787	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	788
				789	# NOTES
				790
				791	## Notes on how 'XML::CompactTree::XS' works
				792
				793	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				794
				795	Print out name of 'node2' for the above example:
				796
				797	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				798
				799	Exploring the structure of $data ( = reference to below array ):
				800
				801	[ 0: XML_READER_TYPE_DOCUMENT,
				802	1: ?
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	803	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	804	1: 'node'
				805	2: ?
				806	3: HASH (attributes)
				807	4: 1 (line number)
				808	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				809	1: 'node1'
				810	2: ?
				811	3: undefined (no attributes)
				812	4: 1 (line number)
				813	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				814	1: 'some '
				815	]
				816	1: [ 0: XML_READER_TYPE_ELEMENT
				817	1: 'n'
				818	2: ?
				819	3: undefined (no attributes)
				820	4: 1 (line number)
				821	5: undefined (no child-nodes)
				822	]
				823	2: [ 0: XML_READER_TYPE_TEXT
				824	1: ' text'
				825	]
				826	]
				827	]
				828	1: [ 0: XML_READER_TYPE_ELEMENT
				829	1: 'node2'
				830	2: ?
				831	3: undefined (not attributes)
				832	4: 1 (line number)
				833	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				834	1: 'more-text'
				835	]
				836	]
				837	]
				838	]
				839	]
				840	]
				841	]
				842
				843	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				844
				845	ref($data->[2]) == ARRAY (with 1 element for 'node')
				846	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				847
				848	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				849	$data->[2]->[0]->[1] == 'node'
				850	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				851	$data->[2]->[0]->[4] == 1 (line number)
				852	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
				853	# child-nodes of actual node (see $_IDX)
				854
				855	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				856	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				857	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				858	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				859	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				860	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				861
				862	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				863	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				864	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				865
				866	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				867	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				868	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				869	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				870	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				871	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				872
				873	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				874	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				875	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				876
				877
				878	retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
				879	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				880	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				881
				882
				883	## Notes on whitespace handling
				884
				885	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
				886	(see function 'retr_info()').
				887
				888	Definition of significant and insignificant whitespace
				889	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				890
				891	Significant whitespace is part of the document content and should be preserved.
				892	Insignificant whitespace is used when editing XML documents for readability.
				893	These whitespaces are typically not intended for inclusion in the delivery of the document.
				894
				895	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				896
				897	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				898	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				899
				900	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				901	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				902	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				903
				904	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				905
				906
				907	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				908
				909	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				910	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				911
				912	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				913	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				914
				915	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				916	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				917
				918	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				919	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				920	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				921	the last read 'non-tag'-node has to be corrected (see [1]),
				922
				923	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				924	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				925
				926	[1]
				927	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				928	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
				929	(see above code fragment '... not exists $ws{ $fval - 1 } ...').
				930
				931	[2]
				932	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				933	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				934
				935	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				936	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				937
				938	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				939	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				940
				941
				942	## Notes on whitespace fixing
				943
				944	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				945	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				946
				947	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				948	example further down and notes on 'Input restrictions' in the manpage).
				949
				950	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				951
				952	Examples (how primary text with linebreaks would be converted by below code):
				953
				954	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				955	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				956
				957	Blanks are inserted before the 1st character:
				958
				959	NOTE: not stringent ('...' stands for text):
				960
				961	beg1............................end1 => no blank before 'beg1'
				962	beg2....<pb/>...................end2 => no blank before 'beg2'
				963	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				964	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				965
				966	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				967	^
				968	\|_blank between 'end3' and 'beg4'
				969
				970
				971	## Notes on segfault prevention
				972
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	973	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	974	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				975	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				976	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				977	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.