Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 784ed1a0af7a126c36cc782c07e29eddd1fc28a4 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Marc Kupietz	a1421f0	2021-02-18 15:32:38 +0100	[diff] [blame]	36	our $VERSION = '1.00';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	40	# Set to 1 for minimal more debug output (no need to be parametrized)
				41	use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	42
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	43	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	44	GetOptions(
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	45	"root\|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
				46	"input\|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	47	'tokenizer-call\|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	48	'tokenizer-korap\|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	49	'tokenizer-internal\|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	50	'use-tokenizer-sentence-splits\|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	51	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	52	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	53	'base-foundry=s' => \(my $_tok_dir = 'base'),
				54	'data-file=s' => \(my $_data_file = 'data'),
				55	'header-file=s' => \(my $_header_file = 'header'),
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	56	'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	57	'log\|l=s' => \(my $log_level = 'notice'),
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	58	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	59	pod2usage(
				60	-verbose => 99,
				61	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				62	-msg => $VERSION_MSG,
				63	-output => '-'
				64	)
				65	},
				66	'version\|v' => sub {
				67	pod2usage(
				68	-verbose => 0,
				69	-msg => $VERSION_MSG,
				70	-output => '-'
				71	)
				72	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	73	);
				74
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame^]	75	# Establish logger
Marc Kupietz	44b1f25	2020-11-26 16:31:40 +0100	[diff] [blame]	76	binmode(STDERR, ":encoding(UTF-8)");
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	77	Log::Any::Adapter->set('Stderr', log_level => $log_level);
				78
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	79	$log->notice('Debugging is activated') if DEBUG;
				80
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	81	# tag (without attributes), which contains the primary text
				82	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	83	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	84
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	85	# TODO: IDS-specific (and redundant)
				86	my $_HEADER_TAG = 'idsHeader';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	87
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	88	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
				89	die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame^]	90	};
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	91
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	92	my $ext_tok;
				93	if ($tokenizer_call) {
				94	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				95	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	96
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	97	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	98	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	99	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	100	##
				101
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	102
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	103	#
				104	# ~~~ constants ~~~
				105	#
				106
				107
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	108	## intern tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame^]	109	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				110	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	111	##
				112
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame^]	113	# Processing of ${_TOKEN_TAG}'s - on/off (default: 1)
				114	my $_TOKENS_PROC = 1;
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	115
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	116	# Name of the directory and the file containing all inline structure informations
				117	# except for $_TOKEN_TAG information
				118	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	119
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	120	# Name of the directory and the file containing all inline token informations
				121	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				122	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	123
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame^]	124	# name of the tag containing all information stored in $_tokens_file
				125	my $_TOKENS_TAG = "w";
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	126
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	127	# Handling inline annotations (inside $_TOKENS_TAG)
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame^]	128	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
				129
				130	# Initialize Token- and Structure-Collector
				131	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				132	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
				133
				134	# Initialize Data-Collector
				135	my $data = KorAP::XML::TEI::Data->new;
				136
				137	# Initialize zipper
				138	my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	139
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	140
				141	#
				142	# ~~~ variables ~~~
				143	#
				144
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	145
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	146	my $input_fh; # input file handle (default: stdin)
				147
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	148	my $dir; # text directory (below $_root_dir)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	149
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	150	my ( $text_id,
				151	$text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	152
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	153	# these are only used inside recursive function 'retr_info'
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	154	my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	155	$e, # element from $tree_data
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	156	## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
				157	$add_one, # ...
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	158	$fval, # ...
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	159	%ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
				160	# idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	161	# (means: 'from-index - 1' is a key in %ws).
				162	# if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
				163
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	164	my $c; # index variables used in loops
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	165
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	166
				167	#
				168	# ~~~ main ~~~
				169	#
				170
				171	# ~ initializations ~
				172
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	173	# Include line numbers in elements of $tree_data for debugging
				174	DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	175
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	176	$fval = 0;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	177
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	178	# ~ read input and write output (text by text) ~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	179
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	180	my $tl = 0; # text line (needed for whitespace handling)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	181
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	182	$input_fh = *STDIN; # input file handle (default: stdin)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	183
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	184	# Maybe not necessary
				185	$data->reset;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	186
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	187	$dir = '';
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	188
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	189	if ( $input_fname ne '' ){
				190	unless (open($input_fh, '<', $input_fname)) {
				191	die $log->fatal("File '$input_fname' could not be opened.");
				192	};
				193	}
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	194
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	195	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	196	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	197
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	198	my $sfx;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	199	my $pos;
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	200	my $input_enc = 'UTF-8';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	201	my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	202
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	203	# ~ loop (reading input document) ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	204
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	205	MAIN: while ( <$input_fh> ){
				206
				207	$_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
				208
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	209	# Set input encoding
				210	if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				211	$input_enc = $2;
				212	next;
				213	};
				214
				215	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	216	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	217
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	218	if ( index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$# ){
				219
				220	# ~ start of text body ~
				221
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	222	$sfx = $2;
				223
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	224	if ($1 !~ /^\s$/ \|\| $sfx !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	225	die $log->fatal("input line number $.: " .
				226	"line with opening text-body tag '${_TEXT_BODY}' " .
				227	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	228	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	229
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	230	# text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
				231	my $buf_in = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	232
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	233	# Iterate over all lines in the text body
				234	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	235
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	236	$_ = remove_xml_comments( $input_fh, $_ );
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	237	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	238	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	239
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	240	# ~ end of text body ~
				241	if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	242
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	243	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	244
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	245	if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
				246	die $log->fatal("input line number $.: " .
				247	"line with closing text-body tag '${_TEXT_BODY}'".
				248	" contains additional information ... => Aborting (line=$_)");
				249	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	250
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	251	if ($dir eq '') {
				252	$log->warn("Maybe empty textSigle => skipping this text ...\ndata=" . substr($data->data, 0, 200));
				253	next MAIN;
				254	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	255
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	256	my $reader = XML::LibXML::Reader->new(
				257	string => "<text>$buf_in</text>",
				258	huge => 1
				259	);
				260
				261	# See notes on whitespace handling
				262	my $param = XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_ATTRIBUTE_ARRAY;
				263
				264	# XCT_LINE_NUMBERS is only needed for debugging
				265	# (see XML::CompactTree::XS)
				266	$param \|= XCT_LINE_NUMBERS if DEBUG;
				267	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
				268
				269	$structures->reset;
				270
				271	$tokens->reset if $_TOKENS_PROC;
				272
				273	# ~ whitespace related issue ~
				274	$add_one = 0;
				275	%ws = ();
				276
				277	# ~ recursion ~
				278	retr_info(1, \$tree_data->[2] ); # parse input data
				279
				280	if (DEBUG) {
				281	$log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
				282	};
				283
				284	# ~ write data.xml ~
				285	$data->to_zip(
				286	$zipper->new_stream("$dir/${_data_file}.xml"),
				287	$text_id_esc
				288	);
				289
				290	# ~ tokenization ~
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	291	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	292
				293	# Tokenize and output
				294	$ext_tok->tokenize($data->data)->to_zip(
				295	$zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
				296	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	297	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	298	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	299
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	300	if ($_GEN_TOK_INT) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	301
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	302	# Tokenize and output
				303	$cons_tok->tokenize($data->data)->to_zip(
				304	$zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	305	$text_id_esc
				306	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	307
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	308	$aggr_tok->tokenize($data->data)->to_zip(
				309	$zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
				310	$text_id_esc
				311	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	312
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	313	$aggr_tok->reset;
				314	$cons_tok->reset;
				315	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	316
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	317	if ($use_tokenizer_sentence_splits) {
				318	$ext_tok->sentencize_from_previous_input($structures);
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	319	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	320
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	321	# ~ write structures ~
				322	if (!$structures->empty) {
				323	$structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame^]	324	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	325	$text_id_esc,
				326	2 # = structure serialization
				327	);
				328	};
				329
				330	# ~ write tokens ~
				331	if ($_TOKENS_PROC && !$tokens->empty) {
				332	$tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame^]	333	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	334	$text_id_esc,
				335	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
				336	);
				337	};
				338
				339	# reinit.
				340	$dir = '';
				341
				342	# Maybe not necessary
				343	$data->reset;
				344
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	345	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	346	};
				347
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	348	# ~ inside text body ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	349
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	350	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	351
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	352	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	353
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	354	# TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
				355	# an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
				356
				357	# Remove consecutive whitespace at beginning and end (mostly one newline)
				358	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	359
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	360	### NOTE: this is only relevant, if a text consists of more than one line
				361	### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
				362	### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
				363	if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	364
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	365	$tl++; # counter for text lines
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	366
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	367	s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
				368	}
				369	###
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	370
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	371	# add line to buffer
				372	$buf_in .= $_;
				373	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	374
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	375	} elsif (m#^(.)(\<${_HEADER_TAG}[^>]?type=["'].*)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	376
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	377	# ~ start of header ~
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	378	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	379
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	380	if ($1 !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	381	die $log->fatal("input line number $.: " .
				382	"line with opening header tag" .
				383	" is not in expected format ... => Aborting (line=$_)");
				384	};
				385
				386	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	387	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	388
				389	# Header was parseable
				390	if ($header) {
				391
				392	# Write header to zip
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	393	my $file = $header->dir . '/' . $_header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	394
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	395	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	396
				397	$header->to_zip($zipper->new_stream($file));
				398
				399	# Header is for text level
				400	if ($header->type eq 'text') {
				401
				402	# Remember dir and sigles
				403	$dir = $header->dir;
				404	$text_id = $header->id;
				405	$text_id_esc = $header->id_esc;
				406
				407	# log output for seeing progression
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	408	$log->notice("$0: text_id=$text_id");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	409
				410	$tl = 0; # reset (needed for ~ whitespace handling ~)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	411	}
				412	}
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	413	}
				414	} #end: while
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	415
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	416	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	417
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	418	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	419
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	420	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	421
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	422
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	423	# Recursively called function to handle XML tree data
				424	sub retr_info {
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	425	# recursion level
				426	# (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
				427	my $rl = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	428
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	429	# Iteration through all array elements
				430	# ($_[0] is a reference to an array reference)
				431	# See notes on how 'XML::CompactTree::XS' works and
				432	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
				433	foreach $e (@{${$_[0]}}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	434
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	435	# Element node
				436	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	437
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	438	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	439	# from here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	440	#~~~~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	441
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	442	# $e->[1] represents the tag name
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	443	# Skip sentences
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	444	if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	445	if (defined $e->[$_IDX]) {
				446	retr_info($rl+1, \$e->[$_IDX]);
				447	}
				448	next;
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	449	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	450
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	451	my $anno = $structures->add_new_annotation($e->[1]);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	452
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	453	# Add element also to token list
				454	if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
				455	$tokens->add_annotation($anno);
				456	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	457
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	458	# Handle attributes (if attributes exist)
				459	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	460
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	461	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
				462	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				463	# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
				464	for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	465
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	466	# '$c' references the 'key' and '$c+1' the 'value'
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	467	$anno->add_attribute(
				468	@{$e->[3]}[$c, $c + 1]
				469	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	470	};
				471	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	472
				473	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	474	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	475
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	476
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	477	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	478	# until here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	479	#~~~~
				480
				481
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	482	# Call function recursively
				483	# do no recursion, if $e->[$_IDX] is not defined
				484	# (because we have no array of child-nodes, e.g.: <back/>)
				485	if (defined $e->[$_IDX]) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	486
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	487	# Recursion with array of child-nodes
				488	retr_info($rl+1, \$e->[$_IDX]);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	489	}
				490
				491
				492	#~~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	493	# from here: tag-node (closing)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	494	#~~~~~
				495
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	496	# NOTE: use $pos, because the offsets are _between_ the characters
				497	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	498	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	499
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	500	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	501
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	502	$fval = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	503
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	504	# ~ whitespace related issue ~
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	505	if ($fval > 0 && not exists $ws{$fval - 1}) {
				506
				507	# ~ previous node was a text-node ~
				508	$anno->set_from($fval - 1);
				509	}
				510
				511	# in case this fails, check input
				512	if (($fval - 1) > $pos) {
				513	die $log->fatal("text_id='$text_id', " .
				514	"processing of structures: " .
				515	"from-value ($fval) is 2 or more greater " .
				516	"than to-value ($pos) => please check. Aborting");
				517	};
				518
				519	# TODO: find example for which this case applies
				520	# maybe this is not necessary anymore, because the above recorrection of the from-value suffices
				521	#
				522	# TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
				523	# do testing with bigger corpus excerpt (wikipedia?)
				524	$anno->set_from($pos) if $fval == $pos + 1;
				525	$anno->set_to($pos);
				526	$anno->set_level($rl);
				527
				528	# Clean up whitespace
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	529	delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	530
				531
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	532	#~~~~
				533	# until here: tag-node (closing)
				534	#~~~~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	535	}
				536
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	537	# Text node
				538	elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	539
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	540	$add_one = 1;
				541	$data->append($e->[1]);
				542	}
				543
				544	# Whitespace node
				545	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				546	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				547
				548	# state, that this from-index belongs to a whitespace-node
				549	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				550	$ws{$data->position}++;
				551
				552	$add_one = 0;
				553	$data->append($e->[1]);
				554	}
				555
				556	# not yet handled type
				557	else {
				558
				559	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				560	};
				561	};
				562	};
				563
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	564
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	565	__END__
				566
				567	=pod
				568
				569	=encoding utf8
				570
				571	=head1 NAME
				572
				573	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				574
				575	=head1 SYNOPSIS
				576
				577	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				578
				579	=head1 DESCRIPTION
				580
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	581	C<tei2korapxml> is a script to convert TEI P5 and
				582	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				583	based documents to the
				584	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				585	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	586	read from C<STDIN>. If no specific output is defined, data is written
				587	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	588
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	589	This program is usually called from inside another script.
				590
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	591	=head1 FORMATS
				592
				593	=head2 Input restrictions
				594
				595	=over 2
				596
				597	=item
				598
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	599	TEI P5 formatted input with certain restrictions:
				600
				601	=over 4
				602
				603	=item
				604
				605	B<mandatory>: text-header with integrated textsigle, text-body
				606
				607	=item
				608
				609	B<optional>: corp-header with integrated corpsigle,
				610	doc-header with integrated docsigle
				611
				612	=back
				613
				614	=item
				615
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	616	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	617	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	618	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	619	into blanks between 2 tokens could lead to additional blanks,
				620	where there should be none (e.g.: punctuation characters like C<,> or
				621	C<.> should not be seperated from their predecessor token).
				622	(see also code section C<~ whitespace handling ~>).
				623
				624	=back
				625
				626	=head2 Notes on the output
				627
				628	=over 2
				629
				630	=item
				631
				632	zip file output (default on C<stdout>) with utf8 encoded entries
				633	(which together form the KorAP-XML format)
				634
				635	=back
				636
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	637	=head1 INSTALLATION
				638
				639	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				640	these bindings are available, the preferred way to install the script is
				641	to use L<cpanm\|App::cpanminus>.
				642
				643	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				644
				645	In case everything went well, the C<tei2korapxml> tool will
				646	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	647
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	648	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				649
				650	=head1 OPTIONS
				651
				652	=over 2
				653
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	654	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	655
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	656	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	657
				658	=item B<--help\|-h>
				659
				660	Print help information.
				661
				662	=item B<--version\|-v>
				663
				664	Print version information.
				665
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	666	=item B<--tokenizer-call\|-tc>
				667
				668	Call an external tokenizer process, that will tokenize
				669	a single line from STDIN and outputs one token per line.
				670
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	671	=item B<--tokenizer-korap\|-tk>
				672
				673	Use the standard KorAP/DeReKo tokenizer.
				674
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	675	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	676
				677	Tokenize the data using two embedded tokenizers,
				678	that will take an I<Aggressive> and a I<conservative>
				679	approach.
				680
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	681	=item B<--inline-tokens> <foundry>#[<file>]
				682
				683	Define the foundry and file (without extension)
				684	to store inline token information in.
				685	If L</KORAPXMLTEI_INLINE> is set, this will contain
				686	annotations as well.
				687	Defaults to C<tokens> and C<morpho>.
				688
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	689	=item B<--inline-structures> <foundry>#[<file>]
				690
				691	Define the foundry and file (without extension)
				692	to store inline structure information in.
				693	Defaults to C<struct> and C<structures>.
				694
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	695	=item B<--base-foundry> <foundry>
				696
				697	Define the base foundry to store newly generated
				698	token information in.
				699	Defaults to C<base>.
				700
				701	=item B<--data-file> <file>
				702
				703	Define the file (without extension)
				704	to store primary data information in.
				705	Defaults to C<data>.
				706
				707	=item B<--header-file> <file>
				708
				709	Define the file name (without extension)
				710	to store header information on
				711	the corpus, document, and text level in.
				712	Defaults to C<header>.
				713
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	714	=item B<--use-tokenizer-sentence-splits\|-s>
				715
				716	Replace existing with, or add new, sentence boundary information
				717	provided by the KorAP tokenizer (currently supported only).
				718
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	719	=item B<--tokens-file> <file>
				720
				721	Define the file (without extension)
				722	to store generated token information in
				723	(either from the KorAP tokenizer or an externally called tokenizer).
				724	Defaults to C<tokens>.
				725
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	726	=item B<--log\|-l>
				727
				728	Loglevel for I<Log::Any>. Defaults to C<notice>.
				729
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	730	=back
				731
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	732	=head1 ENVIRONMENT VARIABLES
				733
				734	=over 2
				735
				736	=item B<KORAPXMLTEI_DEBUG>
				737
				738	Activate minimal debugging.
				739	Defaults to C<false>.
				740
				741	=item B<KORAPXMLTEI_INLINE>
				742
				743	Process inline annotations, if present.
				744	Defaults to C<false>.
				745
				746	=back
				747
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	748	=head1 COPYRIGHT AND LICENSE
				749
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	750	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	751
				752	Author: Peter Harders
				753
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	754	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	755
				756	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				757	Corpus Analysis Platform at the
				758	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				759	member of the
				760	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				761
				762	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	763	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	764
				765	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	766
				767	# NOTES
				768
				769	## Notes on how 'XML::CompactTree::XS' works
				770
				771	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				772
				773	Print out name of 'node2' for the above example:
				774
				775	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				776
				777	Exploring the structure of $data ( = reference to below array ):
				778
				779	[ 0: XML_READER_TYPE_DOCUMENT,
				780	1: ?
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	781	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	782	1: 'node'
				783	2: ?
				784	3: HASH (attributes)
				785	4: 1 (line number)
				786	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				787	1: 'node1'
				788	2: ?
				789	3: undefined (no attributes)
				790	4: 1 (line number)
				791	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				792	1: 'some '
				793	]
				794	1: [ 0: XML_READER_TYPE_ELEMENT
				795	1: 'n'
				796	2: ?
				797	3: undefined (no attributes)
				798	4: 1 (line number)
				799	5: undefined (no child-nodes)
				800	]
				801	2: [ 0: XML_READER_TYPE_TEXT
				802	1: ' text'
				803	]
				804	]
				805	]
				806	1: [ 0: XML_READER_TYPE_ELEMENT
				807	1: 'node2'
				808	2: ?
				809	3: undefined (not attributes)
				810	4: 1 (line number)
				811	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				812	1: 'more-text'
				813	]
				814	]
				815	]
				816	]
				817	]
				818	]
				819	]
				820
				821	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				822
				823	ref($data->[2]) == ARRAY (with 1 element for 'node')
				824	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				825
				826	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				827	$data->[2]->[0]->[1] == 'node'
				828	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				829	$data->[2]->[0]->[4] == 1 (line number)
				830	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
				831	# child-nodes of actual node (see $_IDX)
				832
				833	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				834	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				835	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				836	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				837	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				838	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				839
				840	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				841	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				842	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				843
				844	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				845	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				846	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				847	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				848	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				849	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				850
				851	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				852	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				853	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				854
				855
				856	retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
				857	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				858	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				859
				860
				861	## Notes on whitespace handling
				862
				863	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
				864	(see function 'retr_info()').
				865
				866	Definition of significant and insignificant whitespace
				867	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				868
				869	Significant whitespace is part of the document content and should be preserved.
				870	Insignificant whitespace is used when editing XML documents for readability.
				871	These whitespaces are typically not intended for inclusion in the delivery of the document.
				872
				873	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				874
				875	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				876	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				877
				878	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				879	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				880	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				881
				882	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				883
				884
				885	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				886
				887	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				888	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				889
				890	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				891	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				892
				893	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				894	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				895
				896	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				897	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				898	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				899	the last read 'non-tag'-node has to be corrected (see [1]),
				900
				901	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				902	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				903
				904	[1]
				905	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				906	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
				907	(see above code fragment '... not exists $ws{ $fval - 1 } ...').
				908
				909	[2]
				910	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				911	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				912
				913	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				914	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				915
				916	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				917	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				918
				919
				920	## Notes on whitespace fixing
				921
				922	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				923	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				924
				925	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				926	example further down and notes on 'Input restrictions' in the manpage).
				927
				928	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				929
				930	Examples (how primary text with linebreaks would be converted by below code):
				931
				932	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				933	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				934
				935	Blanks are inserted before the 1st character:
				936
				937	NOTE: not stringent ('...' stands for text):
				938
				939	beg1............................end1 => no blank before 'beg1'
				940	beg2....<pb/>...................end2 => no blank before 'beg2'
				941	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				942	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				943
				944	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				945	^
				946	\|_blank between 'end3' and 'beg4'
				947
				948
				949	## Notes on segfault prevention
				950
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	951	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	952	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				953	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				954	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				955	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.