Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 9aca1f62c4b8fbfb3ddc2cfc0ff034ec9cab5483 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Marc Kupietz	a1421f0	2021-02-18 15:32:38 +0100	[diff] [blame]	36	our $VERSION = '1.00';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	40	# Set to 1 for minimal more debug output (no need to be parametrized)
				41	use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	42
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	43	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	44	GetOptions(
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	45	"root\|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
				46	"input\|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	47	'tokenizer-call\|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	48	'tokenizer-korap\|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	49	'tokenizer-internal\|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	50	'use-tokenizer-sentence-splits\|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	51	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	52	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	53	'base-foundry=s' => \(my $_tok_dir = 'base'),
				54	'data-file=s' => \(my $_data_file = 'data'),
				55	'header-file=s' => \(my $_header_file = 'header'),
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	56	'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	57	'log\|l=s' => \(my $log_level = 'notice'),
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	58	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	59	pod2usage(
				60	-verbose => 99,
				61	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				62	-msg => $VERSION_MSG,
				63	-output => '-'
				64	)
				65	},
				66	'version\|v' => sub {
				67	pod2usage(
				68	-verbose => 0,
				69	-msg => $VERSION_MSG,
				70	-output => '-'
				71	)
				72	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	73	);
				74
Marc Kupietz	44b1f25	2020-11-26 16:31:40 +0100	[diff] [blame]	75	binmode(STDERR, ":encoding(UTF-8)");
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	76	Log::Any::Adapter->set('Stderr', log_level => $log_level);
				77
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	78	$log->notice('Debugging is activated') if DEBUG;
				79
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	80	#
				81	# ~~~ parameter (mandatory) ~~~
				82	#
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	83	my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	84	# optional
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	85	my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	86	# optional
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	87	my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	88	# mandatory
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	89	my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	90
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	91
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	92	## extern tokenization
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	93	my $_GEN_TOK_EXT = $tokenizer_call \|\| $tokenizer_korap ? 1 : 0;
				94
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	95	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
				96	die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
				97	}
				98
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	99	my $ext_tok;
				100	if ($tokenizer_call) {
				101	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				102	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	103
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	104	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	105	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	106	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	107	##
				108
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	109
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	110	#
				111	# ~~~ constants ~~~
				112	#
				113
				114
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	115	## intern tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	116	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				117	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	118	##
				119
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	120	## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
				121	my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	122
				123
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	124	# Name of the directory and the file containing all inline structure informations
				125	# except for $_TOKEN_TAG information
				126	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
				127	$_structure_file .= '.xml';
				128
				129
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	130	# Name of the directory and the file containing all inline token informations
				131	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				132	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
				133	$_tokens_file .= '.xml';
				134
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	135	my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
				136
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	137	# Handling inline annotations (inside $_TOKENS_TAG)
				138	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	139
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	140
				141	#
				142	# ~~~ variables ~~~
				143	#
				144
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	145	# Initialize Token- and Structure-Collector
				146	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				147	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	148
				149
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	150	# Initialize Data-Collector
				151	my $data = KorAP::XML::TEI::Data->new;
				152
				153
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	154	# Initialize zipper
Akron	3bdc0a3	2020-08-03 12:12:56 +0200	[diff] [blame]	155	my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	156	my $input_fh; # input file handle (default: stdin)
				157
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	158	my $dir; # text directory (below $_root_dir)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	159
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	160	my ( $text_id,
				161	$text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	162
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	163	# these are only used inside recursive function 'retr_info'
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	164	my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	165	$e, # element from $tree_data
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	166	## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
				167	$add_one, # ...
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	168	$fval, # ...
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	169	%ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
				170	# idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	171	# (means: 'from-index - 1' is a key in %ws).
				172	# if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
				173
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	174	my $c; # index variables used in loops
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	175
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	176
				177	#
				178	# ~~~ main ~~~
				179	#
				180
				181	# ~ initializations ~
				182
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	183	# Include line numbers in elements of $tree_data for debugging
				184	DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	185
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	186	$fval = 0;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	187
Akron	ec2cef2	2020-07-31 10:00:15 +0200	[diff] [blame]	188	# Normalize regex for header parsing
				189	for ($_CORP_HEADER_BEG,
				190	$_DOC_HEADER_BEG,
				191	$_TEXT_HEADER_BEG) {
				192	s!^([^\s]+)(.)$!$1\[\^>\]$2!;
				193	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	194
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	195
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	196	# ~ read input and write output (text by text) ~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	197
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	198	my $tl = 0; # text line (needed for whitespace handling)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	199
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	200	$input_fh = *STDIN; # input file handle (default: stdin)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	201
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	202	# Maybe not necessary
				203	$data->reset;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	204
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	205	$dir = "";
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	206
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	207	if ( $input_fname ne '' ){
				208	unless (open($input_fh, '<', $input_fname)) {
				209	die $log->fatal("File '$input_fname' could not be opened.");
				210	};
				211	}
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	212
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	213	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	214	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	215
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame^]	216	my $sfx;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	217	my $pos;
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	218	my $input_enc = 'UTF-8';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	219	my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	220
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	221	# ~ loop (reading input document) ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	222
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	223	MAIN: while ( <$input_fh> ){
				224
				225	$_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
				226
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	227	# Set input encoding
				228	if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				229	$input_enc = $2;
				230	next;
				231	};
				232
				233	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	234	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	235
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	236	if ( index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$# ){
				237
				238	# ~ start of text body ~
				239
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	240	$sfx = $2;
				241
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame^]	242	if ($1 !~ /^\s$/ \|\| $sfx !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	243	die $log->fatal("input line number $.: " .
				244	"line with opening text-body tag '${_TEXT_BODY}' " .
				245	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	246	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	247
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	248	# text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
				249	my $buf_in = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	250
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	251	# Iterate over all lines in the text body
				252	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	253
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	254	$_ = remove_xml_comments( $input_fh, $_ );
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	255	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	256	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	257
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	258	# ~ end of text body ~
				259	if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	260
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	261	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	262
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	263	if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
				264	die $log->fatal("input line number $.: " .
				265	"line with closing text-body tag '${_TEXT_BODY}'".
				266	" contains additional information ... => Aborting (line=$_)");
				267	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	268
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	269	if ($dir ne "") {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	270
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame^]	271	my $reader = XML::LibXML::Reader->new(
				272	string => "<text>$buf_in</text>",
				273	huge => 1
				274	);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	275
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	276	# See notes on whitespace handling
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	277	my $param = XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_ATTRIBUTE_ARRAY;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	278
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	279	# XCT_LINE_NUMBERS is only needed for debugging
				280	# (see XML::CompactTree::XS)
				281	$param \|= XCT_LINE_NUMBERS if DEBUG;
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame^]	282	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	283
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	284	$structures->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	285
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	286	$tokens->reset if $_TOKENS_PROC;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	287
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	288	# ~ whitespace related issue ~
				289	$add_one = 0;
				290	%ws = ();
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	291
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	292	# ~ recursion ~
				293	retr_info(1, \$tree_data->[2] ); # parse input data
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	294
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	295	if (DEBUG) {
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	296	$log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	297	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	298
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	299	# ~ write data.xml ~
				300	$data->to_zip(
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	301	$zipper->new_stream("$dir/${_data_file}.xml"),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	302	$text_id_esc
				303	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	304
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	305	# ~ tokenization ~
				306	if ($_GEN_TOK_EXT) {
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	307
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	308	# Tokenize and output
				309	$ext_tok->tokenize($data->data)->to_zip(
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	310	$zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	311	$text_id_esc
				312	);
				313	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	314
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	315	if ($_GEN_TOK_INT) {
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	316
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	317	# Tokenize and output
				318	$cons_tok->tokenize($data->data)->to_zip(
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	319	$zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	320	$text_id_esc
				321	);
Marc Kupietz	74ed7f3	2020-09-09 18:22:07 +0200	[diff] [blame]	322
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	323	$aggr_tok->tokenize($data->data)->to_zip(
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	324	$zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	325	$text_id_esc
				326	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	327
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	328	$aggr_tok->reset;
				329	$cons_tok->reset;
				330	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	331
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	332	if ($use_tokenizer_sentence_splits) {
				333	$ext_tok->sentencize_from_previous_input($structures);
				334	}
				335
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	336	# ~ write structures ~
				337	if (!$structures->empty) {
				338	$structures->to_zip(
				339	$zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
				340	$text_id_esc,
				341	2 # = structure serialization
				342	);
				343	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	344
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	345	# ~ write tokens ~
				346	if ($_TOKENS_PROC && !$tokens->empty) {
				347	$tokens->to_zip(
				348	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
				349	$text_id_esc,
				350	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
				351	);
				352	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	353
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	354	$dir = ""; # reinit.
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	355
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	356	# Maybe not necessary
				357	$data->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	358
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	359	} else { # $dir eq ""
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	360
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	361	$log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	362	}
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	363
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	364	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	365	};
				366
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	367	# ~ inside text body ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	368
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	369	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	370
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	371	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	372
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	373	# TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
				374	# an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
				375
				376	# Remove consecutive whitespace at beginning and end (mostly one newline)
				377	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	378
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	379	### NOTE: this is only relevant, if a text consists of more than one line
				380	### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
				381	### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
				382	if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	383
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	384	$tl++; # counter for text lines
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	385
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	386	s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
				387	}
				388	###
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	389
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	390	# add line to buffer
				391	$buf_in .= $_;
				392	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	393
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	394	} elsif (m#^(.)(<(?:${_TEXT_HEADER_BEG}\|${_DOC_HEADER_BEG}\|${_CORP_HEADER_BEG}).)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	395
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	396	# ~ start of header ~
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	397	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	398
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame^]	399	if ($1 !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	400	die $log->fatal("input line number $.: " .
				401	"line with opening header tag" .
				402	" is not in expected format ... => Aborting (line=$_)");
				403	};
				404
				405	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	406	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	407
				408	# Header was parseable
				409	if ($header) {
				410
				411	# Write header to zip
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	412	my $file = $header->dir . '/' . $_header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	413
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	414	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	415
				416	$header->to_zip($zipper->new_stream($file));
				417
				418	# Header is for text level
				419	if ($header->type eq 'text') {
				420
				421	# Remember dir and sigles
				422	$dir = $header->dir;
				423	$text_id = $header->id;
				424	$text_id_esc = $header->id_esc;
				425
				426	# log output for seeing progression
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	427	$log->notice("$0: text_id=$text_id");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	428
				429	$tl = 0; # reset (needed for ~ whitespace handling ~)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	430	}
				431	}
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	432	}
				433	} #end: while
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	434
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	435	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	436
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	437	$ext_tok->close if $_GEN_TOK_EXT;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	438
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	439	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	440
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	441
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	442	# Recursively called function to handle XML tree data
				443	sub retr_info {
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	444	# recursion level
				445	# (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
				446	my $rl = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	447
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	448	my $dummy_anno;
				449	if ($use_tokenizer_sentence_splits) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	450	$dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	451	}
				452
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	453	# Iteration through all array elements
				454	# ($_[0] is a reference to an array reference)
				455	# See notes on how 'XML::CompactTree::XS' works and
				456	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
				457	foreach $e (@{${$_[0]}}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	458
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	459	# Element node
				460	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	461
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	462	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	463	# from here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	464	#~~~~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	465
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	466	my $anno;
				467
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	468	# $e->[1] represents the tag name
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	469	if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
				470	$anno = $dummy_anno;
				471	} else {
				472	$anno = $structures->add_new_annotation($e->[1]);
				473	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	474
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	475
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	476	# Add element also to token list
				477	if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
				478	$tokens->add_annotation($anno);
				479	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	480
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	481	# Handle attributes (if attributes exist)
				482	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	483
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	484	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
				485	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				486	# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
				487	for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	488
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	489	# '$c' references the 'key' and '$c+1' the 'value'
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	490	$anno->add_attribute(
				491	@{$e->[3]}[$c, $c + 1]
				492	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	493	};
				494	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	495
				496	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	497	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	498
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	499
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	500	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	501	# until here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	502	#~~~~
				503
				504
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	505	# Call function recursively
				506	# do no recursion, if $e->[$_IDX] is not defined
				507	# (because we have no array of child-nodes, e.g.: <back/>)
				508	if (defined $e->[$_IDX]) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	509
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	510	# Recursion with array of child-nodes
				511	retr_info($rl+1, \$e->[$_IDX]);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	512	}
				513
				514
				515	#~~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	516	# from here: tag-node (closing)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	517	#~~~~~
				518
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	519	# NOTE: use $pos, because the offsets are _between_ the characters
				520	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	521	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	522
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	523	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	524
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	525	$fval = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	526
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	527	# ~ whitespace related issue ~
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	528	if ($fval > 0 && not exists $ws{$fval - 1}) {
				529
				530	# ~ previous node was a text-node ~
				531	$anno->set_from($fval - 1);
				532	}
				533
				534	# in case this fails, check input
				535	if (($fval - 1) > $pos) {
				536	die $log->fatal("text_id='$text_id', " .
				537	"processing of structures: " .
				538	"from-value ($fval) is 2 or more greater " .
				539	"than to-value ($pos) => please check. Aborting");
				540	};
				541
				542	# TODO: find example for which this case applies
				543	# maybe this is not necessary anymore, because the above recorrection of the from-value suffices
				544	#
				545	# TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
				546	# do testing with bigger corpus excerpt (wikipedia?)
				547	$anno->set_from($pos) if $fval == $pos + 1;
				548	$anno->set_to($pos);
				549	$anno->set_level($rl);
				550
				551	# Clean up whitespace
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	552	delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	553
				554
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	555	#~~~~
				556	# until here: tag-node (closing)
				557	#~~~~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	558	}
				559
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	560	# Text node
				561	elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	562
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	563	$add_one = 1;
				564	$data->append($e->[1]);
				565	}
				566
				567	# Whitespace node
				568	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				569	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				570
				571	# state, that this from-index belongs to a whitespace-node
				572	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				573	$ws{$data->position}++;
				574
				575	$add_one = 0;
				576	$data->append($e->[1]);
				577	}
				578
				579	# not yet handled type
				580	else {
				581
				582	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				583	};
				584	};
				585	};
				586
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	587
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	588	__END__
				589
				590	=pod
				591
				592	=encoding utf8
				593
				594	=head1 NAME
				595
				596	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				597
				598	=head1 SYNOPSIS
				599
				600	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				601
				602	=head1 DESCRIPTION
				603
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	604	C<tei2korapxml> is a script to convert TEI P5 and
				605	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				606	based documents to the
				607	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				608	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	609	read from C<STDIN>. If no specific output is defined, data is written
				610	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	611
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	612	This program is usually called from inside another script.
				613
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	614	=head1 FORMATS
				615
				616	=head2 Input restrictions
				617
				618	=over 2
				619
				620	=item
				621
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	622	TEI P5 formatted input with certain restrictions:
				623
				624	=over 4
				625
				626	=item
				627
				628	B<mandatory>: text-header with integrated textsigle, text-body
				629
				630	=item
				631
				632	B<optional>: corp-header with integrated corpsigle,
				633	doc-header with integrated docsigle
				634
				635	=back
				636
				637	=item
				638
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	639	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	640	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	641	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	642	into blanks between 2 tokens could lead to additional blanks,
				643	where there should be none (e.g.: punctuation characters like C<,> or
				644	C<.> should not be seperated from their predecessor token).
				645	(see also code section C<~ whitespace handling ~>).
				646
				647	=back
				648
				649	=head2 Notes on the output
				650
				651	=over 2
				652
				653	=item
				654
				655	zip file output (default on C<stdout>) with utf8 encoded entries
				656	(which together form the KorAP-XML format)
				657
				658	=back
				659
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	660	=head1 INSTALLATION
				661
				662	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				663	these bindings are available, the preferred way to install the script is
				664	to use L<cpanm\|App::cpanminus>.
				665
				666	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				667
				668	In case everything went well, the C<tei2korapxml> tool will
				669	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	670
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	671	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				672
				673	=head1 OPTIONS
				674
				675	=over 2
				676
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	677	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	678
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	679	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	680
				681	=item B<--help\|-h>
				682
				683	Print help information.
				684
				685	=item B<--version\|-v>
				686
				687	Print version information.
				688
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	689	=item B<--tokenizer-call\|-tc>
				690
				691	Call an external tokenizer process, that will tokenize
				692	a single line from STDIN and outputs one token per line.
				693
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	694	=item B<--tokenizer-korap\|-tk>
				695
				696	Use the standard KorAP/DeReKo tokenizer.
				697
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	698	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	699
				700	Tokenize the data using two embedded tokenizers,
				701	that will take an I<Aggressive> and a I<conservative>
				702	approach.
				703
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	704	=item B<--inline-tokens> <foundry>#[<file>]
				705
				706	Define the foundry and file (without extension)
				707	to store inline token information in.
				708	If L</KORAPXMLTEI_INLINE> is set, this will contain
				709	annotations as well.
				710	Defaults to C<tokens> and C<morpho>.
				711
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	712	=item B<--inline-structures> <foundry>#[<file>]
				713
				714	Define the foundry and file (without extension)
				715	to store inline structure information in.
				716	Defaults to C<struct> and C<structures>.
				717
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	718	=item B<--base-foundry> <foundry>
				719
				720	Define the base foundry to store newly generated
				721	token information in.
				722	Defaults to C<base>.
				723
				724	=item B<--data-file> <file>
				725
				726	Define the file (without extension)
				727	to store primary data information in.
				728	Defaults to C<data>.
				729
				730	=item B<--header-file> <file>
				731
				732	Define the file name (without extension)
				733	to store header information on
				734	the corpus, document, and text level in.
				735	Defaults to C<header>.
				736
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	737	=item B<--use-tokenizer-sentence-splits\|-s>
				738
				739	Replace existing with, or add new, sentence boundary information
				740	provided by the KorAP tokenizer (currently supported only).
				741
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	742	=item B<--tokens-file> <file>
				743
				744	Define the file (without extension)
				745	to store generated token information in
				746	(either from the KorAP tokenizer or an externally called tokenizer).
				747	Defaults to C<tokens>.
				748
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	749	=item B<--log\|-l>
				750
				751	Loglevel for I<Log::Any>. Defaults to C<notice>.
				752
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	753	=back
				754
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	755	=head1 ENVIRONMENT VARIABLES
				756
				757	=over 2
				758
				759	=item B<KORAPXMLTEI_DEBUG>
				760
				761	Activate minimal debugging.
				762	Defaults to C<false>.
				763
				764	=item B<KORAPXMLTEI_INLINE>
				765
				766	Process inline annotations, if present.
				767	Defaults to C<false>.
				768
				769	=back
				770
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	771	=head1 COPYRIGHT AND LICENSE
				772
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	773	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	774
				775	Author: Peter Harders
				776
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	777	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	778
				779	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				780	Corpus Analysis Platform at the
				781	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				782	member of the
				783	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				784
				785	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	786	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	787
				788	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	789
				790	# NOTES
				791
				792	## Notes on how 'XML::CompactTree::XS' works
				793
				794	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				795
				796	Print out name of 'node2' for the above example:
				797
				798	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				799
				800	Exploring the structure of $data ( = reference to below array ):
				801
				802	[ 0: XML_READER_TYPE_DOCUMENT,
				803	1: ?
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	804	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	805	1: 'node'
				806	2: ?
				807	3: HASH (attributes)
				808	4: 1 (line number)
				809	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				810	1: 'node1'
				811	2: ?
				812	3: undefined (no attributes)
				813	4: 1 (line number)
				814	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				815	1: 'some '
				816	]
				817	1: [ 0: XML_READER_TYPE_ELEMENT
				818	1: 'n'
				819	2: ?
				820	3: undefined (no attributes)
				821	4: 1 (line number)
				822	5: undefined (no child-nodes)
				823	]
				824	2: [ 0: XML_READER_TYPE_TEXT
				825	1: ' text'
				826	]
				827	]
				828	]
				829	1: [ 0: XML_READER_TYPE_ELEMENT
				830	1: 'node2'
				831	2: ?
				832	3: undefined (not attributes)
				833	4: 1 (line number)
				834	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				835	1: 'more-text'
				836	]
				837	]
				838	]
				839	]
				840	]
				841	]
				842	]
				843
				844	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				845
				846	ref($data->[2]) == ARRAY (with 1 element for 'node')
				847	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				848
				849	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				850	$data->[2]->[0]->[1] == 'node'
				851	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				852	$data->[2]->[0]->[4] == 1 (line number)
				853	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
				854	# child-nodes of actual node (see $_IDX)
				855
				856	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				857	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				858	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				859	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				860	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				861	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				862
				863	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				864	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				865	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				866
				867	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				868	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				869	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				870	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				871	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				872	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				873
				874	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				875	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				876	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				877
				878
				879	retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
				880	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				881	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				882
				883
				884	## Notes on whitespace handling
				885
				886	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
				887	(see function 'retr_info()').
				888
				889	Definition of significant and insignificant whitespace
				890	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				891
				892	Significant whitespace is part of the document content and should be preserved.
				893	Insignificant whitespace is used when editing XML documents for readability.
				894	These whitespaces are typically not intended for inclusion in the delivery of the document.
				895
				896	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				897
				898	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				899	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				900
				901	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				902	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				903	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				904
				905	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				906
				907
				908	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				909
				910	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				911	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				912
				913	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				914	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				915
				916	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				917	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				918
				919	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				920	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				921	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				922	the last read 'non-tag'-node has to be corrected (see [1]),
				923
				924	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				925	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				926
				927	[1]
				928	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				929	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
				930	(see above code fragment '... not exists $ws{ $fval - 1 } ...').
				931
				932	[2]
				933	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				934	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				935
				936	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				937	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				938
				939	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				940	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				941
				942
				943	## Notes on whitespace fixing
				944
				945	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				946	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				947
				948	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				949	example further down and notes on 'Input restrictions' in the manpage).
				950
				951	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				952
				953	Examples (how primary text with linebreaks would be converted by below code):
				954
				955	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				956	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				957
				958	Blanks are inserted before the 1st character:
				959
				960	NOTE: not stringent ('...' stands for text):
				961
				962	beg1............................end1 => no blank before 'beg1'
				963	beg2....<pb/>...................end2 => no blank before 'beg2'
				964	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				965	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				966
				967	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				968	^
				969	\|_blank between 'end3' and 'beg4'
				970
				971
				972	## Notes on segfault prevention
				973
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	974	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	975	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				976	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				977	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				978	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.