Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 152eaaca3ba0eb633b9242fb5e370e80e719377a [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Marc Kupietz	a1421f0	2021-02-18 15:32:38 +0100	[diff] [blame]	36	our $VERSION = '1.00';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	40	# Set to 1 for minimal more debug output (no need to be parametrized)
				41	use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	42
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	43	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	44	GetOptions(
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	45	"root\|r=s" => \(my $_root_dir = '.'), # name of root directory inside zip file
				46	"input\|i=s" => \(my $input_fname = ''), # input file (yet only TEI I5 Format accepted)
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	47	'tokenizer-call\|tc=s' => \(my $tokenizer_call), # Temporary argument for testing purposes
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	48	'tokenizer-korap\|tk' => \(my $tokenizer_korap), # use KorAP-tokenizer
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame^]	49	'tokenizer-internal\|ti' => \(my $_GEN_TOK_INT), # use intern tokenization (default = no)
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	50	'use-tokenizer-sentence-splits\|s' => (\my $use_tokenizer_sentence_splits), # use KorAP tokenizer to split s (default=no)
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	51	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	52	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	53	'base-foundry=s' => \(my $_tok_dir = 'base'),
				54	'data-file=s' => \(my $_data_file = 'data'),
				55	'header-file=s' => \(my $_header_file = 'header'),
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame^]	56	'tokens-file=s' => \(my $_tok_file_ext = 'tokens'),
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	57	'log\|l=s' => \(my $log_level = 'notice'),
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	58	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	59	pod2usage(
				60	-verbose => 99,
				61	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				62	-msg => $VERSION_MSG,
				63	-output => '-'
				64	)
				65	},
				66	'version\|v' => sub {
				67	pod2usage(
				68	-verbose => 0,
				69	-msg => $VERSION_MSG,
				70	-output => '-'
				71	)
				72	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	73	);
				74
Marc Kupietz	44b1f25	2020-11-26 16:31:40 +0100	[diff] [blame]	75	binmode(STDERR, ":encoding(UTF-8)");
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	76	Log::Any::Adapter->set('Stderr', log_level => $log_level);
				77
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	78	$log->notice('Debugging is activated') if DEBUG;
				79
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	80	#
				81	# ~~~ parameter (mandatory) ~~~
				82	#
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	83	my $_TEXT_BODY = "text"; # tag (without attributes), which contains the primary text
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	84	# optional
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	85	my $_CORP_HEADER_BEG = "idsHeader type=\"corpus\""; # just keep the correct order of the attributes and evtl. add an '.*' between them
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	86	# optional
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	87	my $_DOC_HEADER_BEG = "idsHeader type=\"document\""; # analog
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	88	# mandatory
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	89	my $_TEXT_HEADER_BEG = "idsHeader type=\"text\""; # analog
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	90
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	91
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	92	## extern tokenization
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	93	my $_GEN_TOK_EXT = $tokenizer_call \|\| $tokenizer_korap ? 1 : 0;
				94
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	95	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
				96	die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
				97	}
				98
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	99	my $ext_tok;
				100	if ($tokenizer_call) {
				101	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				102	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	103
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	104	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	105	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	106	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	107	##
				108
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	109
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	110	#
				111	# ~~~ constants ~~~
				112	#
				113
				114
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	115	## intern tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	116	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				117	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	118	##
				119
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	120	## TODO: optional (different annotation tools can produce more zip-files for feeding into KorAP-XML-Krill)
				121	my $_TOKENS_PROC = 1; # on/off: processing of ${_TOKEN_TAG}'s (default: 1)
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	122
				123
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	124	# Name of the directory and the file containing all inline structure informations
				125	# except for $_TOKEN_TAG information
				126	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
				127	$_structure_file .= '.xml';
				128
				129
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	130	# Name of the directory and the file containing all inline token informations
				131	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				132	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
				133	$_tokens_file .= '.xml';
				134
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	135	my $_TOKENS_TAG = "w"; # name of tag containing all information stored in $_tokens_file
				136
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	137	# Handling inline annotations (inside $_TOKENS_TAG)
				138	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	139
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	140
				141	#
				142	# ~~~ variables ~~~
				143	#
				144
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	145	# Initialize Token- and Structure-Collector
				146	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				147	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	148
				149
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	150	# Initialize Data-Collector
				151	my $data = KorAP::XML::TEI::Data->new;
				152
				153
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	154	# Initialize zipper
Akron	3bdc0a3	2020-08-03 12:12:56 +0200	[diff] [blame]	155	my $zipper = KorAP::XML::TEI::Zipper->new($_root_dir);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	156	my $input_fh; # input file handle (default: stdin)
				157
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	158	my $dir; # text directory (below $_root_dir)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	159
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	160	my ( $text_id,
				161	$text_id_esc ); # '$text_id_esc' = escaped version of $text_id
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	162
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	163	my ( $reader, # instance of 'XML::LibXML::Reader->new' (on input '$buf_in')
				164	$tree_data ); # instance of 'XML::CompactTree::XS::readSubtreeToPerl' (on input '$reader')
				165
				166	# these are only used inside recursive function 'retr_info'
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	167	my ( $_IDX, # value is set dependent on DEBUG - for extracting array of child elements from element in $tree_data
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	168	$e, # element from $tree_data
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	169	## variables for handling ~ whitespace related issue ~ (it is sometimes necessary, to correct the from-values for some tags)
				170	$add_one, # ...
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	171	$fval, # ...
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	172	%ws); # hash for indices of whitespace-nodes (needed to recorrect from-values)
				173	# idea: when closing element, check if it's from-index minus 1 refers to a whitespace-node
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	174	# (means: 'from-index - 1' is a key in %ws).
				175	# if this is _not_ the case, then the from-value is one to high => correct it by substracting 1
				176
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	177	my $c; # index variables used in loops
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	178
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	179
				180	#
				181	# ~~~ main ~~~
				182	#
				183
				184	# ~ initializations ~
				185
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	186	# Include line numbers in elements of $tree_data for debugging
				187	DEBUG ? ($_IDX = 5) : ($_IDX = 4);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	188
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	189	$fval = 0;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	190
Akron	ec2cef2	2020-07-31 10:00:15 +0200	[diff] [blame]	191	# Normalize regex for header parsing
				192	for ($_CORP_HEADER_BEG,
				193	$_DOC_HEADER_BEG,
				194	$_TEXT_HEADER_BEG) {
				195	s!^([^\s]+)(.)$!$1\[\^>\]$2!;
				196	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	197
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	198
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	199	# ~ read input and write output (text by text) ~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	200
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	201	my ( $pfx, $sfx );
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	202
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	203	my $tl = 0; # text line (needed for whitespace handling)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	204
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	205	$input_fh = *STDIN; # input file handle (default: stdin)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	206
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	207	# Maybe not necessary
				208	$data->reset;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	209
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	210	$dir = "";
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	211
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	212	if ( $input_fname ne '' ){
				213	unless (open($input_fh, '<', $input_fname)) {
				214	die $log->fatal("File '$input_fname' could not be opened.");
				215	};
				216	}
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	217
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	218	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	219	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	220
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	221	my $pos;
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	222	my $input_enc = 'UTF-8';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	223	my $l = length('</' . $_TEXT_BODY) + 1;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	224
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	225	# ~ loop (reading input document) ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	226
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	227	MAIN: while ( <$input_fh> ){
				228
				229	$_ = remove_xml_comments( $input_fh, $_ ); # remove HTML (multi-line) comments (<!--...-->)
				230
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	231	# Set input encoding
				232	if ( index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				233	$input_enc = $2;
				234	next;
				235	};
				236
				237	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	238	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	239
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	240	if ( index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$# ){
				241
				242	# ~ start of text body ~
				243
				244	$pfx = $1;
				245	$sfx = $2;
				246
				247	if ($pfx !~ /^\s$/ \|\| $sfx !~ /^\s$/) {
				248	die $log->fatal("input line number $.: " .
				249	"line with opening text-body tag '${_TEXT_BODY}' " .
				250	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	251	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	252
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	253	# text body data extracted from input document ($input_fh), further processed by XML::LibXML::Reader
				254	my $buf_in = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	255
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	256	# Iterate over all lines in the text body
				257	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	258
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	259	$_ = remove_xml_comments( $input_fh, $_ );
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	260	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	261	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	262
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	263	# ~ end of text body ~
				264	if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	265
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame^]	266	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	267
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	268	if ((substr($_, 0, $pos) . substr($_, $l + $pos)) !~ /^\s*$/) {
				269	die $log->fatal("input line number $.: " .
				270	"line with closing text-body tag '${_TEXT_BODY}'".
				271	" contains additional information ... => Aborting (line=$_)");
				272	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	273
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	274	if ($dir ne "") {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	275
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	276	$reader = XML::LibXML::Reader->new( string => "<text>$buf_in</text>", huge => 1 );
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	277
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	278	# See notes on whitespace handling
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	279	my $param = XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_ATTRIBUTE_ARRAY;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	280
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	281	# XCT_LINE_NUMBERS is only needed for debugging
				282	# (see XML::CompactTree::XS)
				283	$param \|= XCT_LINE_NUMBERS if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	284	$tree_data = XML::CompactTree::XS::readSubtreeToPerl( $reader, $param);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	285
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	286	$structures->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	287
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	288	$tokens->reset if $_TOKENS_PROC;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	289
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	290	# ~ whitespace related issue ~
				291	$add_one = 0;
				292	%ws = ();
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	293
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	294	# ~ recursion ~
				295	retr_info(1, \$tree_data->[2] ); # parse input data
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	296
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	297	if (DEBUG) {
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	298	$log->debug("Writing (utf8-formatted) xml file $dir/${_data_file}.xml");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	299	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	300
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	301	# ~ write data.xml ~
				302	$data->to_zip(
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	303	$zipper->new_stream("$dir/${_data_file}.xml"),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	304	$text_id_esc
				305	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	306
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	307	# ~ tokenization ~
				308	if ($_GEN_TOK_EXT) {
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	309
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	310	# Tokenize and output
				311	$ext_tok->tokenize($data->data)->to_zip(
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame^]	312	$zipper->new_stream("$dir/$_tok_dir/${_tok_file_ext}.xml"),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	313	$text_id_esc
				314	);
				315	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	316
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	317	if ($_GEN_TOK_INT) {
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	318
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	319	# Tokenize and output
				320	$cons_tok->tokenize($data->data)->to_zip(
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame^]	321	$zipper->new_stream("$dir/$_tok_dir/" . $cons_tok->name . '.xml'),
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	322	$text_id_esc
				323	);
Marc Kupietz	74ed7f3	2020-09-09 18:22:07 +0200	[diff] [blame]	324
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	325	$aggr_tok->tokenize($data->data)->to_zip(
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame^]	326	$zipper->new_stream("$dir/$_tok_dir/" . $aggr_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	327	$text_id_esc
				328	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	329
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	330	$aggr_tok->reset;
				331	$cons_tok->reset;
				332	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	333
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	334	if ($use_tokenizer_sentence_splits) {
				335	$ext_tok->sentencize_from_previous_input($structures);
				336	}
				337
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	338	# ~ write structures ~
				339	if (!$structures->empty) {
				340	$structures->to_zip(
				341	$zipper->new_stream("$dir/$_structure_dir/$_structure_file"),
				342	$text_id_esc,
				343	2 # = structure serialization
				344	);
				345	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	346
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	347	# ~ write tokens ~
				348	if ($_TOKENS_PROC && !$tokens->empty) {
				349	$tokens->to_zip(
				350	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}"),
				351	$text_id_esc,
				352	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
				353	);
				354	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	355
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	356	$dir = ""; # reinit.
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	357
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	358	# Maybe not necessary
				359	$data->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	360
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	361	} else { # $dir eq ""
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	362
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	363	$log->warn("Maybe empty textSigle => skipping this text ...\ndata=$data");
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	364	}
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	365
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	366	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	367	};
				368
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	369	# ~ inside text body ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	370
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	371	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	372
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	373	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	374
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	375	# TODO: Maybe it's best, to keep the stripping of whitespace and to just remove the if-clause and to insert a blank by default (with possibly
				376	# an option on how newlines in primary text should be handled (stripped or replaced by a whitespace)).
				377
				378	# Remove consecutive whitespace at beginning and end (mostly one newline)
				379	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	380
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	381	### NOTE: this is only relevant, if a text consists of more than one line
				382	### TODO: find a better solution, or create a warning, if a text has more than one line ($tl > 1)
				383	### do testing with 2 different corpora (one with only one-line texts, the other with several lines per text)
				384	if (m/<[^>]+>[^<]/) { # line contains at least one tag with at least one character contents
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	385
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	386	$tl++; # counter for text lines
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	387
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	388	s/^(.)/ $1/ if $tl > 1; # insert blank before 1st character (for 2nd line and consecutive lines)
				389	}
				390	###
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	391
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	392	# add line to buffer
				393	$buf_in .= $_;
				394	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	395
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	396	} elsif (m#^(.)(<(?:${_TEXT_HEADER_BEG}\|${_DOC_HEADER_BEG}\|${_CORP_HEADER_BEG}).)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	397
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	398	# ~ start of header ~
				399	$pfx = $1;
				400	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	401
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	402	if ($pfx !~ /^\s*$/) {
				403	die $log->fatal("input line number $.: " .
				404	"line with opening header tag" .
				405	" is not in expected format ... => Aborting (line=$_)");
				406	};
				407
				408	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	409	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	410
				411	# Header was parseable
				412	if ($header) {
				413
				414	# Write header to zip
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	415	my $file = $header->dir . '/' . $_header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	416
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	417	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	418
				419	$header->to_zip($zipper->new_stream($file));
				420
				421	# Header is for text level
				422	if ($header->type eq 'text') {
				423
				424	# Remember dir and sigles
				425	$dir = $header->dir;
				426	$text_id = $header->id;
				427	$text_id_esc = $header->id_esc;
				428
				429	# log output for seeing progression
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	430	$log->notice("$0: text_id=$text_id");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	431
				432	$tl = 0; # reset (needed for ~ whitespace handling ~)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	433	}
				434	}
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	435	}
				436	} #end: while
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	437
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	438	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	439
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	440	$ext_tok->close if $_GEN_TOK_EXT;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	441
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	442	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	443
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	444
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	445	# Recursively called function to handle XML tree data
				446	sub retr_info {
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	447	# recursion level
				448	# (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
				449	my $rl = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	450
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	451	my $dummy_anno;
				452	if ($use_tokenizer_sentence_splits) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	453	$dummy_anno = $structures->new_dummy_annotation;
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	454	}
				455
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	456	# Iteration through all array elements
				457	# ($_[0] is a reference to an array reference)
				458	# See notes on how 'XML::CompactTree::XS' works and
				459	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
				460	foreach $e (@{${$_[0]}}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	461
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	462	# Element node
				463	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	464
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	465	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	466	# from here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	467	#~~~~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	468
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	469	my $anno;
				470
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	471	# $e->[1] represents the tag name
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	472	if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
				473	$anno = $dummy_anno;
				474	} else {
				475	$anno = $structures->add_new_annotation($e->[1]);
				476	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	477
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	478
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	479	# Add element also to token list
				480	if ($_TOKENS_PROC && $e->[1] eq $_TOKENS_TAG) {
				481	$tokens->add_annotation($anno);
				482	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	483
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	484	# Handle attributes (if attributes exist)
				485	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	486
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	487	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
				488	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				489	# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
				490	for ($c = 0; $c < @{$e->[3]}; $c += 2) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	491
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	492	# '$c' references the 'key' and '$c+1' the 'value'
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	493	$anno->add_attribute(
				494	@{$e->[3]}[$c, $c + 1]
				495	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	496	};
				497	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	498
				499	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	500	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	501
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	502
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	503	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	504	# until here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	505	#~~~~
				506
				507
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	508	# Call function recursively
				509	# do no recursion, if $e->[$_IDX] is not defined
				510	# (because we have no array of child-nodes, e.g.: <back/>)
				511	if (defined $e->[$_IDX]) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	512
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	513	# Recursion with array of child-nodes
				514	retr_info($rl+1, \$e->[$_IDX]);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	515	}
				516
				517
				518	#~~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	519	# from here: tag-node (closing)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	520	#~~~~~
				521
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	522	# NOTE: use $pos, because the offsets are _between_ the characters
				523	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	524	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	525
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	526	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	527
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	528	$fval = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	529
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	530	# ~ whitespace related issue ~
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	531	if ($fval > 0 && not exists $ws{$fval - 1}) {
				532
				533	# ~ previous node was a text-node ~
				534	$anno->set_from($fval - 1);
				535	}
				536
				537	# in case this fails, check input
				538	if (($fval - 1) > $pos) {
				539	die $log->fatal("text_id='$text_id', " .
				540	"processing of structures: " .
				541	"from-value ($fval) is 2 or more greater " .
				542	"than to-value ($pos) => please check. Aborting");
				543	};
				544
				545	# TODO: find example for which this case applies
				546	# maybe this is not necessary anymore, because the above recorrection of the from-value suffices
				547	#
				548	# TODO: check, if it's better to remove this line and change above check to 'if ($fval - 1) >= $pos;
				549	# do testing with bigger corpus excerpt (wikipedia?)
				550	$anno->set_from($pos) if $fval == $pos + 1;
				551	$anno->set_to($pos);
				552	$anno->set_level($rl);
				553
				554	# Clean up whitespace
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	555	delete $ws{$fval - 1} if $fval > 0 && exists $ws{$fval - 1};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	556
				557
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	558	#~~~~
				559	# until here: tag-node (closing)
				560	#~~~~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	561	}
				562
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	563	# Text node
				564	elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	565
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	566	$add_one = 1;
				567	$data->append($e->[1]);
				568	}
				569
				570	# Whitespace node
				571	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				572	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				573
				574	# state, that this from-index belongs to a whitespace-node
				575	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				576	$ws{$data->position}++;
				577
				578	$add_one = 0;
				579	$data->append($e->[1]);
				580	}
				581
				582	# not yet handled type
				583	else {
				584
				585	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				586	};
				587	};
				588	};
				589
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	590
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	591	__END__
				592
				593	=pod
				594
				595	=encoding utf8
				596
				597	=head1 NAME
				598
				599	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				600
				601	=head1 SYNOPSIS
				602
				603	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				604
				605	=head1 DESCRIPTION
				606
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	607	C<tei2korapxml> is a script to convert TEI P5 and
				608	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				609	based documents to the
				610	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				611	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	612	read from C<STDIN>. If no specific output is defined, data is written
				613	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	614
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	615	This program is usually called from inside another script.
				616
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	617	=head1 FORMATS
				618
				619	=head2 Input restrictions
				620
				621	=over 2
				622
				623	=item
				624
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	625	TEI P5 formatted input with certain restrictions:
				626
				627	=over 4
				628
				629	=item
				630
				631	B<mandatory>: text-header with integrated textsigle, text-body
				632
				633	=item
				634
				635	B<optional>: corp-header with integrated corpsigle,
				636	doc-header with integrated docsigle
				637
				638	=back
				639
				640	=item
				641
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	642	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	643	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	644	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	645	into blanks between 2 tokens could lead to additional blanks,
				646	where there should be none (e.g.: punctuation characters like C<,> or
				647	C<.> should not be seperated from their predecessor token).
				648	(see also code section C<~ whitespace handling ~>).
				649
				650	=back
				651
				652	=head2 Notes on the output
				653
				654	=over 2
				655
				656	=item
				657
				658	zip file output (default on C<stdout>) with utf8 encoded entries
				659	(which together form the KorAP-XML format)
				660
				661	=back
				662
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	663	=head1 INSTALLATION
				664
				665	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				666	these bindings are available, the preferred way to install the script is
				667	to use L<cpanm\|App::cpanminus>.
				668
				669	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				670
				671	In case everything went well, the C<tei2korapxml> tool will
				672	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	673
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	674	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				675
				676	=head1 OPTIONS
				677
				678	=over 2
				679
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	680	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	681
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	682	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	683
				684	=item B<--help\|-h>
				685
				686	Print help information.
				687
				688	=item B<--version\|-v>
				689
				690	Print version information.
				691
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	692	=item B<--tokenizer-call\|-tc>
				693
				694	Call an external tokenizer process, that will tokenize
				695	a single line from STDIN and outputs one token per line.
				696
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	697	=item B<--tokenizer-korap\|-tk>
				698
				699	Use the standard KorAP/DeReKo tokenizer.
				700
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	701	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	702
				703	Tokenize the data using two embedded tokenizers,
				704	that will take an I<Aggressive> and a I<conservative>
				705	approach.
				706
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	707	=item B<--inline-tokens> <foundry>#[<file>]
				708
				709	Define the foundry and file (without extension)
				710	to store inline token information in.
				711	If L</KORAPXMLTEI_INLINE> is set, this will contain
				712	annotations as well.
				713	Defaults to C<tokens> and C<morpho>.
				714
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	715	=item B<--inline-structures> <foundry>#[<file>]
				716
				717	Define the foundry and file (without extension)
				718	to store inline structure information in.
				719	Defaults to C<struct> and C<structures>.
				720
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	721	=item B<--base-foundry> <foundry>
				722
				723	Define the base foundry to store newly generated
				724	token information in.
				725	Defaults to C<base>.
				726
				727	=item B<--data-file> <file>
				728
				729	Define the file (without extension)
				730	to store primary data information in.
				731	Defaults to C<data>.
				732
				733	=item B<--header-file> <file>
				734
				735	Define the file name (without extension)
				736	to store header information on
				737	the corpus, document, and text level in.
				738	Defaults to C<header>.
				739
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	740	=item B<--use-tokenizer-sentence-splits\|-s>
				741
				742	Replace existing with, or add new, sentence boundary information
				743	provided by the KorAP tokenizer (currently supported only).
				744
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame^]	745	=item B<--tokens-file> <file>
				746
				747	Define the file (without extension)
				748	to store generated token information in
				749	(either from the KorAP tokenizer or an externally called tokenizer).
				750	Defaults to C<tokens>.
				751
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	752	=item B<--log\|-l>
				753
				754	Loglevel for I<Log::Any>. Defaults to C<notice>.
				755
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	756	=back
				757
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	758	=head1 ENVIRONMENT VARIABLES
				759
				760	=over 2
				761
				762	=item B<KORAPXMLTEI_DEBUG>
				763
				764	Activate minimal debugging.
				765	Defaults to C<false>.
				766
				767	=item B<KORAPXMLTEI_INLINE>
				768
				769	Process inline annotations, if present.
				770	Defaults to C<false>.
				771
				772	=back
				773
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	774	=head1 COPYRIGHT AND LICENSE
				775
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	776	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	777
				778	Author: Peter Harders
				779
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	780	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	781
				782	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				783	Corpus Analysis Platform at the
				784	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				785	member of the
				786	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				787
				788	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	789	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	790
				791	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	792
				793	# NOTES
				794
				795	## Notes on how 'XML::CompactTree::XS' works
				796
				797	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				798
				799	Print out name of 'node2' for the above example:
				800
				801	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				802
				803	Exploring the structure of $data ( = reference to below array ):
				804
				805	[ 0: XML_READER_TYPE_DOCUMENT,
				806	1: ?
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	807	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	808	1: 'node'
				809	2: ?
				810	3: HASH (attributes)
				811	4: 1 (line number)
				812	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				813	1: 'node1'
				814	2: ?
				815	3: undefined (no attributes)
				816	4: 1 (line number)
				817	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				818	1: 'some '
				819	]
				820	1: [ 0: XML_READER_TYPE_ELEMENT
				821	1: 'n'
				822	2: ?
				823	3: undefined (no attributes)
				824	4: 1 (line number)
				825	5: undefined (no child-nodes)
				826	]
				827	2: [ 0: XML_READER_TYPE_TEXT
				828	1: ' text'
				829	]
				830	]
				831	]
				832	1: [ 0: XML_READER_TYPE_ELEMENT
				833	1: 'node2'
				834	2: ?
				835	3: undefined (not attributes)
				836	4: 1 (line number)
				837	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				838	1: 'more-text'
				839	]
				840	]
				841	]
				842	]
				843	]
				844	]
				845	]
				846
				847	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				848
				849	ref($data->[2]) == ARRAY (with 1 element for 'node')
				850	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				851
				852	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				853	$data->[2]->[0]->[1] == 'node'
				854	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				855	$data->[2]->[0]->[4] == 1 (line number)
				856	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
				857	# child-nodes of actual node (see $_IDX)
				858
				859	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				860	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				861	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				862	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				863	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				864	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				865
				866	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				867	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				868	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				869
				870	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				871	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				872	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				873	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				874	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				875	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				876
				877	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				878	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				879	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				880
				881
				882	retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
				883	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				884	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				885
				886
				887	## Notes on whitespace handling
				888
				889	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
				890	(see function 'retr_info()').
				891
				892	Definition of significant and insignificant whitespace
				893	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				894
				895	Significant whitespace is part of the document content and should be preserved.
				896	Insignificant whitespace is used when editing XML documents for readability.
				897	These whitespaces are typically not intended for inclusion in the delivery of the document.
				898
				899	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				900
				901	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				902	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				903
				904	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				905	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				906	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				907
				908	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				909
				910
				911	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				912
				913	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				914	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				915
				916	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				917	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				918
				919	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				920	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				921
				922	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				923	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				924	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				925	the last read 'non-tag'-node has to be corrected (see [1]),
				926
				927	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				928	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				929
				930	[1]
				931	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				932	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
				933	(see above code fragment '... not exists $ws{ $fval - 1 } ...').
				934
				935	[2]
				936	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				937	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				938
				939	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				940	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				941
				942	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				943	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				944
				945
				946	## Notes on whitespace fixing
				947
				948	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				949	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				950
				951	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				952	example further down and notes on 'Input restrictions' in the manpage).
				953
				954	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				955
				956	Examples (how primary text with linebreaks would be converted by below code):
				957
				958	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				959	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				960
				961	Blanks are inserted before the 1st character:
				962
				963	NOTE: not stringent ('...' stands for text):
				964
				965	beg1............................end1 => no blank before 'beg1'
				966	beg2....<pb/>...................end2 => no blank before 'beg2'
				967	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				968	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				969
				970	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				971	^
				972	\|_blank between 'end3' and 'beg4'
				973
				974
				975	## Notes on segfault prevention
				976
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	977	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	978	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				979	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				980	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				981	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.