Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: e2768226abd02e8d6435a2b2a612d048055ddf3f [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Marc Kupietz	a1421f0	2021-02-18 15:32:38 +0100	[diff] [blame]	36	our $VERSION = '1.00';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	40	# Set to 1 for minimal more debug output (no need to be parametrized)
				41	use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	42
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	43	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	44	GetOptions(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	45	"root\|r=s" => \(my $root_dir = '.'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	46	"input\|i=s" => \(my $input_fname = ''),
				47	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				48	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	49	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	50	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				51	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				52	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
				53	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	54	'base-foundry=s' => \(my $base_dir = 'base'),
				55	'data-file=s' => \(my $data_file = 'data'),
				56	'header-file=s' => \(my $header_file = 'header'),
				57	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	58	'log\|l=s' => \(my $log_level = 'notice'),
				59	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	60	pod2usage(
				61	-verbose => 99,
				62	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				63	-msg => $VERSION_MSG,
				64	-output => '-'
				65	)
				66	},
				67	'version\|v' => sub {
				68	pod2usage(
				69	-verbose => 0,
				70	-msg => $VERSION_MSG,
				71	-output => '-'
				72	)
				73	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	74	);
				75
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	76	# Establish logger
Marc Kupietz	44b1f25	2020-11-26 16:31:40 +0100	[diff] [blame]	77	binmode(STDERR, ":encoding(UTF-8)");
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	78	Log::Any::Adapter->set('Stderr', log_level => $log_level);
				79
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	80	$log->notice('Debugging is activated') if DEBUG;
				81
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	82	# tag (without attributes), which contains the primary text
				83	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	84	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	85
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	86	# TODO: IDS-specific (and redundant)
				87	my $_HEADER_TAG = 'idsHeader';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	88
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	89	# name of the tag containing all information stored in $_tokens_file
				90	my $_TOKENS_TAG = 'w';
				91
				92
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	93	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
				94	die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	95	};
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	96
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	97	my $ext_tok;
				98	if ($tokenizer_call) {
				99	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				100	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	101
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	102	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	103	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	104	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	105	##
				106
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	107
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	108	#
				109	# ~~~ constants ~~~
				110	#
				111
				112
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	113	## intern tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	114	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				115	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	116	##
				117
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	118	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	119	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	120	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	121
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	122	# Name of the directory and the file containing all inline token informations
				123	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				124	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	125
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	126	# Handling inline annotations (inside $_TOKENS_TAG)
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	127	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
				128
				129	# Initialize Token- and Structure-Collector
				130	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				131	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
				132
				133	# Initialize Data-Collector
				134	my $data = KorAP::XML::TEI::Data->new;
				135
				136	# Initialize zipper
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	137	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	138
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	139
				140	#
				141	# ~~~ variables ~~~
				142	#
				143
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame^]	144	# text directory (below $root_dir)
				145	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	146
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame^]	147	# Escaped version of text id
				148	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	149
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	150	# element from $tree_data
				151	my $e;
				152
				153	# Keeping track of the current positions in the text
				154	my $pos;
				155
				156	# Default encoding of the text
				157	my $input_enc = 'UTF-8';
				158
				159	# variables for handling ~ whitespace related issue ~
				160	# (it is sometimes necessary, to correct the from-values for some tags)
				161	my $add_one;
				162	my $from = 0;
				163
				164	# text line (needed for whitespace handling)
				165	my $text_line = 0;
				166
				167	# hash for indices of whitespace-nodes
				168	# (needed to recorrect from-values)
				169	# IDEA:
				170	# when closing element, check if it's from-index minus 1 refers to a whitespace-node
				171	# (means: 'from-index - 1' is a key in %ws).
				172	# if this is _not_ the case, then the from-value is one
				173	# to high => correct it by substracting 1
				174	my %ws;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	175
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	176
				177	#
				178	# ~~~ main ~~~
				179	#
				180
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	181	# ~ read input and write output (text by text) ~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	182
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	183	# Input file handle (default: stdin)
				184	my $input_fh = *STDIN;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	185
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	186	if ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	187	unless (open($input_fh, '<', $input_fname)) {
				188	die $log->fatal("File '$input_fname' could not be opened.");
				189	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	190	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	191
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	192	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	193	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	194
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	195
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	196	# Reading input document
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	197	MAIN: while ( <$input_fh> ){
				198
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	199	# remove HTML (multi-line) comments (<!--...-->)
				200	$_ = remove_xml_comments( $input_fh, $_ );
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	201
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	202	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	203	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	204	$input_enc = $2;
				205	next;
				206	};
				207
				208	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	209	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	210
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	211	# Start of Text body
				212	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#){
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	213
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	214	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	215
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	216	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	217	die $log->fatal("input line number $.: " .
				218	"line with opening text-body tag '${_TEXT_BODY}' " .
				219	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	220	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	221
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	222	# Text body data extracted from input document ($input_fh),
				223	# further processed by XML::LibXML::Reader
				224	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	225
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	226	# Iterate over all lines in the text body
				227	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	228
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	229	$_ = remove_xml_comments( $input_fh, $_ );
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	230	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	231	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	232
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	233	# End of text body
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	234	if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	235
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	236	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	237
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	238	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	239	die $log->fatal("input line number $.: " .
				240	"line with closing text-body tag '${_TEXT_BODY}'".
				241	" contains additional information ... => Aborting (line=$_)");
				242	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	243
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	244	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	245	$log->warn(
				246	"Maybe empty textSigle => skipping this text ...\n" .
				247	'data=' . substr($data->data, 0, 200)
				248	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	249	next MAIN;
				250	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	251
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	252	my $reader = XML::LibXML::Reader->new(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	253	string => "<text>$text_buffer</text>",
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	254	huge => 1
				255	);
				256
				257	# See notes on whitespace handling
				258	my $param = XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_ATTRIBUTE_ARRAY;
				259
				260	# XCT_LINE_NUMBERS is only needed for debugging
				261	# (see XML::CompactTree::XS)
				262	$param \|= XCT_LINE_NUMBERS if DEBUG;
				263	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
				264
				265	$structures->reset;
				266
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	267	$tokens->reset unless $skip_inline_tokens;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	268
				269	# ~ whitespace related issue ~
				270	$add_one = 0;
				271	%ws = ();
				272
				273	# ~ recursion ~
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	274	descend(1, $tree_data->[2]); # parse input data
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	275
				276	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	277	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	278	};
				279
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	280	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	281	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	282	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	283	$text_id_esc
				284	);
				285
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	286	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	287	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	288
				289	# Tokenize and output
				290	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	291	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	292	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	293	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	294
				295	if ($use_tokenizer_sentence_splits) {
				296	$ext_tok->sentencize_from_previous_input($structures);
				297	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	298	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	299
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	300	# Tokenize with internal tokenizer
				301	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	302
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	303	# Tokenize and output
				304	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	305	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	306	$text_id_esc
				307	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	308
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	309	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	310	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	311	$text_id_esc
				312	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	313
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	314	$aggr_tok->reset;
				315	$cons_tok->reset;
				316	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	317
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	318	# ~ write structures ~
				319	if (!$structures->empty) {
				320	$structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	321	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	322	$text_id_esc,
				323	2 # = structure serialization
				324	);
				325	};
				326
				327	# ~ write tokens ~
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	328	unless ($skip_inline_tokens \|\| $tokens->empty) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	329	$tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	330	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	331	$text_id_esc,
				332	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
				333	);
				334	};
				335
				336	# reinit.
				337	$dir = '';
				338
				339	# Maybe not necessary
				340	$data->reset;
				341
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	342	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	343	};
				344
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	345
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	346	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	347
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	348	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	349
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	350	# TODO:
				351	# Maybe it's best, to keep the stripping of whitespace and
				352	# to just remove the if-clause and to insert a blank by default
				353	# (with possibly an option on how newlines in primary text should
				354	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	355
				356	# Remove consecutive whitespace at beginning and end (mostly one newline)
				357	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	358
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	359	# NOTE:
				360	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	361
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	362	# TODO:
				363	# find a better solution, or create a warning, if a text has more
				364	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	365
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	366	# TODO:
				367	# do testing with 2 different corpora
				368	# (one with only one-line texts, the other with several lines per text)
				369
				370	# line contains at least one tag with at least one character contents
				371	if (m/<[^>]+>[^<]/) {
				372
				373	# Increment counter for text lines
				374	$text_line++;
				375
				376	# insert blank before 1st character
				377	#(for 2nd line and consecutive lines)
				378	s/^(.)/ $1/ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	379	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	380
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	381	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	382	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	383	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	384
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	385	} elsif (m#^(.)(\<${_HEADER_TAG}[^>]?type=["'].*)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	386
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	387	# ~ start of header ~
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	388	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	389
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	390	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	391	die $log->fatal(
				392	"input line number $.: " .
				393	'line with opening header tag is not in expected format ... ' .
				394	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	395	};
				396
				397	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	398	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	399
				400	# Header was parseable
				401	if ($header) {
				402
				403	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	404	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	405
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	406	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	407
				408	$header->to_zip($zipper->new_stream($file));
				409
				410	# Header is for text level
				411	if ($header->type eq 'text') {
				412
				413	# Remember dir and sigles
				414	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	415	$text_id_esc = $header->id_esc;
				416
				417	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame^]	418	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	419
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	420	# Reset counter for text lines
				421	# (needed for whitespace handling)
				422	$text_line = 0;
				423	};
				424	};
				425	};
				426	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	427
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	428	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	429
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	430	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	431
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	432	close $input_fh;
				433
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	434	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	435
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	436
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	437	# Recursively called function to handle XML tree data
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	438	sub descend {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	439
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	440	# recursion level
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	441	# (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	442	my $depth = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	443
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	444	# Iteration through all array elements
				445	# ($_[0] is a reference to an array reference)
				446	# See notes on how 'XML::CompactTree::XS' works and
				447	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame]	448	foreach $e (@{$_[0]}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	449
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	450	# Element node
				451	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	452
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	453	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	454	# from here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	455	#~~~~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	456
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	457	# Get the child index depending on the debug state.
				458	# This is likely to be optimized away by the compiler.
				459	my $children = $e->[DEBUG ? 5 : 4];
				460
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	461	# $e->[1] represents the tag name
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	462	# Skip sentences
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	463	if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	464	descend($depth+1, $children) if defined $children;
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	465	next;
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	466	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	467
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	468	my $anno = $structures->add_new_annotation($e->[1]);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	469
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	470	# Add element also to token list
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	471	if (!$skip_inline_tokens && $e->[1] eq $_TOKENS_TAG) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	472	$tokens->add_annotation($anno);
				473	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	474
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	475	# Handle attributes (if attributes exist)
				476	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	477
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	478	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
				479	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				480	# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	481	for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	482
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	483	# '$_' references the 'key' and '$_+1' the 'value'
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	484	$anno->add_attribute(
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	485	@{$e->[3]}[$_, $_ + 1]
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	486	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	487	};
				488	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	489
				490	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	491	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	492
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	493
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	494	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	495	# until here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	496	#~~~~
				497
				498
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	499	# Call function recursively
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	500	# do no recursion, if $children is not defined
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	501	# (because we have no array of child-nodes, e.g.: <back/>)
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	502	descend($depth+1, $children) if defined $children;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	503
				504
				505	#~~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	506	# from here: tag-node (closing)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	507	#~~~~~
				508
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	509	# NOTE: use $pos, because the offsets are _between_ the characters
				510	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	511	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	512
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	513	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	514
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	515	$from = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	516
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	517	# ~ whitespace related issue ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	518	if ($from > 0 && not exists $ws{$from - 1}) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	519
				520	# ~ previous node was a text-node ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	521	$anno->set_from($from - 1);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	522	};
				523
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	524	# in case this fails, check input
				525	if (($from - 1) > $pos) {
				526	die $log->fatal(
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame^]	527	"text_id='$text_id_esc', " .
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	528	'processing of structures: ' .
				529	"from-value ($from) is 2 or more greater " .
				530	"than to-value ($pos) => please check. Aborting"
				531	);
				532	};
				533
				534	# TODO:
				535	# find example for which this case applies
				536	# maybe this is not necessary anymore, because the
				537	# above recorrection of the from-value suffices
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	538	#
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	539	# TODO:
				540	# check, if it's better to remove this line and
				541	# change above check to 'if ($from - 1) >= $pos;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	542	# do testing with bigger corpus excerpt (wikipedia?)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	543	$anno->set_from($pos) if $from == $pos + 1;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	544	$anno->set_to($pos);
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	545	$anno->set_level($depth);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	546
				547	# Clean up whitespace
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	548	delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	549
				550
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	551	#~~~~
				552	# until here: tag-node (closing)
				553	#~~~~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	554	}
				555
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	556	# Text node
				557	elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	558
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	559	$add_one = 1;
				560	$data->append($e->[1]);
				561	}
				562
				563	# Whitespace node
				564	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				565	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				566
				567	# state, that this from-index belongs to a whitespace-node
				568	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				569	$ws{$data->position}++;
				570
				571	$add_one = 0;
				572	$data->append($e->[1]);
				573	}
				574
				575	# not yet handled type
				576	else {
				577
				578	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				579	};
				580	};
				581	};
				582
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	583
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	584	__END__
				585
				586	=pod
				587
				588	=encoding utf8
				589
				590	=head1 NAME
				591
				592	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				593
				594	=head1 SYNOPSIS
				595
				596	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				597
				598	=head1 DESCRIPTION
				599
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	600	C<tei2korapxml> is a script to convert TEI P5 and
				601	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				602	based documents to the
				603	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				604	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	605	read from C<STDIN>. If no specific output is defined, data is written
				606	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	607
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	608	This program is usually called from inside another script.
				609
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	610	=head1 FORMATS
				611
				612	=head2 Input restrictions
				613
				614	=over 2
				615
				616	=item
				617
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	618	TEI P5 formatted input with certain restrictions:
				619
				620	=over 4
				621
				622	=item
				623
				624	B<mandatory>: text-header with integrated textsigle, text-body
				625
				626	=item
				627
				628	B<optional>: corp-header with integrated corpsigle,
				629	doc-header with integrated docsigle
				630
				631	=back
				632
				633	=item
				634
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	635	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	636	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	637	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	638	into blanks between 2 tokens could lead to additional blanks,
				639	where there should be none (e.g.: punctuation characters like C<,> or
				640	C<.> should not be seperated from their predecessor token).
				641	(see also code section C<~ whitespace handling ~>).
				642
				643	=back
				644
				645	=head2 Notes on the output
				646
				647	=over 2
				648
				649	=item
				650
				651	zip file output (default on C<stdout>) with utf8 encoded entries
				652	(which together form the KorAP-XML format)
				653
				654	=back
				655
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	656	=head1 INSTALLATION
				657
				658	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				659	these bindings are available, the preferred way to install the script is
				660	to use L<cpanm\|App::cpanminus>.
				661
				662	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				663
				664	In case everything went well, the C<tei2korapxml> tool will
				665	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	666
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	667	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				668
				669	=head1 OPTIONS
				670
				671	=over 2
				672
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	673	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	674
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	675	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	676
				677	=item B<--help\|-h>
				678
				679	Print help information.
				680
				681	=item B<--version\|-v>
				682
				683	Print version information.
				684
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	685	=item B<--tokenizer-call\|-tc>
				686
				687	Call an external tokenizer process, that will tokenize
				688	a single line from STDIN and outputs one token per line.
				689
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	690	=item B<--tokenizer-korap\|-tk>
				691
				692	Use the standard KorAP/DeReKo tokenizer.
				693
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	694	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	695
				696	Tokenize the data using two embedded tokenizers,
				697	that will take an I<Aggressive> and a I<conservative>
				698	approach.
				699
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	700	=item B<--skip-inline-tokens>
				701
				702	Boolean flag indicating that inline tokens should not
				703	be processed. Defaults to false (meaning inline tokens will be processed).
				704
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	705	=item B<--inline-tokens> <foundry>#[<file>]
				706
				707	Define the foundry and file (without extension)
				708	to store inline token information in.
				709	If L</KORAPXMLTEI_INLINE> is set, this will contain
				710	annotations as well.
				711	Defaults to C<tokens> and C<morpho>.
				712
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	713	=item B<--inline-structures> <foundry>#[<file>]
				714
				715	Define the foundry and file (without extension)
				716	to store inline structure information in.
				717	Defaults to C<struct> and C<structures>.
				718
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	719	=item B<--base-foundry> <foundry>
				720
				721	Define the base foundry to store newly generated
				722	token information in.
				723	Defaults to C<base>.
				724
				725	=item B<--data-file> <file>
				726
				727	Define the file (without extension)
				728	to store primary data information in.
				729	Defaults to C<data>.
				730
				731	=item B<--header-file> <file>
				732
				733	Define the file name (without extension)
				734	to store header information on
				735	the corpus, document, and text level in.
				736	Defaults to C<header>.
				737
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	738	=item B<--use-tokenizer-sentence-splits\|-s>
				739
				740	Replace existing with, or add new, sentence boundary information
				741	provided by the KorAP tokenizer (currently supported only).
				742
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	743	=item B<--tokens-file> <file>
				744
				745	Define the file (without extension)
				746	to store generated token information in
				747	(either from the KorAP tokenizer or an externally called tokenizer).
				748	Defaults to C<tokens>.
				749
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	750	=item B<--log\|-l>
				751
				752	Loglevel for I<Log::Any>. Defaults to C<notice>.
				753
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	754	=back
				755
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	756	=head1 ENVIRONMENT VARIABLES
				757
				758	=over 2
				759
				760	=item B<KORAPXMLTEI_DEBUG>
				761
				762	Activate minimal debugging.
				763	Defaults to C<false>.
				764
				765	=item B<KORAPXMLTEI_INLINE>
				766
				767	Process inline annotations, if present.
				768	Defaults to C<false>.
				769
				770	=back
				771
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	772	=head1 COPYRIGHT AND LICENSE
				773
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	774	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	775
				776	Author: Peter Harders
				777
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	778	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	779
				780	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				781	Corpus Analysis Platform at the
				782	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				783	member of the
				784	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				785
				786	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	787	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	788
				789	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	790
				791	# NOTES
				792
				793	## Notes on how 'XML::CompactTree::XS' works
				794
				795	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				796
				797	Print out name of 'node2' for the above example:
				798
				799	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				800
				801	Exploring the structure of $data ( = reference to below array ):
				802
				803	[ 0: XML_READER_TYPE_DOCUMENT,
				804	1: ?
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	805	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	806	1: 'node'
				807	2: ?
				808	3: HASH (attributes)
				809	4: 1 (line number)
				810	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				811	1: 'node1'
				812	2: ?
				813	3: undefined (no attributes)
				814	4: 1 (line number)
				815	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				816	1: 'some '
				817	]
				818	1: [ 0: XML_READER_TYPE_ELEMENT
				819	1: 'n'
				820	2: ?
				821	3: undefined (no attributes)
				822	4: 1 (line number)
				823	5: undefined (no child-nodes)
				824	]
				825	2: [ 0: XML_READER_TYPE_TEXT
				826	1: ' text'
				827	]
				828	]
				829	]
				830	1: [ 0: XML_READER_TYPE_ELEMENT
				831	1: 'node2'
				832	2: ?
				833	3: undefined (not attributes)
				834	4: 1 (line number)
				835	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				836	1: 'more-text'
				837	]
				838	]
				839	]
				840	]
				841	]
				842	]
				843	]
				844
				845	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				846
				847	ref($data->[2]) == ARRAY (with 1 element for 'node')
				848	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				849
				850	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				851	$data->[2]->[0]->[1] == 'node'
				852	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				853	$data->[2]->[0]->[4] == 1 (line number)
				854	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	855	# child-nodes of actual node (see $children)
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	856
				857	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				858	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				859	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				860	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				861	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				862	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				863
				864	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				865	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				866	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				867
				868	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				869	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				870	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				871	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				872	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				873	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				874
				875	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				876	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				877	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				878
				879
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	880	descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	881	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				882	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				883
				884
				885	## Notes on whitespace handling
				886
				887	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	888	(see function 'descend()').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	889
				890	Definition of significant and insignificant whitespace
				891	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				892
				893	Significant whitespace is part of the document content and should be preserved.
				894	Insignificant whitespace is used when editing XML documents for readability.
				895	These whitespaces are typically not intended for inclusion in the delivery of the document.
				896
				897	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				898
				899	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				900	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				901
				902	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				903	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				904	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				905
				906	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				907
				908
				909	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				910
				911	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				912	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				913
				914	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				915	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				916
				917	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				918	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				919
				920	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				921	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				922	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				923	the last read 'non-tag'-node has to be corrected (see [1]),
				924
				925	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				926	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				927
				928	[1]
				929	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				930	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	931	(see above code fragment '... not exists $ws{ $from - 1 } ...').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	932
				933	[2]
				934	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				935	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				936
				937	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				938	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				939
				940	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				941	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				942
				943
				944	## Notes on whitespace fixing
				945
				946	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				947	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				948
				949	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				950	example further down and notes on 'Input restrictions' in the manpage).
				951
				952	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				953
				954	Examples (how primary text with linebreaks would be converted by below code):
				955
				956	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				957	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				958
				959	Blanks are inserted before the 1st character:
				960
				961	NOTE: not stringent ('...' stands for text):
				962
				963	beg1............................end1 => no blank before 'beg1'
				964	beg2....<pb/>...................end2 => no blank before 'beg2'
				965	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				966	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				967
				968	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				969	^
				970	\|_blank between 'end3' and 'beg4'
				971
				972
				973	## Notes on segfault prevention
				974
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	975	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	976	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				977	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				978	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				979	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.