Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 6e44f95efedee12e43d75d9cc55cc6df6f66c3fd [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Marc Kupietz	a1421f0	2021-02-18 15:32:38 +0100	[diff] [blame]	36	our $VERSION = '1.00';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	40	# Set to 1 for minimal more debug output (no need to be parametrized)
				41	use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	42
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	43	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	44	GetOptions(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	45	"root\|r=s" => \(my $root_dir = '.'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	46	"input\|i=s" => \(my $input_fname = ''),
				47	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				48	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	49	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	50	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				51	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				52	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
				53	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	54	'base-foundry=s' => \(my $base_dir = 'base'),
				55	'data-file=s' => \(my $data_file = 'data'),
				56	'header-file=s' => \(my $header_file = 'header'),
				57	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	58	'log\|l=s' => \(my $log_level = 'notice'),
				59	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	60	pod2usage(
				61	-verbose => 99,
				62	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				63	-msg => $VERSION_MSG,
				64	-output => '-'
				65	)
				66	},
				67	'version\|v' => sub {
				68	pod2usage(
				69	-verbose => 0,
				70	-msg => $VERSION_MSG,
				71	-output => '-'
				72	)
				73	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	74	);
				75
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	76	# Establish logger
Marc Kupietz	44b1f25	2020-11-26 16:31:40 +0100	[diff] [blame]	77	binmode(STDERR, ":encoding(UTF-8)");
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	78	Log::Any::Adapter->set('Stderr', log_level => $log_level);
				79
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	80	$log->notice('Debugging is activated') if DEBUG;
				81
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	82	# tag (without attributes), which contains the primary text
				83	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	84	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	85
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	86	# TODO: IDS-specific (and redundant)
				87	my $_HEADER_TAG = 'idsHeader';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	88
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	89	# name of the tag containing all information stored in $_tokens_file
				90	my $_TOKENS_TAG = 'w';
				91
				92
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	93	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
				94	die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	95	};
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	96
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	97	my $ext_tok;
				98	if ($tokenizer_call) {
				99	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				100	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	101
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	102	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	103	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	104	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	105	##
				106
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	107
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	108	#
				109	# ~~~ constants ~~~
				110	#
				111
				112
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	113	## intern tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	114	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				115	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	116	##
				117
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	118	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	119	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	120	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	121
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	122	# Name of the directory and the file containing all inline token informations
				123	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				124	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	125
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	126	# Handling inline annotations (inside $_TOKENS_TAG)
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	127	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
				128
				129	# Initialize Token- and Structure-Collector
				130	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				131	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
				132
				133	# Initialize Data-Collector
				134	my $data = KorAP::XML::TEI::Data->new;
				135
				136	# Initialize zipper
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	137	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	138
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	139
				140	#
				141	# ~~~ variables ~~~
				142	#
				143
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	144	# text directory (below $root_dir)
				145	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	146
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	147	# Escaped version of text id
				148	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	149
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	150	# element from $tree_data
				151	my $e;
				152
				153	# Keeping track of the current positions in the text
				154	my $pos;
				155
				156	# Default encoding of the text
				157	my $input_enc = 'UTF-8';
				158
				159	# variables for handling ~ whitespace related issue ~
				160	# (it is sometimes necessary, to correct the from-values for some tags)
				161	my $add_one;
				162	my $from = 0;
				163
				164	# text line (needed for whitespace handling)
				165	my $text_line = 0;
				166
				167	# hash for indices of whitespace-nodes
				168	# (needed to recorrect from-values)
				169	# IDEA:
				170	# when closing element, check if it's from-index minus 1 refers to a whitespace-node
				171	# (means: 'from-index - 1' is a key in %ws).
				172	# if this is _not_ the case, then the from-value is one
				173	# to high => correct it by substracting 1
				174	my %ws;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	175
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	176
				177	#
				178	# ~~~ main ~~~
				179	#
				180
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	181	# ~ read input and write output (text by text) ~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	182
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	183	# Input file handle (default: stdin)
				184	my $input_fh = *STDIN;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	185
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	186	if ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	187	unless (open($input_fh, '<', $input_fname)) {
				188	die $log->fatal("File '$input_fname' could not be opened.");
				189	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	190	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	191
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	192	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	193	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	194
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	195
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	196	# Reading input document
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	197	MAIN: while ( <$input_fh> ){
				198
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	199	# remove HTML (multi-line) comments (<!--...-->)
				200	$_ = remove_xml_comments( $input_fh, $_ );
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	201
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	202	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	203	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	204	$input_enc = $2;
				205	next;
				206	};
				207
				208	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	209	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	210
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	211	# Start of Text body
				212	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#){
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	213
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	214	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	215
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	216	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	217	die $log->fatal("input line number $.: " .
				218	"line with opening text-body tag '${_TEXT_BODY}' " .
				219	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	220	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	221
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	222	# Text body data extracted from input document ($input_fh),
				223	# further processed by XML::LibXML::Reader
				224	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	225
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	226	# Iterate over all lines in the text body
				227	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	228
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	229	$_ = remove_xml_comments( $input_fh, $_ );
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	230	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	231	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	232
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	233	# End of text body
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	234	if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	235
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	236	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	237
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	238	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	239	die $log->fatal("input line number $.: " .
				240	"line with closing text-body tag '${_TEXT_BODY}'".
				241	" contains additional information ... => Aborting (line=$_)");
				242	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	243
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	244	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	245	$log->warn(
				246	"Maybe empty textSigle => skipping this text ...\n" .
				247	'data=' . substr($data->data, 0, 200)
				248	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	249	next MAIN;
				250	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	251
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	252	my $reader = XML::LibXML::Reader->new(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	253	string => "<text>$text_buffer</text>",
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	254	huge => 1
				255	);
				256
				257	# See notes on whitespace handling
				258	my $param = XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_ATTRIBUTE_ARRAY;
				259
				260	# XCT_LINE_NUMBERS is only needed for debugging
				261	# (see XML::CompactTree::XS)
				262	$param \|= XCT_LINE_NUMBERS if DEBUG;
				263	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
				264
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	265	# ~ whitespace related issue ~
				266	$add_one = 0;
				267	%ws = ();
				268
				269	# ~ recursion ~
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	270	descend(1, $tree_data->[2]); # parse input data
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	271
				272	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	273	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	274	};
				275
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	276	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	277	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	278	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	279	$text_id_esc
				280	);
				281
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	282	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	283	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	284
				285	# Tokenize and output
				286	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	287	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	288	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	289	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	290
				291	if ($use_tokenizer_sentence_splits) {
				292	$ext_tok->sentencize_from_previous_input($structures);
				293	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	294	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	295
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	296	# Tokenize with internal tokenizer
				297	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	298
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	299	# Tokenize and output
				300	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	301	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	302	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	303	)->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	304
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	305	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	306	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	307	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	308	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	309	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	310
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	311	# ~ write structures ~
				312	if (!$structures->empty) {
				313	$structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	314	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	315	$text_id_esc,
				316	2 # = structure serialization
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	317	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	318	};
				319
				320	# ~ write tokens ~
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	321	unless ($skip_inline_tokens \|\| $tokens->empty) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	322	$tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	323	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	324	$text_id_esc,
				325	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	326	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	327	};
				328
				329	# reinit.
				330	$dir = '';
				331
				332	# Maybe not necessary
				333	$data->reset;
				334
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	335	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	336	};
				337
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	338
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	339	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	340
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	341	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	342
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	343	# TODO:
				344	# Maybe it's best, to keep the stripping of whitespace and
				345	# to just remove the if-clause and to insert a blank by default
				346	# (with possibly an option on how newlines in primary text should
				347	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	348
				349	# Remove consecutive whitespace at beginning and end (mostly one newline)
				350	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	351
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	352	# NOTE:
				353	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	354
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	355	# TODO:
				356	# find a better solution, or create a warning, if a text has more
				357	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	358
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	359	# TODO:
				360	# do testing with 2 different corpora
				361	# (one with only one-line texts, the other with several lines per text)
				362
				363	# line contains at least one tag with at least one character contents
				364	if (m/<[^>]+>[^<]/) {
				365
				366	# Increment counter for text lines
				367	$text_line++;
				368
				369	# insert blank before 1st character
Akron	6e2b125	2021-02-24 12:41:15 +0100	[diff] [blame^]	370	# (for 2nd line and consecutive lines)
				371	$_ = ' ' . $_ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	372	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	373
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	374	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	375	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	376	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	377
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	378	} elsif (m#^(.)(\<${_HEADER_TAG}[^>]?type=["'].*)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	379
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	380	# ~ start of header ~
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	381	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	382
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	383	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	384	die $log->fatal(
				385	"input line number $.: " .
				386	'line with opening header tag is not in expected format ... ' .
				387	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	388	};
				389
				390	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	391	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	392
				393	# Header was parseable
				394	if ($header) {
				395
				396	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	397	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	398
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	399	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	400
				401	$header->to_zip($zipper->new_stream($file));
				402
				403	# Header is for text level
				404	if ($header->type eq 'text') {
				405
				406	# Remember dir and sigles
				407	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	408	$text_id_esc = $header->id_esc;
				409
				410	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	411	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	412
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	413	# Reset counter for text lines
				414	# (needed for whitespace handling)
				415	$text_line = 0;
				416	};
				417	};
				418	};
				419	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	420
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	421	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	422
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	423	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	424
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	425	close $input_fh;
				426
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	427	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	428
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	429
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	430	# Recursively called function to handle XML tree data
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	431	sub descend {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	432
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	433	# recursion level
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	434	# (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	435	my $depth = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	436
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	437	# Iteration through all array elements
				438	# ($_[0] is a reference to an array reference)
				439	# See notes on how 'XML::CompactTree::XS' works and
				440	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame]	441	foreach $e (@{$_[0]}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	442
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	443	# Element node
				444	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	445
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	446	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	447	# from here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	448	#~~~~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	449
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	450	# Get the child index depending on the debug state.
				451	# This is likely to be optimized away by the compiler.
				452	my $children = $e->[DEBUG ? 5 : 4];
				453
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	454	# $e->[1] represents the tag name
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	455	# Skip sentences
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	456	if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	457	descend($depth+1, $children) if defined $children;
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	458	next;
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	459	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	460
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	461	my $anno = $structures->add_new_annotation($e->[1]);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	462
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	463	# Add element also to token list
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	464	if (!$skip_inline_tokens && $e->[1] eq $_TOKENS_TAG) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	465	$tokens->add_annotation($anno);
				466	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	467
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	468	# Handle attributes (if attributes exist)
				469	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	470
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	471	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
				472	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				473	# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	474	for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	475
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	476	# '$_' references the 'key' and '$_+1' the 'value'
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	477	$anno->add_attribute(
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	478	@{$e->[3]}[$_, $_ + 1]
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	479	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	480	};
				481	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	482
				483	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	484	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	485
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	486
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	487	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	488	# until here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	489	#~~~~
				490
				491
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	492	# Call function recursively
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	493	# do no recursion, if $children is not defined
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	494	# (because we have no array of child-nodes, e.g.: <back/>)
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	495	descend($depth+1, $children) if defined $children;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	496
				497
				498	#~~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	499	# from here: tag-node (closing)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	500	#~~~~~
				501
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	502	# NOTE: use $pos, because the offsets are _between_ the characters
				503	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	504	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	505
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	506	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	507
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	508	$from = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	509
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	510	# ~ whitespace related issue ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	511	if ($from > 0 && not exists $ws{$from - 1}) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	512
				513	# ~ previous node was a text-node ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	514	$anno->set_from($from - 1);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	515	};
				516
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	517	# in case this fails, check input
				518	if (($from - 1) > $pos) {
				519	die $log->fatal(
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	520	"text_id='$text_id_esc', " .
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	521	'processing of structures: ' .
				522	"from-value ($from) is 2 or more greater " .
				523	"than to-value ($pos) => please check. Aborting"
				524	);
				525	};
				526
				527	# TODO:
				528	# find example for which this case applies
				529	# maybe this is not necessary anymore, because the
				530	# above recorrection of the from-value suffices
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	531	#
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	532	# TODO:
				533	# check, if it's better to remove this line and
				534	# change above check to 'if ($from - 1) >= $pos;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	535	# do testing with bigger corpus excerpt (wikipedia?)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	536	$anno->set_from($pos) if $from == $pos + 1;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	537	$anno->set_to($pos);
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	538	$anno->set_level($depth);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	539
				540	# Clean up whitespace
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	541	delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	542
				543
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	544	#~~~~
				545	# until here: tag-node (closing)
				546	#~~~~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	547	}
				548
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	549	# Text node
				550	elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	551
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	552	$add_one = 1;
				553	$data->append($e->[1]);
				554	}
				555
				556	# Whitespace node
				557	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				558	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				559
				560	# state, that this from-index belongs to a whitespace-node
				561	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				562	$ws{$data->position}++;
				563
				564	$add_one = 0;
				565	$data->append($e->[1]);
				566	}
				567
				568	# not yet handled type
				569	else {
				570
				571	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				572	};
				573	};
				574	};
				575
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	576
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	577	__END__
				578
				579	=pod
				580
				581	=encoding utf8
				582
				583	=head1 NAME
				584
				585	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				586
				587	=head1 SYNOPSIS
				588
				589	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				590
				591	=head1 DESCRIPTION
				592
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	593	C<tei2korapxml> is a script to convert TEI P5 and
				594	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				595	based documents to the
				596	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				597	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	598	read from C<STDIN>. If no specific output is defined, data is written
				599	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	600
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	601	This program is usually called from inside another script.
				602
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	603	=head1 FORMATS
				604
				605	=head2 Input restrictions
				606
				607	=over 2
				608
				609	=item
				610
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	611	TEI P5 formatted input with certain restrictions:
				612
				613	=over 4
				614
				615	=item
				616
				617	B<mandatory>: text-header with integrated textsigle, text-body
				618
				619	=item
				620
				621	B<optional>: corp-header with integrated corpsigle,
				622	doc-header with integrated docsigle
				623
				624	=back
				625
				626	=item
				627
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	628	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	629	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	630	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	631	into blanks between 2 tokens could lead to additional blanks,
				632	where there should be none (e.g.: punctuation characters like C<,> or
				633	C<.> should not be seperated from their predecessor token).
				634	(see also code section C<~ whitespace handling ~>).
				635
				636	=back
				637
				638	=head2 Notes on the output
				639
				640	=over 2
				641
				642	=item
				643
				644	zip file output (default on C<stdout>) with utf8 encoded entries
				645	(which together form the KorAP-XML format)
				646
				647	=back
				648
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	649	=head1 INSTALLATION
				650
				651	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				652	these bindings are available, the preferred way to install the script is
				653	to use L<cpanm\|App::cpanminus>.
				654
				655	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				656
				657	In case everything went well, the C<tei2korapxml> tool will
				658	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	659
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	660	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				661
				662	=head1 OPTIONS
				663
				664	=over 2
				665
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	666	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	667
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	668	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	669
				670	=item B<--help\|-h>
				671
				672	Print help information.
				673
				674	=item B<--version\|-v>
				675
				676	Print version information.
				677
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	678	=item B<--tokenizer-call\|-tc>
				679
				680	Call an external tokenizer process, that will tokenize
				681	a single line from STDIN and outputs one token per line.
				682
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	683	=item B<--tokenizer-korap\|-tk>
				684
				685	Use the standard KorAP/DeReKo tokenizer.
				686
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	687	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	688
				689	Tokenize the data using two embedded tokenizers,
				690	that will take an I<Aggressive> and a I<conservative>
				691	approach.
				692
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	693	=item B<--skip-inline-tokens>
				694
				695	Boolean flag indicating that inline tokens should not
				696	be processed. Defaults to false (meaning inline tokens will be processed).
				697
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	698	=item B<--inline-tokens> <foundry>#[<file>]
				699
				700	Define the foundry and file (without extension)
				701	to store inline token information in.
				702	If L</KORAPXMLTEI_INLINE> is set, this will contain
				703	annotations as well.
				704	Defaults to C<tokens> and C<morpho>.
				705
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	706	=item B<--inline-structures> <foundry>#[<file>]
				707
				708	Define the foundry and file (without extension)
				709	to store inline structure information in.
				710	Defaults to C<struct> and C<structures>.
				711
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	712	=item B<--base-foundry> <foundry>
				713
				714	Define the base foundry to store newly generated
				715	token information in.
				716	Defaults to C<base>.
				717
				718	=item B<--data-file> <file>
				719
				720	Define the file (without extension)
				721	to store primary data information in.
				722	Defaults to C<data>.
				723
				724	=item B<--header-file> <file>
				725
				726	Define the file name (without extension)
				727	to store header information on
				728	the corpus, document, and text level in.
				729	Defaults to C<header>.
				730
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	731	=item B<--use-tokenizer-sentence-splits\|-s>
				732
				733	Replace existing with, or add new, sentence boundary information
				734	provided by the KorAP tokenizer (currently supported only).
				735
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	736	=item B<--tokens-file> <file>
				737
				738	Define the file (without extension)
				739	to store generated token information in
				740	(either from the KorAP tokenizer or an externally called tokenizer).
				741	Defaults to C<tokens>.
				742
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	743	=item B<--log\|-l>
				744
				745	Loglevel for I<Log::Any>. Defaults to C<notice>.
				746
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	747	=back
				748
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	749	=head1 ENVIRONMENT VARIABLES
				750
				751	=over 2
				752
				753	=item B<KORAPXMLTEI_DEBUG>
				754
				755	Activate minimal debugging.
				756	Defaults to C<false>.
				757
				758	=item B<KORAPXMLTEI_INLINE>
				759
				760	Process inline annotations, if present.
				761	Defaults to C<false>.
				762
				763	=back
				764
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	765	=head1 COPYRIGHT AND LICENSE
				766
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	767	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	768
				769	Author: Peter Harders
				770
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	771	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	772
				773	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				774	Corpus Analysis Platform at the
				775	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				776	member of the
				777	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				778
				779	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	780	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	781
				782	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	783
				784	# NOTES
				785
				786	## Notes on how 'XML::CompactTree::XS' works
				787
				788	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				789
				790	Print out name of 'node2' for the above example:
				791
				792	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				793
				794	Exploring the structure of $data ( = reference to below array ):
				795
				796	[ 0: XML_READER_TYPE_DOCUMENT,
				797	1: ?
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	798	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	799	1: 'node'
				800	2: ?
				801	3: HASH (attributes)
				802	4: 1 (line number)
				803	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				804	1: 'node1'
				805	2: ?
				806	3: undefined (no attributes)
				807	4: 1 (line number)
				808	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				809	1: 'some '
				810	]
				811	1: [ 0: XML_READER_TYPE_ELEMENT
				812	1: 'n'
				813	2: ?
				814	3: undefined (no attributes)
				815	4: 1 (line number)
				816	5: undefined (no child-nodes)
				817	]
				818	2: [ 0: XML_READER_TYPE_TEXT
				819	1: ' text'
				820	]
				821	]
				822	]
				823	1: [ 0: XML_READER_TYPE_ELEMENT
				824	1: 'node2'
				825	2: ?
				826	3: undefined (not attributes)
				827	4: 1 (line number)
				828	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				829	1: 'more-text'
				830	]
				831	]
				832	]
				833	]
				834	]
				835	]
				836	]
				837
				838	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				839
				840	ref($data->[2]) == ARRAY (with 1 element for 'node')
				841	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				842
				843	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				844	$data->[2]->[0]->[1] == 'node'
				845	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				846	$data->[2]->[0]->[4] == 1 (line number)
				847	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	848	# child-nodes of actual node (see $children)
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	849
				850	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				851	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				852	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				853	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				854	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				855	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				856
				857	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				858	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				859	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				860
				861	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				862	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				863	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				864	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				865	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				866	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				867
				868	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				869	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				870	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				871
				872
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	873	descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	874	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				875	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				876
				877
				878	## Notes on whitespace handling
				879
				880	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	881	(see function 'descend()').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	882
				883	Definition of significant and insignificant whitespace
				884	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				885
				886	Significant whitespace is part of the document content and should be preserved.
				887	Insignificant whitespace is used when editing XML documents for readability.
				888	These whitespaces are typically not intended for inclusion in the delivery of the document.
				889
				890	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				891
				892	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				893	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				894
				895	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				896	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				897	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				898
				899	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				900
				901
				902	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				903
				904	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				905	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				906
				907	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				908	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				909
				910	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				911	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				912
				913	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				914	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				915	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				916	the last read 'non-tag'-node has to be corrected (see [1]),
				917
				918	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				919	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				920
				921	[1]
				922	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				923	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	924	(see above code fragment '... not exists $ws{ $from - 1 } ...').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	925
				926	[2]
				927	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				928	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				929
				930	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				931	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				932
				933	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				934	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				935
				936
				937	## Notes on whitespace fixing
				938
				939	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				940	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				941
				942	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				943	example further down and notes on 'Input restrictions' in the manpage).
				944
				945	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				946
				947	Examples (how primary text with linebreaks would be converted by below code):
				948
				949	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				950	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				951
				952	Blanks are inserted before the 1st character:
				953
				954	NOTE: not stringent ('...' stands for text):
				955
				956	beg1............................end1 => no blank before 'beg1'
				957	beg2....<pb/>...................end2 => no blank before 'beg2'
				958	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				959	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				960
				961	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				962	^
				963	\|_blank between 'end3' and 'beg4'
				964
				965
				966	## Notes on segfault prevention
				967
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	968	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	969	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				970	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				971	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				972	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.