Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: b0318f8afc6c61bb3b41649503ad6499c2a9ae14 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	36	our $VERSION = '1.01';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	40	use constant {
				41	# Set to 1 for minimal more debug output (no need to be parametrized)
				42	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0,
				43
				44	# XCT_LINE_NUMBERS is only needed for debugging
				45	# (see XML::CompactTree::XS)
				46	XCT_PARAM => (
				47	XCT_DOCUMENT_ROOT
				48	\| XCT_IGNORE_COMMENTS
				49	\| XCT_ATTRIBUTE_ARRAY
				50	\| ($ENV{KORAPXMLTEI_DEBUG} ? XCT_LINE_NUMBERS : 0)
				51	)
				52	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	53
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	54	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	55	GetOptions(
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	56	'root\|r=s' => \(my $root_dir = '.'),
				57	'input\|i=s' => \(my $input_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	58	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				59	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	60	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				62	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				63	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
				64	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	65	'base-foundry=s' => \(my $base_dir = 'base'),
				66	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	67	'header-file=s' => \(my $header_file = 'header'),
				68	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	69	'log\|l=s' => \(my $log_level = 'notice'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	70	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	71	pod2usage(
				72	-verbose => 99,
				73	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				74	-msg => $VERSION_MSG,
				75	-output => '-'
				76	)
				77	},
				78	'version\|v' => sub {
				79	pod2usage(
				80	-verbose => 0,
				81	-msg => $VERSION_MSG,
				82	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	83	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	84	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	85	);
				86
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	87
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	88	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	89	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	90	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	91	$log->notice('Debugging is activated') if DEBUG;
				92
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	93
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	94	# tag (without attributes), which contains the primary text
				95	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	96	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	97
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	98	# TODO: IDS-specific (and redundant)
				99	my $_HEADER_TAG = 'idsHeader';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	100
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	101	# name of the tag containing all information stored in $_tokens_file
				102	my $_TOKENS_TAG = 'w';
				103
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	104
				105	# Define tokenizers
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	106	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	107	die $log->fatal(
				108	'Sentence splitting is currently only supported by KorAP tokenizer ' .
				109	'(use -tk to activate it)'
				110	);
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	111	};
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	112
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	113	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	114	my $ext_tok;
				115	if ($tokenizer_call) {
				116	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				117	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	118
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	119	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	120	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	121	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	122
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	123
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	124	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	125	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				126	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	127
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	128
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	129	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	130	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	131	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	132
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	133	# Name of the directory and the file containing all inline token informations
				134	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				135	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	136
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	137	# Handling inline annotations (inside $_TOKENS_TAG)
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	138	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
				139
				140	# Initialize Token- and Structure-Collector
				141	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				142	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
				143
				144	# Initialize Data-Collector
				145	my $data = KorAP::XML::TEI::Data->new;
				146
				147	# Initialize zipper
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	148	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	149
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	150
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	151	# text directory (below $root_dir)
				152	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	153
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	154	# Escaped version of text id
				155	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	156
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	157	# element from $tree_data
				158	my $e;
				159
				160	# Keeping track of the current positions in the text
				161	my $pos;
				162
				163	# Default encoding of the text
				164	my $input_enc = 'UTF-8';
				165
				166	# variables for handling ~ whitespace related issue ~
				167	# (it is sometimes necessary, to correct the from-values for some tags)
				168	my $add_one;
				169	my $from = 0;
				170
				171	# text line (needed for whitespace handling)
				172	my $text_line = 0;
				173
				174	# hash for indices of whitespace-nodes
				175	# (needed to recorrect from-values)
				176	# IDEA:
				177	# when closing element, check if it's from-index minus 1 refers to a whitespace-node
				178	# (means: 'from-index - 1' is a key in %ws).
				179	# if this is _not_ the case, then the from-value is one
				180	# to high => correct it by substracting 1
				181	my %ws;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	182
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	183
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	184	# Input file handle (default: stdin)
				185	my $input_fh = *STDIN;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	186
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	187	if ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	188	unless (open($input_fh, '<', $input_fname)) {
				189	die $log->fatal("File '$input_fname' could not be opened.");
				190	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	191	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	192
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	193	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	194	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	195
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	196
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	197	# Reading input document
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	198	MAIN: while (<$input_fh>) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	199
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	200	# remove HTML (multi-line) comments (<!--...-->)
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	201	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	202
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	203	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	204	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	205	$input_enc = $2;
				206	next;
				207	};
				208
				209	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	210	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	211
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	212	# Start of text body
				213	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	214	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	215
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	216	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	217	die $log->fatal("input line number $.: " .
				218	"line with opening text-body tag '${_TEXT_BODY}' " .
				219	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	220	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	221
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	222	# Text body data extracted from input document ($input_fh),
				223	# further processed by XML::LibXML::Reader
				224	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	225
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	226	# Iterate over all lines in the text body
				227	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	228
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	229	$_ = remove_xml_comments($input_fh, $_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	230	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	231	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	232
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	233	# End of text body
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	234	if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	235
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	236	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	237
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	238	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	239	die $log->fatal("input line number $.: " .
				240	"line with closing text-body tag '${_TEXT_BODY}'".
				241	" contains additional information ... => Aborting (line=$_)");
				242	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	243
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	244	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	245	$log->warn(
				246	"Maybe empty textSigle => skipping this text ...\n" .
				247	'data=' . substr($data->data, 0, 200)
				248	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	249	next MAIN;
				250	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	251
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	252	my $reader = XML::LibXML::Reader->new(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	253	string => "<text>$text_buffer</text>",
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	254	huge => 1
				255	);
				256
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	257	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, XCT_PARAM);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	258
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	259	# ~ whitespace related issue ~
				260	$add_one = 0;
				261	%ws = ();
				262
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	263	# Recursively parse all children
				264	descend(1, $tree_data->[2]);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	265
				266	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	267	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	268	};
				269
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	270	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	271	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	272	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	273	$text_id_esc
				274	);
				275
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	276	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	277	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	278
				279	# Tokenize and output
				280	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	281	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	282	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	283	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	284
				285	if ($use_tokenizer_sentence_splits) {
				286	$ext_tok->sentencize_from_previous_input($structures);
				287	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	288	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	289
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	290	# Tokenize with internal tokenizer
				291	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	292
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	293	# Tokenize and output
				294	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	295	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	296	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	297	)->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	298
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	299	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	300	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	301	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	302	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	303	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	304
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	305	# ~ write structures ~
				306	if (!$structures->empty) {
				307	$structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	308	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	309	$text_id_esc,
				310	2 # = structure serialization
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	311	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	312	};
				313
				314	# ~ write tokens ~
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	315	unless ($skip_inline_tokens \|\| $tokens->empty) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	316	$tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	317	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	318	$text_id_esc,
				319	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	320	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	321	};
				322
				323	# reinit.
				324	$dir = '';
				325
				326	# Maybe not necessary
				327	$data->reset;
				328
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	329	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	330	};
				331
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	332
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	333	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	334
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	335	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	336
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	337	# TODO:
				338	# Maybe it's best, to keep the stripping of whitespace and
				339	# to just remove the if-clause and to insert a blank by default
				340	# (with possibly an option on how newlines in primary text should
				341	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	342
				343	# Remove consecutive whitespace at beginning and end (mostly one newline)
				344	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	345
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	346	# NOTE:
				347	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	348
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	349	# TODO:
				350	# find a better solution, or create a warning, if a text has more
				351	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	352
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	353	# TODO:
				354	# do testing with 2 different corpora
				355	# (one with only one-line texts, the other with several lines per text)
				356
				357	# line contains at least one tag with at least one character contents
				358	if (m/<[^>]+>[^<]/) {
				359
				360	# Increment counter for text lines
				361	$text_line++;
				362
				363	# insert blank before 1st character
Akron	6e2b125	2021-02-24 12:41:15 +0100	[diff] [blame]	364	# (for 2nd line and consecutive lines)
				365	$_ = ' ' . $_ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	366	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	367
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	368	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	369	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	370	};
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	371	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	372
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	373	# Start of header section
				374	elsif (m#^(.)(\<${_HEADER_TAG}[^>]?type=["'].*)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	375
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	376	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	377
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	378	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	379	die $log->fatal(
				380	"input line number $.: " .
				381	'line with opening header tag is not in expected format ... ' .
				382	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	383	};
				384
				385	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	386	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	387
				388	# Header was parseable
				389	if ($header) {
				390
				391	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	392	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	393
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	394	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	395
				396	$header->to_zip($zipper->new_stream($file));
				397
				398	# Header is for text level
				399	if ($header->type eq 'text') {
				400
				401	# Remember dir and sigles
				402	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	403	$text_id_esc = $header->id_esc;
				404
				405	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	406	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	407
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	408	# Reset counter for text lines
				409	# (needed for whitespace handling)
				410	$text_line = 0;
				411	};
				412	};
				413	};
				414	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	415
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	416	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	417
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	418	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	419
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	420	close $input_fh;
				421
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	422	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	423
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	424
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	425	# Recursively called function to handle XML tree data
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	426	sub descend {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	427
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	428	# recursion level
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	429	# (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	430	my $depth = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	431
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	432	# Iteration through all array elements
				433	# ($_[0] is a reference to an array reference)
				434	# See notes on how 'XML::CompactTree::XS' works and
				435	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame]	436	foreach $e (@{$_[0]}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	437
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	438	# $e->[1] represents the tag name of an element node
				439	# or the primary data of a text or ws node
				440	my $node_info = $e->[1];
				441
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	442	# Element node
				443	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	444
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	445	# Deal with opening tag
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	446
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	447	# Get the child index depending on the debug state.
				448	# This is likely to be optimized away by the compiler.
				449	my $children = $e->[DEBUG ? 5 : 4];
				450
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	451	# Skip sentences
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	452	if ($use_tokenizer_sentence_splits && $node_info eq 's') {
				453	descend($depth + 1, $children) if defined $children;
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	454	next;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	455	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	456
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	457	my $anno = $structures->add_new_annotation($node_info);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	458
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	459	# Add element also to token list
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	460	if (!$skip_inline_tokens && $node_info eq $_TOKENS_TAG) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	461	$tokens->add_annotation($anno);
				462	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	463
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	464	# Handle attributes (if attributes exist)
				465	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	466
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	467	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	468	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				469	# NOTE:
				470	# arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	471	for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	472	$anno->add_attribute(
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	473	@{$e->[3]}[$_, $_ + 1]
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	474	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	475	};
				476	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	477
				478	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	479	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	480
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	481
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	482	# Call function recursively
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	483	# do no recursion, if $children is not defined
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	484	# (because we have no array of child-nodes, e.g.: <back/>)
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	485	descend($depth+1, $children) if defined $children;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	486
				487
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	488	# Deal with closing tag
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	489
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	490	# NOTE:
				491	# use $pos, because the offsets are _between_ the characters
				492	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	493	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	494
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	495	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	496
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	497	$from = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	498
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	499	# ~ whitespace related issue ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	500	if ($from > 0 && not exists $ws{$from - 1}) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	501
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	502	# Previous node was a text-node
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	503	$anno->set_from($from - 1);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	504	};
				505
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	506	# in case this fails, check input
				507	if (($from - 1) > $pos) {
				508	die $log->fatal(
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	509	"text_id='$text_id_esc', " .
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	510	'processing of structures: ' .
				511	"from-value ($from) is 2 or more greater " .
				512	"than to-value ($pos) => please check. Aborting"
				513	);
				514	};
				515
				516	# TODO:
				517	# find example for which this case applies
				518	# maybe this is not necessary anymore, because the
				519	# above recorrection of the from-value suffices
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	520	#
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	521	# TODO:
				522	# check, if it's better to remove this line and
				523	# change above check to 'if ($from - 1) >= $pos;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	524	# do testing with bigger corpus excerpt (wikipedia?)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	525	$anno->set_from($pos) if $from == $pos + 1;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	526	$anno->set_to($pos);
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	527	$anno->set_level($depth);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	528
				529	# Clean up whitespace
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	530	delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	531	}
				532
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	533	# Text node
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	534	elsif ($e->[0] == XML_READER_TYPE_TEXT) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	535
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	536	$add_one = 1;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	537	$data->append($node_info);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	538	}
				539
				540	# Whitespace node
				541	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				542	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				543
				544	# state, that this from-index belongs to a whitespace-node
				545	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				546	$ws{$data->position}++;
				547
				548	$add_one = 0;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame^]	549	$data->append($node_info);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	550	}
				551
				552	# not yet handled type
				553	else {
				554
				555	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				556	};
				557	};
				558	};
				559
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	560
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	561	__END__
				562
				563	=pod
				564
				565	=encoding utf8
				566
				567	=head1 NAME
				568
				569	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				570
				571	=head1 SYNOPSIS
				572
				573	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				574
				575	=head1 DESCRIPTION
				576
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	577	C<tei2korapxml> is a script to convert TEI P5 and
				578	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				579	based documents to the
				580	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				581	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	582	read from C<STDIN>. If no specific output is defined, data is written
				583	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	584
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	585	This program is usually called from inside another script.
				586
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	587	=head1 FORMATS
				588
				589	=head2 Input restrictions
				590
				591	=over 2
				592
				593	=item
				594
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	595	TEI P5 formatted input with certain restrictions:
				596
				597	=over 4
				598
				599	=item
				600
				601	B<mandatory>: text-header with integrated textsigle, text-body
				602
				603	=item
				604
				605	B<optional>: corp-header with integrated corpsigle,
				606	doc-header with integrated docsigle
				607
				608	=back
				609
				610	=item
				611
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	612	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	613	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	614	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	615	into blanks between 2 tokens could lead to additional blanks,
				616	where there should be none (e.g.: punctuation characters like C<,> or
				617	C<.> should not be seperated from their predecessor token).
				618	(see also code section C<~ whitespace handling ~>).
				619
				620	=back
				621
				622	=head2 Notes on the output
				623
				624	=over 2
				625
				626	=item
				627
				628	zip file output (default on C<stdout>) with utf8 encoded entries
				629	(which together form the KorAP-XML format)
				630
				631	=back
				632
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	633	=head1 INSTALLATION
				634
				635	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				636	these bindings are available, the preferred way to install the script is
				637	to use L<cpanm\|App::cpanminus>.
				638
				639	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				640
				641	In case everything went well, the C<tei2korapxml> tool will
				642	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	643
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	644	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				645
				646	=head1 OPTIONS
				647
				648	=over 2
				649
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	650	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	651
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	652	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	653
				654	=item B<--help\|-h>
				655
				656	Print help information.
				657
				658	=item B<--version\|-v>
				659
				660	Print version information.
				661
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	662	=item B<--tokenizer-call\|-tc>
				663
				664	Call an external tokenizer process, that will tokenize
				665	a single line from STDIN and outputs one token per line.
				666
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	667	=item B<--tokenizer-korap\|-tk>
				668
				669	Use the standard KorAP/DeReKo tokenizer.
				670
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	671	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	672
				673	Tokenize the data using two embedded tokenizers,
				674	that will take an I<Aggressive> and a I<conservative>
				675	approach.
				676
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	677	=item B<--skip-inline-tokens>
				678
				679	Boolean flag indicating that inline tokens should not
				680	be processed. Defaults to false (meaning inline tokens will be processed).
				681
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	682	=item B<--inline-tokens> <foundry>#[<file>]
				683
				684	Define the foundry and file (without extension)
				685	to store inline token information in.
				686	If L</KORAPXMLTEI_INLINE> is set, this will contain
				687	annotations as well.
				688	Defaults to C<tokens> and C<morpho>.
				689
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	690	=item B<--inline-structures> <foundry>#[<file>]
				691
				692	Define the foundry and file (without extension)
				693	to store inline structure information in.
				694	Defaults to C<struct> and C<structures>.
				695
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	696	=item B<--base-foundry> <foundry>
				697
				698	Define the base foundry to store newly generated
				699	token information in.
				700	Defaults to C<base>.
				701
				702	=item B<--data-file> <file>
				703
				704	Define the file (without extension)
				705	to store primary data information in.
				706	Defaults to C<data>.
				707
				708	=item B<--header-file> <file>
				709
				710	Define the file name (without extension)
				711	to store header information on
				712	the corpus, document, and text level in.
				713	Defaults to C<header>.
				714
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	715	=item B<--use-tokenizer-sentence-splits\|-s>
				716
				717	Replace existing with, or add new, sentence boundary information
				718	provided by the KorAP tokenizer (currently supported only).
				719
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	720	=item B<--tokens-file> <file>
				721
				722	Define the file (without extension)
				723	to store generated token information in
				724	(either from the KorAP tokenizer or an externally called tokenizer).
				725	Defaults to C<tokens>.
				726
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	727	=item B<--log\|-l>
				728
				729	Loglevel for I<Log::Any>. Defaults to C<notice>.
				730
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	731	=back
				732
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	733	=head1 ENVIRONMENT VARIABLES
				734
				735	=over 2
				736
				737	=item B<KORAPXMLTEI_DEBUG>
				738
				739	Activate minimal debugging.
				740	Defaults to C<false>.
				741
				742	=item B<KORAPXMLTEI_INLINE>
				743
				744	Process inline annotations, if present.
				745	Defaults to C<false>.
				746
				747	=back
				748
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	749	=head1 COPYRIGHT AND LICENSE
				750
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	751	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	752
				753	Author: Peter Harders
				754
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	755	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	756
				757	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				758	Corpus Analysis Platform at the
				759	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				760	member of the
				761	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				762
				763	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	764	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	765
				766	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	767
				768	# NOTES
				769
				770	## Notes on how 'XML::CompactTree::XS' works
				771
				772	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				773
				774	Print out name of 'node2' for the above example:
				775
				776	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				777
				778	Exploring the structure of $data ( = reference to below array ):
				779
				780	[ 0: XML_READER_TYPE_DOCUMENT,
				781	1: ?
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	782	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	783	1: 'node'
				784	2: ?
				785	3: HASH (attributes)
				786	4: 1 (line number)
				787	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				788	1: 'node1'
				789	2: ?
				790	3: undefined (no attributes)
				791	4: 1 (line number)
				792	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				793	1: 'some '
				794	]
				795	1: [ 0: XML_READER_TYPE_ELEMENT
				796	1: 'n'
				797	2: ?
				798	3: undefined (no attributes)
				799	4: 1 (line number)
				800	5: undefined (no child-nodes)
				801	]
				802	2: [ 0: XML_READER_TYPE_TEXT
				803	1: ' text'
				804	]
				805	]
				806	]
				807	1: [ 0: XML_READER_TYPE_ELEMENT
				808	1: 'node2'
				809	2: ?
				810	3: undefined (not attributes)
				811	4: 1 (line number)
				812	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				813	1: 'more-text'
				814	]
				815	]
				816	]
				817	]
				818	]
				819	]
				820	]
				821
				822	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				823
				824	ref($data->[2]) == ARRAY (with 1 element for 'node')
				825	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				826
				827	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				828	$data->[2]->[0]->[1] == 'node'
				829	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				830	$data->[2]->[0]->[4] == 1 (line number)
				831	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	832	# child-nodes of actual node (see $children)
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	833
				834	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				835	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				836	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				837	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				838	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				839	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				840
				841	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				842	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				843	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				844
				845	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				846	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				847	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				848	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				849	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				850	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				851
				852	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				853	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				854	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				855
				856
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	857	descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	858	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				859	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				860
				861
				862	## Notes on whitespace handling
				863
				864	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	865	(see function 'descend()').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	866
				867	Definition of significant and insignificant whitespace
				868	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				869
				870	Significant whitespace is part of the document content and should be preserved.
				871	Insignificant whitespace is used when editing XML documents for readability.
				872	These whitespaces are typically not intended for inclusion in the delivery of the document.
				873
				874	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				875
				876	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				877	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				878
				879	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				880	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				881	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				882
				883	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				884
				885
				886	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				887
				888	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				889	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				890
				891	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				892	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				893
				894	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				895	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				896
				897	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				898	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				899	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				900	the last read 'non-tag'-node has to be corrected (see [1]),
				901
				902	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				903	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				904
				905	[1]
				906	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				907	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	908	(see above code fragment '... not exists $ws{ $from - 1 } ...').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	909
				910	[2]
				911	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				912	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				913
				914	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				915	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				916
				917	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				918	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				919
				920
				921	## Notes on whitespace fixing
				922
				923	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				924	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				925
				926	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				927	example further down and notes on 'Input restrictions' in the manpage).
				928
				929	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				930
				931	Examples (how primary text with linebreaks would be converted by below code):
				932
				933	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				934	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				935
				936	Blanks are inserted before the 1st character:
				937
				938	NOTE: not stringent ('...' stands for text):
				939
				940	beg1............................end1 => no blank before 'beg1'
				941	beg2....<pb/>...................end2 => no blank before 'beg2'
				942	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				943	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				944
				945	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				946	^
				947	\|_blank between 'end3' and 'beg4'
				948
				949
				950	## Notes on segfault prevention
				951
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	952	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	953	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				954	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				955	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				956	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.