Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: fb58d8e2794fb13ceed04d6699c7a5d77b7ed069 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	36	our $VERSION = '1.01';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	40	use constant {
				41	# Set to 1 for minimal more debug output (no need to be parametrized)
				42	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0,
				43
				44	# XCT_LINE_NUMBERS is only needed for debugging
				45	# (see XML::CompactTree::XS)
				46	XCT_PARAM => (
				47	XCT_DOCUMENT_ROOT
				48	\| XCT_IGNORE_COMMENTS
				49	\| XCT_ATTRIBUTE_ARRAY
				50	\| ($ENV{KORAPXMLTEI_DEBUG} ? XCT_LINE_NUMBERS : 0)
				51	)
				52	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	53
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	54	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	55	GetOptions(
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	56	'root\|r=s' => \(my $root_dir = '.'),
				57	'input\|i=s' => \(my $input_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	58	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				59	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	60	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				62	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				63	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
				64	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	65	'base-foundry=s' => \(my $base_dir = 'base'),
				66	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	67	'header-file=s' => \(my $header_file = 'header'),
				68	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	69	'log\|l=s' => \(my $log_level = 'notice'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	70	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	71	pod2usage(
				72	-verbose => 99,
				73	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				74	-msg => $VERSION_MSG,
				75	-output => '-'
				76	)
				77	},
				78	'version\|v' => sub {
				79	pod2usage(
				80	-verbose => 0,
				81	-msg => $VERSION_MSG,
				82	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	83	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	84	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	85	);
				86
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	87
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	88	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	89	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	90	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	91	$log->notice('Debugging is activated') if DEBUG;
				92
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	93
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	94	# tag (without attributes), which contains the primary text
				95	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	96	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	97
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	98	# TODO: IDS-specific (and redundant)
				99	my $_HEADER_TAG = 'idsHeader';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	100
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	101	# name of the tag containing all information stored in $_tokens_file
				102	my $_TOKENS_TAG = 'w';
				103
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	104
				105	# Define tokenizers
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	106	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	107	die $log->fatal(
				108	'Sentence splitting is currently only supported by KorAP tokenizer ' .
				109	'(use -tk to activate it)'
				110	);
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	111	};
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	112
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	113	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	114	my $ext_tok;
				115	if ($tokenizer_call) {
				116	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				117	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	118
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	119	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	120	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	121	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	122
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	123
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	124	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	125	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				126	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	127
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	128
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	129	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	130	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	131	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	132
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	133	# Name of the directory and the file containing all inline token informations
				134	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				135	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	136
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	137	# Handling inline annotations (inside $_TOKENS_TAG)
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	138	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
				139
				140	# Initialize Token- and Structure-Collector
				141	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				142	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
				143
				144	# Initialize Data-Collector
				145	my $data = KorAP::XML::TEI::Data->new;
				146
				147	# Initialize zipper
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	148	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	149
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	150
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	151	# text directory (below $root_dir)
				152	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	153
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	154	# Escaped version of text id
				155	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	156
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	157	# element from $tree_data
				158	my $e;
				159
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	160	# Default encoding of the text
				161	my $input_enc = 'UTF-8';
				162
				163	# variables for handling ~ whitespace related issue ~
				164	# (it is sometimes necessary, to correct the from-values for some tags)
				165	my $add_one;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	166
				167	# text line (needed for whitespace handling)
				168	my $text_line = 0;
				169
				170	# hash for indices of whitespace-nodes
				171	# (needed to recorrect from-values)
				172	# IDEA:
				173	# when closing element, check if it's from-index minus 1 refers to a whitespace-node
				174	# (means: 'from-index - 1' is a key in %ws).
				175	# if this is _not_ the case, then the from-value is one
				176	# to high => correct it by substracting 1
				177	my %ws;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	178
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	179
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	180	# Input file handle (default: stdin)
				181	my $input_fh = *STDIN;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	182
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	183	if ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	184	unless (open($input_fh, '<', $input_fname)) {
				185	die $log->fatal("File '$input_fname' could not be opened.");
				186	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	187	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	188
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	189	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	190	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	191
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	192
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	193	# Reading input document
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	194	MAIN: while (<$input_fh>) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	195
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	196	# remove HTML (multi-line) comments (<!--...-->)
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	197	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	198
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	199	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	200	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	201	$input_enc = $2;
				202	next;
				203	};
				204
				205	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	206	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	207
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	208	# Start of text body
				209	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	210	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	211
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	212	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	213	die $log->fatal("input line number $.: " .
				214	"line with opening text-body tag '${_TEXT_BODY}' " .
				215	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	216	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	217
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	218	# Text body data extracted from input document ($input_fh),
				219	# further processed by XML::LibXML::Reader
				220	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	221
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	222	# Iterate over all lines in the text body
				223	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	224
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	225	$_ = remove_xml_comments($input_fh, $_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	226	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	227	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	228
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	229	# End of text body
Akron	b43b491	2021-02-25 10:31:11 +0100	[diff] [blame^]	230	if ((my $pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	231
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	232	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	233
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	234	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	235	die $log->fatal("input line number $.: " .
				236	"line with closing text-body tag '${_TEXT_BODY}'".
				237	" contains additional information ... => Aborting (line=$_)");
				238	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	239
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	240	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	241	$log->warn(
				242	"Maybe empty textSigle => skipping this text ...\n" .
				243	'data=' . substr($data->data, 0, 200)
				244	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	245	next MAIN;
				246	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	247
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	248	my $reader = XML::LibXML::Reader->new(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	249	string => "<text>$text_buffer</text>",
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	250	huge => 1
				251	);
				252
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	253	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, XCT_PARAM);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	254
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	255	# ~ whitespace related issue ~
				256	$add_one = 0;
				257	%ws = ();
				258
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	259	# Recursively parse all children
				260	descend(1, $tree_data->[2]);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	261
				262	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	263	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	264	};
				265
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	266	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	267	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	268	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	269	$text_id_esc
				270	);
				271
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	272	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	273	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	274
				275	# Tokenize and output
				276	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	277	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	278	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	279	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	280
				281	if ($use_tokenizer_sentence_splits) {
				282	$ext_tok->sentencize_from_previous_input($structures);
				283	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	284	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	285
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	286	# Tokenize with internal tokenizer
				287	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	288
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	289	# Tokenize and output
				290	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	291	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	292	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	293	)->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	294
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	295	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	296	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	297	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	298	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	299	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	300
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	301	# ~ write structures ~
				302	if (!$structures->empty) {
				303	$structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	304	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	305	$text_id_esc,
				306	2 # = structure serialization
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	307	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	308	};
				309
				310	# ~ write tokens ~
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	311	unless ($skip_inline_tokens \|\| $tokens->empty) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	312	$tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	313	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	314	$text_id_esc,
				315	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	316	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	317	};
				318
				319	# reinit.
				320	$dir = '';
				321
				322	# Maybe not necessary
				323	$data->reset;
				324
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	325	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	326	};
				327
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	328
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	329	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	330
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	331	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	332
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	333	# TODO:
				334	# Maybe it's best, to keep the stripping of whitespace and
				335	# to just remove the if-clause and to insert a blank by default
				336	# (with possibly an option on how newlines in primary text should
				337	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	338
				339	# Remove consecutive whitespace at beginning and end (mostly one newline)
				340	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	341
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	342	# NOTE:
				343	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	344
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	345	# TODO:
				346	# find a better solution, or create a warning, if a text has more
				347	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	348
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	349	# TODO:
				350	# do testing with 2 different corpora
				351	# (one with only one-line texts, the other with several lines per text)
				352
				353	# line contains at least one tag with at least one character contents
				354	if (m/<[^>]+>[^<]/) {
				355
				356	# Increment counter for text lines
				357	$text_line++;
				358
				359	# insert blank before 1st character
Akron	6e2b125	2021-02-24 12:41:15 +0100	[diff] [blame]	360	# (for 2nd line and consecutive lines)
				361	$_ = ' ' . $_ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	362	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	363
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	364	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	365	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	366	};
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	367	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	368
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	369	# Start of header section
				370	elsif (m#^(.)(\<${_HEADER_TAG}[^>]?type=["'].*)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	371
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	372	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	373
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	374	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	375	die $log->fatal(
				376	"input line number $.: " .
				377	'line with opening header tag is not in expected format ... ' .
				378	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	379	};
				380
				381	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	382	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	383
				384	# Header was parseable
				385	if ($header) {
				386
				387	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	388	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	389
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	390	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	391
				392	$header->to_zip($zipper->new_stream($file));
				393
				394	# Header is for text level
				395	if ($header->type eq 'text') {
				396
				397	# Remember dir and sigles
				398	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	399	$text_id_esc = $header->id_esc;
				400
				401	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	402	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	403
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	404	# Reset counter for text lines
				405	# (needed for whitespace handling)
				406	$text_line = 0;
				407	};
				408	};
				409	};
				410	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	411
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	412	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	413
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	414	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	415
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	416	close $input_fh;
				417
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	418	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	419
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	420
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	421	# Recursively called function to handle XML tree data
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	422	sub descend {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	423
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	424	# recursion level
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	425	# (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	426	my $depth = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	427
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	428	# Iteration through all array elements
				429	# ($_[0] is a reference to an array reference)
				430	# See notes on how 'XML::CompactTree::XS' works and
				431	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame]	432	foreach $e (@{$_[0]}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	433
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	434	# $e->[1] represents the tag name of an element node
				435	# or the primary data of a text or ws node
				436	my $node_info = $e->[1];
				437
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	438	# Element node
				439	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	440
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	441	# Deal with opening tag
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	442
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	443	# Get the child index depending on the debug state.
				444	# This is likely to be optimized away by the compiler.
				445	my $children = $e->[DEBUG ? 5 : 4];
				446
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	447	# Skip sentences
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	448	if ($use_tokenizer_sentence_splits && $node_info eq 's') {
				449	descend($depth + 1, $children) if defined $children;
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	450	next;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	451	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	452
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	453	my $anno = $structures->add_new_annotation($node_info);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	454
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	455	# Add element also to token list
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	456	if (!$skip_inline_tokens && $node_info eq $_TOKENS_TAG) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	457	$tokens->add_annotation($anno);
				458	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	459
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	460	# Handle attributes (if attributes exist)
				461	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	462
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	463	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	464	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				465	# NOTE:
				466	# arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	467	for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	468	$anno->add_attribute(
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	469	@{$e->[3]}[$_, $_ + 1]
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	470	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	471	};
				472	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	473
				474	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	475	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	476
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	477
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	478	# Call function recursively
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	479	# do no recursion, if $children is not defined
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	480	# (because we have no array of child-nodes, e.g.: <back/>)
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	481	descend($depth+1, $children) if defined $children;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	482
				483
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	484	# Deal with closing tag
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	485
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	486	# NOTE:
				487	# use $pos, because the offsets are _between_ the characters
				488	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	489	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	490
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	491	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	492
Akron	b43b491	2021-02-25 10:31:11 +0100	[diff] [blame^]	493	my $from = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	494
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	495	# ~ whitespace related issue ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	496	if ($from > 0 && not exists $ws{$from - 1}) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	497
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	498	# Previous node was a text-node
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	499	$anno->set_from($from - 1);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	500	};
				501
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	502	# in case this fails, check input
				503	if (($from - 1) > $pos) {
				504	die $log->fatal(
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	505	"text_id='$text_id_esc', " .
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	506	'processing of structures: ' .
				507	"from-value ($from) is 2 or more greater " .
				508	"than to-value ($pos) => please check. Aborting"
				509	);
				510	};
				511
				512	# TODO:
				513	# find example for which this case applies
				514	# maybe this is not necessary anymore, because the
				515	# above recorrection of the from-value suffices
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	516	#
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	517	# TODO:
				518	# check, if it's better to remove this line and
				519	# change above check to 'if ($from - 1) >= $pos;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	520	# do testing with bigger corpus excerpt (wikipedia?)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	521	$anno->set_from($pos) if $from == $pos + 1;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	522	$anno->set_to($pos);
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	523	$anno->set_level($depth);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	524
				525	# Clean up whitespace
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	526	delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	527	}
				528
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	529	# Text node
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	530	elsif ($e->[0] == XML_READER_TYPE_TEXT) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	531
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	532	$add_one = 1;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	533	$data->append($node_info);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	534	}
				535
				536	# Whitespace node
				537	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				538	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				539
				540	# state, that this from-index belongs to a whitespace-node
				541	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				542	$ws{$data->position}++;
				543
				544	$add_one = 0;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	545	$data->append($node_info);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	546	}
				547
				548	# not yet handled type
				549	else {
				550
				551	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				552	};
				553	};
				554	};
				555
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	556
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	557	__END__
				558
				559	=pod
				560
				561	=encoding utf8
				562
				563	=head1 NAME
				564
				565	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				566
				567	=head1 SYNOPSIS
				568
				569	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				570
				571	=head1 DESCRIPTION
				572
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	573	C<tei2korapxml> is a script to convert TEI P5 and
				574	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				575	based documents to the
				576	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				577	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	578	read from C<STDIN>. If no specific output is defined, data is written
				579	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	580
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	581	This program is usually called from inside another script.
				582
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	583	=head1 FORMATS
				584
				585	=head2 Input restrictions
				586
				587	=over 2
				588
				589	=item
				590
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	591	TEI P5 formatted input with certain restrictions:
				592
				593	=over 4
				594
				595	=item
				596
				597	B<mandatory>: text-header with integrated textsigle, text-body
				598
				599	=item
				600
				601	B<optional>: corp-header with integrated corpsigle,
				602	doc-header with integrated docsigle
				603
				604	=back
				605
				606	=item
				607
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	608	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	609	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	610	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	611	into blanks between 2 tokens could lead to additional blanks,
				612	where there should be none (e.g.: punctuation characters like C<,> or
				613	C<.> should not be seperated from their predecessor token).
				614	(see also code section C<~ whitespace handling ~>).
				615
				616	=back
				617
				618	=head2 Notes on the output
				619
				620	=over 2
				621
				622	=item
				623
				624	zip file output (default on C<stdout>) with utf8 encoded entries
				625	(which together form the KorAP-XML format)
				626
				627	=back
				628
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	629	=head1 INSTALLATION
				630
				631	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				632	these bindings are available, the preferred way to install the script is
				633	to use L<cpanm\|App::cpanminus>.
				634
				635	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				636
				637	In case everything went well, the C<tei2korapxml> tool will
				638	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	639
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	640	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				641
				642	=head1 OPTIONS
				643
				644	=over 2
				645
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	646	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	647
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	648	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	649
				650	=item B<--help\|-h>
				651
				652	Print help information.
				653
				654	=item B<--version\|-v>
				655
				656	Print version information.
				657
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	658	=item B<--tokenizer-call\|-tc>
				659
				660	Call an external tokenizer process, that will tokenize
				661	a single line from STDIN and outputs one token per line.
				662
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	663	=item B<--tokenizer-korap\|-tk>
				664
				665	Use the standard KorAP/DeReKo tokenizer.
				666
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	667	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	668
				669	Tokenize the data using two embedded tokenizers,
				670	that will take an I<Aggressive> and a I<conservative>
				671	approach.
				672
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	673	=item B<--skip-inline-tokens>
				674
				675	Boolean flag indicating that inline tokens should not
				676	be processed. Defaults to false (meaning inline tokens will be processed).
				677
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	678	=item B<--inline-tokens> <foundry>#[<file>]
				679
				680	Define the foundry and file (without extension)
				681	to store inline token information in.
				682	If L</KORAPXMLTEI_INLINE> is set, this will contain
				683	annotations as well.
				684	Defaults to C<tokens> and C<morpho>.
				685
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	686	=item B<--inline-structures> <foundry>#[<file>]
				687
				688	Define the foundry and file (without extension)
				689	to store inline structure information in.
				690	Defaults to C<struct> and C<structures>.
				691
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	692	=item B<--base-foundry> <foundry>
				693
				694	Define the base foundry to store newly generated
				695	token information in.
				696	Defaults to C<base>.
				697
				698	=item B<--data-file> <file>
				699
				700	Define the file (without extension)
				701	to store primary data information in.
				702	Defaults to C<data>.
				703
				704	=item B<--header-file> <file>
				705
				706	Define the file name (without extension)
				707	to store header information on
				708	the corpus, document, and text level in.
				709	Defaults to C<header>.
				710
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	711	=item B<--use-tokenizer-sentence-splits\|-s>
				712
				713	Replace existing with, or add new, sentence boundary information
				714	provided by the KorAP tokenizer (currently supported only).
				715
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	716	=item B<--tokens-file> <file>
				717
				718	Define the file (without extension)
				719	to store generated token information in
				720	(either from the KorAP tokenizer or an externally called tokenizer).
				721	Defaults to C<tokens>.
				722
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	723	=item B<--log\|-l>
				724
				725	Loglevel for I<Log::Any>. Defaults to C<notice>.
				726
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	727	=back
				728
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	729	=head1 ENVIRONMENT VARIABLES
				730
				731	=over 2
				732
				733	=item B<KORAPXMLTEI_DEBUG>
				734
				735	Activate minimal debugging.
				736	Defaults to C<false>.
				737
				738	=item B<KORAPXMLTEI_INLINE>
				739
				740	Process inline annotations, if present.
				741	Defaults to C<false>.
				742
				743	=back
				744
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	745	=head1 COPYRIGHT AND LICENSE
				746
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	747	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	748
				749	Author: Peter Harders
				750
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	751	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	752
				753	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				754	Corpus Analysis Platform at the
				755	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				756	member of the
				757	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				758
				759	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	760	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	761
				762	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	763
				764	# NOTES
				765
				766	## Notes on how 'XML::CompactTree::XS' works
				767
				768	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				769
				770	Print out name of 'node2' for the above example:
				771
				772	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				773
				774	Exploring the structure of $data ( = reference to below array ):
				775
				776	[ 0: XML_READER_TYPE_DOCUMENT,
				777	1: ?
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	778	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	779	1: 'node'
				780	2: ?
				781	3: HASH (attributes)
				782	4: 1 (line number)
				783	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				784	1: 'node1'
				785	2: ?
				786	3: undefined (no attributes)
				787	4: 1 (line number)
				788	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				789	1: 'some '
				790	]
				791	1: [ 0: XML_READER_TYPE_ELEMENT
				792	1: 'n'
				793	2: ?
				794	3: undefined (no attributes)
				795	4: 1 (line number)
				796	5: undefined (no child-nodes)
				797	]
				798	2: [ 0: XML_READER_TYPE_TEXT
				799	1: ' text'
				800	]
				801	]
				802	]
				803	1: [ 0: XML_READER_TYPE_ELEMENT
				804	1: 'node2'
				805	2: ?
				806	3: undefined (not attributes)
				807	4: 1 (line number)
				808	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				809	1: 'more-text'
				810	]
				811	]
				812	]
				813	]
				814	]
				815	]
				816	]
				817
				818	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				819
				820	ref($data->[2]) == ARRAY (with 1 element for 'node')
				821	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				822
				823	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				824	$data->[2]->[0]->[1] == 'node'
				825	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				826	$data->[2]->[0]->[4] == 1 (line number)
				827	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	828	# child-nodes of actual node (see $children)
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	829
				830	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				831	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				832	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				833	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				834	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				835	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				836
				837	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				838	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				839	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				840
				841	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				842	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				843	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				844	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				845	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				846	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				847
				848	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				849	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				850	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				851
				852
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	853	descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	854	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				855	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				856
				857
				858	## Notes on whitespace handling
				859
				860	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	861	(see function 'descend()').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	862
				863	Definition of significant and insignificant whitespace
				864	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				865
				866	Significant whitespace is part of the document content and should be preserved.
				867	Insignificant whitespace is used when editing XML documents for readability.
				868	These whitespaces are typically not intended for inclusion in the delivery of the document.
				869
				870	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				871
				872	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				873	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				874
				875	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				876	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				877	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				878
				879	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				880
				881
				882	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				883
				884	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				885	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				886
				887	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				888	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				889
				890	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				891	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				892
				893	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				894	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				895	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				896	the last read 'non-tag'-node has to be corrected (see [1]),
				897
				898	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				899	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				900
				901	[1]
				902	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				903	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	904	(see above code fragment '... not exists $ws{ $from - 1 } ...').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	905
				906	[2]
				907	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				908	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				909
				910	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				911	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				912
				913	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				914	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				915
				916
				917	## Notes on whitespace fixing
				918
				919	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				920	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				921
				922	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				923	example further down and notes on 'Input restrictions' in the manpage).
				924
				925	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				926
				927	Examples (how primary text with linebreaks would be converted by below code):
				928
				929	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				930	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				931
				932	Blanks are inserted before the 1st character:
				933
				934	NOTE: not stringent ('...' stands for text):
				935
				936	beg1............................end1 => no blank before 'beg1'
				937	beg2....<pb/>...................end2 => no blank before 'beg2'
				938	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				939	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				940
				941	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				942	^
				943	\|_blank between 'end3' and 'beg4'
				944
				945
				946	## Notes on segfault prevention
				947
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	948	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	949	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				950	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				951	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				952	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.