Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 561c824659adf681a200e025d14b00b69c3fc00e [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Marc Kupietz	a1421f0	2021-02-18 15:32:38 +0100	[diff] [blame]	36	our $VERSION = '1.00';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	40	# Set to 1 for minimal more debug output (no need to be parametrized)
				41	use constant DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	42
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	43	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	44	GetOptions(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	45	"root\|r=s" => \(my $root_dir = '.'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	46	"input\|i=s" => \(my $input_fname = ''),
				47	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				48	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	49	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	50	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				51	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				52	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
				53	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	54	'base-foundry=s' => \(my $base_dir = 'base'),
				55	'data-file=s' => \(my $data_file = 'data'),
				56	'header-file=s' => \(my $header_file = 'header'),
				57	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	58	'log\|l=s' => \(my $log_level = 'notice'),
				59	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	60	pod2usage(
				61	-verbose => 99,
				62	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				63	-msg => $VERSION_MSG,
				64	-output => '-'
				65	)
				66	},
				67	'version\|v' => sub {
				68	pod2usage(
				69	-verbose => 0,
				70	-msg => $VERSION_MSG,
				71	-output => '-'
				72	)
				73	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	74	);
				75
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	76	# Establish logger
Marc Kupietz	44b1f25	2020-11-26 16:31:40 +0100	[diff] [blame]	77	binmode(STDERR, ":encoding(UTF-8)");
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	78	Log::Any::Adapter->set('Stderr', log_level => $log_level);
				79
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	80	$log->notice('Debugging is activated') if DEBUG;
				81
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	82	# tag (without attributes), which contains the primary text
				83	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	84	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	85
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	86	# TODO: IDS-specific (and redundant)
				87	my $_HEADER_TAG = 'idsHeader';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	88
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	89	# name of the tag containing all information stored in $_tokens_file
				90	my $_TOKENS_TAG = 'w';
				91
				92
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	93	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
				94	die $log->fatal("Sentence splitting is currently only supported by KorAP tokenizer (use -tk to activate it");
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	95	};
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	96
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	97	my $ext_tok;
				98	if ($tokenizer_call) {
				99	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				100	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	101
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	102	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	103	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	104	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	105	##
				106
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	107
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	108	#
				109	# ~~~ constants ~~~
				110	#
				111
				112
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	113	## intern tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	114	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				115	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	116	##
				117
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	118	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	119	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	120	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	121
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	122	# Name of the directory and the file containing all inline token informations
				123	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				124	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	125
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	126	# Handling inline annotations (inside $_TOKENS_TAG)
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	127	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
				128
				129	# Initialize Token- and Structure-Collector
				130	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				131	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
				132
				133	# Initialize Data-Collector
				134	my $data = KorAP::XML::TEI::Data->new;
				135
				136	# Initialize zipper
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	137	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	138
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	139
				140	#
				141	# ~~~ variables ~~~
				142	#
				143
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	144	my $dir = ''; # text directory (below $root_dir)
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	145
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	146	# '$text_id_esc' = escaped version of $text_id
				147	my ($text_id, $text_id_esc);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	148
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	149	# these are only used inside recursive function 'retr_info'
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	150	# value is set dependent on DEBUG - for extracting array of
				151	# child elements from element in $tree_data
				152	my $child_idx;
				153
				154	# element from $tree_data
				155	my $e;
				156
				157	# Keeping track of the current positions in the text
				158	my $pos;
				159
				160	# Default encoding of the text
				161	my $input_enc = 'UTF-8';
				162
				163	# variables for handling ~ whitespace related issue ~
				164	# (it is sometimes necessary, to correct the from-values for some tags)
				165	my $add_one;
				166	my $from = 0;
				167
				168	# text line (needed for whitespace handling)
				169	my $text_line = 0;
				170
				171	# hash for indices of whitespace-nodes
				172	# (needed to recorrect from-values)
				173	# IDEA:
				174	# when closing element, check if it's from-index minus 1 refers to a whitespace-node
				175	# (means: 'from-index - 1' is a key in %ws).
				176	# if this is _not_ the case, then the from-value is one
				177	# to high => correct it by substracting 1
				178	my %ws;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	179
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	180
				181	#
				182	# ~~~ main ~~~
				183	#
				184
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	185	# Include line numbers in elements of $tree_data for debugging
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	186	DEBUG ? ($child_idx = 5) : ($child_idx = 4);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	187
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	188
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	189	# ~ read input and write output (text by text) ~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	190
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	191	# Input file handle (default: stdin)
				192	my $input_fh = *STDIN;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	193
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	194	if ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	195	unless (open($input_fh, '<', $input_fname)) {
				196	die $log->fatal("File '$input_fname' could not be opened.");
				197	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	198	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	199
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	200	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	201	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	202
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	203
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	204	# Reading input document
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	205	MAIN: while ( <$input_fh> ){
				206
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	207	# remove HTML (multi-line) comments (<!--...-->)
				208	$_ = remove_xml_comments( $input_fh, $_ );
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	209
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	210	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	211	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	212	$input_enc = $2;
				213	next;
				214	};
				215
				216	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	217	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	218
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	219	# Start of Text body
				220	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#){
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	221
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	222	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	223
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	224	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	225	die $log->fatal("input line number $.: " .
				226	"line with opening text-body tag '${_TEXT_BODY}' " .
				227	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	228	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	229
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	230	# Text body data extracted from input document ($input_fh),
				231	# further processed by XML::LibXML::Reader
				232	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	233
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	234	# Iterate over all lines in the text body
				235	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	236
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	237	$_ = remove_xml_comments( $input_fh, $_ );
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	238	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	239	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	240
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	241	# End of text body
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	242	if (($pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	243
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	244	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	245
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	246	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	247	die $log->fatal("input line number $.: " .
				248	"line with closing text-body tag '${_TEXT_BODY}'".
				249	" contains additional information ... => Aborting (line=$_)");
				250	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	251
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	252	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	253	$log->warn(
				254	"Maybe empty textSigle => skipping this text ...\n" .
				255	'data=' . substr($data->data, 0, 200)
				256	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	257	next MAIN;
				258	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	259
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	260	my $reader = XML::LibXML::Reader->new(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	261	string => "<text>$text_buffer</text>",
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	262	huge => 1
				263	);
				264
				265	# See notes on whitespace handling
				266	my $param = XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_ATTRIBUTE_ARRAY;
				267
				268	# XCT_LINE_NUMBERS is only needed for debugging
				269	# (see XML::CompactTree::XS)
				270	$param \|= XCT_LINE_NUMBERS if DEBUG;
				271	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, $param);
				272
				273	$structures->reset;
				274
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	275	$tokens->reset unless $skip_inline_tokens;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	276
				277	# ~ whitespace related issue ~
				278	$add_one = 0;
				279	%ws = ();
				280
				281	# ~ recursion ~
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame^]	282	retr_info(1, $tree_data->[2]); # parse input data
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	283
				284	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	285	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	286	};
				287
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	288	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	289	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	290	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	291	$text_id_esc
				292	);
				293
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	294	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	295	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	296
				297	# Tokenize and output
				298	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	299	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	300	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	301	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	302	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	303
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	304	# Tokenize with internal tokenizer
				305	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	306
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	307	# Tokenize and output
				308	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	309	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	310	$text_id_esc
				311	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	312
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	313	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	314	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	315	$text_id_esc
				316	);
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	317
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	318	$aggr_tok->reset;
				319	$cons_tok->reset;
				320	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	321
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	322	if ($use_tokenizer_sentence_splits) {
				323	$ext_tok->sentencize_from_previous_input($structures);
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	324	};
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	325
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	326	# ~ write structures ~
				327	if (!$structures->empty) {
				328	$structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	329	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	330	$text_id_esc,
				331	2 # = structure serialization
				332	);
				333	};
				334
				335	# ~ write tokens ~
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	336	unless ($skip_inline_tokens \|\| $tokens->empty) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	337	$tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	338	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	339	$text_id_esc,
				340	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
				341	);
				342	};
				343
				344	# reinit.
				345	$dir = '';
				346
				347	# Maybe not necessary
				348	$data->reset;
				349
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	350	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	351	};
				352
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	353
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	354	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	355
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	356	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	357
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	358	# TODO:
				359	# Maybe it's best, to keep the stripping of whitespace and
				360	# to just remove the if-clause and to insert a blank by default
				361	# (with possibly an option on how newlines in primary text should
				362	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	363
				364	# Remove consecutive whitespace at beginning and end (mostly one newline)
				365	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	366
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	367	# NOTE:
				368	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	369
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	370	# TODO:
				371	# find a better solution, or create a warning, if a text has more
				372	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	373
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	374	# TODO:
				375	# do testing with 2 different corpora
				376	# (one with only one-line texts, the other with several lines per text)
				377
				378	# line contains at least one tag with at least one character contents
				379	if (m/<[^>]+>[^<]/) {
				380
				381	# Increment counter for text lines
				382	$text_line++;
				383
				384	# insert blank before 1st character
				385	#(for 2nd line and consecutive lines)
				386	s/^(.)/ $1/ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	387	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	388
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	389	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	390	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	391	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	392
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	393	} elsif (m#^(.)(\<${_HEADER_TAG}[^>]?type=["'].*)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	394
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	395	# ~ start of header ~
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	396	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	397
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	398	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	399	die $log->fatal(
				400	"input line number $.: " .
				401	'line with opening header tag is not in expected format ... ' .
				402	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	403	};
				404
				405	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	406	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	407
				408	# Header was parseable
				409	if ($header) {
				410
				411	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	412	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	413
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	414	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	415
				416	$header->to_zip($zipper->new_stream($file));
				417
				418	# Header is for text level
				419	if ($header->type eq 'text') {
				420
				421	# Remember dir and sigles
				422	$dir = $header->dir;
				423	$text_id = $header->id;
				424	$text_id_esc = $header->id_esc;
				425
				426	# log output for seeing progression
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	427	$log->notice("$0: text_id=$text_id");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	428
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	429	# Reset counter for text lines
				430	# (needed for whitespace handling)
				431	$text_line = 0;
				432	};
				433	};
				434	};
				435	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	436
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	437	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	438
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	439	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	440
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	441	close $input_fh;
				442
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	443	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	444
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	445
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	446	# Recursively called function to handle XML tree data
				447	sub retr_info {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	448
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	449	# recursion level
				450	# (1 = topmost level inside retr_info() = should always be level of tag $_TEXT_BODY)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	451	my $depth = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	452
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	453	# Iteration through all array elements
				454	# ($_[0] is a reference to an array reference)
				455	# See notes on how 'XML::CompactTree::XS' works and
				456	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame^]	457	foreach $e (@{$_[0]}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	458
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	459	# Element node
				460	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	461
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	462	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	463	# from here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	464	#~~~~
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	465
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	466	# $e->[1] represents the tag name
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	467	# Skip sentences
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	468	if ($use_tokenizer_sentence_splits && $e->[1] eq "s") {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	469	if (defined $e->[$child_idx]) {
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame^]	470	retr_info($depth+1, $e->[$child_idx]);
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	471	}
				472	next;
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	473	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	474
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	475	my $anno = $structures->add_new_annotation($e->[1]);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	476
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	477	# Add element also to token list
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	478	if (!$skip_inline_tokens && $e->[1] eq $_TOKENS_TAG) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	479	$tokens->add_annotation($anno);
				480	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	481
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	482	# Handle attributes (if attributes exist)
				483	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	484
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	485	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
				486	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				487	# note: arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	488	for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	489
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	490	# '$_' references the 'key' and '$_+1' the 'value'
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	491	$anno->add_attribute(
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	492	@{$e->[3]}[$_, $_ + 1]
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	493	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	494	};
				495	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	496
				497	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	498	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	499
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	500
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	501	#~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	502	# until here: tag-node (opening)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	503	#~~~~
				504
				505
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	506	# Call function recursively
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	507	# do no recursion, if $e->[$child_idx] is not defined
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	508	# (because we have no array of child-nodes, e.g.: <back/>)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	509	if (defined $e->[$child_idx]) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	510
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	511	# Recursion with array of child-nodes
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame^]	512	retr_info($depth+1, $e->[$child_idx]);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	513	}
				514
				515
				516	#~~~~~
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	517	# from here: tag-node (closing)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	518	#~~~~~
				519
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	520	# NOTE: use $pos, because the offsets are _between_ the characters
				521	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	522	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	523
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	524	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	525
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	526	$from = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	527
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	528	# ~ whitespace related issue ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	529	if ($from > 0 && not exists $ws{$from - 1}) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	530
				531	# ~ previous node was a text-node ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	532	$anno->set_from($from - 1);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	533	};
				534
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	535	# in case this fails, check input
				536	if (($from - 1) > $pos) {
				537	die $log->fatal(
				538	"text_id='$text_id', " .
				539	'processing of structures: ' .
				540	"from-value ($from) is 2 or more greater " .
				541	"than to-value ($pos) => please check. Aborting"
				542	);
				543	};
				544
				545	# TODO:
				546	# find example for which this case applies
				547	# maybe this is not necessary anymore, because the
				548	# above recorrection of the from-value suffices
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	549	#
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	550	# TODO:
				551	# check, if it's better to remove this line and
				552	# change above check to 'if ($from - 1) >= $pos;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	553	# do testing with bigger corpus excerpt (wikipedia?)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	554	$anno->set_from($pos) if $from == $pos + 1;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	555	$anno->set_to($pos);
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	556	$anno->set_level($depth);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	557
				558	# Clean up whitespace
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	559	delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	560
				561
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	562	#~~~~
				563	# until here: tag-node (closing)
				564	#~~~~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	565	}
				566
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	567	# Text node
				568	elsif ($e->[0] == XML_READER_TYPE_TEXT){
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	569
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	570	$add_one = 1;
				571	$data->append($e->[1]);
				572	}
				573
				574	# Whitespace node
				575	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				576	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				577
				578	# state, that this from-index belongs to a whitespace-node
				579	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				580	$ws{$data->position}++;
				581
				582	$add_one = 0;
				583	$data->append($e->[1]);
				584	}
				585
				586	# not yet handled type
				587	else {
				588
				589	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				590	};
				591	};
				592	};
				593
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	594
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	595	__END__
				596
				597	=pod
				598
				599	=encoding utf8
				600
				601	=head1 NAME
				602
				603	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				604
				605	=head1 SYNOPSIS
				606
				607	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				608
				609	=head1 DESCRIPTION
				610
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	611	C<tei2korapxml> is a script to convert TEI P5 and
				612	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				613	based documents to the
				614	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				615	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	616	read from C<STDIN>. If no specific output is defined, data is written
				617	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	618
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	619	This program is usually called from inside another script.
				620
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	621	=head1 FORMATS
				622
				623	=head2 Input restrictions
				624
				625	=over 2
				626
				627	=item
				628
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	629	TEI P5 formatted input with certain restrictions:
				630
				631	=over 4
				632
				633	=item
				634
				635	B<mandatory>: text-header with integrated textsigle, text-body
				636
				637	=item
				638
				639	B<optional>: corp-header with integrated corpsigle,
				640	doc-header with integrated docsigle
				641
				642	=back
				643
				644	=item
				645
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	646	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	647	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	648	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	649	into blanks between 2 tokens could lead to additional blanks,
				650	where there should be none (e.g.: punctuation characters like C<,> or
				651	C<.> should not be seperated from their predecessor token).
				652	(see also code section C<~ whitespace handling ~>).
				653
				654	=back
				655
				656	=head2 Notes on the output
				657
				658	=over 2
				659
				660	=item
				661
				662	zip file output (default on C<stdout>) with utf8 encoded entries
				663	(which together form the KorAP-XML format)
				664
				665	=back
				666
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	667	=head1 INSTALLATION
				668
				669	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				670	these bindings are available, the preferred way to install the script is
				671	to use L<cpanm\|App::cpanminus>.
				672
				673	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				674
				675	In case everything went well, the C<tei2korapxml> tool will
				676	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	677
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	678	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				679
				680	=head1 OPTIONS
				681
				682	=over 2
				683
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	684	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	685
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	686	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	687
				688	=item B<--help\|-h>
				689
				690	Print help information.
				691
				692	=item B<--version\|-v>
				693
				694	Print version information.
				695
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	696	=item B<--tokenizer-call\|-tc>
				697
				698	Call an external tokenizer process, that will tokenize
				699	a single line from STDIN and outputs one token per line.
				700
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	701	=item B<--tokenizer-korap\|-tk>
				702
				703	Use the standard KorAP/DeReKo tokenizer.
				704
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	705	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	706
				707	Tokenize the data using two embedded tokenizers,
				708	that will take an I<Aggressive> and a I<conservative>
				709	approach.
				710
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	711	=item B<--skip-inline-tokens>
				712
				713	Boolean flag indicating that inline tokens should not
				714	be processed. Defaults to false (meaning inline tokens will be processed).
				715
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	716	=item B<--inline-tokens> <foundry>#[<file>]
				717
				718	Define the foundry and file (without extension)
				719	to store inline token information in.
				720	If L</KORAPXMLTEI_INLINE> is set, this will contain
				721	annotations as well.
				722	Defaults to C<tokens> and C<morpho>.
				723
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	724	=item B<--inline-structures> <foundry>#[<file>]
				725
				726	Define the foundry and file (without extension)
				727	to store inline structure information in.
				728	Defaults to C<struct> and C<structures>.
				729
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	730	=item B<--base-foundry> <foundry>
				731
				732	Define the base foundry to store newly generated
				733	token information in.
				734	Defaults to C<base>.
				735
				736	=item B<--data-file> <file>
				737
				738	Define the file (without extension)
				739	to store primary data information in.
				740	Defaults to C<data>.
				741
				742	=item B<--header-file> <file>
				743
				744	Define the file name (without extension)
				745	to store header information on
				746	the corpus, document, and text level in.
				747	Defaults to C<header>.
				748
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	749	=item B<--use-tokenizer-sentence-splits\|-s>
				750
				751	Replace existing with, or add new, sentence boundary information
				752	provided by the KorAP tokenizer (currently supported only).
				753
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	754	=item B<--tokens-file> <file>
				755
				756	Define the file (without extension)
				757	to store generated token information in
				758	(either from the KorAP tokenizer or an externally called tokenizer).
				759	Defaults to C<tokens>.
				760
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	761	=item B<--log\|-l>
				762
				763	Loglevel for I<Log::Any>. Defaults to C<notice>.
				764
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	765	=back
				766
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	767	=head1 ENVIRONMENT VARIABLES
				768
				769	=over 2
				770
				771	=item B<KORAPXMLTEI_DEBUG>
				772
				773	Activate minimal debugging.
				774	Defaults to C<false>.
				775
				776	=item B<KORAPXMLTEI_INLINE>
				777
				778	Process inline annotations, if present.
				779	Defaults to C<false>.
				780
				781	=back
				782
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	783	=head1 COPYRIGHT AND LICENSE
				784
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	785	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	786
				787	Author: Peter Harders
				788
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	789	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	790
				791	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				792	Corpus Analysis Platform at the
				793	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				794	member of the
				795	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				796
				797	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	798	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	799
				800	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	801
				802	# NOTES
				803
				804	## Notes on how 'XML::CompactTree::XS' works
				805
				806	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				807
				808	Print out name of 'node2' for the above example:
				809
				810	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				811
				812	Exploring the structure of $data ( = reference to below array ):
				813
				814	[ 0: XML_READER_TYPE_DOCUMENT,
				815	1: ?
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	816	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see retr_info( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	817	1: 'node'
				818	2: ?
				819	3: HASH (attributes)
				820	4: 1 (line number)
				821	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				822	1: 'node1'
				823	2: ?
				824	3: undefined (no attributes)
				825	4: 1 (line number)
				826	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				827	1: 'some '
				828	]
				829	1: [ 0: XML_READER_TYPE_ELEMENT
				830	1: 'n'
				831	2: ?
				832	3: undefined (no attributes)
				833	4: 1 (line number)
				834	5: undefined (no child-nodes)
				835	]
				836	2: [ 0: XML_READER_TYPE_TEXT
				837	1: ' text'
				838	]
				839	]
				840	]
				841	1: [ 0: XML_READER_TYPE_ELEMENT
				842	1: 'node2'
				843	2: ?
				844	3: undefined (not attributes)
				845	4: 1 (line number)
				846	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				847	1: 'more-text'
				848	]
				849	]
				850	]
				851	]
				852	]
				853	]
				854	]
				855
				856	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				857
				858	ref($data->[2]) == ARRAY (with 1 element for 'node')
				859	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				860
				861	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				862	$data->[2]->[0]->[1] == 'node'
				863	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				864	$data->[2]->[0]->[4] == 1 (line number)
				865	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	866	# child-nodes of actual node (see $child_idx)
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	867
				868	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				869	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				870	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				871	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				872	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				873	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				874
				875	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				876	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				877	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				878
				879	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				880	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				881	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				882	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				883	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				884	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				885
				886	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				887	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				888	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				889
				890
				891	retr_info() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
				892	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				893	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				894
				895
				896	## Notes on whitespace handling
				897
				898	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
				899	(see function 'retr_info()').
				900
				901	Definition of significant and insignificant whitespace
				902	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				903
				904	Significant whitespace is part of the document content and should be preserved.
				905	Insignificant whitespace is used when editing XML documents for readability.
				906	These whitespaces are typically not intended for inclusion in the delivery of the document.
				907
				908	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				909
				910	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				911	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				912
				913	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				914	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				915	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				916
				917	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				918
				919
				920	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				921
				922	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				923	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				924
				925	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				926	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				927
				928	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				929	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				930
				931	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				932	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				933	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				934	the last read 'non-tag'-node has to be corrected (see [1]),
				935
				936	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				937	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				938
				939	[1]
				940	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				941	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	942	(see above code fragment '... not exists $ws{ $from - 1 } ...').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	943
				944	[2]
				945	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				946	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				947
				948	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				949	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				950
				951	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				952	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				953
				954
				955	## Notes on whitespace fixing
				956
				957	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				958	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				959
				960	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				961	example further down and notes on 'Input restrictions' in the manpage).
				962
				963	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				964
				965	Examples (how primary text with linebreaks would be converted by below code):
				966
				967	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				968	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				969
				970	Blanks are inserted before the 1st character:
				971
				972	NOTE: not stringent ('...' stands for text):
				973
				974	beg1............................end1 => no blank before 'beg1'
				975	beg2....<pb/>...................end2 => no blank before 'beg2'
				976	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				977	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				978
				979	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				980	^
				981	\|_blank between 'end3' and 'beg4'
				982
				983
				984	## Notes on segfault prevention
				985
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	986	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	987	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				988	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				989	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				990	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.