Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: af1bc059111cbdaee2c511a2d351d2eb75462fa7 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14	use XML::CompactTree::XS;
				15	use XML::LibXML::Reader;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	16
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	17	use FindBin;
				18	BEGIN {
				19	unshift @INC, "$FindBin::Bin/../lib";
				20	};
				21
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	22	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Tokenizer::Conservative;
				25	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	26	use KorAP::XML::TEI::Annotations::Collector;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	27	use KorAP::XML::TEI::Data;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	28	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	29	use KorAP::XML::TEI::Header;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	30
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	31	eval {
				32	require KorAP::XML::TEI::Tokenizer::KorAP;
				33	1;
				34	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	35
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	36	our $VERSION = '1.01';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	37
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	38	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				39
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	40	use constant {
				41	# Set to 1 for minimal more debug output (no need to be parametrized)
				42	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0,
				43
				44	# XCT_LINE_NUMBERS is only needed for debugging
				45	# (see XML::CompactTree::XS)
				46	XCT_PARAM => (
				47	XCT_DOCUMENT_ROOT
				48	\| XCT_IGNORE_COMMENTS
				49	\| XCT_ATTRIBUTE_ARRAY
				50	\| ($ENV{KORAPXMLTEI_DEBUG} ? XCT_LINE_NUMBERS : 0)
				51	)
				52	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	53
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	54	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	55	GetOptions(
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	56	'root\|r=s' => \(my $root_dir = '.'),
				57	'input\|i=s' => \(my $input_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	58	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				59	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	60	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				62	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				63	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
				64	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame^]	65	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	66	'base-foundry=s' => \(my $base_dir = 'base'),
				67	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	68	'header-file=s' => \(my $header_file = 'header'),
				69	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	70	'log\|l=s' => \(my $log_level = 'notice'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	71	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	72	pod2usage(
				73	-verbose => 99,
				74	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				75	-msg => $VERSION_MSG,
				76	-output => '-'
				77	)
				78	},
				79	'version\|v' => sub {
				80	pod2usage(
				81	-verbose => 0,
				82	-msg => $VERSION_MSG,
				83	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	84	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	85	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	86	);
				87
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	88
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	89	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	90	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	91	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	92	$log->notice('Debugging is activated') if DEBUG;
				93
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	94
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	95	# tag (without attributes), which contains the primary text
				96	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	97	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	98
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	99	# TODO: IDS-specific (and redundant)
				100	my $_HEADER_TAG = 'idsHeader';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	101
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	102	# name of the tag containing all information stored in $_tokens_file
				103	my $_TOKENS_TAG = 'w';
				104
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	105
				106	# Define tokenizers
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	107	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	108	die $log->fatal(
				109	'Sentence splitting is currently only supported by KorAP tokenizer ' .
				110	'(use -tk to activate it)'
				111	);
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	112	};
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	113
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame^]	114	# Remember to skip certain inline tags
				115	my %skip_inline_tags = ();
				116	if ($skip_inline_tags_str) {
				117	foreach (split /\s,\s/, $skip_inline_tags_str) {
				118	$skip_inline_tags{$_} = 1;
				119	};
				120	};
				121
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	122	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	123	my $ext_tok;
				124	if ($tokenizer_call) {
				125	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				126	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	127
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	128	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	129	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame^]	130	if ($use_tokenizer_sentence_splits) {
				131	$skip_inline_tags{s} = 1;
				132	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	133	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	134
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	135
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	136	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	137	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				138	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	139
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	140
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	141	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	142	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	143	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	144
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	145	# Name of the directory and the file containing all inline token informations
				146	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				147	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	148
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	149	# Handling inline annotations (inside $_TOKENS_TAG)
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	150	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
				151
				152	# Initialize Token- and Structure-Collector
				153	my $tokens = KorAP::XML::TEI::Annotations::Collector->new;
				154	my $structures = KorAP::XML::TEI::Annotations::Collector->new;
				155
				156	# Initialize Data-Collector
				157	my $data = KorAP::XML::TEI::Data->new;
				158
				159	# Initialize zipper
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	160	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	161
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	162
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	163	# text directory (below $root_dir)
				164	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	165
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	166	# Escaped version of text id
				167	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	168
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	169	# element from $tree_data
				170	my $e;
				171
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	172	# Default encoding of the text
				173	my $input_enc = 'UTF-8';
				174
				175	# variables for handling ~ whitespace related issue ~
				176	# (it is sometimes necessary, to correct the from-values for some tags)
				177	my $add_one;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	178
				179	# text line (needed for whitespace handling)
				180	my $text_line = 0;
				181
				182	# hash for indices of whitespace-nodes
				183	# (needed to recorrect from-values)
				184	# IDEA:
				185	# when closing element, check if it's from-index minus 1 refers to a whitespace-node
				186	# (means: 'from-index - 1' is a key in %ws).
				187	# if this is _not_ the case, then the from-value is one
				188	# to high => correct it by substracting 1
				189	my %ws;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	190
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	191
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	192	# Input file handle (default: stdin)
				193	my $input_fh = *STDIN;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	194
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	195	if ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	196	unless (open($input_fh, '<', $input_fname)) {
				197	die $log->fatal("File '$input_fname' could not be opened.");
				198	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	199	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	200
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	201	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	202	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	203
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	204
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	205	# Reading input document
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	206	MAIN: while (<$input_fh>) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	207
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	208	# remove HTML (multi-line) comments (<!--...-->)
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	209	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	210
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	211	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	212	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	213	$input_enc = $2;
				214	next;
				215	};
				216
				217	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	218	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	219
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	220	# Start of text body
				221	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	222	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	223
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	224	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	225	die $log->fatal("input line number $.: " .
				226	"line with opening text-body tag '${_TEXT_BODY}' " .
				227	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	228	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	229
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	230	# Text body data extracted from input document ($input_fh),
				231	# further processed by XML::LibXML::Reader
				232	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	233
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	234	# Iterate over all lines in the text body
				235	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	236
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	237	$_ = remove_xml_comments($input_fh, $_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	238	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	239	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	240
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	241	# End of text body
Akron	b43b491	2021-02-25 10:31:11 +0100	[diff] [blame]	242	if ((my $pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	243
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	244	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	245
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	246	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	247	die $log->fatal("input line number $.: " .
				248	"line with closing text-body tag '${_TEXT_BODY}'".
				249	" contains additional information ... => Aborting (line=$_)");
				250	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	251
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	252	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	253	$log->warn(
				254	"Maybe empty textSigle => skipping this text ...\n" .
				255	'data=' . substr($data->data, 0, 200)
				256	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	257	next MAIN;
				258	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	259
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	260	my $reader = XML::LibXML::Reader->new(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	261	string => "<text>$text_buffer</text>",
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	262	huge => 1
				263	);
				264
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	265	my $tree_data = XML::CompactTree::XS::readSubtreeToPerl($reader, XCT_PARAM);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	266
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	267	# ~ whitespace related issue ~
				268	$add_one = 0;
				269	%ws = ();
				270
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	271	# Recursively parse all children
				272	descend(1, $tree_data->[2]);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	273
				274	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	275	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	276	};
				277
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	278	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	279	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	280	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	281	$text_id_esc
				282	);
				283
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	284	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	285	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	286
				287	# Tokenize and output
				288	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	289	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	290	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	291	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	292
				293	if ($use_tokenizer_sentence_splits) {
				294	$ext_tok->sentencize_from_previous_input($structures);
				295	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	296	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	297
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	298	# Tokenize with internal tokenizer
				299	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	300
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	301	# Tokenize and output
				302	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	303	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	304	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	305	)->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	306
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	307	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	308	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	309	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	310	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	311	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	312
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	313	# ~ write structures ~
				314	if (!$structures->empty) {
				315	$structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	316	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	317	$text_id_esc,
				318	2 # = structure serialization
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	319	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	320	};
				321
				322	# ~ write tokens ~
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	323	unless ($skip_inline_tokens \|\| $tokens->empty) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	324	$tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	325	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	326	$text_id_esc,
				327	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	328	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	329	};
				330
				331	# reinit.
				332	$dir = '';
				333
				334	# Maybe not necessary
				335	$data->reset;
				336
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	337	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	338	};
				339
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	340
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	341	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	342
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	343	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	344
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	345	# TODO:
				346	# Maybe it's best, to keep the stripping of whitespace and
				347	# to just remove the if-clause and to insert a blank by default
				348	# (with possibly an option on how newlines in primary text should
				349	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	350
				351	# Remove consecutive whitespace at beginning and end (mostly one newline)
				352	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	353
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	354	# NOTE:
				355	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	356
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	357	# TODO:
				358	# find a better solution, or create a warning, if a text has more
				359	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	360
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	361	# TODO:
				362	# do testing with 2 different corpora
				363	# (one with only one-line texts, the other with several lines per text)
				364
				365	# line contains at least one tag with at least one character contents
				366	if (m/<[^>]+>[^<]/) {
				367
				368	# Increment counter for text lines
				369	$text_line++;
				370
				371	# insert blank before 1st character
Akron	6e2b125	2021-02-24 12:41:15 +0100	[diff] [blame]	372	# (for 2nd line and consecutive lines)
				373	$_ = ' ' . $_ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	374	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	375
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	376	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	377	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	378	};
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	379	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	380
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	381	# Start of header section
				382	elsif (m#^(.)(\<${_HEADER_TAG}[^>]?type=["'].*)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	383
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	384	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	385
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	386	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	387	die $log->fatal(
				388	"input line number $.: " .
				389	'line with opening header tag is not in expected format ... ' .
				390	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	391	};
				392
				393	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	394	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	395
				396	# Header was parseable
				397	if ($header) {
				398
				399	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	400	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	401
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	402	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	403
				404	$header->to_zip($zipper->new_stream($file));
				405
				406	# Header is for text level
				407	if ($header->type eq 'text') {
				408
				409	# Remember dir and sigles
				410	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	411	$text_id_esc = $header->id_esc;
				412
				413	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	414	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	415
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	416	# Reset counter for text lines
				417	# (needed for whitespace handling)
				418	$text_line = 0;
				419	};
				420	};
				421	};
				422	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	423
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	424	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	425
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	426	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	427
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	428	close $input_fh;
				429
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	430	exit(0);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	431
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	432
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	433	# Recursively called function to handle XML tree data
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	434	sub descend {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	435
Akron	1c4f220	2020-07-30 09:28:22 +0200	[diff] [blame]	436	# recursion level
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	437	# (1 = topmost level inside descend() = should always be level of tag $_TEXT_BODY)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	438	my $depth = shift;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	439
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	440	# Iteration through all array elements
				441	# ($_[0] is a reference to an array reference)
				442	# See notes on how 'XML::CompactTree::XS' works and
				443	# see 'NODE TYPES' in manpage of XML::LibXML::Reader
Akron	3556c75	2021-02-24 09:53:24 +0100	[diff] [blame]	444	foreach $e (@{$_[0]}) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	445
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	446	# $e->[1] represents the tag name of an element node
				447	# or the primary data of a text or ws node
				448	my $node_info = $e->[1];
				449
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	450	# Element node
				451	if ($e->[0] == XML_READER_TYPE_ELEMENT) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	452
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	453	# Deal with opening tag
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	454
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	455	# Get the child index depending on the debug state.
				456	# This is likely to be optimized away by the compiler.
				457	my $children = $e->[DEBUG ? 5 : 4];
				458
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame^]	459	# Skip certain tags
				460	if ($skip_inline_tags{$node_info}) {
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	461	descend($depth + 1, $children) if defined $children;
Akron	ace1277	2021-02-19 13:16:26 +0100	[diff] [blame]	462	next;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	463	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	464
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	465	my $anno = $structures->add_new_annotation($node_info);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	466
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	467	# Add element also to token list
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	468	if (!$skip_inline_tokens && $node_info eq $_TOKENS_TAG) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	469	$tokens->add_annotation($anno);
				470	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	471
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	472	# Handle attributes (if attributes exist)
				473	if (defined $e->[3]) {
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	474
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	475	# with 'XCT_ATTRIBUTE_ARRAY', $node->[3] is an array reference of the form
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	476	# [ name1, value1, name2, value2, ....] of attribute names and corresponding values.
				477	# NOTE:
				478	# arrays are faster (see: http://makepp.sourceforge.net/2.0/perl_performance.html)
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	479	for (local $_ = 0; $_ < @{$e->[3]}; $_ += 2) {
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	480	$anno->add_attribute(
Akron	dac5d93	2021-02-23 21:12:02 +0100	[diff] [blame]	481	@{$e->[3]}[$_, $_ + 1]
Akron	7501ca0	2020-08-01 21:05:25 +0200	[diff] [blame]	482	);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	483	};
				484	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	485
				486	# this is, where a normal tag or tokens-tag ($_TOKENS_TAG) starts
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	487	$anno->set_from($data->position + $add_one);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	488
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	489
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	490	# Call function recursively
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	491	# do no recursion, if $children is not defined
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	492	# (because we have no array of child-nodes, e.g.: <back/>)
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	493	descend($depth+1, $children) if defined $children;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	494
				495
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	496	# Deal with closing tag
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	497
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	498	# NOTE:
				499	# use $pos, because the offsets are _between_ the characters
				500	# (e.g.: word = 'Hello' => from = 0 (before 'H'), to = 5 (after 'o'))
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	501	my $pos = $data->position;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	502
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	503	# Handle structures and tokens
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	504
Akron	b43b491	2021-02-25 10:31:11 +0100	[diff] [blame]	505	my $from = $anno->from;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	506
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	507	# ~ whitespace related issue ~
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	508	if ($from > 0 && not exists $ws{$from - 1}) {
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	509
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	510	# Previous node was a text-node
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	511	$anno->set_from($from - 1);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	512	};
				513
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	514	# in case this fails, check input
				515	if (($from - 1) > $pos) {
				516	die $log->fatal(
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	517	"text_id='$text_id_esc', " .
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	518	'processing of structures: ' .
				519	"from-value ($from) is 2 or more greater " .
				520	"than to-value ($pos) => please check. Aborting"
				521	);
				522	};
				523
				524	# TODO:
				525	# find example for which this case applies
				526	# maybe this is not necessary anymore, because the
				527	# above recorrection of the from-value suffices
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	528	#
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	529	# TODO:
				530	# check, if it's better to remove this line and
				531	# change above check to 'if ($from - 1) >= $pos;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	532	# do testing with bigger corpus excerpt (wikipedia?)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	533	$anno->set_from($pos) if $from == $pos + 1;
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	534	$anno->set_to($pos);
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	535	$anno->set_level($depth);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	536
				537	# Clean up whitespace
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	538	delete $ws{$from - 1} if $from > 0 && exists $ws{$from - 1};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	539	}
				540
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	541	# Text node
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	542	elsif ($e->[0] == XML_READER_TYPE_TEXT) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	543
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	544	$add_one = 1;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	545	$data->append($node_info);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	546	}
				547
				548	# Whitespace node
				549	# (See notes on whitespace handling - regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE)
				550	elsif ($e->[0] == XML_READER_TYPE_SIGNIFICANT_WHITESPACE) {
				551
				552	# state, that this from-index belongs to a whitespace-node
				553	# ('++' doesn't mean a thing here - maybe it could be used for a consistency check)
				554	$ws{$data->position}++;
				555
				556	$add_one = 0;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	557	$data->append($node_info);
Akron	d658df7	2021-02-18 18:58:56 +0100	[diff] [blame]	558	}
				559
				560	# not yet handled type
				561	else {
				562
				563	die $log->fatal('Not yet handled type ($e->[0]=' . $e->[0] . ') ... => Aborting');
				564	};
				565	};
				566	};
				567
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	568
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	569	__END__
				570
				571	=pod
				572
				573	=encoding utf8
				574
				575	=head1 NAME
				576
				577	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				578
				579	=head1 SYNOPSIS
				580
				581	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				582
				583	=head1 DESCRIPTION
				584
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	585	C<tei2korapxml> is a script to convert TEI P5 and
				586	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				587	based documents to the
				588	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				589	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	590	read from C<STDIN>. If no specific output is defined, data is written
				591	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	592
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	593	This program is usually called from inside another script.
				594
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	595	=head1 FORMATS
				596
				597	=head2 Input restrictions
				598
				599	=over 2
				600
				601	=item
				602
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	603	TEI P5 formatted input with certain restrictions:
				604
				605	=over 4
				606
				607	=item
				608
				609	B<mandatory>: text-header with integrated textsigle, text-body
				610
				611	=item
				612
				613	B<optional>: corp-header with integrated corpsigle,
				614	doc-header with integrated docsigle
				615
				616	=back
				617
				618	=item
				619
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	620	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	621	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	622	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	623	into blanks between 2 tokens could lead to additional blanks,
				624	where there should be none (e.g.: punctuation characters like C<,> or
				625	C<.> should not be seperated from their predecessor token).
				626	(see also code section C<~ whitespace handling ~>).
				627
				628	=back
				629
				630	=head2 Notes on the output
				631
				632	=over 2
				633
				634	=item
				635
				636	zip file output (default on C<stdout>) with utf8 encoded entries
				637	(which together form the KorAP-XML format)
				638
				639	=back
				640
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	641	=head1 INSTALLATION
				642
				643	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				644	these bindings are available, the preferred way to install the script is
				645	to use L<cpanm\|App::cpanminus>.
				646
				647	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				648
				649	In case everything went well, the C<tei2korapxml> tool will
				650	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	651
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	652	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				653
				654	=head1 OPTIONS
				655
				656	=over 2
				657
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	658	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	659
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	660	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	661
				662	=item B<--help\|-h>
				663
				664	Print help information.
				665
				666	=item B<--version\|-v>
				667
				668	Print version information.
				669
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	670	=item B<--tokenizer-call\|-tc>
				671
				672	Call an external tokenizer process, that will tokenize
				673	a single line from STDIN and outputs one token per line.
				674
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	675	=item B<--tokenizer-korap\|-tk>
				676
				677	Use the standard KorAP/DeReKo tokenizer.
				678
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	679	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	680
				681	Tokenize the data using two embedded tokenizers,
				682	that will take an I<Aggressive> and a I<conservative>
				683	approach.
				684
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	685	=item B<--skip-inline-tokens>
				686
				687	Boolean flag indicating that inline tokens should not
				688	be processed. Defaults to false (meaning inline tokens will be processed).
				689
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame^]	690	=item B<--skip-inline-tags>
				691
				692	Expects a comma-separated list of tags to be ignored when the structure
				693	is parsed. Content of these tags however will be processed.
				694
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	695	=item B<--inline-tokens> <foundry>#[<file>]
				696
				697	Define the foundry and file (without extension)
				698	to store inline token information in.
				699	If L</KORAPXMLTEI_INLINE> is set, this will contain
				700	annotations as well.
				701	Defaults to C<tokens> and C<morpho>.
				702
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	703	=item B<--inline-structures> <foundry>#[<file>]
				704
				705	Define the foundry and file (without extension)
				706	to store inline structure information in.
				707	Defaults to C<struct> and C<structures>.
				708
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	709	=item B<--base-foundry> <foundry>
				710
				711	Define the base foundry to store newly generated
				712	token information in.
				713	Defaults to C<base>.
				714
				715	=item B<--data-file> <file>
				716
				717	Define the file (without extension)
				718	to store primary data information in.
				719	Defaults to C<data>.
				720
				721	=item B<--header-file> <file>
				722
				723	Define the file name (without extension)
				724	to store header information on
				725	the corpus, document, and text level in.
				726	Defaults to C<header>.
				727
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	728	=item B<--use-tokenizer-sentence-splits\|-s>
				729
				730	Replace existing with, or add new, sentence boundary information
				731	provided by the KorAP tokenizer (currently supported only).
				732
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	733	=item B<--tokens-file> <file>
				734
				735	Define the file (without extension)
				736	to store generated token information in
				737	(either from the KorAP tokenizer or an externally called tokenizer).
				738	Defaults to C<tokens>.
				739
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	740	=item B<--log\|-l>
				741
				742	Loglevel for I<Log::Any>. Defaults to C<notice>.
				743
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	744	=back
				745
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	746	=head1 ENVIRONMENT VARIABLES
				747
				748	=over 2
				749
				750	=item B<KORAPXMLTEI_DEBUG>
				751
				752	Activate minimal debugging.
				753	Defaults to C<false>.
				754
				755	=item B<KORAPXMLTEI_INLINE>
				756
				757	Process inline annotations, if present.
				758	Defaults to C<false>.
				759
				760	=back
				761
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	762	=head1 COPYRIGHT AND LICENSE
				763
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	764	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	765
				766	Author: Peter Harders
				767
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	768	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	769
				770	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				771	Corpus Analysis Platform at the
				772	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				773	member of the
				774	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				775
				776	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	777	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	778
				779	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	780
				781	# NOTES
				782
				783	## Notes on how 'XML::CompactTree::XS' works
				784
				785	Example: <node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>
				786
				787	Print out name of 'node2' for the above example:
				788
				789	echo '<node a="v"><node1>some <n/> text</node1><node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27\n"'
				790
				791	Exploring the structure of $data ( = reference to below array ):
				792
				793	[ 0: XML_READER_TYPE_DOCUMENT,
				794	1: ?
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	795	2: [ 0: [ 0: XML_READER_TYPE_ELEMENT <- start recursion with array '$data->[2]' (see descend( \$tree_data->[2] ))
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	796	1: 'node'
				797	2: ?
				798	3: HASH (attributes)
				799	4: 1 (line number)
				800	5: [ 0: [ 0: XML_READER_TYPE_ELEMENT
				801	1: 'node1'
				802	2: ?
				803	3: undefined (no attributes)
				804	4: 1 (line number)
				805	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				806	1: 'some '
				807	]
				808	1: [ 0: XML_READER_TYPE_ELEMENT
				809	1: 'n'
				810	2: ?
				811	3: undefined (no attributes)
				812	4: 1 (line number)
				813	5: undefined (no child-nodes)
				814	]
				815	2: [ 0: XML_READER_TYPE_TEXT
				816	1: ' text'
				817	]
				818	]
				819	]
				820	1: [ 0: XML_READER_TYPE_ELEMENT
				821	1: 'node2'
				822	2: ?
				823	3: undefined (not attributes)
				824	4: 1 (line number)
				825	5: [ 0: [ 0: XML_READER_TYPE_TEXT
				826	1: 'more-text'
				827	]
				828	]
				829	]
				830	]
				831	]
				832	]
				833	]
				834
				835	$data->[0] = 9 (=> type == XML_READER_TYPE_DOCUMENT)
				836
				837	ref($data->[2]) == ARRAY (with 1 element for 'node')
				838	ref($data->[2]->[0]) == ARRAY (with 6 elements)
				839
				840	$data->[2]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				841	$data->[2]->[0]->[1] == 'node'
				842	ref($data->[2]->[0]->[3]) == HASH (=> ${$data->[2]->[0]->[3]}{a} == 'v')
				843	$data->[2]->[0]->[4] == 1 (line number)
				844	ref($data->[2]->[0]->[5]) == ARRAY (with 2 elements for 'node1' and 'node2')
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	845	# child-nodes of actual node (see $children)
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	846
				847	ref($data->[2]->[0]->[5]->[0]) == ARRAY (with 6 elements)
				848	$data->[2]->[0]->[5]->[0]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				849	$data->[2]->[0]->[5]->[0]->[1] == 'node1'
				850	$data->[2]->[0]->[5]->[0]->[3] == undefined (=> no attribute)
				851	$data->[2]->[0]->[5]->[0]->[4] == 1 (line number)
				852	ref($data->[2]->[0]->[5]->[0]->[5]) == ARRAY (with 3 elements for 'some ', '<n/>' and ' text')
				853
				854	ref($data->[2]->[0]->[5]->[0]->[5]->[0]) == ARRAY (with 2 elements)
				855	$data->[2]->[0]->[5]->[0]->[5]->[0]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				856	$data->[2]->[0]->[5]->[0]->[5]->[0]->[1] == 'some '
				857
				858	ref($data->[2]->[0]->[5]->[0]->[5]->[1]) == ARRAY (with 5 elements)
				859	$data->[2]->[0]->[5]->[0]->[5]->[1]->[0] == 1 (=> type == XML_READER_TYPE_ELEMENT)
				860	$data->[2]->[0]->[5]->[0]->[5]->[1]->[1] == 'n'
				861	$data->[2]->[0]->[5]->[0]->[5]->[1]->[3] == undefined (=> no attribute)
				862	$data->[2]->[0]->[5]->[0]->[5]->[1]->[4] == 1 (line number)
				863	$data->[2]->[0]->[5]->[0]->[5]->[1]->[5] == undefined (=> no child-nodes)
				864
				865	ref($data->[2]->[0]->[5]->[0]->[5]->[2]) == ARRAY (with 2 elements)
				866	$data->[2]->[0]->[5]->[0]->[5]->[2]->[0] == 3 (=> type == XML_READER_TYPE_TEXT)
				867	$data->[2]->[0]->[5]->[0]->[5]->[2]->[1] == ' text'
				868
				869
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	870	descend() starts with the array reference ${$_[0]} (= \$tree_data->[2]), which corresponds to ${\$data->[2]} in the above example.
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	871	Hence, the expression @{${$_[0]}} corresponds to @{${\$data->[2]}}, $e to ${${\$data->[2]}}[0] (= $data->[2]->[0]) and $e->[0] to
				872	${${\$data->[2]}}[0]->[0] (= $data->[2]->[0]->[0]).
				873
				874
				875	## Notes on whitespace handling
				876
				877	Every whitespace inside the processed text is 'significant' and recognized as a node of type 'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'
Akron	5aca0d2	2021-02-24 12:09:53 +0100	[diff] [blame]	878	(see function 'descend()').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	879
				880	Definition of significant and insignificant whitespace
				881	(source: https://www.oracle.com/technical-resources/articles/wang-whitespace.html):
				882
				883	Significant whitespace is part of the document content and should be preserved.
				884	Insignificant whitespace is used when editing XML documents for readability.
				885	These whitespaces are typically not intended for inclusion in the delivery of the document.
				886
				887	### Regarding XML_READER_TYPE_SIGNIFICANT_WHITESPACE
				888
				889	The 3rd form of nodes, besides text- (XML_READER_TYPE_TEXT) and tag-nodes (XML_READER_TYPE_ELEMENT) are nodes of the type
				890	'XML_READER_TYPE_SIGNIFICANT_WHITESPACE'.
				891
				892	When modifiying the previous example (see: Notes on how 'XML::CompactTree::XS' works) by inserting an additional blank between
				893	'</node1>' and '<node2>', the output for '$data->[2]->[0]->[5]->[1]->[1]' is a blank (' ') and it's type is '14'
				894	(XML_READER_TYPE_SIGNIFICANT_WHITESPACE, see 'man XML::LibXML::Reader'):
				895
				896	echo '<node a="v"><node1>some <n/> text</node1> <node2>more-text</node2></node>' \| perl -e 'use XML::CompactTree::XS; use XML::LibXML::Reader; $reader = XML::LibXML::Reader->new(IO => STDIN); $data = XML::CompactTree::XS::readSubtreeToPerl( $reader, XCT_DOCUMENT_ROOT \| XCT_IGNORE_COMMENTS \| XCT_LINE_NUMBERS ); print "node=\x27".$data->[2]->[0]->[5]->[1]->[1]."\x27, type=".$data->[2]->[0]->[5]->[1]->[0]."\n"'
				897
				898
				899	Example: '... <head type="main"><s>Campagne in Frankreich</s></head><head type="sub"> <s>1792</s> ...'
				900
				901	Two text-nodes should normally be separated by a blank. In the above example, that would be the 2 text-nodes
				902	'Campagne in Frankreich' and '1792', which are separated by the whitespace-node ' ' (see [2]).
				903
				904	The text-node 'Campagne in Frankreich' leads to the setting of '$add_one' to 1, so that when opening the 2nd 'head'-tag,
				905	it's from-index gets set to the correct start-index of '1792' (and not to the start-index of the whitespace-node ' ').
				906
				907	The assumption here is, that in most cases there _is_ a whitespace node between 2 text-nodes. The below code fragment
				908	enables a way, to check, if this really _was_ the case for the last 2 'non-tag'-nodes, when closing a tag:
				909
				910	When a whitespace-node is read, its from-index is stored as a hash-key (in %ws), to state that it belongs to a ws-node.
				911	So when closing a tag, it can be checked, if the previous 'non-tag'-node (text or whitespace), which is the one before
				912	the last read 'non-tag'-node, was a actually _not_ a ws-node, but instead a text-node. In that case, the from-value of
				913	the last read 'non-tag'-node has to be corrected (see [1]),
				914
				915	For whitespace-nodes $add_one is set to 0, so when opening the next tag (in the above example the 2nd 's'-tag), no
				916	additional 1 is added (because this was already done by the whitespace-node itself when incrementing the variable $pos).
				917
				918	[1]
				919	Now, what happens, when 2 text-nodes are _not_ seperated by a whitespace-node (e.g.: <w>Augen<c>,</c></w>)?
				920	In this case, the falsely increased from-value has to be decreased again by 1 when closing the enclosing tag
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	921	(see above code fragment '... not exists $ws{ $from - 1 } ...').
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	922
				923	[2]
				924	Comparing the 2 examples '<w>fu</w> <w>bar</w>' and '<w>fu</w><w> </w><w>bar</w>', is ' ' in both cases handled as a
				925	whitespace-node (XML_READER_TYPE_SIGNIFICANT_WHITESPACE).
				926
				927	The from-index of the 2nd w-tag in the second example refers to 'bar', which may not have been the intention
				928	(even though '<w> </w>' doesn't make a lot of sense). TODO: could this be a bug?
				929
				930	Empty tags also cling to the next text-token - e.g. in '<w>tok1</w> <w>tok2</w><a><b/></a> <w>tok3</w>' are the from-
				931	and to-indizes for the tags 'a' and 'b' both 12, which is the start-index of the token 'tok3'.
				932
				933
				934	## Notes on whitespace fixing
				935
				936	The idea for the below code fragment was to fix (recreate) missing whitespace in a poorly created corpus, in which linebreaks where inserted
				937	into the text with the addition that maybe (or not) whitespace before those linebreaks was unintenionally stripped.
				938
				939	It soon turned out, that it was best to suggest considering just avoiding linebreaks and putting all primary text tokens into one line (see
				940	example further down and notes on 'Input restrictions' in the manpage).
				941
				942	Somehow an old first very poor approach remained, which is not stringent, but also doesn't affect one-line text.
				943
				944	Examples (how primary text with linebreaks would be converted by below code):
				945
				946	'...<w>end</w>\n<w>.</w>...' -> '...<w>end</w> <w>.</w>...'
				947	'...<w>,</w>\n<w>this</w>\n<w>is</w>\n<w>it</w>\n<w>!</w>...' -> '<w>,<w> <w>this</w> <w>is</w> <w>it</w> <w>!</w>'.
				948
				949	Blanks are inserted before the 1st character:
				950
				951	NOTE: not stringent ('...' stands for text):
				952
				953	beg1............................end1 => no blank before 'beg1'
				954	beg2....<pb/>...................end2 => no blank before 'beg2'
				955	beg3....<info attr1="val1"/>....end3 => no blank before 'beg3'
				956	beg4....<test>ok</test>.........end4 => blank before 'beg4'
				957
				958	=> beg1....end1beg2...<pb/>...end2beg3....<info attr1="val1"/>....end3 beg4...<test>ok</test>....end4
				959	^
				960	\|_blank between 'end3' and 'beg4'
				961
				962
				963	## Notes on segfault prevention
				964
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	965	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	966	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				967	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				968	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				969	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.