Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 574040756323ea211a7417d9fac4781a694a8162 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	14	use FindBin;
				15	BEGIN {
				16	unshift @INC, "$FindBin::Bin/../lib";
				17	};
				18
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	19	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	20	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::Conservative;
				22	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	25	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	26
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	27	eval {
				28	require KorAP::XML::TEI::Tokenizer::KorAP;
				29	1;
				30	};
Peter Harders	1c5ce15	2020-07-22 18:02:50 +0200	[diff] [blame]	31
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	32	our $VERSION = '1.01';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	33
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	34	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				35
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	36	use constant {
				37	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	38	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	39	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	40
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	41	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	42	GetOptions(
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	43	'root\|r=s' => \(my $root_dir = '.'),
				44	'input\|i=s' => \(my $input_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	45	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				46	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	47	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	48	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				49	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				50	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
				51	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	52	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	53	'base-foundry=s' => \(my $base_dir = 'base'),
				54	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	55	'header-file=s' => \(my $header_file = 'header'),
				56	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	57	'log\|l=s' => \(my $log_level = 'notice'),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	58	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	59	pod2usage(
				60	-verbose => 99,
				61	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				62	-msg => $VERSION_MSG,
				63	-output => '-'
				64	)
				65	},
				66	'version\|v' => sub {
				67	pod2usage(
				68	-verbose => 0,
				69	-msg => $VERSION_MSG,
				70	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	71	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	72	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	73	);
				74
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	75
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	76	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	77	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	78	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	79	$log->notice('Debugging is activated') if DEBUG;
				80
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	81
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	82	# tag (without attributes), which contains the primary text
				83	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	84	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	85
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	86	# TODO: IDS-specific (and redundant)
				87	my $_HEADER_TAG = 'idsHeader';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	88
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	89
				90	# Define tokenizers
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	91	if ($use_tokenizer_sentence_splits && !$tokenizer_korap) {
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	92	die $log->fatal(
				93	'Sentence splitting is currently only supported by KorAP tokenizer ' .
				94	'(use -tk to activate it)'
				95	);
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	96	};
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	97
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	98	# Remember to skip certain inline tags
				99	my %skip_inline_tags = ();
				100	if ($skip_inline_tags_str) {
				101	foreach (split /\s,\s/, $skip_inline_tags_str) {
				102	$skip_inline_tags{$_} = 1;
				103	};
				104	};
				105
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	106	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	107	my $ext_tok;
				108	if ($tokenizer_call) {
				109	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
				110	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	111
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	112	elsif ($tokenizer_korap) {
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	113	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	114	if ($use_tokenizer_sentence_splits) {
				115	$skip_inline_tags{s} = 1;
				116	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	117	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	118
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	119
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	120	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	121	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				122	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	123
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	124
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	125	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	126	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	127	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	128
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	129	# Name of the directory and the file containing all inline token informations
				130	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				131	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	132
Akron	4e3c7e3	2021-02-18 15:19:53 +0100	[diff] [blame]	133	# Handling inline annotations (inside $_TOKENS_TAG)
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	134	my $_INLINE_ANNOT = $ENV{KORAPXMLTEI_INLINE} ? 1 : 0;
				135
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	136	# Initialize zipper
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	137	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	138
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	139	# text directory (below $root_dir)
				140	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	141
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	142	# Escaped version of text id
				143	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	144
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	145	# Default encoding of the text
				146	my $input_enc = 'UTF-8';
				147
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	148	# text line (needed for whitespace handling)
				149	my $text_line = 0;
				150
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	151
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	152	# Input file handle (default: stdin)
				153	my $input_fh = *STDIN;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	154
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	155	if ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	156	unless (open($input_fh, '<', $input_fname)) {
				157	die $log->fatal("File '$input_fname' could not be opened.");
				158	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	159	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	160
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	161	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	162	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	163
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	164
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	165	# Create inline parser object
				166	my $inline = KorAP::XML::TEI::Inline->new(
				167	$skip_inline_tokens,
				168	\%skip_inline_tags
				169	);
				170
				171
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	172	# Reading input document
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	173	MAIN: while (<$input_fh>) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	174
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	175	# remove HTML (multi-line) comments (<!--...-->)
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	176	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	177
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	178	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	179	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	180	$input_enc = $2;
				181	next;
				182	};
				183
				184	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	185	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	186
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	187	# Start of text body
				188	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	189	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	190
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	191	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	192	die $log->fatal("input line number $.: " .
				193	"line with opening text-body tag '${_TEXT_BODY}' " .
				194	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	195	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	196
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	197	# Text body data extracted from input document ($input_fh),
				198	# further processed by XML::LibXML::Reader
				199	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	200
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	201	# Iterate over all lines in the text body
				202	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	203
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	204	$_ = remove_xml_comments($input_fh, $_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	205	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	206	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	207
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	208	# End of text body
Akron	b43b491	2021-02-25 10:31:11 +0100	[diff] [blame]	209	if ((my $pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	210
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	211	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	212
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	213	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	214	die $log->fatal("input line number $.: " .
				215	"line with closing text-body tag '${_TEXT_BODY}'".
				216	" contains additional information ... => Aborting (line=$_)");
				217	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	218
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	219	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	220	$log->warn(
				221	"Maybe empty textSigle => skipping this text ...\n" .
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	222	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	223	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	224	next MAIN;
				225	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	226
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	227	# Parse inline structure
				228	$inline->parse($text_id_esc, \$text_buffer);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	229
				230	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	231	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	232	};
				233
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	234	my $data = $inline->data;
				235
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	236	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	237	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	238	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	239	$text_id_esc
				240	);
				241
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	242	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	243	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	244
				245	# Tokenize and output
				246	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	247	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	248	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	249	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	250
				251	if ($use_tokenizer_sentence_splits) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	252	$ext_tok->sentencize_from_previous_input($inline->structures);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	253	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	254	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	255
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	256	# Tokenize with internal tokenizer
				257	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	258
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	259	# Tokenize and output
				260	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	261	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	262	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	263	)->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	264
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	265	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	266	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	267	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	268	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	269	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	270
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	271	# ~ write structures ~
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	272	if (!$inline->structures->empty) {
				273	$inline->structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	274	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	275	$text_id_esc,
				276	2 # = structure serialization
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	277	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	278	};
				279
				280	# ~ write tokens ~
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	281	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				282	$inline->tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	283	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	284	$text_id_esc,
				285	$_INLINE_ANNOT # Either 0 = tokens without inline or 1 = tokens with inline
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame^]	286	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	287	};
				288
				289	# reinit.
				290	$dir = '';
				291
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	292	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	293	};
				294
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	295
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	296	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	297
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	298	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	299
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	300	# TODO:
				301	# Maybe it's best, to keep the stripping of whitespace and
				302	# to just remove the if-clause and to insert a blank by default
				303	# (with possibly an option on how newlines in primary text should
				304	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	305
				306	# Remove consecutive whitespace at beginning and end (mostly one newline)
				307	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	308
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	309	# NOTE:
				310	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	311
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	312	# TODO:
				313	# find a better solution, or create a warning, if a text has more
				314	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	315
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	316	# TODO:
				317	# do testing with 2 different corpora
				318	# (one with only one-line texts, the other with several lines per text)
				319
				320	# line contains at least one tag with at least one character contents
				321	if (m/<[^>]+>[^<]/) {
				322
				323	# Increment counter for text lines
				324	$text_line++;
				325
				326	# insert blank before 1st character
Akron	6e2b125	2021-02-24 12:41:15 +0100	[diff] [blame]	327	# (for 2nd line and consecutive lines)
				328	$_ = ' ' . $_ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	329	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	330
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	331	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	332	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	333	};
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	334	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	335
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	336	# Start of header section
				337	elsif (m#^(.)(\<${_HEADER_TAG}[^>]?type=["'].*)$#) {
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	338
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	339	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	340
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	341	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	342	die $log->fatal(
				343	"input line number $.: " .
				344	'line with opening header tag is not in expected format ... ' .
				345	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	346	};
				347
				348	# Parse header
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	349	my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	350
				351	# Header was parseable
				352	if ($header) {
				353
				354	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	355	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	356
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	357	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	358
				359	$header->to_zip($zipper->new_stream($file));
				360
				361	# Header is for text level
				362	if ($header->type eq 'text') {
				363
				364	# Remember dir and sigles
				365	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	366	$text_id_esc = $header->id_esc;
				367
				368	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	369	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	370
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	371	# Reset counter for text lines
				372	# (needed for whitespace handling)
				373	$text_line = 0;
				374	};
				375	};
				376	};
				377	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	378
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	379	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	380
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	381	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	382
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	383	close $input_fh;
				384
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	385
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	386	__END__
				387
				388	=pod
				389
				390	=encoding utf8
				391
				392	=head1 NAME
				393
				394	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				395
				396	=head1 SYNOPSIS
				397
				398	cat corpus.i5.xml \| tei2korapxml > corpus.korapxml.zip
				399
				400	=head1 DESCRIPTION
				401
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	402	C<tei2korapxml> is a script to convert TEI P5 and
				403	L<I5\|https://www1.ids-mannheim.de/kl/projekte/korpora/textmodell.html>
				404	based documents to the
				405	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
				406	If no specific input is defined, data is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	407	read from C<STDIN>. If no specific output is defined, data is written
				408	to C<STDOUT>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	409
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	410	This program is usually called from inside another script.
				411
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	412	=head1 FORMATS
				413
				414	=head2 Input restrictions
				415
				416	=over 2
				417
				418	=item
				419
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	420	TEI P5 formatted input with certain restrictions:
				421
				422	=over 4
				423
				424	=item
				425
				426	B<mandatory>: text-header with integrated textsigle, text-body
				427
				428	=item
				429
				430	B<optional>: corp-header with integrated corpsigle,
				431	doc-header with integrated docsigle
				432
				433	=back
				434
				435	=item
				436
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	437	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	438	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	439	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	440	into blanks between 2 tokens could lead to additional blanks,
				441	where there should be none (e.g.: punctuation characters like C<,> or
				442	C<.> should not be seperated from their predecessor token).
				443	(see also code section C<~ whitespace handling ~>).
				444
				445	=back
				446
				447	=head2 Notes on the output
				448
				449	=over 2
				450
				451	=item
				452
				453	zip file output (default on C<stdout>) with utf8 encoded entries
				454	(which together form the KorAP-XML format)
				455
				456	=back
				457
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	458	=head1 INSTALLATION
				459
				460	C<tei2korapxml> requires L<libxml2-dev> bindings to build. When
				461	these bindings are available, the preferred way to install the script is
				462	to use L<cpanm\|App::cpanminus>.
				463
				464	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				465
				466	In case everything went well, the C<tei2korapxml> tool will
				467	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	468
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	469	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				470
				471	=head1 OPTIONS
				472
				473	=over 2
				474
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	475	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	476
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	477	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	478
				479	=item B<--help\|-h>
				480
				481	Print help information.
				482
				483	=item B<--version\|-v>
				484
				485	Print version information.
				486
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	487	=item B<--tokenizer-call\|-tc>
				488
				489	Call an external tokenizer process, that will tokenize
				490	a single line from STDIN and outputs one token per line.
				491
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	492	=item B<--tokenizer-korap\|-tk>
				493
				494	Use the standard KorAP/DeReKo tokenizer.
				495
Akron	6d7b8e4	2020-09-29 07:37:41 +0200	[diff] [blame]	496	=item B<--tokenizer-internal\|-ti>
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	497
				498	Tokenize the data using two embedded tokenizers,
				499	that will take an I<Aggressive> and a I<conservative>
				500	approach.
				501
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	502	=item B<--skip-inline-tokens>
				503
				504	Boolean flag indicating that inline tokens should not
				505	be processed. Defaults to false (meaning inline tokens will be processed).
				506
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	507	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	508
				509	Expects a comma-separated list of tags to be ignored when the structure
				510	is parsed. Content of these tags however will be processed.
				511
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	512	=item B<--inline-tokens> <foundry>#[<file>]
				513
				514	Define the foundry and file (without extension)
				515	to store inline token information in.
				516	If L</KORAPXMLTEI_INLINE> is set, this will contain
				517	annotations as well.
				518	Defaults to C<tokens> and C<morpho>.
				519
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	520	=item B<--inline-structures> <foundry>#[<file>]
				521
				522	Define the foundry and file (without extension)
				523	to store inline structure information in.
				524	Defaults to C<struct> and C<structures>.
				525
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	526	=item B<--base-foundry> <foundry>
				527
				528	Define the base foundry to store newly generated
				529	token information in.
				530	Defaults to C<base>.
				531
				532	=item B<--data-file> <file>
				533
				534	Define the file (without extension)
				535	to store primary data information in.
				536	Defaults to C<data>.
				537
				538	=item B<--header-file> <file>
				539
				540	Define the file name (without extension)
				541	to store header information on
				542	the corpus, document, and text level in.
				543	Defaults to C<header>.
				544
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	545	=item B<--use-tokenizer-sentence-splits\|-s>
				546
				547	Replace existing with, or add new, sentence boundary information
				548	provided by the KorAP tokenizer (currently supported only).
				549
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	550	=item B<--tokens-file> <file>
				551
				552	Define the file (without extension)
				553	to store generated token information in
				554	(either from the KorAP tokenizer or an externally called tokenizer).
				555	Defaults to C<tokens>.
				556
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	557	=item B<--log\|-l>
				558
				559	Loglevel for I<Log::Any>. Defaults to C<notice>.
				560
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	561	=back
				562
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	563	=head1 ENVIRONMENT VARIABLES
				564
				565	=over 2
				566
				567	=item B<KORAPXMLTEI_DEBUG>
				568
				569	Activate minimal debugging.
				570	Defaults to C<false>.
				571
				572	=item B<KORAPXMLTEI_INLINE>
				573
				574	Process inline annotations, if present.
				575	Defaults to C<false>.
				576
				577	=back
				578
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	579	=head1 COPYRIGHT AND LICENSE
				580
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	581	Copyright (C) 2021, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	582
				583	Author: Peter Harders
				584
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	585	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	586
				587	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				588	Corpus Analysis Platform at the
				589	L<Leibniz Institute for the German Language (IDS)\|http://ids-mannheim.de/>,
				590	member of the
				591	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				592
				593	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	594	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	595
				596	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	597
				598	# NOTES
				599
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	600	## Notes on segfault prevention
				601
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	602	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	603	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				604	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				605	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				606	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.