Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 418408e2b8253ea0badd5192da8c1527bce9fd55 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	14	use FindBin;
				15	BEGIN {
				16	unshift @INC, "$FindBin::Bin/../lib";
				17	};
				18
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	19	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	20	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::Conservative;
				22	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	25	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	26
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	27	our $VERSION = '2.6.0';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	28
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	29	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				30
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	31	use constant {
				32	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	33	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	34	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	35
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	36	if ($ENV{KORAPXMLTEI_INLINE}) {
				37	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
				38	};
				39
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	40	# Inline tokens won't be stored in the structure file
				41	my $inline_tokens_exclusive = 0;
				42
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	43	# Inline dependencies won't be stored in the tokens file
				44	my $inline_deps_exclusive = 0;
				45
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	46	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	47	GetOptions(
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	48	'root\|r=s' => \(my $root_dir = '.'),
				49	'input\|i=s' => \(my $input_fname = ''),
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	50	'output\|o=s' => \(my $output_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	51	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				52	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	53	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	54	'no-tokenizer' => \(my $no_tokenizer),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	55	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				56	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				57	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	58	'inline-dependencies=s' => \(my $inline_dependencies),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	59	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	60	'skip-inline-token-annotations!' => \(
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	61	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	62	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	63	'base-foundry=s' => \(my $base_dir = 'base'),
				64	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	65	'header-file=s' => \(my $header_file = 'header'),
				66	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	67	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	68	'log\|l=s' => \(my $log_level = 'notice'),
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	69	'required-version\|rv=s' => \(my $required_version),
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	70	'' => \(my $stdio),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	71	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	72	pod2usage(
				73	-verbose => 99,
				74	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				75	-msg => $VERSION_MSG,
				76	-output => '-'
				77	)
				78	},
				79	'version\|v' => sub {
				80	pod2usage(
				81	-verbose => 0,
				82	-msg => $VERSION_MSG,
				83	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	84	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	85	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	86	);
				87
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	88
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	89	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	90	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	91	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	92	$log->notice('Debugging is activated') if DEBUG;
				93
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	94
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	95	if ($required_version) {
Marc Kupietz	2475c95	2024-01-09 10:40:04 +0100	[diff] [blame]	96	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	97	if (!$1 \|\| $1 ne $VERSION) {
				98	$log->error("Required version $required_version mismatches version $VERSION");
				99	exit(1);
				100	};
				101	};
				102
				103
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	104	my ($what, $with);
				105	if ($xmlid_to_textsigle ne '') {
				106	($what, $with) = split('@', $xmlid_to_textsigle);
				107	$what = qr!$what!;
				108	};
				109
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	110	# tag (without attributes), which contains the primary text
				111	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	112	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	113
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	114	# Remember to skip certain inline tags
				115	my %skip_inline_tags = ();
				116	if ($skip_inline_tags_str) {
				117	foreach (split /\s,\s/, $skip_inline_tags_str) {
				118	$skip_inline_tags{$_} = 1;
				119	};
				120	};
				121
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	122	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	123	my $ext_tok;
				124	if ($tokenizer_call) {
				125	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	126	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	127	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	128
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	129	# KorAP tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	130	elsif ($tokenizer_korap) {
Akron	bd4281e	2022-03-28 08:31:40 +0200	[diff] [blame]	131	eval {
				132	require KorAP::XML::TEI::Tokenizer::KorAP;
				133	1;
				134	};
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	135
				136	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
				137	if ($korap_tok_ver ne $VERSION) {
				138	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
				139	exit(1);
				140	};
				141
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	142	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	143	}
				144
				145	# No internal tokenizer chosen
				146	elsif (!$tokenizer_intern && !$no_tokenizer) {
				147	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
				148	exit(1);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	149	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	150
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	151	if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	152	$skip_inline_tags{s} = 1;
				153	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	154
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	155	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	156	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				157	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	158
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	159
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	160	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	161	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	162	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	163
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	164	# Name of the directory and the file containing all inline token informations
				165	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				166	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	167
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	168	if (index($_tokens_dir, '!') == 0) {
				169	$_tokens_dir = substr($_tokens_dir, 1);
				170	$inline_tokens_exclusive = 1;
				171	};
				172
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	173
				174	my ($_dep_dir, $_dep_file);
				175	if ($inline_dependencies) {
				176	($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
				177	$inline_dependencies = 1;
				178
				179	if ($_dep_dir && index($_dep_dir, '!') == 0) {
				180	$_dep_dir = substr($_dep_dir, 1);
				181	$inline_deps_exclusive = 1;
				182	};
				183	};
				184
				185
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	186	# Initialize zipper
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	187	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	188
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	189	# text directory (below $root_dir)
				190	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	191
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	192	# Escaped version of text id
				193	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	194
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	195	# Default encoding of the text
				196	my $input_enc = 'UTF-8';
				197
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	198	# text line (needed for whitespace handling)
				199	my $text_line = 0;
				200
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	201
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	202	# Input file handle (default: stdin)
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	203	my $input_fh;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	204
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	205	# Single dash was set
				206	if ($stdio) {
				207	$input_fh = *STDIN;
				208	}
				209
				210	# Input flag was passed
				211	elsif ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	212	unless (open($input_fh, '<', $input_fname)) {
				213	die $log->fatal("File '$input_fname' could not be opened.");
				214	};
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	215	}
				216
				217	# No input to process
				218	else {
				219	pod2usage(
				220	-verbose => 99,
				221	-sections => 'NAME\|SYNOPSIS',
				222	-msg => $VERSION_MSG,
				223	-output => '-'
				224	);
				225	exit;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	226	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	227
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	228	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	229	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	230
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	231
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	232	# Create inline parser object
				233	my $inline = KorAP::XML::TEI::Inline->new(
				234	$skip_inline_tokens,
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	235	\%skip_inline_tags,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	236	$inline_tokens_exclusive,
				237	$inline_dependencies
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	238	);
				239
				240
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	241	# Reading input document
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	242	MAIN: while (<$input_fh>) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	243
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	244	# remove HTML (multi-line) comments (<!--...-->)
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	245	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	246
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	247	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	248	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	249	$input_enc = $2;
				250	next;
				251	};
				252
				253	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	254	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	255
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	256	# Start of text body
				257	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	258	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	259
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	260	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	261	die $log->fatal("input line number $.: " .
				262	"line with opening text-body tag '${_TEXT_BODY}' " .
				263	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	264	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	265
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	266	# Text body data extracted from input document ($input_fh),
				267	# further processed by XML::LibXML::Reader
				268	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	269
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	270	# Iterate over all lines in the text body
				271	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	272
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	273	$_ = remove_xml_comments($input_fh, $_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	274	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	275	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	276
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	277	# End of text body
Akron	72f4a88	2023-03-02 09:48:14 +0100	[diff] [blame]	278	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	279
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	280	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	281
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	282	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	283	die $log->fatal("input line number $.: " .
				284	"line with closing text-body tag '${_TEXT_BODY}'".
				285	" contains additional information ... => Aborting (line=$_)");
				286	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	287
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	288	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	289	$log->warn(
				290	"Maybe empty textSigle => skipping this text ...\n" .
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	291	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	292	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	293	next MAIN;
				294	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	295
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	296	# Parse inline structure
				297	$inline->parse($text_id_esc, \$text_buffer);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	298
				299	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	300	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	301	};
				302
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	303	my $data = $inline->data;
				304
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	305	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	306	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	307	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	308	$text_id_esc
				309	);
				310
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	311	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	312	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	313
				314	# Tokenize and output
				315	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	316	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	317	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	318	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	319
				320	if ($use_tokenizer_sentence_splits) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	321	$ext_tok->sentencize_from_previous_input($inline->structures);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	322	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	323	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	324
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	325	# Tokenize with internal tokenizer
				326	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	327
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	328	# Tokenize and output
				329	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	330	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	331	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	332	)->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	333
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	334	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	335	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	336	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	337	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	338	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	339
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	340	# ~ write structures ~
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	341	unless ($inline->structures->empty) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	342	$inline->structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	343	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	344	$text_id_esc,
				345	2 # = structure serialization
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	346	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	347	};
				348
				349	# ~ write tokens ~
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	350	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				351	$inline->tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	352	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	353	$text_id_esc,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	354	# Either 0 = tokens without inline or
				355	# 1 = tokens with inline
				356	# !$skip_inline_token_annotations
				357	($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	358	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	359	};
				360
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	361	# ~ write dependencies ~
				362	unless ($inline->dependencies->empty) {
				363	$inline->dependencies->to_zip(
				364	$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
				365	$text_id_esc,
				366	3 # = dependency serialization
				367	);
				368	};
				369
				370
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	371	# reinit.
				372	$dir = '';
				373
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	374	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	375	};
				376
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	377
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	378	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	379
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	380	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	381
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	382	# TODO:
				383	# Maybe it's best, to keep the stripping of whitespace and
				384	# to just remove the if-clause and to insert a blank by default
				385	# (with possibly an option on how newlines in primary text should
				386	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	387
				388	# Remove consecutive whitespace at beginning and end (mostly one newline)
				389	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	390
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	391	# NOTE:
				392	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	393
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	394	# TODO:
				395	# find a better solution, or create a warning, if a text has more
				396	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	397
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	398	# TODO:
				399	# do testing with 2 different corpora
				400	# (one with only one-line texts, the other with several lines per text)
				401
Akron	ec50325	2023-04-24 18:03:17 +0200	[diff] [blame]	402	# line contains at least one non-tag character
				403	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	404
				405	# Increment counter for text lines
				406	$text_line++;
				407
				408	# insert blank before 1st character
Akron	6e2b125	2021-02-24 12:41:15 +0100	[diff] [blame]	409	# (for 2nd line and consecutive lines)
				410	$_ = ' ' . $_ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	411	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	412
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	413	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	414	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	415	};
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	416	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	417
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	418	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
				419	my $leadin = $1;
				420	my $id = $3;
				421	my $sigle = $3;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	422
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	423	if ($what) {
				424	$_ = $id;
				425	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
				426	$sigle = $_;
				427	$log->debug("Converted text id `$id' to sigle `$sigle'");
				428	};
				429	$sigle =~ s/\./-/g;
				430
				431	my @parts = split(/[\/_]/, $sigle);
				432	if (@parts != 3) {
				433	die $log->fatal(
				434	"input line number $.: " .
				435	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
				436	"=> Aborting (line=$_)");
				437	};
				438
				439	$dir = join("/", @parts);
				440	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
				441	$log->notice("$0: text_id=$text_id_esc");
				442
				443	if ($leadin !~ /^\s*$/) {
				444	die $log->fatal(
				445	"input line number $.: " .
				446	'line with opening header tag is not in expected format ... ' .
				447	"=> Aborting (line=$_)");
				448	};
				449	}
				450
				451	# Start of header section
				452	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	453	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	454
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	455	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	456	die $log->fatal(
				457	"input line number $.: " .
				458	'line with opening header tag is not in expected format ... ' .
				459	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	460	};
				461
				462	# Parse header
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	463	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	464
				465	# Header was parseable
				466	if ($header) {
				467
				468	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	469	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	470
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	471	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	472
				473	$header->to_zip($zipper->new_stream($file));
				474
				475	# Header is for text level
				476	if ($header->type eq 'text') {
				477
				478	# Remember dir and sigles
				479	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	480	$text_id_esc = $header->id_esc;
				481
				482	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	483	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	484
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	485	# Reset counter for text lines
				486	# (needed for whitespace handling)
				487	$text_line = 0;
				488	};
				489	};
				490	};
				491	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	492
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	493	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	494
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	495	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	496
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	497	close $input_fh;
				498
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	499
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	500	__END__
				501
				502	=pod
				503
				504	=encoding utf8
				505
				506	=head1 NAME
				507
				508	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				509
				510	=head1 SYNOPSIS
				511
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	512	cat corpus.i5.xml \| tei2korapxml - > corpus.korapxml.zip
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	513
				514	=head1 DESCRIPTION
				515
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	516	C<tei2korapxml> is a script to convert TEI P5 and
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	517	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	518	based documents to the
				519	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	520
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	521	This program is usually called from inside another script.
				522
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	523	=head1 FORMATS
				524
				525	=head2 Input restrictions
				526
				527	=over 2
				528
				529	=item
				530
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	531	TEI P5 formatted input with certain restrictions:
				532
				533	=over 4
				534
				535	=item
				536
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	537	B<mandatory>: text-header with integrated textsigle
				538	(or convertable identifier), text-body
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	539
				540	=item
				541
				542	B<optional>: corp-header with integrated corpsigle,
				543	doc-header with integrated docsigle
				544
				545	=back
				546
				547	=item
				548
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	549	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	550	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	551	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	552	into blanks between 2 tokens could lead to additional blanks,
				553	where there should be none (e.g.: punctuation characters like C<,> or
				554	C<.> should not be seperated from their predecessor token).
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	555	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	556
Akron	940ca6f	2021-10-11 12:38:39 +0200	[diff] [blame]	557	=item
				558
				559	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
				560	need to be defined in the same line as the header tag.
				561
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	562	=back
				563
				564	=head2 Notes on the output
				565
				566	=over 2
				567
				568	=item
				569
				570	zip file output (default on C<stdout>) with utf8 encoded entries
				571	(which together form the KorAP-XML format)
				572
				573	=back
				574
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	575	=head1 INSTALLATION
				576
Akron	d26319b	2023-01-12 15:34:41 +0100	[diff] [blame]	577	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietz	e83a4e9	2021-03-16 20:51:26 +0100	[diff] [blame]	578	When these requirements are met, the preferred way to install the script is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	579	to use L<cpanm\|App::cpanminus>.
				580
				581	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				582
				583	In case everything went well, the C<tei2korapxml> tool will
				584	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	585
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	586	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				587
				588	=head1 OPTIONS
				589
				590	=over 2
				591
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	592	=item B<--input\|-i>
				593
				594	The input file to process. If no specific input is defined and a single
				595	dash C<-> is passed as an argument, data is read from C<STDIN>.
				596
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	597	=item B<--output\|-o>
				598
				599	The output zip file to be created. If no specific output is defined,
				600	data is written to C<STDOUT>.
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	601
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	602	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	603
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	604	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	605
				606	=item B<--help\|-h>
				607
				608	Print help information.
				609
				610	=item B<--version\|-v>
				611
				612	Print version information.
				613
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	614	=item B<--tokenizer-korap\|-tk>
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	615
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	616	Use the standard KorAP/DeReKo tokenizer.
				617
				618	=item B<--tokenizer-internal\|-ti>
				619
				620	Tokenize the data using two embedded tokenizers,
				621	that will take an I<aggressive> and a I<conservative>
				622	approach.
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	623
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	624	=item B<--tokenizer-call\|-tc>
				625
				626	Call an external tokenizer process, that will tokenize
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	627	from STDIN and outputs the offsets of all tokens.
				628
				629	Texts are separated using C<\x04\n>. The external process
				630	should add a new line per text.
				631
				632	If the L</--use-tokenizer-sentence-splits> option is activated,
				633	sentences are marked by offset as well in new lines.
				634
				635	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
				636	splitting, call C<tei2korap> as follows:
				637
				638	$ cat corpus.i5.xml \| tei2korapxml -s \
				639	$ -tc 'datok tokenize \
				640	$ -t ./tokenizer.matok \
				641	$ -p --newline-after-eot --no-sentences \
				642	$ --no-tokens --sentence-positions -' - \
				643	$ > corpus.korapxml.zip
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	644
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	645	=item B<--no-tokenizer>
				646
				647	Boolean flag indicating that no tokenizer should be used.
				648	This is meant to ensure that by default a final token layer always
				649	exists.
				650	If a separate tokenizer is chosen, this flag is ignored.
				651
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	652	=item B<--skip-inline-tokens>
				653
				654	Boolean flag indicating that inline tokens should not
				655	be processed. Defaults to false (meaning inline tokens will be processed).
				656
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	657	=item B<--skip-inline-token-annotations>
				658
				659	Boolean flag indicating that inline token annotations should not
				660	be processed. Defaults to true (meaning inline token annotations
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	661	won't be processed). Can be negated with
				662	C<--no-skip-inline-token-annotations>.
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	663
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	664	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	665
				666	Expects a comma-separated list of tags to be ignored when the structure
				667	is parsed. Content of these tags however will be processed.
				668
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	669	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
				670
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	671	Expects a regular replacement expression (separated by B<@> between the
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	672	search and the replacement) to convert text id attributes to text sigles
				673	with three parts (separated by B</>).
				674
				675	Example:
				676
				677	tei2korapxml \
				678	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
				679	-tk - < t/data/icc_german_sample.p5.xml
				680
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	681	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
				682	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	683
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	684	=item B<--inline-tokens> <foundry>#[<file>]
				685
				686	Define the foundry and file (without extension)
				687	to store inline token information in.
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	688	Unless C<--skip-inline-token-annotations> is set,
				689	this will contain annotations as well.
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	690	Defaults to C<tokens> and C<morpho>.
				691
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	692	The inline token data will also be stored in the
				693	inline structures file (see I<--inline-structures>),
				694	unless the inline token foundry is prepended
				695	by an B<!> exclamation mark, indicating that inline
				696	tokens are stored exclusively in the inline tokens
				697	file.
				698
				699	Example:
				700
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	701	tei2korapxml --no-tokenizer --inline-tokens \
				702	'!gingko#morpho' < data.i5.xml > korapxml.zip
				703
				704	=item B<--inline-dependencies> <foundry>#[<file>]
				705
				706	Define the foundry and file (without extension)
				707	to store inline dependency information in.
				708	Defaults to the layer of C<dependency> and
				709	will be ignored if not set (which means, dependency
				710	attributes will be stored in the inline tokens file,
				711	if not skipped).
				712
				713	The dependency data will also be stored in the
				714	inline token file (see I<--inline-tokens>),
				715	unless the inline dependencies foundry is prepended
				716	by an B<!> exclamation mark, indicating that inline
				717	dependency data is stored exclusively in the inline
				718	dependencies file.
				719
				720	Example:
				721
				722	tei2korapxml --no-tokenizer --inline-dependencies \
				723	'gingko#dependency' < data.i5.xml > korapxml.zip
				724
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	725
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	726	=item B<--inline-structures> <foundry>#[<file>]
				727
				728	Define the foundry and file (without extension)
				729	to store inline structure information in.
				730	Defaults to C<struct> and C<structures>.
				731
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	732	=item B<--base-foundry> <foundry>
				733
				734	Define the base foundry to store newly generated
				735	token information in.
				736	Defaults to C<base>.
				737
				738	=item B<--data-file> <file>
				739
				740	Define the file (without extension)
				741	to store primary data information in.
				742	Defaults to C<data>.
				743
				744	=item B<--header-file> <file>
				745
				746	Define the file name (without extension)
				747	to store header information on
				748	the corpus, document, and text level in.
				749	Defaults to C<header>.
				750
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	751	=item B<--use-tokenizer-sentence-splits\|-s>
				752
				753	Replace existing with, or add new, sentence boundary information
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	754	provided by the tokenizer.
				755	Currently KorAP-tokenizer and certain external tokenizers support
				756	these boundaries.
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	757
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	758	=item B<--tokens-file> <file>
				759
				760	Define the file (without extension)
				761	to store generated token information in
				762	(either from the KorAP tokenizer or an externally called tokenizer).
				763	Defaults to C<tokens>.
				764
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	765	=item B<--log\|-l>
				766
				767	Loglevel for I<Log::Any>. Defaults to C<notice>.
				768
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	769	=back
				770
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	771	=head1 ENVIRONMENT VARIABLES
				772
				773	=over 2
				774
				775	=item B<KORAPXMLTEI_DEBUG>
				776
				777	Activate minimal debugging.
				778	Defaults to C<false>.
				779
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	780	=back
				781
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	782	=head1 COPYRIGHT AND LICENSE
				783
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame^]	784	Copyright (C) 2021-2024, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	785
				786	Author: Peter Harders
				787
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	788	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	789
				790	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				791	Corpus Analysis Platform at the
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	792	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	793	member of the
				794	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				795
				796	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	797	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	798
				799	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	800
				801	# NOTES
				802
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	803	## Notes on segfault prevention
				804
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	805	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	806	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				807	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				808	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				809	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.