Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: f6cbe5a531918b76796f55c91549415a27b03b67 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	9	use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	10
				11	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	12
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	13	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	15	use FindBin;
				16	BEGIN {
				17	unshift @INC, "$FindBin::Bin/../lib";
				18	};
				19
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	20	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	22	use KorAP::XML::TEI::Tokenizer::Conservative;
				23	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	25	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	26	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	27
Marc Kupietz	ef5dfd3	2026-03-05 10:02:47 +0100	[diff] [blame^]	28	our $VERSION = '2.7.1';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	29
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	30	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				31
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	32	use constant {
				33	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	34	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	35	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	36
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	37	if ($ENV{KORAPXMLTEI_INLINE}) {
				38	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
				39	};
				40
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	41	# Inline tokens won't be stored in the structure file
				42	my $inline_tokens_exclusive = 0;
				43
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	44	# Inline dependencies won't be stored in the tokens file
				45	my $inline_deps_exclusive = 0;
				46
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	47	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	48	GetOptions(
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	49	'auto-textsigle\|A=s' => \(my $auto_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	50	'root\|r=s' => \(my $root_dir = '.'),
				51	'input\|i=s' => \(my $input_fname = ''),
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	52	'output\|o=s' => \(my $output_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	53	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				54	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	55	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	56	'no-tokenizer' => \(my $no_tokenizer),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	57	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				58	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				59	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	60	'inline-dependencies=s' => \(my $inline_dependencies),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	62	'skip-inline-token-annotations!' => \(
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	63	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	64	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	65	'base-foundry=s' => \(my $base_dir = 'base'),
				66	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	67	'header-file=s' => \(my $header_file = 'header'),
				68	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	69	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	70	'log\|l=s' => \(my $log_level = 'notice'),
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	71	'required-version\|rv=s' => \(my $required_version),
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	72	'progress\|p' => \(my $progress),
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	73	'' => \(my $stdio),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	74	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	75	pod2usage(
				76	-verbose => 99,
				77	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				78	-msg => $VERSION_MSG,
				79	-output => '-'
				80	)
				81	},
				82	'version\|v' => sub {
				83	pod2usage(
				84	-verbose => 0,
				85	-msg => $VERSION_MSG,
				86	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	87	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	88	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	89	);
				90
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	91
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	92	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	93	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	94	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	95	$log->notice('Debugging is activated') if DEBUG;
				96
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	97
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	98	if ($required_version) {
Marc Kupietz	2475c95	2024-01-09 10:40:04 +0100	[diff] [blame]	99	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	100	if (!$1 \|\| $1 ne $VERSION) {
				101	$log->error("Required version $required_version mismatches version $VERSION");
				102	exit(1);
				103	};
				104	};
				105
				106
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	107	my ($what, $with);
				108	if ($xmlid_to_textsigle ne '') {
				109	($what, $with) = split('@', $xmlid_to_textsigle);
				110	$what = qr!$what!;
				111	};
				112
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	113	if ($progress) {
				114	eval {
				115	require Time::Progress;
				116	1;
				117	} or do {
				118	$log->warn('Time::Progress not installed. Progress bar disabled.');
				119	$progress = 0;
				120	}
				121	};
				122
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	123	# tag (without attributes), which contains the primary text
				124	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	125	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	126
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	127	# Remember to skip certain inline tags
				128	my %skip_inline_tags = ();
				129	if ($skip_inline_tags_str) {
				130	foreach (split /\s,\s/, $skip_inline_tags_str) {
				131	$skip_inline_tags{$_} = 1;
				132	};
				133	};
				134
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	135	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	136	my $ext_tok;
				137	if ($tokenizer_call) {
				138	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	139	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	140	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	141
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	142	# KorAP tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	143	elsif ($tokenizer_korap) {
Akron	bd4281e	2022-03-28 08:31:40 +0200	[diff] [blame]	144	eval {
				145	require KorAP::XML::TEI::Tokenizer::KorAP;
				146	1;
				147	};
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	148
				149	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
				150	if ($korap_tok_ver ne $VERSION) {
				151	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
				152	exit(1);
				153	};
				154
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	155	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	156	}
				157
				158	# No internal tokenizer chosen
				159	elsif (!$tokenizer_intern && !$no_tokenizer) {
				160	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
				161	exit(1);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	162	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	163
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	164	if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	165	$skip_inline_tags{s} = 1;
				166	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	167
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	168	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	169	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				170	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	171
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	172
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	173	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	174	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	175	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	176
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	177	# Name of the directory and the file containing all inline token informations
				178	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				179	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	180
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	181	if (index($_tokens_dir, '!') == 0) {
				182	$_tokens_dir = substr($_tokens_dir, 1);
				183	$inline_tokens_exclusive = 1;
				184	};
				185
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	186
				187	my ($_dep_dir, $_dep_file);
				188	if ($inline_dependencies) {
				189	($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
				190	$inline_dependencies = 1;
				191
				192	if ($_dep_dir && index($_dep_dir, '!') == 0) {
				193	$_dep_dir = substr($_dep_dir, 1);
				194	$inline_deps_exclusive = 1;
				195	};
				196	};
				197
				198
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	199	# Initialize zipper
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	200	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	201
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	202	# text directory (below $root_dir)
				203	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	204
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	205	# Escaped version of text id
				206	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	207
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	208	# Default encoding of the text
				209	my $input_enc = 'UTF-8';
				210
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	211	# text line (needed for whitespace handling)
				212	my $text_line = 0;
				213
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	214
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	215	# Input file handle (default: stdin)
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	216	my $input_fh;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	217
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	218	# Single dash was set
				219	if ($stdio) {
				220	$input_fh = *STDIN;
				221	}
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	222	# Input flag was passed
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	223	elsif (@ARGV \|\| $input_fname ne '') {
				224	unless ($input_fname ne '') {
				225	$input_fname = shift @ARGV;
				226	};
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	227	unless (open($input_fh, '<', $input_fname)) {
				228	die $log->fatal("File '$input_fname' could not be opened.");
				229	};
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	230	}
				231
				232	# No input to process
				233	else {
				234	pod2usage(
				235	-verbose => 99,
				236	-sections => 'NAME\|SYNOPSIS',
				237	-msg => $VERSION_MSG,
				238	-output => '-'
				239	);
				240	exit;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	241	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	242
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	243	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	244	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	245
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	246
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	247	# Create inline parser object
				248	my $inline = KorAP::XML::TEI::Inline->new(
				249	$skip_inline_tokens,
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	250	\%skip_inline_tags,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	251	$inline_tokens_exclusive,
				252	$inline_dependencies
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	253	);
				254
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	255	do {
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	256	my $p;
				257	if ($progress && $input_fname ne '') {
				258	my $file_size = -s $input_fname;
				259	if ($file_size) {
				260	$p = Time::Progress->new(min => 0, max => $file_size);
				261	$log->notice("Reading input document $input_fname (Size: $file_size bytes)");
				262	}
				263	} elsif ($input_fname ne '') {
				264	$log->notice("Reading input document $input_fname");
				265	};
				266
				267	my $i = 0;
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	268	MAIN:
				269	while (<$input_fh>) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	270
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	271	if ($p && ($i++ % 500 == 0)) {
				272	print STDERR $p->report("\r%20b %p ETA: %E", tell($input_fh));
				273	};
				274
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	275	# remove HTML (multi-line) comments (<!--...-->)
				276	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	277
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	278	# Set input encoding
				279	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				280	$input_enc = $2;
				281	next;
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	282	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	283
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	284	$_ = decode($input_enc, $_);
				285	$_ = replace_entities($_);
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	286
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	287	# Start of text body
				288	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
				289	my $suffix = $2;
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	290
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	291	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
				292	die $log->fatal("input line number $.: " .
				293	"line with opening text-body tag '${_TEXT_BODY}' " .
				294	"contains additional information ... => Aborting (line=$_)");
				295	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	296
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	297	# Text body data extracted from input document ($input_fh),
				298	# further processed by XML::LibXML::Reader
				299	my $text_buffer = '';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	300
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	301	# Iterate over all lines in the text body
				302	while (<$input_fh>) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	303
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	304	$_ = remove_xml_comments($input_fh, $_);
				305	$_ = decode($input_enc, $_);
				306	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	307
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	308	# End of text body
				309	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
				310
				311	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
				312
Marc Kupietz	ff061ef	2026-03-05 09:59:35 +0100	[diff] [blame]	313	my $before = substr($_, 0, $pos);
				314	my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
				315	my $before_check = $before;
				316	$before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
				317	if (($before_check . $after) !~ /^\s*$/) {
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	318	die $log->fatal("input line number $.: " .
				319	"line with closing text-body tag '${_TEXT_BODY}'" .
				320	" contains additional information ... => Aborting (line=$_)");
				321	};
				322
Marc Kupietz	ff061ef	2026-03-05 09:59:35 +0100	[diff] [blame]	323	# Add any remaining content before </text> (e.g. </body>) to the buffer
				324	$before =~ s/^\s+//;
				325	$before =~ s/\s+$//;
				326	$text_buffer .= $before if $before ne '';
				327
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	328	if ($dir eq '') {
				329	$log->warn(
				330	"Maybe empty textSigle => skipping this text ...\n" .
				331	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	332	);
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	333	next MAIN;
				334	};
				335
				336	# Parse inline structure
				337	$inline->parse($text_id_esc, \$text_buffer);
				338
				339	if (DEBUG) {
				340	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
				341	};
				342
				343	my $data = $inline->data;
				344
				345	# Write data.xml
				346	$data->to_zip(
				347	$zipper->new_stream("$dir/${data_file}.xml"),
				348	$text_id_esc
				349	);
				350
				351	# Tokenize with external tokenizer
				352	if ($ext_tok) {
				353
				354	# Tokenize and output
				355	$ext_tok->tokenize($data->data)->to_zip(
				356	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
				357	$text_id_esc
				358	);
				359
				360	if ($use_tokenizer_sentence_splits) {
				361	$ext_tok->sentencize_from_previous_input($inline->structures);
				362	};
				363	};
				364
				365	# Tokenize with internal tokenizer
				366	if ($tokenizer_intern) {
				367
				368	# Tokenize and output
				369	$cons_tok->tokenize($data->data)->to_zip(
				370	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
				371	$text_id_esc
				372	)->reset;
				373
				374	$aggr_tok->tokenize($data->data)->to_zip(
				375	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
				376	$text_id_esc
				377	)->reset;
				378	};
				379
				380	# ~ write structures ~
				381	unless ($inline->structures->empty) {
				382	$inline->structures->to_zip(
				383	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
				384	$text_id_esc,
				385	2 # = structure serialization
				386	);
				387	};
				388
				389	# ~ write tokens ~
				390	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				391	$inline->tokens->to_zip(
				392	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
				393	$text_id_esc,
				394	# Either 0 = tokens without inline or
				395	# 1 = tokens with inline
				396	# !$skip_inline_token_annotations
				397	($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
				398	);
				399	};
				400
				401	# ~ write dependencies ~
				402	unless ($inline->dependencies->empty) {
				403	$inline->dependencies->to_zip(
				404	$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
				405	$text_id_esc,
				406	3 # = dependency serialization
				407	);
				408	};
				409
				410
				411	# reinit.
				412	$dir = '';
				413
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	414	next MAIN;
				415	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	416
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	417
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	418	# ~ whitespace handling ~
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	419
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	420	# Fix whitespaces (see notes on whitespace fixing)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	421
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	422	# TODO:
				423	# Maybe it's best, to keep the stripping of whitespace and
				424	# to just remove the if-clause and to insert a blank by default
				425	# (with possibly an option on how newlines in primary text should
				426	# be handled (stripped or replaced by a whitespace)).
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	427
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	428	# Remove consecutive whitespace at beginning and end (mostly one newline)
				429	s/^\s+//;
				430	s/\s+$//;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	431
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	432	# NOTE:
				433	# this is only relevant, if a text consists of more than one line
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	434
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	435	# TODO:
				436	# find a better solution, or create a warning, if a text has more
				437	# than one line ($text_line > 1)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	438
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	439	# TODO:
				440	# do testing with 2 different corpora
				441	# (one with only one-line texts, the other with several lines per text)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	442
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	443	# line contains at least one non-tag character
				444	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	445
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	446	# Increment counter for text lines
				447	$text_line++;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	448
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	449	# insert blank before 1st character
				450	# (for 2nd line and consecutive lines)
				451	$_ = ' ' . $_ if $text_line > 1;
				452	}
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	453
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	454	# add line to buffer
				455	$text_buffer .= $_;
				456	};
				457	}
				458	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
				459	my $leadin = $1;
				460	my $id = $3;
				461	my $sigle = $3;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	462
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	463	if ($what) {
				464	$_ = $id;
				465	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
				466	$sigle = $_;
				467	$log->debug("Converted text id `$id' to sigle `$sigle'");
				468	};
				469	$sigle =~ s/\./-/g;
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	470
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	471	my @parts = split(/[\/_]/, $sigle);
				472	if (@parts != 3) {
				473	die $log->fatal(
				474	"input line number $.: " .
				475	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
				476	"=> Aborting (line=$_)");
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	477	};
				478
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	479	$dir = join("/", @parts);
				480	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
				481	$log->notice("$0: text_id=$text_id_esc");
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	482
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	483	if ($leadin !~ /^\s*$/) {
				484	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	485	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	486	'line with opening header tag is not in expected format ... ' .
				487	"=> Aborting (line=$_)");
				488	};
				489	}
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	490
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	491	# Start of header section
				492	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
				493	my $content = "$2\n";
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	494
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	495	if ($1 !~ /^\s*$/) {
				496	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	497	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	498	'line with opening header tag is not in expected format ... ' .
				499	"=> Aborting (line=$_)");
				500	};
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	501
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	502	# Parse header
				503	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
				504	if ($auto_textsigle) {
				505	$auto_textsigle = increase_auto_textsigle($auto_textsigle);
				506	$log->debug("Auto-incremented text sigle to $auto_textsigle");
				507	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	508
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	509	# Header was parseable
				510	if ($header) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	511
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	512	# Write header to zip
				513	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	514
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	515	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	516
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	517	$header->to_zip($zipper->new_stream($file));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	518
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	519	# Header is for text level
				520	if ($header->type eq 'text') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	521
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	522	# Remember dir and sigles
				523	$dir = $header->dir;
				524	$text_id_esc = $header->id_esc;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	525
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	526	# log output for seeing progression
				527	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	528
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	529	# Reset counter for text lines
				530	# (needed for whitespace handling)
				531	$text_line = 0;
				532	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	533	};
				534	};
				535	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	536	$text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	537
				538	if ($p) {
				539	print STDERR $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
				540	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	541	} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	542	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	543
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	544	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	545
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	546	close $input_fh;
				547
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	548
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	549	__END__
				550
				551	=pod
				552
				553	=encoding utf8
				554
				555	=head1 NAME
				556
				557	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				558
				559	=head1 SYNOPSIS
				560
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	561	cat corpus.i5.xml \| tei2korapxml -tk - > corpus.korapxml.zip
				562	tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	563
				564	=head1 DESCRIPTION
				565
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	566	C<tei2korapxml> is a script to convert TEI P5 and
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	567	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	568	based documents to the
				569	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	570
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	571	This program is usually called from inside another script.
				572
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	573	=head1 FORMATS
				574
				575	=head2 Input restrictions
				576
				577	=over 2
				578
				579	=item
				580
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	581	TEI P5 formatted input with certain restrictions:
				582
				583	=over 4
				584
				585	=item
				586
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	587	B<mandatory>: text-header with integrated textsigle
				588	(or convertable identifier), text-body
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	589
				590	=item
				591
				592	B<optional>: corp-header with integrated corpsigle,
				593	doc-header with integrated docsigle
				594
				595	=back
				596
				597	=item
				598
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	599	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	600	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	601	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	602	into blanks between 2 tokens could lead to additional blanks,
				603	where there should be none (e.g.: punctuation characters like C<,> or
				604	C<.> should not be seperated from their predecessor token).
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	605	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	606
Akron	940ca6f	2021-10-11 12:38:39 +0200	[diff] [blame]	607	=item
				608
				609	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
				610	need to be defined in the same line as the header tag.
				611
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	612	=back
				613
				614	=head2 Notes on the output
				615
				616	=over 2
				617
				618	=item
				619
				620	zip file output (default on C<stdout>) with utf8 encoded entries
				621	(which together form the KorAP-XML format)
				622
				623	=back
				624
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	625	=head1 INSTALLATION
				626
Akron	d26319b	2023-01-12 15:34:41 +0100	[diff] [blame]	627	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietz	e83a4e9	2021-03-16 20:51:26 +0100	[diff] [blame]	628	When these requirements are met, the preferred way to install the script is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	629	to use L<cpanm\|App::cpanminus>.
				630
				631	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				632
				633	In case everything went well, the C<tei2korapxml> tool will
				634	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	635
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	636	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				637
				638	=head1 OPTIONS
				639
				640	=over 2
				641
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	642	=item B<--input\|-i>
				643
				644	The input file to process. If no specific input is defined and a single
				645	dash C<-> is passed as an argument, data is read from C<STDIN>.
				646
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	647	Instead of using C<-i> input files can also be defined as trailing arguments
				648	to the command:
				649
				650	tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
				651
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	652	=item B<--output\|-o>
				653
				654	The output zip file to be created. If no specific output is defined,
				655	data is written to C<STDOUT>.
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	656
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	657	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	658
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	659	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	660
				661	=item B<--help\|-h>
				662
				663	Print help information.
				664
				665	=item B<--version\|-v>
				666
				667	Print version information.
				668
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	669	=item B<--tokenizer-korap\|-tk>
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	670
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	671	Use the standard KorAP/DeReKo tokenizer.
				672
				673	=item B<--tokenizer-internal\|-ti>
				674
				675	Tokenize the data using two embedded tokenizers,
				676	that will take an I<aggressive> and a I<conservative>
				677	approach.
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	678
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	679	=item B<--tokenizer-call\|-tc>
				680
				681	Call an external tokenizer process, that will tokenize
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	682	from STDIN and outputs the offsets of all tokens.
				683
				684	Texts are separated using C<\x04\n>. The external process
				685	should add a new line per text.
				686
				687	If the L</--use-tokenizer-sentence-splits> option is activated,
				688	sentences are marked by offset as well in new lines.
				689
				690	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
				691	splitting, call C<tei2korap> as follows:
				692
				693	$ cat corpus.i5.xml \| tei2korapxml -s \
				694	$ -tc 'datok tokenize \
				695	$ -t ./tokenizer.matok \
				696	$ -p --newline-after-eot --no-sentences \
				697	$ --no-tokens --sentence-positions -' - \
				698	$ > corpus.korapxml.zip
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	699
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	700	=item B<--no-tokenizer>
				701
				702	Boolean flag indicating that no tokenizer should be used.
				703	This is meant to ensure that by default a final token layer always
				704	exists.
				705	If a separate tokenizer is chosen, this flag is ignored.
				706
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	707	=item B<--skip-inline-tokens>
				708
				709	Boolean flag indicating that inline tokens should not
				710	be processed. Defaults to false (meaning inline tokens will be processed).
				711
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	712	=item B<--skip-inline-token-annotations>
				713
				714	Boolean flag indicating that inline token annotations should not
				715	be processed. Defaults to true (meaning inline token annotations
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	716	won't be processed). Can be negated with
				717	C<--no-skip-inline-token-annotations>.
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	718
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	719	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	720
				721	Expects a comma-separated list of tags to be ignored when the structure
				722	is parsed. Content of these tags however will be processed.
				723
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	724	=item B<--auto-textsigle> <textsigle>
				725
				726	Expects a text sigle thats serves as fallback if no text sigles
				727	are given in the input data.
				728	The auto text sigle will be incremented for each text processed.
				729
				730	Example:
				731
				732	tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
				733	< data.i5.xml > korapxml.zip
				734
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	735	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
				736
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	737	Expects a regular replacement expression (separated by B<@> between the
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	738	search and the replacement) to convert text id attributes to text sigles
				739	with three parts (separated by B</>).
				740
				741	Example:
				742
				743	tei2korapxml \
				744	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
				745	-tk - < t/data/icc_german_sample.p5.xml
				746
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	747	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
				748	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	749
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	750	=item B<--inline-tokens> <foundry>#[<file>]
				751
				752	Define the foundry and file (without extension)
				753	to store inline token information in.
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	754	Unless C<--skip-inline-token-annotations> is set,
				755	this will contain annotations as well.
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	756	Defaults to C<tokens> and C<morpho>.
				757
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	758	The inline token data will also be stored in the
				759	inline structures file (see I<--inline-structures>),
				760	unless the inline token foundry is prepended
				761	by an B<!> exclamation mark, indicating that inline
				762	tokens are stored exclusively in the inline tokens
				763	file.
				764
				765	Example:
				766
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	767	tei2korapxml --no-tokenizer --inline-tokens \
				768	'!gingko#morpho' < data.i5.xml > korapxml.zip
				769
				770	=item B<--inline-dependencies> <foundry>#[<file>]
				771
				772	Define the foundry and file (without extension)
				773	to store inline dependency information in.
				774	Defaults to the layer of C<dependency> and
				775	will be ignored if not set (which means, dependency
				776	attributes will be stored in the inline tokens file,
				777	if not skipped).
				778
				779	The dependency data will also be stored in the
				780	inline token file (see I<--inline-tokens>),
				781	unless the inline dependencies foundry is prepended
				782	by an B<!> exclamation mark, indicating that inline
				783	dependency data is stored exclusively in the inline
				784	dependencies file.
				785
				786	Example:
				787
				788	tei2korapxml --no-tokenizer --inline-dependencies \
				789	'gingko#dependency' < data.i5.xml > korapxml.zip
				790
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	791
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	792	=item B<--inline-structures> <foundry>#[<file>]
				793
				794	Define the foundry and file (without extension)
				795	to store inline structure information in.
				796	Defaults to C<struct> and C<structures>.
				797
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	798	=item B<--base-foundry> <foundry>
				799
				800	Define the base foundry to store newly generated
				801	token information in.
				802	Defaults to C<base>.
				803
				804	=item B<--data-file> <file>
				805
				806	Define the file (without extension)
				807	to store primary data information in.
				808	Defaults to C<data>.
				809
				810	=item B<--header-file> <file>
				811
				812	Define the file name (without extension)
				813	to store header information on
				814	the corpus, document, and text level in.
				815	Defaults to C<header>.
				816
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	817	=item B<--use-tokenizer-sentence-splits\|-s>
				818
				819	Replace existing with, or add new, sentence boundary information
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	820	provided by the tokenizer.
				821	Currently KorAP-tokenizer and certain external tokenizers support
				822	these boundaries.
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	823
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	824	=item B<--tokens-file> <file>
				825
				826	Define the file (without extension)
				827	to store generated token information in
				828	(either from the KorAP tokenizer or an externally called tokenizer).
				829	Defaults to C<tokens>.
				830
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	831	=item B<--log\|-l>
				832
				833	Loglevel for I<Log::Any>. Defaults to C<notice>.
				834
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	835	=back
				836
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	837	=head1 ENVIRONMENT VARIABLES
				838
				839	=over 2
				840
				841	=item B<KORAPXMLTEI_DEBUG>
				842
				843	Activate minimal debugging.
				844	Defaults to C<false>.
				845
Marc Kupietz	d254f5c	2025-04-16 10:37:08 +0200	[diff] [blame]	846	=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
				847
				848	Set the heap size for the tokenizer process.
				849	Defaults to C<512m>.
				850
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	851	=back
				852
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	853	=head1 COPYRIGHT AND LICENSE
				854
Marc Kupietz	b6fd6bc	2025-04-16 12:47:26 +0200	[diff] [blame]	855	Copyright (C) 2021-2025, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	856
				857	Author: Peter Harders
				858
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	859	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	860
				861	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				862	Corpus Analysis Platform at the
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	863	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	864	member of the
				865	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				866
				867	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	868	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	869
				870	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	871
				872	# NOTES
				873
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	874	## Notes on segfault prevention
				875
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	876	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	877	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				878	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				879	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				880	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.