Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: f8a26c2afd1ce6c5b65634fd64247caa3421521b [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	9	use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	10
				11	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	12
Marc Kupietz	8ab6832	2026-03-18 18:04:14 +0100	[diff] [blame^]	13	use Encode qw(decode encode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	15	use FindBin;
				16	BEGIN {
				17	unshift @INC, "$FindBin::Bin/../lib";
				18	};
				19
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	20	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	22	use KorAP::XML::TEI::Tokenizer::Conservative;
				23	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	25	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	26	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	27
Marc Kupietz	32781e9	2026-03-05 18:32:43 +0100	[diff] [blame]	28	our $VERSION = '2.7.2';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	29
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	30	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				31
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	32	use constant {
				33	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	34	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	35	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	36
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	37	if ($ENV{KORAPXMLTEI_INLINE}) {
				38	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
				39	};
				40
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	41	# Inline tokens won't be stored in the structure file
				42	my $inline_tokens_exclusive = 0;
				43
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	44	# Inline dependencies won't be stored in the tokens file
				45	my $inline_deps_exclusive = 0;
				46
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	47	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	48	GetOptions(
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	49	'auto-textsigle\|A=s' => \(my $auto_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	50	'root\|r=s' => \(my $root_dir = '.'),
				51	'input\|i=s' => \(my $input_fname = ''),
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	52	'output\|o=s' => \(my $output_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	53	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				54	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	55	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	56	'no-tokenizer' => \(my $no_tokenizer),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	57	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				58	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				59	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	60	'inline-dependencies=s' => \(my $inline_dependencies),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	62	'skip-inline-token-annotations!' => \(
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	63	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	64	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	65	'base-foundry=s' => \(my $base_dir = 'base'),
				66	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	67	'header-file=s' => \(my $header_file = 'header'),
				68	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	69	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	70	'log\|l=s' => \(my $log_level = 'notice'),
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	71	'required-version\|rv=s' => \(my $required_version),
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	72	'progress\|p' => \(my $progress),
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	73	'' => \(my $stdio),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	74	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	75	pod2usage(
				76	-verbose => 99,
				77	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				78	-msg => $VERSION_MSG,
				79	-output => '-'
				80	)
				81	},
				82	'version\|v' => sub {
				83	pod2usage(
				84	-verbose => 0,
				85	-msg => $VERSION_MSG,
				86	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	87	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	88	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	89	);
				90
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	91
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	92	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	93	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	94	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	95	$log->notice('Debugging is activated') if DEBUG;
				96
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	97
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	98	if ($required_version) {
Marc Kupietz	2475c95	2024-01-09 10:40:04 +0100	[diff] [blame]	99	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	100	if (!$1 \|\| $1 ne $VERSION) {
				101	$log->error("Required version $required_version mismatches version $VERSION");
				102	exit(1);
				103	};
				104	};
				105
				106
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	107	my ($what, $with);
				108	if ($xmlid_to_textsigle ne '') {
				109	($what, $with) = split('@', $xmlid_to_textsigle);
				110	$what = qr!$what!;
				111	};
				112
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	113	my $progress_fh;
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	114	if ($progress) {
				115	eval {
				116	require Time::Progress;
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	117	my $tty = $^O eq 'MSWin32' ? 'CON' : '/dev/tty';
				118	open($progress_fh, '>', $tty)
				119	or die "Cannot open $tty";
				120	$progress_fh->autoflush(1);
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	121	1;
				122	} or do {
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	123	$log->warn('Progress bar disabled: ' . ($@ =~ s/ at .*//sr));
				124	$progress = 0;
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	125	}
				126	};
				127
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	128	# tag (without attributes), which contains the primary text
				129	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	130	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	131
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	132	# Remember to skip certain inline tags
				133	my %skip_inline_tags = ();
				134	if ($skip_inline_tags_str) {
				135	foreach (split /\s,\s/, $skip_inline_tags_str) {
				136	$skip_inline_tags{$_} = 1;
				137	};
				138	};
				139
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	140	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	141	my $ext_tok;
				142	if ($tokenizer_call) {
				143	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	144	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	145	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	146
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	147	# KorAP tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	148	elsif ($tokenizer_korap) {
Akron	bd4281e	2022-03-28 08:31:40 +0200	[diff] [blame]	149	eval {
				150	require KorAP::XML::TEI::Tokenizer::KorAP;
				151	1;
				152	};
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	153
				154	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
				155	if ($korap_tok_ver ne $VERSION) {
				156	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
				157	exit(1);
				158	};
				159
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	160	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	161	}
				162
				163	# No internal tokenizer chosen
				164	elsif (!$tokenizer_intern && !$no_tokenizer) {
				165	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
				166	exit(1);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	167	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	168
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	169	if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	170	$skip_inline_tags{s} = 1;
				171	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	172
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	173	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	174	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				175	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	176
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	177
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	178	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	179	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	180	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	181
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	182	# Name of the directory and the file containing all inline token informations
				183	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				184	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	185
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	186	if (index($_tokens_dir, '!') == 0) {
				187	$_tokens_dir = substr($_tokens_dir, 1);
				188	$inline_tokens_exclusive = 1;
				189	};
				190
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	191
				192	my ($_dep_dir, $_dep_file);
				193	if ($inline_dependencies) {
				194	($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
				195	$inline_dependencies = 1;
				196
				197	if ($_dep_dir && index($_dep_dir, '!') == 0) {
				198	$_dep_dir = substr($_dep_dir, 1);
				199	$inline_deps_exclusive = 1;
				200	};
				201	};
				202
				203
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	204	# Initialize zipper
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	205	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	206
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	207	# text directory (below $root_dir)
				208	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	209
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	210	# Escaped version of text id
				211	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	212
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	213	# Default encoding of the text
				214	my $input_enc = 'UTF-8';
				215
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	216	# text line (needed for whitespace handling)
				217	my $text_line = 0;
				218
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	219
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	220	# Input file handle (default: stdin)
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	221	my $input_fh;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	222
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	223	# Single dash was set
				224	if ($stdio) {
				225	$input_fh = *STDIN;
				226	}
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	227	# Input flag was passed
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	228	elsif (@ARGV \|\| $input_fname ne '') {
				229	unless ($input_fname ne '') {
				230	$input_fname = shift @ARGV;
				231	};
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	232	unless (open($input_fh, '<', $input_fname)) {
				233	die $log->fatal("File '$input_fname' could not be opened.");
				234	};
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	235	}
				236
				237	# No input to process
				238	else {
				239	pod2usage(
				240	-verbose => 99,
				241	-sections => 'NAME\|SYNOPSIS',
				242	-msg => $VERSION_MSG,
				243	-output => '-'
				244	);
				245	exit;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	246	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	247
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	248	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	249	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	250
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	251
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	252	# Create inline parser object
				253	my $inline = KorAP::XML::TEI::Inline->new(
				254	$skip_inline_tokens,
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	255	\%skip_inline_tags,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	256	$inline_tokens_exclusive,
				257	$inline_dependencies
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	258	);
				259
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	260	do {
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	261	my $p;
				262	if ($progress && $input_fname ne '') {
				263	my $file_size = -s $input_fname;
				264	if ($file_size) {
				265	$p = Time::Progress->new(min => 0, max => $file_size);
				266	$log->notice("Reading input document $input_fname (Size: $file_size bytes)");
				267	}
				268	} elsif ($input_fname ne '') {
				269	$log->notice("Reading input document $input_fname");
				270	};
				271
				272	my $i = 0;
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	273	MAIN:
				274	while (<$input_fh>) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	275
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	276	if ($p && ($i++ % 500 == 0)) {
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	277	print $progress_fh $p->report("\r%20b %p ETA: %E", tell($input_fh));
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	278	};
				279
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	280	# remove HTML (multi-line) comments (<!--...-->)
				281	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	282
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	283	# Set input encoding
				284	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				285	$input_enc = $2;
				286	next;
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	287	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	288
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	289	$_ = decode($input_enc, $_);
				290	$_ = replace_entities($_);
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	291
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	292	# Start of text body
				293	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
				294	my $suffix = $2;
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	295
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	296	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
				297	die $log->fatal("input line number $.: " .
				298	"line with opening text-body tag '${_TEXT_BODY}' " .
				299	"contains additional information ... => Aborting (line=$_)");
				300	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	301
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	302	# Text body data extracted from input document ($input_fh),
				303	# further processed by XML::LibXML::Reader
				304	my $text_buffer = '';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	305
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	306	# Iterate over all lines in the text body
				307	while (<$input_fh>) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	308
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	309	$_ = remove_xml_comments($input_fh, $_);
				310	$_ = decode($input_enc, $_);
				311	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	312
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	313	# End of text body
				314	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
				315
				316	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
				317
Marc Kupietz	ff061ef	2026-03-05 09:59:35 +0100	[diff] [blame]	318	my $before = substr($_, 0, $pos);
				319	my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
				320	my $before_check = $before;
				321	$before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
				322	if (($before_check . $after) !~ /^\s*$/) {
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	323	die $log->fatal("input line number $.: " .
				324	"line with closing text-body tag '${_TEXT_BODY}'" .
				325	" contains additional information ... => Aborting (line=$_)");
				326	};
				327
Marc Kupietz	ff061ef	2026-03-05 09:59:35 +0100	[diff] [blame]	328	# Add any remaining content before </text> (e.g. </body>) to the buffer
				329	$before =~ s/^\s+//;
				330	$before =~ s/\s+$//;
				331	$text_buffer .= $before if $before ne '';
				332
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	333	if ($dir eq '') {
				334	$log->warn(
				335	"Maybe empty textSigle => skipping this text ...\n" .
				336	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	337	);
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	338	next MAIN;
				339	};
				340
				341	# Parse inline structure
				342	$inline->parse($text_id_esc, \$text_buffer);
				343
				344	if (DEBUG) {
				345	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
				346	};
				347
				348	my $data = $inline->data;
				349
				350	# Write data.xml
				351	$data->to_zip(
				352	$zipper->new_stream("$dir/${data_file}.xml"),
				353	$text_id_esc
				354	);
				355
				356	# Tokenize with external tokenizer
				357	if ($ext_tok) {
				358
Marc Kupietz	8ab6832	2026-03-18 18:04:14 +0100	[diff] [blame^]	359	my $tokens_output = eval {
				360	$ext_tok->tokenize($data->data)->to_string($text_id_esc);
				361	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	362
Marc Kupietz	8ab6832	2026-03-18 18:04:14 +0100	[diff] [blame^]	363	if (my $err = $@) {
				364	$err =~ s/\s+$//;
				365	$log->error("Skipping external tokenization for '$text_id_esc': $err");
				366	$ext_tok->reset;
				367	}
				368	elsif (defined $tokens_output) {
				369	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml")
				370	->print(encode('UTF-8', $tokens_output));
				371
				372	if ($use_tokenizer_sentence_splits) {
				373	$ext_tok->sentencize_from_previous_input($inline->structures);
				374	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	375	};
				376	};
				377
				378	# Tokenize with internal tokenizer
				379	if ($tokenizer_intern) {
				380
				381	# Tokenize and output
				382	$cons_tok->tokenize($data->data)->to_zip(
				383	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
				384	$text_id_esc
				385	)->reset;
				386
				387	$aggr_tok->tokenize($data->data)->to_zip(
				388	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
				389	$text_id_esc
				390	)->reset;
				391	};
				392
				393	# ~ write structures ~
				394	unless ($inline->structures->empty) {
				395	$inline->structures->to_zip(
				396	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
				397	$text_id_esc,
				398	2 # = structure serialization
				399	);
				400	};
				401
				402	# ~ write tokens ~
				403	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				404	$inline->tokens->to_zip(
				405	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
				406	$text_id_esc,
				407	# Either 0 = tokens without inline or
				408	# 1 = tokens with inline
				409	# !$skip_inline_token_annotations
				410	($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
				411	);
				412	};
				413
				414	# ~ write dependencies ~
				415	unless ($inline->dependencies->empty) {
				416	$inline->dependencies->to_zip(
				417	$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
				418	$text_id_esc,
				419	3 # = dependency serialization
				420	);
				421	};
				422
				423
				424	# reinit.
				425	$dir = '';
				426
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	427	next MAIN;
				428	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	429
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	430
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	431	# ~ whitespace handling ~
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	432
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	433	# Fix whitespaces (see notes on whitespace fixing)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	434
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	435	# TODO:
				436	# Maybe it's best, to keep the stripping of whitespace and
				437	# to just remove the if-clause and to insert a blank by default
				438	# (with possibly an option on how newlines in primary text should
				439	# be handled (stripped or replaced by a whitespace)).
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	440
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	441	# Remove consecutive whitespace at beginning and end (mostly one newline)
				442	s/^\s+//;
				443	s/\s+$//;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	444
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	445	# NOTE:
				446	# this is only relevant, if a text consists of more than one line
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	447
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	448	# TODO:
				449	# find a better solution, or create a warning, if a text has more
				450	# than one line ($text_line > 1)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	451
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	452	# TODO:
				453	# do testing with 2 different corpora
				454	# (one with only one-line texts, the other with several lines per text)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	455
Marc Kupietz	a84fcb5	2026-03-05 17:22:43 +0100	[diff] [blame]	456	# Check if the buffer currently ends inside an open XML tag
				457	# (last '<' is after last '>'), meaning this line is a continuation of
				458	# a multi-line element (e.g. attributes split across lines like <ref>).
				459	# A space must be prepended to avoid "attributes construct error" in the
				460	# XML parser when two attribute tokens are concatenated without separator.
				461	my $in_open_tag = ($text_buffer ne '' &&
				462	rindex($text_buffer, '<') > rindex($text_buffer, '>'));
				463
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	464	# line contains at least one non-tag character
				465	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	466
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	467	# Increment counter for text lines
				468	$text_line++;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	469
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	470	# insert blank before 1st character
Marc Kupietz	a84fcb5	2026-03-05 17:22:43 +0100	[diff] [blame]	471	# (for 2nd line and consecutive lines, or when continuing an open tag)
				472	$_ = ' ' . $_ if $text_line > 1 \|\| $in_open_tag;
				473	}
				474
				475	# Line is purely within an open tag (attribute continuation):
				476	# prepend a space so attributes are properly separated.
				477	elsif ($in_open_tag) {
				478	$_ = ' ' . $_;
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	479	}
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	480
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	481	# add line to buffer
				482	$text_buffer .= $_;
				483	};
				484	}
				485	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
				486	my $leadin = $1;
				487	my $id = $3;
				488	my $sigle = $3;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	489
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	490	if ($what) {
				491	$_ = $id;
				492	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
				493	$sigle = $_;
				494	$log->debug("Converted text id `$id' to sigle `$sigle'");
				495	};
				496	$sigle =~ s/\./-/g;
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	497
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	498	my @parts = split(/[\/_]/, $sigle);
				499	if (@parts != 3) {
				500	die $log->fatal(
				501	"input line number $.: " .
				502	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
				503	"=> Aborting (line=$_)");
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	504	};
				505
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	506	$dir = join("/", @parts);
				507	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
				508	$log->notice("$0: text_id=$text_id_esc");
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	509
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	510	if ($leadin !~ /^\s*$/) {
				511	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	512	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	513	'line with opening header tag is not in expected format ... ' .
				514	"=> Aborting (line=$_)");
				515	};
				516	}
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	517
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	518	# Start of header section
				519	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
				520	my $content = "$2\n";
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	521
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	522	if ($1 !~ /^\s*$/) {
				523	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	524	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	525	'line with opening header tag is not in expected format ... ' .
				526	"=> Aborting (line=$_)");
				527	};
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	528
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	529	# Parse header
				530	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
				531	if ($auto_textsigle) {
				532	$auto_textsigle = increase_auto_textsigle($auto_textsigle);
				533	$log->debug("Auto-incremented text sigle to $auto_textsigle");
				534	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	535
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	536	# Header was parseable
				537	if ($header) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	538
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	539	# Write header to zip
				540	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	541
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	542	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	543
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	544	$header->to_zip($zipper->new_stream($file));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	545
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	546	# Header is for text level
				547	if ($header->type eq 'text') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	548
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	549	# Remember dir and sigles
				550	$dir = $header->dir;
				551	$text_id_esc = $header->id_esc;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	552
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	553	# log output for seeing progression
				554	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	555
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	556	# Reset counter for text lines
				557	# (needed for whitespace handling)
				558	$text_line = 0;
				559	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	560	};
				561	};
				562	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	563	$text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	564
				565	if ($p) {
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	566	print $progress_fh $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	567	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	568	} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	569	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	570
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	571	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	572
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	573	close $input_fh;
				574
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	575
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	576	__END__
				577
				578	=pod
				579
				580	=encoding utf8
				581
				582	=head1 NAME
				583
				584	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				585
				586	=head1 SYNOPSIS
				587
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	588	cat corpus.i5.xml \| tei2korapxml -tk - > corpus.korapxml.zip
				589	tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	590
				591	=head1 DESCRIPTION
				592
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	593	C<tei2korapxml> is a script to convert TEI P5 and
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	594	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	595	based documents to the
				596	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	597
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	598	This program is usually called from inside another script.
				599
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	600	=head1 FORMATS
				601
				602	=head2 Input restrictions
				603
				604	=over 2
				605
				606	=item
				607
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	608	TEI P5 formatted input with certain restrictions:
				609
				610	=over 4
				611
				612	=item
				613
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	614	B<mandatory>: text-header with integrated textsigle
				615	(or convertable identifier), text-body
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	616
				617	=item
				618
				619	B<optional>: corp-header with integrated corpsigle,
				620	doc-header with integrated docsigle
				621
				622	=back
				623
				624	=item
				625
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	626	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	627	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	628	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	629	into blanks between 2 tokens could lead to additional blanks,
				630	where there should be none (e.g.: punctuation characters like C<,> or
				631	C<.> should not be seperated from their predecessor token).
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	632	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	633
Akron	940ca6f	2021-10-11 12:38:39 +0200	[diff] [blame]	634	=item
				635
				636	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
				637	need to be defined in the same line as the header tag.
				638
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	639	=back
				640
				641	=head2 Notes on the output
				642
				643	=over 2
				644
				645	=item
				646
				647	zip file output (default on C<stdout>) with utf8 encoded entries
				648	(which together form the KorAP-XML format)
				649
				650	=back
				651
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	652	=head1 INSTALLATION
				653
Akron	d26319b	2023-01-12 15:34:41 +0100	[diff] [blame]	654	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietz	e83a4e9	2021-03-16 20:51:26 +0100	[diff] [blame]	655	When these requirements are met, the preferred way to install the script is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	656	to use L<cpanm\|App::cpanminus>.
				657
				658	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				659
				660	In case everything went well, the C<tei2korapxml> tool will
				661	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	662
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	663	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				664
				665	=head1 OPTIONS
				666
				667	=over 2
				668
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	669	=item B<--input\|-i>
				670
				671	The input file to process. If no specific input is defined and a single
				672	dash C<-> is passed as an argument, data is read from C<STDIN>.
				673
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	674	Instead of using C<-i> input files can also be defined as trailing arguments
				675	to the command:
				676
				677	tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
				678
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	679	=item B<--output\|-o>
				680
				681	The output zip file to be created. If no specific output is defined,
				682	data is written to C<STDOUT>.
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	683
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	684	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	685
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	686	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	687
				688	=item B<--help\|-h>
				689
				690	Print help information.
				691
				692	=item B<--version\|-v>
				693
				694	Print version information.
				695
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	696	=item B<--tokenizer-korap\|-tk>
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	697
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	698	Use the standard KorAP/DeReKo tokenizer.
				699
				700	=item B<--tokenizer-internal\|-ti>
				701
				702	Tokenize the data using two embedded tokenizers,
				703	that will take an I<aggressive> and a I<conservative>
				704	approach.
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	705
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	706	=item B<--tokenizer-call\|-tc>
				707
				708	Call an external tokenizer process, that will tokenize
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	709	from STDIN and outputs the offsets of all tokens.
				710
				711	Texts are separated using C<\x04\n>. The external process
				712	should add a new line per text.
				713
				714	If the L</--use-tokenizer-sentence-splits> option is activated,
				715	sentences are marked by offset as well in new lines.
				716
				717	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
				718	splitting, call C<tei2korap> as follows:
				719
				720	$ cat corpus.i5.xml \| tei2korapxml -s \
				721	$ -tc 'datok tokenize \
				722	$ -t ./tokenizer.matok \
				723	$ -p --newline-after-eot --no-sentences \
				724	$ --no-tokens --sentence-positions -' - \
				725	$ > corpus.korapxml.zip
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	726
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	727	=item B<--no-tokenizer>
				728
				729	Boolean flag indicating that no tokenizer should be used.
				730	This is meant to ensure that by default a final token layer always
				731	exists.
				732	If a separate tokenizer is chosen, this flag is ignored.
				733
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	734	=item B<--skip-inline-tokens>
				735
				736	Boolean flag indicating that inline tokens should not
				737	be processed. Defaults to false (meaning inline tokens will be processed).
				738
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	739	=item B<--skip-inline-token-annotations>
				740
				741	Boolean flag indicating that inline token annotations should not
				742	be processed. Defaults to true (meaning inline token annotations
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	743	won't be processed). Can be negated with
				744	C<--no-skip-inline-token-annotations>.
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	745
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	746	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	747
				748	Expects a comma-separated list of tags to be ignored when the structure
				749	is parsed. Content of these tags however will be processed.
				750
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	751	=item B<--auto-textsigle> <textsigle>
				752
				753	Expects a text sigle thats serves as fallback if no text sigles
				754	are given in the input data.
				755	The auto text sigle will be incremented for each text processed.
				756
				757	Example:
				758
				759	tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
				760	< data.i5.xml > korapxml.zip
				761
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	762	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
				763
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	764	Expects a regular replacement expression (separated by B<@> between the
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	765	search and the replacement) to convert text id attributes to text sigles
				766	with three parts (separated by B</>).
				767
				768	Example:
				769
				770	tei2korapxml \
				771	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
				772	-tk - < t/data/icc_german_sample.p5.xml
				773
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	774	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
				775	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	776
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	777	=item B<--inline-tokens> <foundry>#[<file>]
				778
				779	Define the foundry and file (without extension)
				780	to store inline token information in.
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	781	Unless C<--skip-inline-token-annotations> is set,
				782	this will contain annotations as well.
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	783	Defaults to C<tokens> and C<morpho>.
				784
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	785	The inline token data will also be stored in the
				786	inline structures file (see I<--inline-structures>),
				787	unless the inline token foundry is prepended
				788	by an B<!> exclamation mark, indicating that inline
				789	tokens are stored exclusively in the inline tokens
				790	file.
				791
				792	Example:
				793
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	794	tei2korapxml --no-tokenizer --inline-tokens \
				795	'!gingko#morpho' < data.i5.xml > korapxml.zip
				796
				797	=item B<--inline-dependencies> <foundry>#[<file>]
				798
				799	Define the foundry and file (without extension)
				800	to store inline dependency information in.
				801	Defaults to the layer of C<dependency> and
				802	will be ignored if not set (which means, dependency
				803	attributes will be stored in the inline tokens file,
				804	if not skipped).
				805
				806	The dependency data will also be stored in the
				807	inline token file (see I<--inline-tokens>),
				808	unless the inline dependencies foundry is prepended
				809	by an B<!> exclamation mark, indicating that inline
				810	dependency data is stored exclusively in the inline
				811	dependencies file.
				812
				813	Example:
				814
				815	tei2korapxml --no-tokenizer --inline-dependencies \
				816	'gingko#dependency' < data.i5.xml > korapxml.zip
				817
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	818
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	819	=item B<--inline-structures> <foundry>#[<file>]
				820
				821	Define the foundry and file (without extension)
				822	to store inline structure information in.
				823	Defaults to C<struct> and C<structures>.
				824
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	825	=item B<--base-foundry> <foundry>
				826
				827	Define the base foundry to store newly generated
				828	token information in.
				829	Defaults to C<base>.
				830
				831	=item B<--data-file> <file>
				832
				833	Define the file (without extension)
				834	to store primary data information in.
				835	Defaults to C<data>.
				836
				837	=item B<--header-file> <file>
				838
				839	Define the file name (without extension)
				840	to store header information on
				841	the corpus, document, and text level in.
				842	Defaults to C<header>.
				843
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	844	=item B<--use-tokenizer-sentence-splits\|-s>
				845
				846	Replace existing with, or add new, sentence boundary information
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	847	provided by the tokenizer.
				848	Currently KorAP-tokenizer and certain external tokenizers support
				849	these boundaries.
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	850
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	851	=item B<--tokens-file> <file>
				852
				853	Define the file (without extension)
				854	to store generated token information in
				855	(either from the KorAP tokenizer or an externally called tokenizer).
				856	Defaults to C<tokens>.
				857
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	858	=item B<--log\|-l>
				859
				860	Loglevel for I<Log::Any>. Defaults to C<notice>.
				861
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	862	=back
				863
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	864	=head1 ENVIRONMENT VARIABLES
				865
				866	=over 2
				867
				868	=item B<KORAPXMLTEI_DEBUG>
				869
				870	Activate minimal debugging.
				871	Defaults to C<false>.
				872
Marc Kupietz	d254f5c	2025-04-16 10:37:08 +0200	[diff] [blame]	873	=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
				874
				875	Set the heap size for the tokenizer process.
				876	Defaults to C<512m>.
				877
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	878	=back
				879
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	880	=head1 COPYRIGHT AND LICENSE
				881
Marc Kupietz	b6fd6bc	2025-04-16 12:47:26 +0200	[diff] [blame]	882	Copyright (C) 2021-2025, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	883
				884	Author: Peter Harders
				885
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	886	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	887
				888	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				889	Corpus Analysis Platform at the
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	890	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	891	member of the
				892	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				893
				894	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	895	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	896
				897	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	898
				899	# NOTES
				900
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	901	## Notes on segfault prevention
				902
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	903	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	904	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				905	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				906	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				907	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.