Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 3d35077569b8e8026bb92c555c0fdce48d77104b [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	9	use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	10
				11	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	12
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	13	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	15	use FindBin;
				16	BEGIN {
				17	unshift @INC, "$FindBin::Bin/../lib";
				18	};
				19
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	20	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	22	use KorAP::XML::TEI::Tokenizer::Conservative;
				23	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	25	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	26	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	27
Marc Kupietz	32781e9	2026-03-05 18:32:43 +0100	[diff] [blame^]	28	our $VERSION = '2.7.2';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	29
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	30	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				31
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	32	use constant {
				33	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	34	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	35	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	36
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	37	if ($ENV{KORAPXMLTEI_INLINE}) {
				38	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
				39	};
				40
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	41	# Inline tokens won't be stored in the structure file
				42	my $inline_tokens_exclusive = 0;
				43
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	44	# Inline dependencies won't be stored in the tokens file
				45	my $inline_deps_exclusive = 0;
				46
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	47	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	48	GetOptions(
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	49	'auto-textsigle\|A=s' => \(my $auto_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	50	'root\|r=s' => \(my $root_dir = '.'),
				51	'input\|i=s' => \(my $input_fname = ''),
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	52	'output\|o=s' => \(my $output_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	53	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				54	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	55	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	56	'no-tokenizer' => \(my $no_tokenizer),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	57	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				58	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				59	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	60	'inline-dependencies=s' => \(my $inline_dependencies),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	62	'skip-inline-token-annotations!' => \(
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	63	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	64	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	65	'base-foundry=s' => \(my $base_dir = 'base'),
				66	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	67	'header-file=s' => \(my $header_file = 'header'),
				68	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	69	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	70	'log\|l=s' => \(my $log_level = 'notice'),
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	71	'required-version\|rv=s' => \(my $required_version),
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	72	'progress\|p' => \(my $progress),
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	73	'' => \(my $stdio),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	74	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	75	pod2usage(
				76	-verbose => 99,
				77	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				78	-msg => $VERSION_MSG,
				79	-output => '-'
				80	)
				81	},
				82	'version\|v' => sub {
				83	pod2usage(
				84	-verbose => 0,
				85	-msg => $VERSION_MSG,
				86	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	87	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	88	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	89	);
				90
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	91
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	92	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	93	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	94	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	95	$log->notice('Debugging is activated') if DEBUG;
				96
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	97
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	98	if ($required_version) {
Marc Kupietz	2475c95	2024-01-09 10:40:04 +0100	[diff] [blame]	99	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	100	if (!$1 \|\| $1 ne $VERSION) {
				101	$log->error("Required version $required_version mismatches version $VERSION");
				102	exit(1);
				103	};
				104	};
				105
				106
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	107	my ($what, $with);
				108	if ($xmlid_to_textsigle ne '') {
				109	($what, $with) = split('@', $xmlid_to_textsigle);
				110	$what = qr!$what!;
				111	};
				112
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	113	my $progress_fh;
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	114	if ($progress) {
				115	eval {
				116	require Time::Progress;
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	117	my $tty = $^O eq 'MSWin32' ? 'CON' : '/dev/tty';
				118	open($progress_fh, '>', $tty)
				119	or die "Cannot open $tty";
				120	$progress_fh->autoflush(1);
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	121	1;
				122	} or do {
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	123	$log->warn('Progress bar disabled: ' . ($@ =~ s/ at .*//sr));
				124	$progress = 0;
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	125	}
				126	};
				127
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	128	# tag (without attributes), which contains the primary text
				129	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	130	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	131
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	132	# Remember to skip certain inline tags
				133	my %skip_inline_tags = ();
				134	if ($skip_inline_tags_str) {
				135	foreach (split /\s,\s/, $skip_inline_tags_str) {
				136	$skip_inline_tags{$_} = 1;
				137	};
				138	};
				139
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	140	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	141	my $ext_tok;
				142	if ($tokenizer_call) {
				143	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	144	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	145	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	146
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	147	# KorAP tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	148	elsif ($tokenizer_korap) {
Akron	bd4281e	2022-03-28 08:31:40 +0200	[diff] [blame]	149	eval {
				150	require KorAP::XML::TEI::Tokenizer::KorAP;
				151	1;
				152	};
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	153
				154	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
				155	if ($korap_tok_ver ne $VERSION) {
				156	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
				157	exit(1);
				158	};
				159
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	160	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	161	}
				162
				163	# No internal tokenizer chosen
				164	elsif (!$tokenizer_intern && !$no_tokenizer) {
				165	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
				166	exit(1);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	167	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	168
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	169	if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	170	$skip_inline_tags{s} = 1;
				171	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	172
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	173	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	174	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				175	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	176
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	177
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	178	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	179	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	180	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	181
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	182	# Name of the directory and the file containing all inline token informations
				183	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				184	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	185
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	186	if (index($_tokens_dir, '!') == 0) {
				187	$_tokens_dir = substr($_tokens_dir, 1);
				188	$inline_tokens_exclusive = 1;
				189	};
				190
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	191
				192	my ($_dep_dir, $_dep_file);
				193	if ($inline_dependencies) {
				194	($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
				195	$inline_dependencies = 1;
				196
				197	if ($_dep_dir && index($_dep_dir, '!') == 0) {
				198	$_dep_dir = substr($_dep_dir, 1);
				199	$inline_deps_exclusive = 1;
				200	};
				201	};
				202
				203
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	204	# Initialize zipper
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	205	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	206
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	207	# text directory (below $root_dir)
				208	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	209
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	210	# Escaped version of text id
				211	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	212
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	213	# Default encoding of the text
				214	my $input_enc = 'UTF-8';
				215
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	216	# text line (needed for whitespace handling)
				217	my $text_line = 0;
				218
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	219
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	220	# Input file handle (default: stdin)
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	221	my $input_fh;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	222
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	223	# Single dash was set
				224	if ($stdio) {
				225	$input_fh = *STDIN;
				226	}
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	227	# Input flag was passed
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	228	elsif (@ARGV \|\| $input_fname ne '') {
				229	unless ($input_fname ne '') {
				230	$input_fname = shift @ARGV;
				231	};
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	232	unless (open($input_fh, '<', $input_fname)) {
				233	die $log->fatal("File '$input_fname' could not be opened.");
				234	};
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	235	}
				236
				237	# No input to process
				238	else {
				239	pod2usage(
				240	-verbose => 99,
				241	-sections => 'NAME\|SYNOPSIS',
				242	-msg => $VERSION_MSG,
				243	-output => '-'
				244	);
				245	exit;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	246	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	247
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	248	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	249	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	250
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	251
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	252	# Create inline parser object
				253	my $inline = KorAP::XML::TEI::Inline->new(
				254	$skip_inline_tokens,
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	255	\%skip_inline_tags,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	256	$inline_tokens_exclusive,
				257	$inline_dependencies
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	258	);
				259
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	260	do {
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	261	my $p;
				262	if ($progress && $input_fname ne '') {
				263	my $file_size = -s $input_fname;
				264	if ($file_size) {
				265	$p = Time::Progress->new(min => 0, max => $file_size);
				266	$log->notice("Reading input document $input_fname (Size: $file_size bytes)");
				267	}
				268	} elsif ($input_fname ne '') {
				269	$log->notice("Reading input document $input_fname");
				270	};
				271
				272	my $i = 0;
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	273	MAIN:
				274	while (<$input_fh>) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	275
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	276	if ($p && ($i++ % 500 == 0)) {
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	277	print $progress_fh $p->report("\r%20b %p ETA: %E", tell($input_fh));
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	278	};
				279
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	280	# remove HTML (multi-line) comments (<!--...-->)
				281	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	282
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	283	# Set input encoding
				284	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				285	$input_enc = $2;
				286	next;
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	287	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	288
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	289	$_ = decode($input_enc, $_);
				290	$_ = replace_entities($_);
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	291
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	292	# Start of text body
				293	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
				294	my $suffix = $2;
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	295
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	296	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
				297	die $log->fatal("input line number $.: " .
				298	"line with opening text-body tag '${_TEXT_BODY}' " .
				299	"contains additional information ... => Aborting (line=$_)");
				300	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	301
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	302	# Text body data extracted from input document ($input_fh),
				303	# further processed by XML::LibXML::Reader
				304	my $text_buffer = '';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	305
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	306	# Iterate over all lines in the text body
				307	while (<$input_fh>) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	308
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	309	$_ = remove_xml_comments($input_fh, $_);
				310	$_ = decode($input_enc, $_);
				311	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	312
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	313	# End of text body
				314	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
				315
				316	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
				317
Marc Kupietz	ff061ef	2026-03-05 09:59:35 +0100	[diff] [blame]	318	my $before = substr($_, 0, $pos);
				319	my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
				320	my $before_check = $before;
				321	$before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
				322	if (($before_check . $after) !~ /^\s*$/) {
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	323	die $log->fatal("input line number $.: " .
				324	"line with closing text-body tag '${_TEXT_BODY}'" .
				325	" contains additional information ... => Aborting (line=$_)");
				326	};
				327
Marc Kupietz	ff061ef	2026-03-05 09:59:35 +0100	[diff] [blame]	328	# Add any remaining content before </text> (e.g. </body>) to the buffer
				329	$before =~ s/^\s+//;
				330	$before =~ s/\s+$//;
				331	$text_buffer .= $before if $before ne '';
				332
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	333	if ($dir eq '') {
				334	$log->warn(
				335	"Maybe empty textSigle => skipping this text ...\n" .
				336	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	337	);
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	338	next MAIN;
				339	};
				340
				341	# Parse inline structure
				342	$inline->parse($text_id_esc, \$text_buffer);
				343
				344	if (DEBUG) {
				345	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
				346	};
				347
				348	my $data = $inline->data;
				349
				350	# Write data.xml
				351	$data->to_zip(
				352	$zipper->new_stream("$dir/${data_file}.xml"),
				353	$text_id_esc
				354	);
				355
				356	# Tokenize with external tokenizer
				357	if ($ext_tok) {
				358
				359	# Tokenize and output
				360	$ext_tok->tokenize($data->data)->to_zip(
				361	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
				362	$text_id_esc
				363	);
				364
				365	if ($use_tokenizer_sentence_splits) {
				366	$ext_tok->sentencize_from_previous_input($inline->structures);
				367	};
				368	};
				369
				370	# Tokenize with internal tokenizer
				371	if ($tokenizer_intern) {
				372
				373	# Tokenize and output
				374	$cons_tok->tokenize($data->data)->to_zip(
				375	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
				376	$text_id_esc
				377	)->reset;
				378
				379	$aggr_tok->tokenize($data->data)->to_zip(
				380	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
				381	$text_id_esc
				382	)->reset;
				383	};
				384
				385	# ~ write structures ~
				386	unless ($inline->structures->empty) {
				387	$inline->structures->to_zip(
				388	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
				389	$text_id_esc,
				390	2 # = structure serialization
				391	);
				392	};
				393
				394	# ~ write tokens ~
				395	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				396	$inline->tokens->to_zip(
				397	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
				398	$text_id_esc,
				399	# Either 0 = tokens without inline or
				400	# 1 = tokens with inline
				401	# !$skip_inline_token_annotations
				402	($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
				403	);
				404	};
				405
				406	# ~ write dependencies ~
				407	unless ($inline->dependencies->empty) {
				408	$inline->dependencies->to_zip(
				409	$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
				410	$text_id_esc,
				411	3 # = dependency serialization
				412	);
				413	};
				414
				415
				416	# reinit.
				417	$dir = '';
				418
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	419	next MAIN;
				420	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	421
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	422
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	423	# ~ whitespace handling ~
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	424
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	425	# Fix whitespaces (see notes on whitespace fixing)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	426
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	427	# TODO:
				428	# Maybe it's best, to keep the stripping of whitespace and
				429	# to just remove the if-clause and to insert a blank by default
				430	# (with possibly an option on how newlines in primary text should
				431	# be handled (stripped or replaced by a whitespace)).
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	432
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	433	# Remove consecutive whitespace at beginning and end (mostly one newline)
				434	s/^\s+//;
				435	s/\s+$//;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	436
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	437	# NOTE:
				438	# this is only relevant, if a text consists of more than one line
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	439
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	440	# TODO:
				441	# find a better solution, or create a warning, if a text has more
				442	# than one line ($text_line > 1)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	443
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	444	# TODO:
				445	# do testing with 2 different corpora
				446	# (one with only one-line texts, the other with several lines per text)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	447
Marc Kupietz	a84fcb5	2026-03-05 17:22:43 +0100	[diff] [blame]	448	# Check if the buffer currently ends inside an open XML tag
				449	# (last '<' is after last '>'), meaning this line is a continuation of
				450	# a multi-line element (e.g. attributes split across lines like <ref>).
				451	# A space must be prepended to avoid "attributes construct error" in the
				452	# XML parser when two attribute tokens are concatenated without separator.
				453	my $in_open_tag = ($text_buffer ne '' &&
				454	rindex($text_buffer, '<') > rindex($text_buffer, '>'));
				455
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	456	# line contains at least one non-tag character
				457	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	458
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	459	# Increment counter for text lines
				460	$text_line++;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	461
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	462	# insert blank before 1st character
Marc Kupietz	a84fcb5	2026-03-05 17:22:43 +0100	[diff] [blame]	463	# (for 2nd line and consecutive lines, or when continuing an open tag)
				464	$_ = ' ' . $_ if $text_line > 1 \|\| $in_open_tag;
				465	}
				466
				467	# Line is purely within an open tag (attribute continuation):
				468	# prepend a space so attributes are properly separated.
				469	elsif ($in_open_tag) {
				470	$_ = ' ' . $_;
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	471	}
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	472
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	473	# add line to buffer
				474	$text_buffer .= $_;
				475	};
				476	}
				477	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
				478	my $leadin = $1;
				479	my $id = $3;
				480	my $sigle = $3;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	481
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	482	if ($what) {
				483	$_ = $id;
				484	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
				485	$sigle = $_;
				486	$log->debug("Converted text id `$id' to sigle `$sigle'");
				487	};
				488	$sigle =~ s/\./-/g;
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	489
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	490	my @parts = split(/[\/_]/, $sigle);
				491	if (@parts != 3) {
				492	die $log->fatal(
				493	"input line number $.: " .
				494	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
				495	"=> Aborting (line=$_)");
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	496	};
				497
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	498	$dir = join("/", @parts);
				499	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
				500	$log->notice("$0: text_id=$text_id_esc");
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	501
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	502	if ($leadin !~ /^\s*$/) {
				503	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	504	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	505	'line with opening header tag is not in expected format ... ' .
				506	"=> Aborting (line=$_)");
				507	};
				508	}
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	509
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	510	# Start of header section
				511	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
				512	my $content = "$2\n";
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	513
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	514	if ($1 !~ /^\s*$/) {
				515	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	516	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	517	'line with opening header tag is not in expected format ... ' .
				518	"=> Aborting (line=$_)");
				519	};
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	520
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	521	# Parse header
				522	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
				523	if ($auto_textsigle) {
				524	$auto_textsigle = increase_auto_textsigle($auto_textsigle);
				525	$log->debug("Auto-incremented text sigle to $auto_textsigle");
				526	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	527
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	528	# Header was parseable
				529	if ($header) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	530
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	531	# Write header to zip
				532	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	533
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	534	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	535
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	536	$header->to_zip($zipper->new_stream($file));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	537
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	538	# Header is for text level
				539	if ($header->type eq 'text') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	540
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	541	# Remember dir and sigles
				542	$dir = $header->dir;
				543	$text_id_esc = $header->id_esc;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	544
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	545	# log output for seeing progression
				546	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	547
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	548	# Reset counter for text lines
				549	# (needed for whitespace handling)
				550	$text_line = 0;
				551	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	552	};
				553	};
				554	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	555	$text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	556
				557	if ($p) {
Marc Kupietz	3c16cb9	2026-03-05 18:29:59 +0100	[diff] [blame]	558	print $progress_fh $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	559	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	560	} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	561	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	562
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	563	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	564
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	565	close $input_fh;
				566
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	567
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	568	__END__
				569
				570	=pod
				571
				572	=encoding utf8
				573
				574	=head1 NAME
				575
				576	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				577
				578	=head1 SYNOPSIS
				579
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	580	cat corpus.i5.xml \| tei2korapxml -tk - > corpus.korapxml.zip
				581	tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	582
				583	=head1 DESCRIPTION
				584
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	585	C<tei2korapxml> is a script to convert TEI P5 and
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	586	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	587	based documents to the
				588	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	589
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	590	This program is usually called from inside another script.
				591
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	592	=head1 FORMATS
				593
				594	=head2 Input restrictions
				595
				596	=over 2
				597
				598	=item
				599
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	600	TEI P5 formatted input with certain restrictions:
				601
				602	=over 4
				603
				604	=item
				605
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	606	B<mandatory>: text-header with integrated textsigle
				607	(or convertable identifier), text-body
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	608
				609	=item
				610
				611	B<optional>: corp-header with integrated corpsigle,
				612	doc-header with integrated docsigle
				613
				614	=back
				615
				616	=item
				617
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	618	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	619	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	620	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	621	into blanks between 2 tokens could lead to additional blanks,
				622	where there should be none (e.g.: punctuation characters like C<,> or
				623	C<.> should not be seperated from their predecessor token).
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	624	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	625
Akron	940ca6f	2021-10-11 12:38:39 +0200	[diff] [blame]	626	=item
				627
				628	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
				629	need to be defined in the same line as the header tag.
				630
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	631	=back
				632
				633	=head2 Notes on the output
				634
				635	=over 2
				636
				637	=item
				638
				639	zip file output (default on C<stdout>) with utf8 encoded entries
				640	(which together form the KorAP-XML format)
				641
				642	=back
				643
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	644	=head1 INSTALLATION
				645
Akron	d26319b	2023-01-12 15:34:41 +0100	[diff] [blame]	646	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietz	e83a4e9	2021-03-16 20:51:26 +0100	[diff] [blame]	647	When these requirements are met, the preferred way to install the script is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	648	to use L<cpanm\|App::cpanminus>.
				649
				650	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				651
				652	In case everything went well, the C<tei2korapxml> tool will
				653	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	654
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	655	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				656
				657	=head1 OPTIONS
				658
				659	=over 2
				660
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	661	=item B<--input\|-i>
				662
				663	The input file to process. If no specific input is defined and a single
				664	dash C<-> is passed as an argument, data is read from C<STDIN>.
				665
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	666	Instead of using C<-i> input files can also be defined as trailing arguments
				667	to the command:
				668
				669	tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
				670
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	671	=item B<--output\|-o>
				672
				673	The output zip file to be created. If no specific output is defined,
				674	data is written to C<STDOUT>.
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	675
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	676	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	677
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	678	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	679
				680	=item B<--help\|-h>
				681
				682	Print help information.
				683
				684	=item B<--version\|-v>
				685
				686	Print version information.
				687
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	688	=item B<--tokenizer-korap\|-tk>
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	689
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	690	Use the standard KorAP/DeReKo tokenizer.
				691
				692	=item B<--tokenizer-internal\|-ti>
				693
				694	Tokenize the data using two embedded tokenizers,
				695	that will take an I<aggressive> and a I<conservative>
				696	approach.
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	697
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	698	=item B<--tokenizer-call\|-tc>
				699
				700	Call an external tokenizer process, that will tokenize
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	701	from STDIN and outputs the offsets of all tokens.
				702
				703	Texts are separated using C<\x04\n>. The external process
				704	should add a new line per text.
				705
				706	If the L</--use-tokenizer-sentence-splits> option is activated,
				707	sentences are marked by offset as well in new lines.
				708
				709	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
				710	splitting, call C<tei2korap> as follows:
				711
				712	$ cat corpus.i5.xml \| tei2korapxml -s \
				713	$ -tc 'datok tokenize \
				714	$ -t ./tokenizer.matok \
				715	$ -p --newline-after-eot --no-sentences \
				716	$ --no-tokens --sentence-positions -' - \
				717	$ > corpus.korapxml.zip
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	718
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	719	=item B<--no-tokenizer>
				720
				721	Boolean flag indicating that no tokenizer should be used.
				722	This is meant to ensure that by default a final token layer always
				723	exists.
				724	If a separate tokenizer is chosen, this flag is ignored.
				725
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	726	=item B<--skip-inline-tokens>
				727
				728	Boolean flag indicating that inline tokens should not
				729	be processed. Defaults to false (meaning inline tokens will be processed).
				730
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	731	=item B<--skip-inline-token-annotations>
				732
				733	Boolean flag indicating that inline token annotations should not
				734	be processed. Defaults to true (meaning inline token annotations
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	735	won't be processed). Can be negated with
				736	C<--no-skip-inline-token-annotations>.
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	737
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	738	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	739
				740	Expects a comma-separated list of tags to be ignored when the structure
				741	is parsed. Content of these tags however will be processed.
				742
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	743	=item B<--auto-textsigle> <textsigle>
				744
				745	Expects a text sigle thats serves as fallback if no text sigles
				746	are given in the input data.
				747	The auto text sigle will be incremented for each text processed.
				748
				749	Example:
				750
				751	tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
				752	< data.i5.xml > korapxml.zip
				753
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	754	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
				755
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	756	Expects a regular replacement expression (separated by B<@> between the
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	757	search and the replacement) to convert text id attributes to text sigles
				758	with three parts (separated by B</>).
				759
				760	Example:
				761
				762	tei2korapxml \
				763	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
				764	-tk - < t/data/icc_german_sample.p5.xml
				765
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	766	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
				767	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	768
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	769	=item B<--inline-tokens> <foundry>#[<file>]
				770
				771	Define the foundry and file (without extension)
				772	to store inline token information in.
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	773	Unless C<--skip-inline-token-annotations> is set,
				774	this will contain annotations as well.
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	775	Defaults to C<tokens> and C<morpho>.
				776
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	777	The inline token data will also be stored in the
				778	inline structures file (see I<--inline-structures>),
				779	unless the inline token foundry is prepended
				780	by an B<!> exclamation mark, indicating that inline
				781	tokens are stored exclusively in the inline tokens
				782	file.
				783
				784	Example:
				785
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	786	tei2korapxml --no-tokenizer --inline-tokens \
				787	'!gingko#morpho' < data.i5.xml > korapxml.zip
				788
				789	=item B<--inline-dependencies> <foundry>#[<file>]
				790
				791	Define the foundry and file (without extension)
				792	to store inline dependency information in.
				793	Defaults to the layer of C<dependency> and
				794	will be ignored if not set (which means, dependency
				795	attributes will be stored in the inline tokens file,
				796	if not skipped).
				797
				798	The dependency data will also be stored in the
				799	inline token file (see I<--inline-tokens>),
				800	unless the inline dependencies foundry is prepended
				801	by an B<!> exclamation mark, indicating that inline
				802	dependency data is stored exclusively in the inline
				803	dependencies file.
				804
				805	Example:
				806
				807	tei2korapxml --no-tokenizer --inline-dependencies \
				808	'gingko#dependency' < data.i5.xml > korapxml.zip
				809
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	810
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	811	=item B<--inline-structures> <foundry>#[<file>]
				812
				813	Define the foundry and file (without extension)
				814	to store inline structure information in.
				815	Defaults to C<struct> and C<structures>.
				816
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	817	=item B<--base-foundry> <foundry>
				818
				819	Define the base foundry to store newly generated
				820	token information in.
				821	Defaults to C<base>.
				822
				823	=item B<--data-file> <file>
				824
				825	Define the file (without extension)
				826	to store primary data information in.
				827	Defaults to C<data>.
				828
				829	=item B<--header-file> <file>
				830
				831	Define the file name (without extension)
				832	to store header information on
				833	the corpus, document, and text level in.
				834	Defaults to C<header>.
				835
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	836	=item B<--use-tokenizer-sentence-splits\|-s>
				837
				838	Replace existing with, or add new, sentence boundary information
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	839	provided by the tokenizer.
				840	Currently KorAP-tokenizer and certain external tokenizers support
				841	these boundaries.
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	842
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	843	=item B<--tokens-file> <file>
				844
				845	Define the file (without extension)
				846	to store generated token information in
				847	(either from the KorAP tokenizer or an externally called tokenizer).
				848	Defaults to C<tokens>.
				849
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	850	=item B<--log\|-l>
				851
				852	Loglevel for I<Log::Any>. Defaults to C<notice>.
				853
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	854	=back
				855
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	856	=head1 ENVIRONMENT VARIABLES
				857
				858	=over 2
				859
				860	=item B<KORAPXMLTEI_DEBUG>
				861
				862	Activate minimal debugging.
				863	Defaults to C<false>.
				864
Marc Kupietz	d254f5c	2025-04-16 10:37:08 +0200	[diff] [blame]	865	=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
				866
				867	Set the heap size for the tokenizer process.
				868	Defaults to C<512m>.
				869
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	870	=back
				871
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	872	=head1 COPYRIGHT AND LICENSE
				873
Marc Kupietz	b6fd6bc	2025-04-16 12:47:26 +0200	[diff] [blame]	874	Copyright (C) 2021-2025, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	875
				876	Author: Peter Harders
				877
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	878	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	879
				880	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				881	Corpus Analysis Platform at the
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	882	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	883	member of the
				884	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				885
				886	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	887	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	888
				889	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	890
				891	# NOTES
				892
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	893	## Notes on segfault prevention
				894
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	895	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	896	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				897	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				898	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				899	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.