Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 953fc4452872da8874afa92ee3a5aa59cdaade21 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	9	use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	10
				11	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	12
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	13	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	15	use FindBin;
				16	BEGIN {
				17	unshift @INC, "$FindBin::Bin/../lib";
				18	};
				19
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	20	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	22	use KorAP::XML::TEI::Tokenizer::Conservative;
				23	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	25	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	26	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	27
Marc Kupietz	ef5dfd3	2026-03-05 10:02:47 +0100	[diff] [blame]	28	our $VERSION = '2.7.1';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	29
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	30	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				31
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	32	use constant {
				33	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	34	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	35	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	36
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	37	if ($ENV{KORAPXMLTEI_INLINE}) {
				38	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
				39	};
				40
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	41	# Inline tokens won't be stored in the structure file
				42	my $inline_tokens_exclusive = 0;
				43
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	44	# Inline dependencies won't be stored in the tokens file
				45	my $inline_deps_exclusive = 0;
				46
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	47	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	48	GetOptions(
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	49	'auto-textsigle\|A=s' => \(my $auto_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	50	'root\|r=s' => \(my $root_dir = '.'),
				51	'input\|i=s' => \(my $input_fname = ''),
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	52	'output\|o=s' => \(my $output_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	53	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				54	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	55	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	56	'no-tokenizer' => \(my $no_tokenizer),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	57	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				58	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				59	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	60	'inline-dependencies=s' => \(my $inline_dependencies),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	62	'skip-inline-token-annotations!' => \(
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	63	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	64	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	65	'base-foundry=s' => \(my $base_dir = 'base'),
				66	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	67	'header-file=s' => \(my $header_file = 'header'),
				68	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	69	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	70	'log\|l=s' => \(my $log_level = 'notice'),
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	71	'required-version\|rv=s' => \(my $required_version),
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	72	'progress\|p' => \(my $progress),
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	73	'' => \(my $stdio),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	74	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	75	pod2usage(
				76	-verbose => 99,
				77	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				78	-msg => $VERSION_MSG,
				79	-output => '-'
				80	)
				81	},
				82	'version\|v' => sub {
				83	pod2usage(
				84	-verbose => 0,
				85	-msg => $VERSION_MSG,
				86	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	87	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	88	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	89	);
				90
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	91
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	92	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	93	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	94	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	95	$log->notice('Debugging is activated') if DEBUG;
				96
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	97
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	98	if ($required_version) {
Marc Kupietz	2475c95	2024-01-09 10:40:04 +0100	[diff] [blame]	99	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	100	if (!$1 \|\| $1 ne $VERSION) {
				101	$log->error("Required version $required_version mismatches version $VERSION");
				102	exit(1);
				103	};
				104	};
				105
				106
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	107	my ($what, $with);
				108	if ($xmlid_to_textsigle ne '') {
				109	($what, $with) = split('@', $xmlid_to_textsigle);
				110	$what = qr!$what!;
				111	};
				112
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	113	if ($progress) {
				114	eval {
				115	require Time::Progress;
				116	1;
				117	} or do {
				118	$log->warn('Time::Progress not installed. Progress bar disabled.');
				119	$progress = 0;
				120	}
				121	};
				122
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	123	# tag (without attributes), which contains the primary text
				124	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	125	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	126
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	127	# Remember to skip certain inline tags
				128	my %skip_inline_tags = ();
				129	if ($skip_inline_tags_str) {
				130	foreach (split /\s,\s/, $skip_inline_tags_str) {
				131	$skip_inline_tags{$_} = 1;
				132	};
				133	};
				134
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	135	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	136	my $ext_tok;
				137	if ($tokenizer_call) {
				138	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	139	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	140	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	141
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	142	# KorAP tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	143	elsif ($tokenizer_korap) {
Akron	bd4281e	2022-03-28 08:31:40 +0200	[diff] [blame]	144	eval {
				145	require KorAP::XML::TEI::Tokenizer::KorAP;
				146	1;
				147	};
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	148
				149	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
				150	if ($korap_tok_ver ne $VERSION) {
				151	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
				152	exit(1);
				153	};
				154
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	155	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	156	}
				157
				158	# No internal tokenizer chosen
				159	elsif (!$tokenizer_intern && !$no_tokenizer) {
				160	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
				161	exit(1);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	162	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	163
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	164	if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	165	$skip_inline_tags{s} = 1;
				166	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	167
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	168	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	169	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				170	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	171
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	172
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	173	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	174	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	175	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	176
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	177	# Name of the directory and the file containing all inline token informations
				178	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				179	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	180
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	181	if (index($_tokens_dir, '!') == 0) {
				182	$_tokens_dir = substr($_tokens_dir, 1);
				183	$inline_tokens_exclusive = 1;
				184	};
				185
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	186
				187	my ($_dep_dir, $_dep_file);
				188	if ($inline_dependencies) {
				189	($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
				190	$inline_dependencies = 1;
				191
				192	if ($_dep_dir && index($_dep_dir, '!') == 0) {
				193	$_dep_dir = substr($_dep_dir, 1);
				194	$inline_deps_exclusive = 1;
				195	};
				196	};
				197
				198
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	199	# Initialize zipper
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	200	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	201
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	202	# text directory (below $root_dir)
				203	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	204
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	205	# Escaped version of text id
				206	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	207
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	208	# Default encoding of the text
				209	my $input_enc = 'UTF-8';
				210
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	211	# text line (needed for whitespace handling)
				212	my $text_line = 0;
				213
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	214
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	215	# Input file handle (default: stdin)
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	216	my $input_fh;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	217
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	218	# Single dash was set
				219	if ($stdio) {
				220	$input_fh = *STDIN;
				221	}
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	222	# Input flag was passed
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	223	elsif (@ARGV \|\| $input_fname ne '') {
				224	unless ($input_fname ne '') {
				225	$input_fname = shift @ARGV;
				226	};
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	227	unless (open($input_fh, '<', $input_fname)) {
				228	die $log->fatal("File '$input_fname' could not be opened.");
				229	};
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	230	}
				231
				232	# No input to process
				233	else {
				234	pod2usage(
				235	-verbose => 99,
				236	-sections => 'NAME\|SYNOPSIS',
				237	-msg => $VERSION_MSG,
				238	-output => '-'
				239	);
				240	exit;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	241	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	242
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	243	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	244	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	245
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	246
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	247	# Create inline parser object
				248	my $inline = KorAP::XML::TEI::Inline->new(
				249	$skip_inline_tokens,
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	250	\%skip_inline_tags,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	251	$inline_tokens_exclusive,
				252	$inline_dependencies
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	253	);
				254
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	255	do {
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	256	my $p;
				257	if ($progress && $input_fname ne '') {
				258	my $file_size = -s $input_fname;
				259	if ($file_size) {
				260	$p = Time::Progress->new(min => 0, max => $file_size);
				261	$log->notice("Reading input document $input_fname (Size: $file_size bytes)");
				262	}
				263	} elsif ($input_fname ne '') {
				264	$log->notice("Reading input document $input_fname");
				265	};
				266
				267	my $i = 0;
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	268	MAIN:
				269	while (<$input_fh>) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	270
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	271	if ($p && ($i++ % 500 == 0)) {
				272	print STDERR $p->report("\r%20b %p ETA: %E", tell($input_fh));
				273	};
				274
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	275	# remove HTML (multi-line) comments (<!--...-->)
				276	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	277
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	278	# Set input encoding
				279	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				280	$input_enc = $2;
				281	next;
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	282	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	283
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	284	$_ = decode($input_enc, $_);
				285	$_ = replace_entities($_);
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	286
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	287	# Start of text body
				288	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
				289	my $suffix = $2;
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	290
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	291	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
				292	die $log->fatal("input line number $.: " .
				293	"line with opening text-body tag '${_TEXT_BODY}' " .
				294	"contains additional information ... => Aborting (line=$_)");
				295	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	296
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	297	# Text body data extracted from input document ($input_fh),
				298	# further processed by XML::LibXML::Reader
				299	my $text_buffer = '';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	300
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	301	# Iterate over all lines in the text body
				302	while (<$input_fh>) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	303
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	304	$_ = remove_xml_comments($input_fh, $_);
				305	$_ = decode($input_enc, $_);
				306	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	307
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	308	# End of text body
				309	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
				310
				311	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
				312
Marc Kupietz	ff061ef	2026-03-05 09:59:35 +0100	[diff] [blame]	313	my $before = substr($_, 0, $pos);
				314	my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
				315	my $before_check = $before;
				316	$before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
				317	if (($before_check . $after) !~ /^\s*$/) {
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	318	die $log->fatal("input line number $.: " .
				319	"line with closing text-body tag '${_TEXT_BODY}'" .
				320	" contains additional information ... => Aborting (line=$_)");
				321	};
				322
Marc Kupietz	ff061ef	2026-03-05 09:59:35 +0100	[diff] [blame]	323	# Add any remaining content before </text> (e.g. </body>) to the buffer
				324	$before =~ s/^\s+//;
				325	$before =~ s/\s+$//;
				326	$text_buffer .= $before if $before ne '';
				327
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	328	if ($dir eq '') {
				329	$log->warn(
				330	"Maybe empty textSigle => skipping this text ...\n" .
				331	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	332	);
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	333	next MAIN;
				334	};
				335
				336	# Parse inline structure
				337	$inline->parse($text_id_esc, \$text_buffer);
				338
				339	if (DEBUG) {
				340	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
				341	};
				342
				343	my $data = $inline->data;
				344
				345	# Write data.xml
				346	$data->to_zip(
				347	$zipper->new_stream("$dir/${data_file}.xml"),
				348	$text_id_esc
				349	);
				350
				351	# Tokenize with external tokenizer
				352	if ($ext_tok) {
				353
				354	# Tokenize and output
				355	$ext_tok->tokenize($data->data)->to_zip(
				356	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
				357	$text_id_esc
				358	);
				359
				360	if ($use_tokenizer_sentence_splits) {
				361	$ext_tok->sentencize_from_previous_input($inline->structures);
				362	};
				363	};
				364
				365	# Tokenize with internal tokenizer
				366	if ($tokenizer_intern) {
				367
				368	# Tokenize and output
				369	$cons_tok->tokenize($data->data)->to_zip(
				370	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
				371	$text_id_esc
				372	)->reset;
				373
				374	$aggr_tok->tokenize($data->data)->to_zip(
				375	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
				376	$text_id_esc
				377	)->reset;
				378	};
				379
				380	# ~ write structures ~
				381	unless ($inline->structures->empty) {
				382	$inline->structures->to_zip(
				383	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
				384	$text_id_esc,
				385	2 # = structure serialization
				386	);
				387	};
				388
				389	# ~ write tokens ~
				390	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				391	$inline->tokens->to_zip(
				392	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
				393	$text_id_esc,
				394	# Either 0 = tokens without inline or
				395	# 1 = tokens with inline
				396	# !$skip_inline_token_annotations
				397	($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
				398	);
				399	};
				400
				401	# ~ write dependencies ~
				402	unless ($inline->dependencies->empty) {
				403	$inline->dependencies->to_zip(
				404	$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
				405	$text_id_esc,
				406	3 # = dependency serialization
				407	);
				408	};
				409
				410
				411	# reinit.
				412	$dir = '';
				413
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	414	next MAIN;
				415	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	416
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	417
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	418	# ~ whitespace handling ~
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	419
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	420	# Fix whitespaces (see notes on whitespace fixing)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	421
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	422	# TODO:
				423	# Maybe it's best, to keep the stripping of whitespace and
				424	# to just remove the if-clause and to insert a blank by default
				425	# (with possibly an option on how newlines in primary text should
				426	# be handled (stripped or replaced by a whitespace)).
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	427
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	428	# Remove consecutive whitespace at beginning and end (mostly one newline)
				429	s/^\s+//;
				430	s/\s+$//;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	431
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	432	# NOTE:
				433	# this is only relevant, if a text consists of more than one line
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	434
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	435	# TODO:
				436	# find a better solution, or create a warning, if a text has more
				437	# than one line ($text_line > 1)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	438
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	439	# TODO:
				440	# do testing with 2 different corpora
				441	# (one with only one-line texts, the other with several lines per text)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	442
Marc Kupietz	a84fcb5	2026-03-05 17:22:43 +0100	[diff] [blame^]	443	# Check if the buffer currently ends inside an open XML tag
				444	# (last '<' is after last '>'), meaning this line is a continuation of
				445	# a multi-line element (e.g. attributes split across lines like <ref>).
				446	# A space must be prepended to avoid "attributes construct error" in the
				447	# XML parser when two attribute tokens are concatenated without separator.
				448	my $in_open_tag = ($text_buffer ne '' &&
				449	rindex($text_buffer, '<') > rindex($text_buffer, '>'));
				450
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	451	# line contains at least one non-tag character
				452	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	453
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	454	# Increment counter for text lines
				455	$text_line++;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	456
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	457	# insert blank before 1st character
Marc Kupietz	a84fcb5	2026-03-05 17:22:43 +0100	[diff] [blame^]	458	# (for 2nd line and consecutive lines, or when continuing an open tag)
				459	$_ = ' ' . $_ if $text_line > 1 \|\| $in_open_tag;
				460	}
				461
				462	# Line is purely within an open tag (attribute continuation):
				463	# prepend a space so attributes are properly separated.
				464	elsif ($in_open_tag) {
				465	$_ = ' ' . $_;
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	466	}
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	467
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	468	# add line to buffer
				469	$text_buffer .= $_;
				470	};
				471	}
				472	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
				473	my $leadin = $1;
				474	my $id = $3;
				475	my $sigle = $3;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	476
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	477	if ($what) {
				478	$_ = $id;
				479	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
				480	$sigle = $_;
				481	$log->debug("Converted text id `$id' to sigle `$sigle'");
				482	};
				483	$sigle =~ s/\./-/g;
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	484
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	485	my @parts = split(/[\/_]/, $sigle);
				486	if (@parts != 3) {
				487	die $log->fatal(
				488	"input line number $.: " .
				489	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
				490	"=> Aborting (line=$_)");
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	491	};
				492
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	493	$dir = join("/", @parts);
				494	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
				495	$log->notice("$0: text_id=$text_id_esc");
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	496
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	497	if ($leadin !~ /^\s*$/) {
				498	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	499	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	500	'line with opening header tag is not in expected format ... ' .
				501	"=> Aborting (line=$_)");
				502	};
				503	}
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	504
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	505	# Start of header section
				506	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
				507	my $content = "$2\n";
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	508
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	509	if ($1 !~ /^\s*$/) {
				510	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	511	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	512	'line with opening header tag is not in expected format ... ' .
				513	"=> Aborting (line=$_)");
				514	};
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	515
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	516	# Parse header
				517	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
				518	if ($auto_textsigle) {
				519	$auto_textsigle = increase_auto_textsigle($auto_textsigle);
				520	$log->debug("Auto-incremented text sigle to $auto_textsigle");
				521	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	522
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	523	# Header was parseable
				524	if ($header) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	525
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	526	# Write header to zip
				527	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	528
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	529	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	530
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	531	$header->to_zip($zipper->new_stream($file));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	532
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	533	# Header is for text level
				534	if ($header->type eq 'text') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	535
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	536	# Remember dir and sigles
				537	$dir = $header->dir;
				538	$text_id_esc = $header->id_esc;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	539
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	540	# log output for seeing progression
				541	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	542
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	543	# Reset counter for text lines
				544	# (needed for whitespace handling)
				545	$text_line = 0;
				546	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	547	};
				548	};
				549	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	550	$text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz	2115ecc	2025-12-10 11:37:03 +0100	[diff] [blame]	551
				552	if ($p) {
				553	print STDERR $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
				554	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	555	} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	556	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	557
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	558	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	559
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	560	close $input_fh;
				561
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	562
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	563	__END__
				564
				565	=pod
				566
				567	=encoding utf8
				568
				569	=head1 NAME
				570
				571	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				572
				573	=head1 SYNOPSIS
				574
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	575	cat corpus.i5.xml \| tei2korapxml -tk - > corpus.korapxml.zip
				576	tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	577
				578	=head1 DESCRIPTION
				579
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	580	C<tei2korapxml> is a script to convert TEI P5 and
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	581	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	582	based documents to the
				583	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	584
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	585	This program is usually called from inside another script.
				586
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	587	=head1 FORMATS
				588
				589	=head2 Input restrictions
				590
				591	=over 2
				592
				593	=item
				594
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	595	TEI P5 formatted input with certain restrictions:
				596
				597	=over 4
				598
				599	=item
				600
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	601	B<mandatory>: text-header with integrated textsigle
				602	(or convertable identifier), text-body
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	603
				604	=item
				605
				606	B<optional>: corp-header with integrated corpsigle,
				607	doc-header with integrated docsigle
				608
				609	=back
				610
				611	=item
				612
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	613	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	614	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	615	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	616	into blanks between 2 tokens could lead to additional blanks,
				617	where there should be none (e.g.: punctuation characters like C<,> or
				618	C<.> should not be seperated from their predecessor token).
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	619	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	620
Akron	940ca6f	2021-10-11 12:38:39 +0200	[diff] [blame]	621	=item
				622
				623	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
				624	need to be defined in the same line as the header tag.
				625
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	626	=back
				627
				628	=head2 Notes on the output
				629
				630	=over 2
				631
				632	=item
				633
				634	zip file output (default on C<stdout>) with utf8 encoded entries
				635	(which together form the KorAP-XML format)
				636
				637	=back
				638
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	639	=head1 INSTALLATION
				640
Akron	d26319b	2023-01-12 15:34:41 +0100	[diff] [blame]	641	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietz	e83a4e9	2021-03-16 20:51:26 +0100	[diff] [blame]	642	When these requirements are met, the preferred way to install the script is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	643	to use L<cpanm\|App::cpanminus>.
				644
				645	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				646
				647	In case everything went well, the C<tei2korapxml> tool will
				648	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	649
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	650	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				651
				652	=head1 OPTIONS
				653
				654	=over 2
				655
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	656	=item B<--input\|-i>
				657
				658	The input file to process. If no specific input is defined and a single
				659	dash C<-> is passed as an argument, data is read from C<STDIN>.
				660
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	661	Instead of using C<-i> input files can also be defined as trailing arguments
				662	to the command:
				663
				664	tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
				665
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	666	=item B<--output\|-o>
				667
				668	The output zip file to be created. If no specific output is defined,
				669	data is written to C<STDOUT>.
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	670
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	671	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	672
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	673	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	674
				675	=item B<--help\|-h>
				676
				677	Print help information.
				678
				679	=item B<--version\|-v>
				680
				681	Print version information.
				682
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	683	=item B<--tokenizer-korap\|-tk>
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	684
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	685	Use the standard KorAP/DeReKo tokenizer.
				686
				687	=item B<--tokenizer-internal\|-ti>
				688
				689	Tokenize the data using two embedded tokenizers,
				690	that will take an I<aggressive> and a I<conservative>
				691	approach.
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	692
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	693	=item B<--tokenizer-call\|-tc>
				694
				695	Call an external tokenizer process, that will tokenize
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	696	from STDIN and outputs the offsets of all tokens.
				697
				698	Texts are separated using C<\x04\n>. The external process
				699	should add a new line per text.
				700
				701	If the L</--use-tokenizer-sentence-splits> option is activated,
				702	sentences are marked by offset as well in new lines.
				703
				704	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
				705	splitting, call C<tei2korap> as follows:
				706
				707	$ cat corpus.i5.xml \| tei2korapxml -s \
				708	$ -tc 'datok tokenize \
				709	$ -t ./tokenizer.matok \
				710	$ -p --newline-after-eot --no-sentences \
				711	$ --no-tokens --sentence-positions -' - \
				712	$ > corpus.korapxml.zip
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	713
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	714	=item B<--no-tokenizer>
				715
				716	Boolean flag indicating that no tokenizer should be used.
				717	This is meant to ensure that by default a final token layer always
				718	exists.
				719	If a separate tokenizer is chosen, this flag is ignored.
				720
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	721	=item B<--skip-inline-tokens>
				722
				723	Boolean flag indicating that inline tokens should not
				724	be processed. Defaults to false (meaning inline tokens will be processed).
				725
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	726	=item B<--skip-inline-token-annotations>
				727
				728	Boolean flag indicating that inline token annotations should not
				729	be processed. Defaults to true (meaning inline token annotations
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	730	won't be processed). Can be negated with
				731	C<--no-skip-inline-token-annotations>.
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	732
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	733	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	734
				735	Expects a comma-separated list of tags to be ignored when the structure
				736	is parsed. Content of these tags however will be processed.
				737
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	738	=item B<--auto-textsigle> <textsigle>
				739
				740	Expects a text sigle thats serves as fallback if no text sigles
				741	are given in the input data.
				742	The auto text sigle will be incremented for each text processed.
				743
				744	Example:
				745
				746	tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
				747	< data.i5.xml > korapxml.zip
				748
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	749	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
				750
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	751	Expects a regular replacement expression (separated by B<@> between the
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	752	search and the replacement) to convert text id attributes to text sigles
				753	with three parts (separated by B</>).
				754
				755	Example:
				756
				757	tei2korapxml \
				758	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
				759	-tk - < t/data/icc_german_sample.p5.xml
				760
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	761	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
				762	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	763
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	764	=item B<--inline-tokens> <foundry>#[<file>]
				765
				766	Define the foundry and file (without extension)
				767	to store inline token information in.
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	768	Unless C<--skip-inline-token-annotations> is set,
				769	this will contain annotations as well.
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	770	Defaults to C<tokens> and C<morpho>.
				771
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	772	The inline token data will also be stored in the
				773	inline structures file (see I<--inline-structures>),
				774	unless the inline token foundry is prepended
				775	by an B<!> exclamation mark, indicating that inline
				776	tokens are stored exclusively in the inline tokens
				777	file.
				778
				779	Example:
				780
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	781	tei2korapxml --no-tokenizer --inline-tokens \
				782	'!gingko#morpho' < data.i5.xml > korapxml.zip
				783
				784	=item B<--inline-dependencies> <foundry>#[<file>]
				785
				786	Define the foundry and file (without extension)
				787	to store inline dependency information in.
				788	Defaults to the layer of C<dependency> and
				789	will be ignored if not set (which means, dependency
				790	attributes will be stored in the inline tokens file,
				791	if not skipped).
				792
				793	The dependency data will also be stored in the
				794	inline token file (see I<--inline-tokens>),
				795	unless the inline dependencies foundry is prepended
				796	by an B<!> exclamation mark, indicating that inline
				797	dependency data is stored exclusively in the inline
				798	dependencies file.
				799
				800	Example:
				801
				802	tei2korapxml --no-tokenizer --inline-dependencies \
				803	'gingko#dependency' < data.i5.xml > korapxml.zip
				804
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	805
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	806	=item B<--inline-structures> <foundry>#[<file>]
				807
				808	Define the foundry and file (without extension)
				809	to store inline structure information in.
				810	Defaults to C<struct> and C<structures>.
				811
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	812	=item B<--base-foundry> <foundry>
				813
				814	Define the base foundry to store newly generated
				815	token information in.
				816	Defaults to C<base>.
				817
				818	=item B<--data-file> <file>
				819
				820	Define the file (without extension)
				821	to store primary data information in.
				822	Defaults to C<data>.
				823
				824	=item B<--header-file> <file>
				825
				826	Define the file name (without extension)
				827	to store header information on
				828	the corpus, document, and text level in.
				829	Defaults to C<header>.
				830
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	831	=item B<--use-tokenizer-sentence-splits\|-s>
				832
				833	Replace existing with, or add new, sentence boundary information
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	834	provided by the tokenizer.
				835	Currently KorAP-tokenizer and certain external tokenizers support
				836	these boundaries.
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	837
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	838	=item B<--tokens-file> <file>
				839
				840	Define the file (without extension)
				841	to store generated token information in
				842	(either from the KorAP tokenizer or an externally called tokenizer).
				843	Defaults to C<tokens>.
				844
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	845	=item B<--log\|-l>
				846
				847	Loglevel for I<Log::Any>. Defaults to C<notice>.
				848
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	849	=back
				850
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	851	=head1 ENVIRONMENT VARIABLES
				852
				853	=over 2
				854
				855	=item B<KORAPXMLTEI_DEBUG>
				856
				857	Activate minimal debugging.
				858	Defaults to C<false>.
				859
Marc Kupietz	d254f5c	2025-04-16 10:37:08 +0200	[diff] [blame]	860	=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
				861
				862	Set the heap size for the tokenizer process.
				863	Defaults to C<512m>.
				864
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	865	=back
				866
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	867	=head1 COPYRIGHT AND LICENSE
				868
Marc Kupietz	b6fd6bc	2025-04-16 12:47:26 +0200	[diff] [blame]	869	Copyright (C) 2021-2025, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	870
				871	Author: Peter Harders
				872
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	873	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	874
				875	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				876	Corpus Analysis Platform at the
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	877	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	878	member of the
				879	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				880
				881	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	882	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	883
				884	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	885
				886	# NOTES
				887
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	888	## Notes on segfault prevention
				889
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	890	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	891	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				892	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				893	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				894	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.