Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 7d079cab223452dc0a92d69f2a1084f850063525 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	9	use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	10
				11	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	12
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	13	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	15	use FindBin;
				16	BEGIN {
				17	unshift @INC, "$FindBin::Bin/../lib";
				18	};
				19
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	20	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	22	use KorAP::XML::TEI::Tokenizer::Conservative;
				23	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	25	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	26	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	27
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	28	our $VERSION = '2.6.0';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	29
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	30	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				31
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	32	use constant {
				33	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	34	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	35	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	36
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	37	if ($ENV{KORAPXMLTEI_INLINE}) {
				38	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
				39	};
				40
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	41	# Inline tokens won't be stored in the structure file
				42	my $inline_tokens_exclusive = 0;
				43
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	44	# Inline dependencies won't be stored in the tokens file
				45	my $inline_deps_exclusive = 0;
				46
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	47	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	48	GetOptions(
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	49	'auto-textsigle\|A=s' => \(my $auto_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	50	'root\|r=s' => \(my $root_dir = '.'),
				51	'input\|i=s' => \(my $input_fname = ''),
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	52	'output\|o=s' => \(my $output_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	53	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				54	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	55	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	56	'no-tokenizer' => \(my $no_tokenizer),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	57	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				58	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				59	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	60	'inline-dependencies=s' => \(my $inline_dependencies),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	62	'skip-inline-token-annotations!' => \(
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	63	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	64	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	65	'base-foundry=s' => \(my $base_dir = 'base'),
				66	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	67	'header-file=s' => \(my $header_file = 'header'),
				68	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	69	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	70	'log\|l=s' => \(my $log_level = 'notice'),
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	71	'required-version\|rv=s' => \(my $required_version),
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	72	'' => \(my $stdio),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	73	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	74	pod2usage(
				75	-verbose => 99,
				76	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				77	-msg => $VERSION_MSG,
				78	-output => '-'
				79	)
				80	},
				81	'version\|v' => sub {
				82	pod2usage(
				83	-verbose => 0,
				84	-msg => $VERSION_MSG,
				85	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	86	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	87	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	88	);
				89
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	90
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	91	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	92	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	93	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	94	$log->notice('Debugging is activated') if DEBUG;
				95
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	96
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	97	if ($required_version) {
Marc Kupietz	2475c95	2024-01-09 10:40:04 +0100	[diff] [blame]	98	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	99	if (!$1 \|\| $1 ne $VERSION) {
				100	$log->error("Required version $required_version mismatches version $VERSION");
				101	exit(1);
				102	};
				103	};
				104
				105
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	106	my ($what, $with);
				107	if ($xmlid_to_textsigle ne '') {
				108	($what, $with) = split('@', $xmlid_to_textsigle);
				109	$what = qr!$what!;
				110	};
				111
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	112	# tag (without attributes), which contains the primary text
				113	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	114	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	115
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	116	# Remember to skip certain inline tags
				117	my %skip_inline_tags = ();
				118	if ($skip_inline_tags_str) {
				119	foreach (split /\s,\s/, $skip_inline_tags_str) {
				120	$skip_inline_tags{$_} = 1;
				121	};
				122	};
				123
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	124	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	125	my $ext_tok;
				126	if ($tokenizer_call) {
				127	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	128	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	129	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	130
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	131	# KorAP tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	132	elsif ($tokenizer_korap) {
Akron	bd4281e	2022-03-28 08:31:40 +0200	[diff] [blame]	133	eval {
				134	require KorAP::XML::TEI::Tokenizer::KorAP;
				135	1;
				136	};
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	137
				138	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
				139	if ($korap_tok_ver ne $VERSION) {
				140	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
				141	exit(1);
				142	};
				143
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	144	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	145	}
				146
				147	# No internal tokenizer chosen
				148	elsif (!$tokenizer_intern && !$no_tokenizer) {
				149	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
				150	exit(1);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	151	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	152
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	153	if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	154	$skip_inline_tags{s} = 1;
				155	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	156
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	157	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	158	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				159	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	160
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	161
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	162	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	163	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	164	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	165
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	166	# Name of the directory and the file containing all inline token informations
				167	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				168	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	169
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	170	if (index($_tokens_dir, '!') == 0) {
				171	$_tokens_dir = substr($_tokens_dir, 1);
				172	$inline_tokens_exclusive = 1;
				173	};
				174
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	175
				176	my ($_dep_dir, $_dep_file);
				177	if ($inline_dependencies) {
				178	($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
				179	$inline_dependencies = 1;
				180
				181	if ($_dep_dir && index($_dep_dir, '!') == 0) {
				182	$_dep_dir = substr($_dep_dir, 1);
				183	$inline_deps_exclusive = 1;
				184	};
				185	};
				186
				187
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	188	# Initialize zipper
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	189	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	190
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	191	# text directory (below $root_dir)
				192	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	193
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	194	# Escaped version of text id
				195	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	196
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	197	# Default encoding of the text
				198	my $input_enc = 'UTF-8';
				199
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	200	# text line (needed for whitespace handling)
				201	my $text_line = 0;
				202
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	203
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	204	# Input file handle (default: stdin)
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	205	my $input_fh;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	206
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	207	# Single dash was set
				208	if ($stdio) {
				209	$input_fh = *STDIN;
				210	}
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	211	# Input flag was passed
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	212	elsif (@ARGV \|\| $input_fname ne '') {
				213	unless ($input_fname ne '') {
				214	$input_fname = shift @ARGV;
				215	};
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	216	unless (open($input_fh, '<', $input_fname)) {
				217	die $log->fatal("File '$input_fname' could not be opened.");
				218	};
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	219	}
				220
				221	# No input to process
				222	else {
				223	pod2usage(
				224	-verbose => 99,
				225	-sections => 'NAME\|SYNOPSIS',
				226	-msg => $VERSION_MSG,
				227	-output => '-'
				228	);
				229	exit;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	230	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	231
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	232	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	233	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	234
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	235
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	236	# Create inline parser object
				237	my $inline = KorAP::XML::TEI::Inline->new(
				238	$skip_inline_tokens,
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	239	\%skip_inline_tags,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	240	$inline_tokens_exclusive,
				241	$inline_dependencies
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	242	);
				243
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	244	do {
				245	$log->notice("Reading input document $input_fname") if ($input_fname ne '');
				246	MAIN:
				247	while (<$input_fh>) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	248
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	249	# remove HTML (multi-line) comments (<!--...-->)
				250	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	251
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	252	# Set input encoding
				253	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
				254	$input_enc = $2;
				255	next;
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	256	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	257
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	258	$_ = decode($input_enc, $_);
				259	$_ = replace_entities($_);
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	260
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	261	# Start of text body
				262	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
				263	my $suffix = $2;
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	264
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	265	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
				266	die $log->fatal("input line number $.: " .
				267	"line with opening text-body tag '${_TEXT_BODY}' " .
				268	"contains additional information ... => Aborting (line=$_)");
				269	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	270
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	271	# Text body data extracted from input document ($input_fh),
				272	# further processed by XML::LibXML::Reader
				273	my $text_buffer = '';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	274
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	275	# Iterate over all lines in the text body
				276	while (<$input_fh>) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	277
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	278	$_ = remove_xml_comments($input_fh, $_);
				279	$_ = decode($input_enc, $_);
				280	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	281
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	282	# End of text body
				283	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
				284
				285	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
				286
				287	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
				288	die $log->fatal("input line number $.: " .
				289	"line with closing text-body tag '${_TEXT_BODY}'" .
				290	" contains additional information ... => Aborting (line=$_)");
				291	};
				292
				293	if ($dir eq '') {
				294	$log->warn(
				295	"Maybe empty textSigle => skipping this text ...\n" .
				296	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	297	);
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	298	next MAIN;
				299	};
				300
				301	# Parse inline structure
				302	$inline->parse($text_id_esc, \$text_buffer);
				303
				304	if (DEBUG) {
				305	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
				306	};
				307
				308	my $data = $inline->data;
				309
				310	# Write data.xml
				311	$data->to_zip(
				312	$zipper->new_stream("$dir/${data_file}.xml"),
				313	$text_id_esc
				314	);
				315
				316	# Tokenize with external tokenizer
				317	if ($ext_tok) {
				318
				319	# Tokenize and output
				320	$ext_tok->tokenize($data->data)->to_zip(
				321	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
				322	$text_id_esc
				323	);
				324
				325	if ($use_tokenizer_sentence_splits) {
				326	$ext_tok->sentencize_from_previous_input($inline->structures);
				327	};
				328	};
				329
				330	# Tokenize with internal tokenizer
				331	if ($tokenizer_intern) {
				332
				333	# Tokenize and output
				334	$cons_tok->tokenize($data->data)->to_zip(
				335	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
				336	$text_id_esc
				337	)->reset;
				338
				339	$aggr_tok->tokenize($data->data)->to_zip(
				340	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
				341	$text_id_esc
				342	)->reset;
				343	};
				344
				345	# ~ write structures ~
				346	unless ($inline->structures->empty) {
				347	$inline->structures->to_zip(
				348	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
				349	$text_id_esc,
				350	2 # = structure serialization
				351	);
				352	};
				353
				354	# ~ write tokens ~
				355	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				356	$inline->tokens->to_zip(
				357	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
				358	$text_id_esc,
				359	# Either 0 = tokens without inline or
				360	# 1 = tokens with inline
				361	# !$skip_inline_token_annotations
				362	($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
				363	);
				364	};
				365
				366	# ~ write dependencies ~
				367	unless ($inline->dependencies->empty) {
				368	$inline->dependencies->to_zip(
				369	$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
				370	$text_id_esc,
				371	3 # = dependency serialization
				372	);
				373	};
				374
				375
				376	# reinit.
				377	$dir = '';
				378
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	379	next MAIN;
				380	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	381
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	382
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	383	# ~ whitespace handling ~
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	384
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	385	# Fix whitespaces (see notes on whitespace fixing)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	386
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	387	# TODO:
				388	# Maybe it's best, to keep the stripping of whitespace and
				389	# to just remove the if-clause and to insert a blank by default
				390	# (with possibly an option on how newlines in primary text should
				391	# be handled (stripped or replaced by a whitespace)).
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	392
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	393	# Remove consecutive whitespace at beginning and end (mostly one newline)
				394	s/^\s+//;
				395	s/\s+$//;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	396
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	397	# NOTE:
				398	# this is only relevant, if a text consists of more than one line
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	399
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	400	# TODO:
				401	# find a better solution, or create a warning, if a text has more
				402	# than one line ($text_line > 1)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	403
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	404	# TODO:
				405	# do testing with 2 different corpora
				406	# (one with only one-line texts, the other with several lines per text)
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	407
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	408	# line contains at least one non-tag character
				409	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	410
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	411	# Increment counter for text lines
				412	$text_line++;
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	413
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	414	# insert blank before 1st character
				415	# (for 2nd line and consecutive lines)
				416	$_ = ' ' . $_ if $text_line > 1;
				417	}
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	418
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	419	# add line to buffer
				420	$text_buffer .= $_;
				421	};
				422	}
				423	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
				424	my $leadin = $1;
				425	my $id = $3;
				426	my $sigle = $3;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	427
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	428	if ($what) {
				429	$_ = $id;
				430	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
				431	$sigle = $_;
				432	$log->debug("Converted text id `$id' to sigle `$sigle'");
				433	};
				434	$sigle =~ s/\./-/g;
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	435
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	436	my @parts = split(/[\/_]/, $sigle);
				437	if (@parts != 3) {
				438	die $log->fatal(
				439	"input line number $.: " .
				440	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
				441	"=> Aborting (line=$_)");
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	442	};
				443
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	444	$dir = join("/", @parts);
				445	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
				446	$log->notice("$0: text_id=$text_id_esc");
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	447
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	448	if ($leadin !~ /^\s*$/) {
				449	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	450	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	451	'line with opening header tag is not in expected format ... ' .
				452	"=> Aborting (line=$_)");
				453	};
				454	}
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	455
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	456	# Start of header section
				457	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
				458	my $content = "$2\n";
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	459
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	460	if ($1 !~ /^\s*$/) {
				461	die $log->fatal(
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	462	"input line number $.: " .
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	463	'line with opening header tag is not in expected format ... ' .
				464	"=> Aborting (line=$_)");
				465	};
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	466
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	467	# Parse header
				468	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
				469	if ($auto_textsigle) {
				470	$auto_textsigle = increase_auto_textsigle($auto_textsigle);
				471	$log->debug("Auto-incremented text sigle to $auto_textsigle");
				472	};
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	473
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	474	# Header was parseable
				475	if ($header) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	476
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	477	# Write header to zip
				478	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	479
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	480	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	481
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	482	$header->to_zip($zipper->new_stream($file));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	483
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	484	# Header is for text level
				485	if ($header->type eq 'text') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	486
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	487	# Remember dir and sigles
				488	$dir = $header->dir;
				489	$text_id_esc = $header->id_esc;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	490
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	491	# log output for seeing progression
				492	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	493
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	494	# Reset counter for text lines
				495	# (needed for whitespace handling)
				496	$text_line = 0;
				497	};
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	498	};
				499	};
				500	};
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	501	$text_id_esc = $auto_textsigle if ($auto_textsigle);
				502	} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	503	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	504
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	505	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	506
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	507	close $input_fh;
				508
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	509
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	510	__END__
				511
				512	=pod
				513
				514	=encoding utf8
				515
				516	=head1 NAME
				517
				518	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				519
				520	=head1 SYNOPSIS
				521
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	522	cat corpus.i5.xml \| tei2korapxml -tk - > corpus.korapxml.zip
				523	tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	524
				525	=head1 DESCRIPTION
				526
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	527	C<tei2korapxml> is a script to convert TEI P5 and
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	528	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	529	based documents to the
				530	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	531
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	532	This program is usually called from inside another script.
				533
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	534	=head1 FORMATS
				535
				536	=head2 Input restrictions
				537
				538	=over 2
				539
				540	=item
				541
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	542	TEI P5 formatted input with certain restrictions:
				543
				544	=over 4
				545
				546	=item
				547
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	548	B<mandatory>: text-header with integrated textsigle
				549	(or convertable identifier), text-body
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	550
				551	=item
				552
				553	B<optional>: corp-header with integrated corpsigle,
				554	doc-header with integrated docsigle
				555
				556	=back
				557
				558	=item
				559
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	560	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	561	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	562	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	563	into blanks between 2 tokens could lead to additional blanks,
				564	where there should be none (e.g.: punctuation characters like C<,> or
				565	C<.> should not be seperated from their predecessor token).
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	566	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	567
Akron	940ca6f	2021-10-11 12:38:39 +0200	[diff] [blame]	568	=item
				569
				570	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
				571	need to be defined in the same line as the header tag.
				572
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	573	=back
				574
				575	=head2 Notes on the output
				576
				577	=over 2
				578
				579	=item
				580
				581	zip file output (default on C<stdout>) with utf8 encoded entries
				582	(which together form the KorAP-XML format)
				583
				584	=back
				585
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	586	=head1 INSTALLATION
				587
Akron	d26319b	2023-01-12 15:34:41 +0100	[diff] [blame]	588	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietz	e83a4e9	2021-03-16 20:51:26 +0100	[diff] [blame]	589	When these requirements are met, the preferred way to install the script is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	590	to use L<cpanm\|App::cpanminus>.
				591
				592	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				593
				594	In case everything went well, the C<tei2korapxml> tool will
				595	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	596
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	597	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				598
				599	=head1 OPTIONS
				600
				601	=over 2
				602
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	603	=item B<--input\|-i>
				604
				605	The input file to process. If no specific input is defined and a single
				606	dash C<-> is passed as an argument, data is read from C<STDIN>.
				607
Marc Kupietz	5b3f1d8	2024-07-05 17:50:55 +0200	[diff] [blame]	608	Instead of using C<-i> input files can also be defined as trailing arguments
				609	to the command:
				610
				611	tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
				612
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	613	=item B<--output\|-o>
				614
				615	The output zip file to be created. If no specific output is defined,
				616	data is written to C<STDOUT>.
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	617
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	618	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	619
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	620	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	621
				622	=item B<--help\|-h>
				623
				624	Print help information.
				625
				626	=item B<--version\|-v>
				627
				628	Print version information.
				629
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	630	=item B<--tokenizer-korap\|-tk>
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	631
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	632	Use the standard KorAP/DeReKo tokenizer.
				633
				634	=item B<--tokenizer-internal\|-ti>
				635
				636	Tokenize the data using two embedded tokenizers,
				637	that will take an I<aggressive> and a I<conservative>
				638	approach.
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	639
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	640	=item B<--tokenizer-call\|-tc>
				641
				642	Call an external tokenizer process, that will tokenize
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	643	from STDIN and outputs the offsets of all tokens.
				644
				645	Texts are separated using C<\x04\n>. The external process
				646	should add a new line per text.
				647
				648	If the L</--use-tokenizer-sentence-splits> option is activated,
				649	sentences are marked by offset as well in new lines.
				650
				651	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
				652	splitting, call C<tei2korap> as follows:
				653
				654	$ cat corpus.i5.xml \| tei2korapxml -s \
				655	$ -tc 'datok tokenize \
				656	$ -t ./tokenizer.matok \
				657	$ -p --newline-after-eot --no-sentences \
				658	$ --no-tokens --sentence-positions -' - \
				659	$ > corpus.korapxml.zip
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	660
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	661	=item B<--no-tokenizer>
				662
				663	Boolean flag indicating that no tokenizer should be used.
				664	This is meant to ensure that by default a final token layer always
				665	exists.
				666	If a separate tokenizer is chosen, this flag is ignored.
				667
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	668	=item B<--skip-inline-tokens>
				669
				670	Boolean flag indicating that inline tokens should not
				671	be processed. Defaults to false (meaning inline tokens will be processed).
				672
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	673	=item B<--skip-inline-token-annotations>
				674
				675	Boolean flag indicating that inline token annotations should not
				676	be processed. Defaults to true (meaning inline token annotations
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	677	won't be processed). Can be negated with
				678	C<--no-skip-inline-token-annotations>.
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	679
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	680	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	681
				682	Expects a comma-separated list of tags to be ignored when the structure
				683	is parsed. Content of these tags however will be processed.
				684
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame]	685	=item B<--auto-textsigle> <textsigle>
				686
				687	Expects a text sigle thats serves as fallback if no text sigles
				688	are given in the input data.
				689	The auto text sigle will be incremented for each text processed.
				690
				691	Example:
				692
				693	tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
				694	< data.i5.xml > korapxml.zip
				695
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	696	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
				697
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	698	Expects a regular replacement expression (separated by B<@> between the
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	699	search and the replacement) to convert text id attributes to text sigles
				700	with three parts (separated by B</>).
				701
				702	Example:
				703
				704	tei2korapxml \
				705	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
				706	-tk - < t/data/icc_german_sample.p5.xml
				707
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	708	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
				709	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	710
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	711	=item B<--inline-tokens> <foundry>#[<file>]
				712
				713	Define the foundry and file (without extension)
				714	to store inline token information in.
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	715	Unless C<--skip-inline-token-annotations> is set,
				716	this will contain annotations as well.
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	717	Defaults to C<tokens> and C<morpho>.
				718
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	719	The inline token data will also be stored in the
				720	inline structures file (see I<--inline-structures>),
				721	unless the inline token foundry is prepended
				722	by an B<!> exclamation mark, indicating that inline
				723	tokens are stored exclusively in the inline tokens
				724	file.
				725
				726	Example:
				727
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	728	tei2korapxml --no-tokenizer --inline-tokens \
				729	'!gingko#morpho' < data.i5.xml > korapxml.zip
				730
				731	=item B<--inline-dependencies> <foundry>#[<file>]
				732
				733	Define the foundry and file (without extension)
				734	to store inline dependency information in.
				735	Defaults to the layer of C<dependency> and
				736	will be ignored if not set (which means, dependency
				737	attributes will be stored in the inline tokens file,
				738	if not skipped).
				739
				740	The dependency data will also be stored in the
				741	inline token file (see I<--inline-tokens>),
				742	unless the inline dependencies foundry is prepended
				743	by an B<!> exclamation mark, indicating that inline
				744	dependency data is stored exclusively in the inline
				745	dependencies file.
				746
				747	Example:
				748
				749	tei2korapxml --no-tokenizer --inline-dependencies \
				750	'gingko#dependency' < data.i5.xml > korapxml.zip
				751
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	752
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	753	=item B<--inline-structures> <foundry>#[<file>]
				754
				755	Define the foundry and file (without extension)
				756	to store inline structure information in.
				757	Defaults to C<struct> and C<structures>.
				758
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	759	=item B<--base-foundry> <foundry>
				760
				761	Define the base foundry to store newly generated
				762	token information in.
				763	Defaults to C<base>.
				764
				765	=item B<--data-file> <file>
				766
				767	Define the file (without extension)
				768	to store primary data information in.
				769	Defaults to C<data>.
				770
				771	=item B<--header-file> <file>
				772
				773	Define the file name (without extension)
				774	to store header information on
				775	the corpus, document, and text level in.
				776	Defaults to C<header>.
				777
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	778	=item B<--use-tokenizer-sentence-splits\|-s>
				779
				780	Replace existing with, or add new, sentence boundary information
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	781	provided by the tokenizer.
				782	Currently KorAP-tokenizer and certain external tokenizers support
				783	these boundaries.
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	784
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	785	=item B<--tokens-file> <file>
				786
				787	Define the file (without extension)
				788	to store generated token information in
				789	(either from the KorAP tokenizer or an externally called tokenizer).
				790	Defaults to C<tokens>.
				791
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	792	=item B<--log\|-l>
				793
				794	Loglevel for I<Log::Any>. Defaults to C<notice>.
				795
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	796	=back
				797
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	798	=head1 ENVIRONMENT VARIABLES
				799
				800	=over 2
				801
				802	=item B<KORAPXMLTEI_DEBUG>
				803
				804	Activate minimal debugging.
				805	Defaults to C<false>.
				806
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	807	=back
				808
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	809	=head1 COPYRIGHT AND LICENSE
				810
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	811	Copyright (C) 2021-2024, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	812
				813	Author: Peter Harders
				814
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	815	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	816
				817	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				818	Corpus Analysis Platform at the
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	819	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	820	member of the
				821	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				822
				823	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	824	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	825
				826	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	827
				828	# NOTES
				829
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	830	## Notes on segfault prevention
				831
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	832	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	833	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				834	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				835	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				836	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.