Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: c150c047060e89540d780206d25aebfccf08ffd8 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
				9
				10	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	11
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	12	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	13
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	14	use FindBin;
				15	BEGIN {
				16	unshift @INC, "$FindBin::Bin/../lib";
				17	};
				18
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	19	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	20	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::Conservative;
				22	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	23	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	25	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	26
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame^]	27	our $VERSION = '2.6.0';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	28
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	29	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				30
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	31	use constant {
				32	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	33	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	34	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	35
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	36	if ($ENV{KORAPXMLTEI_INLINE}) {
				37	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
				38	};
				39
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	40	# Inline tokens won't be stored in the structure file
				41	my $inline_tokens_exclusive = 0;
				42
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	43	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	44	GetOptions(
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	45	'root\|r=s' => \(my $root_dir = '.'),
				46	'input\|i=s' => \(my $input_fname = ''),
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame^]	47	'output\|o=s' => \(my $output_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	48	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				49	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	50	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	51	'no-tokenizer' => \(my $no_tokenizer),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	52	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				53	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				54	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
				55	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	56	'skip-inline-token-annotations' => \(
				57	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	58	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	59	'base-foundry=s' => \(my $base_dir = 'base'),
				60	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	61	'header-file=s' => \(my $header_file = 'header'),
				62	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	63	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	64	'log\|l=s' => \(my $log_level = 'notice'),
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	65	'required-version\|rv=s' => \(my $required_version),
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	66	'' => \(my $stdio),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	67	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	68	pod2usage(
				69	-verbose => 99,
				70	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				71	-msg => $VERSION_MSG,
				72	-output => '-'
				73	)
				74	},
				75	'version\|v' => sub {
				76	pod2usage(
				77	-verbose => 0,
				78	-msg => $VERSION_MSG,
				79	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	80	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	81	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	82	);
				83
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	84
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	85	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	86	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	87	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	88	$log->notice('Debugging is activated') if DEBUG;
				89
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	90
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	91	if ($required_version) {
Marc Kupietz	2475c95	2024-01-09 10:40:04 +0100	[diff] [blame]	92	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	93	if (!$1 \|\| $1 ne $VERSION) {
				94	$log->error("Required version $required_version mismatches version $VERSION");
				95	exit(1);
				96	};
				97	};
				98
				99
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	100	my ($what, $with);
				101	if ($xmlid_to_textsigle ne '') {
				102	($what, $with) = split('@', $xmlid_to_textsigle);
				103	$what = qr!$what!;
				104	};
				105
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	106	# tag (without attributes), which contains the primary text
				107	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	108	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	109
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	110	# Remember to skip certain inline tags
				111	my %skip_inline_tags = ();
				112	if ($skip_inline_tags_str) {
				113	foreach (split /\s,\s/, $skip_inline_tags_str) {
				114	$skip_inline_tags{$_} = 1;
				115	};
				116	};
				117
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	118	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	119	my $ext_tok;
				120	if ($tokenizer_call) {
				121	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	122	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	123	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	124
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	125	# KorAP tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	126	elsif ($tokenizer_korap) {
Akron	bd4281e	2022-03-28 08:31:40 +0200	[diff] [blame]	127	eval {
				128	require KorAP::XML::TEI::Tokenizer::KorAP;
				129	1;
				130	};
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	131
				132	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
				133	if ($korap_tok_ver ne $VERSION) {
				134	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
				135	exit(1);
				136	};
				137
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	138	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	139	}
				140
				141	# No internal tokenizer chosen
				142	elsif (!$tokenizer_intern && !$no_tokenizer) {
				143	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
				144	exit(1);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	145	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	146
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	147	if ($use_tokenizer_sentence_splits) {
				148	$skip_inline_tags{s} = 1;
				149	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	150
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	151	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	152	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				153	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	154
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	155
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	156	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	157	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	158	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	159
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	160	# Name of the directory and the file containing all inline token informations
				161	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				162	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	163
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	164	if (index($_tokens_dir, '!') == 0) {
				165	$_tokens_dir = substr($_tokens_dir, 1);
				166	$inline_tokens_exclusive = 1;
				167	};
				168
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	169	# Initialize zipper
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame^]	170	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	171
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	172	# text directory (below $root_dir)
				173	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	174
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	175	# Escaped version of text id
				176	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	177
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	178	# Default encoding of the text
				179	my $input_enc = 'UTF-8';
				180
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	181	# text line (needed for whitespace handling)
				182	my $text_line = 0;
				183
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	184
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	185	# Input file handle (default: stdin)
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	186	my $input_fh;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	187
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	188	# Single dash was set
				189	if ($stdio) {
				190	$input_fh = *STDIN;
				191	}
				192
				193	# Input flag was passed
				194	elsif ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	195	unless (open($input_fh, '<', $input_fname)) {
				196	die $log->fatal("File '$input_fname' could not be opened.");
				197	};
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	198	}
				199
				200	# No input to process
				201	else {
				202	pod2usage(
				203	-verbose => 99,
				204	-sections => 'NAME\|SYNOPSIS',
				205	-msg => $VERSION_MSG,
				206	-output => '-'
				207	);
				208	exit;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	209	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	210
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	211	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	212	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	213
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	214
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	215	# Create inline parser object
				216	my $inline = KorAP::XML::TEI::Inline->new(
				217	$skip_inline_tokens,
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	218	\%skip_inline_tags,
				219	$inline_tokens_exclusive
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	220	);
				221
				222
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	223	# Reading input document
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	224	MAIN: while (<$input_fh>) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	225
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	226	# remove HTML (multi-line) comments (<!--...-->)
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	227	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	228
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	229	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	230	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	231	$input_enc = $2;
				232	next;
				233	};
				234
				235	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	236	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	237
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	238	# Start of text body
				239	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	240	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	241
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	242	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	243	die $log->fatal("input line number $.: " .
				244	"line with opening text-body tag '${_TEXT_BODY}' " .
				245	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	246	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	247
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	248	# Text body data extracted from input document ($input_fh),
				249	# further processed by XML::LibXML::Reader
				250	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	251
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	252	# Iterate over all lines in the text body
				253	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	254
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	255	$_ = remove_xml_comments($input_fh, $_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	256	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	257	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	258
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	259	# End of text body
Akron	72f4a88	2023-03-02 09:48:14 +0100	[diff] [blame]	260	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	261
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	262	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	263
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	264	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	265	die $log->fatal("input line number $.: " .
				266	"line with closing text-body tag '${_TEXT_BODY}'".
				267	" contains additional information ... => Aborting (line=$_)");
				268	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	269
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	270	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	271	$log->warn(
				272	"Maybe empty textSigle => skipping this text ...\n" .
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	273	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	274	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	275	next MAIN;
				276	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	277
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	278	# Parse inline structure
				279	$inline->parse($text_id_esc, \$text_buffer);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	280
				281	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	282	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	283	};
				284
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	285	my $data = $inline->data;
				286
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	287	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	288	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	289	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	290	$text_id_esc
				291	);
				292
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	293	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	294	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	295
				296	# Tokenize and output
				297	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	298	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	299	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	300	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	301
				302	if ($use_tokenizer_sentence_splits) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	303	$ext_tok->sentencize_from_previous_input($inline->structures);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	304	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	305	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	306
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	307	# Tokenize with internal tokenizer
				308	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	309
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	310	# Tokenize and output
				311	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	312	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	313	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	314	)->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	315
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	316	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	317	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	318	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	319	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	320	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	321
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	322	# ~ write structures ~
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	323	if (!$inline->structures->empty) {
				324	$inline->structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	325	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	326	$text_id_esc,
				327	2 # = structure serialization
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	328	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	329	};
				330
				331	# ~ write tokens ~
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	332	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				333	$inline->tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	334	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	335	$text_id_esc,
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	336	# Either 0 = tokens without inline or 1 = tokens with inline
				337	!$skip_inline_token_annotations
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	338	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	339	};
				340
				341	# reinit.
				342	$dir = '';
				343
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	344	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	345	};
				346
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	347
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	348	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	349
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	350	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	351
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	352	# TODO:
				353	# Maybe it's best, to keep the stripping of whitespace and
				354	# to just remove the if-clause and to insert a blank by default
				355	# (with possibly an option on how newlines in primary text should
				356	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	357
				358	# Remove consecutive whitespace at beginning and end (mostly one newline)
				359	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	360
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	361	# NOTE:
				362	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	363
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	364	# TODO:
				365	# find a better solution, or create a warning, if a text has more
				366	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	367
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	368	# TODO:
				369	# do testing with 2 different corpora
				370	# (one with only one-line texts, the other with several lines per text)
				371
Akron	ec50325	2023-04-24 18:03:17 +0200	[diff] [blame]	372	# line contains at least one non-tag character
				373	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	374
				375	# Increment counter for text lines
				376	$text_line++;
				377
				378	# insert blank before 1st character
Akron	6e2b125	2021-02-24 12:41:15 +0100	[diff] [blame]	379	# (for 2nd line and consecutive lines)
				380	$_ = ' ' . $_ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	381	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	382
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	383	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	384	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	385	};
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	386	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	387
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	388	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
				389	my $leadin = $1;
				390	my $id = $3;
				391	my $sigle = $3;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	392
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	393	if ($what) {
				394	$_ = $id;
				395	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
				396	$sigle = $_;
				397	$log->debug("Converted text id `$id' to sigle `$sigle'");
				398	};
				399	$sigle =~ s/\./-/g;
				400
				401	my @parts = split(/[\/_]/, $sigle);
				402	if (@parts != 3) {
				403	die $log->fatal(
				404	"input line number $.: " .
				405	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
				406	"=> Aborting (line=$_)");
				407	};
				408
				409	$dir = join("/", @parts);
				410	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
				411	$log->notice("$0: text_id=$text_id_esc");
				412
				413	if ($leadin !~ /^\s*$/) {
				414	die $log->fatal(
				415	"input line number $.: " .
				416	'line with opening header tag is not in expected format ... ' .
				417	"=> Aborting (line=$_)");
				418	};
				419	}
				420
				421	# Start of header section
				422	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	423	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	424
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	425	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	426	die $log->fatal(
				427	"input line number $.: " .
				428	'line with opening header tag is not in expected format ... ' .
				429	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	430	};
				431
				432	# Parse header
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	433	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	434
				435	# Header was parseable
				436	if ($header) {
				437
				438	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	439	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	440
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	441	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	442
				443	$header->to_zip($zipper->new_stream($file));
				444
				445	# Header is for text level
				446	if ($header->type eq 'text') {
				447
				448	# Remember dir and sigles
				449	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	450	$text_id_esc = $header->id_esc;
				451
				452	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	453	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	454
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	455	# Reset counter for text lines
				456	# (needed for whitespace handling)
				457	$text_line = 0;
				458	};
				459	};
				460	};
				461	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	462
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	463	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	464
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	465	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	466
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	467	close $input_fh;
				468
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	469
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	470	__END__
				471
				472	=pod
				473
				474	=encoding utf8
				475
				476	=head1 NAME
				477
				478	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				479
				480	=head1 SYNOPSIS
				481
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	482	cat corpus.i5.xml \| tei2korapxml - > corpus.korapxml.zip
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	483
				484	=head1 DESCRIPTION
				485
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	486	C<tei2korapxml> is a script to convert TEI P5 and
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	487	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	488	based documents to the
				489	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	490
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	491	This program is usually called from inside another script.
				492
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	493	=head1 FORMATS
				494
				495	=head2 Input restrictions
				496
				497	=over 2
				498
				499	=item
				500
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	501	TEI P5 formatted input with certain restrictions:
				502
				503	=over 4
				504
				505	=item
				506
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	507	B<mandatory>: text-header with integrated textsigle
				508	(or convertable identifier), text-body
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	509
				510	=item
				511
				512	B<optional>: corp-header with integrated corpsigle,
				513	doc-header with integrated docsigle
				514
				515	=back
				516
				517	=item
				518
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	519	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	520	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	521	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	522	into blanks between 2 tokens could lead to additional blanks,
				523	where there should be none (e.g.: punctuation characters like C<,> or
				524	C<.> should not be seperated from their predecessor token).
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	525	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	526
Akron	940ca6f	2021-10-11 12:38:39 +0200	[diff] [blame]	527	=item
				528
				529	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
				530	need to be defined in the same line as the header tag.
				531
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	532	=back
				533
				534	=head2 Notes on the output
				535
				536	=over 2
				537
				538	=item
				539
				540	zip file output (default on C<stdout>) with utf8 encoded entries
				541	(which together form the KorAP-XML format)
				542
				543	=back
				544
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	545	=head1 INSTALLATION
				546
Akron	d26319b	2023-01-12 15:34:41 +0100	[diff] [blame]	547	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietz	e83a4e9	2021-03-16 20:51:26 +0100	[diff] [blame]	548	When these requirements are met, the preferred way to install the script is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	549	to use L<cpanm\|App::cpanminus>.
				550
				551	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				552
				553	In case everything went well, the C<tei2korapxml> tool will
				554	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	555
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	556	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				557
				558	=head1 OPTIONS
				559
				560	=over 2
				561
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	562	=item B<--input\|-i>
				563
				564	The input file to process. If no specific input is defined and a single
				565	dash C<-> is passed as an argument, data is read from C<STDIN>.
				566
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame^]	567	=item B<--output\|-o>
				568
				569	The output zip file to be created. If no specific output is defined,
				570	data is written to C<STDOUT>.
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	571
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	572	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	573
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	574	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	575
				576	=item B<--help\|-h>
				577
				578	Print help information.
				579
				580	=item B<--version\|-v>
				581
				582	Print version information.
				583
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	584	=item B<--tokenizer-korap\|-tk>
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	585
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	586	Use the standard KorAP/DeReKo tokenizer.
				587
				588	=item B<--tokenizer-internal\|-ti>
				589
				590	Tokenize the data using two embedded tokenizers,
				591	that will take an I<aggressive> and a I<conservative>
				592	approach.
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	593
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	594	=item B<--tokenizer-call\|-tc>
				595
				596	Call an external tokenizer process, that will tokenize
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	597	from STDIN and outputs the offsets of all tokens.
				598
				599	Texts are separated using C<\x04\n>. The external process
				600	should add a new line per text.
				601
				602	If the L</--use-tokenizer-sentence-splits> option is activated,
				603	sentences are marked by offset as well in new lines.
				604
				605	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
				606	splitting, call C<tei2korap> as follows:
				607
				608	$ cat corpus.i5.xml \| tei2korapxml -s \
				609	$ -tc 'datok tokenize \
				610	$ -t ./tokenizer.matok \
				611	$ -p --newline-after-eot --no-sentences \
				612	$ --no-tokens --sentence-positions -' - \
				613	$ > corpus.korapxml.zip
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	614
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	615	=item B<--no-tokenizer>
				616
				617	Boolean flag indicating that no tokenizer should be used.
				618	This is meant to ensure that by default a final token layer always
				619	exists.
				620	If a separate tokenizer is chosen, this flag is ignored.
				621
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	622	=item B<--skip-inline-tokens>
				623
				624	Boolean flag indicating that inline tokens should not
				625	be processed. Defaults to false (meaning inline tokens will be processed).
				626
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	627	=item B<--skip-inline-token-annotations>
				628
				629	Boolean flag indicating that inline token annotations should not
				630	be processed. Defaults to true (meaning inline token annotations
				631	won't be processed).
				632
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	633	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	634
				635	Expects a comma-separated list of tags to be ignored when the structure
				636	is parsed. Content of these tags however will be processed.
				637
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	638	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
				639
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	640	Expects a regular replacement expression (separated by B<@> between the
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	641	search and the replacement) to convert text id attributes to text sigles
				642	with three parts (separated by B</>).
				643
				644	Example:
				645
				646	tei2korapxml \
				647	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
				648	-tk - < t/data/icc_german_sample.p5.xml
				649
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	650	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
				651	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	652
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	653	=item B<--inline-tokens> <foundry>#[<file>]
				654
				655	Define the foundry and file (without extension)
				656	to store inline token information in.
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	657	Unless C<--skip-inline-token-annotations> is set,
				658	this will contain annotations as well.
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	659	Defaults to C<tokens> and C<morpho>.
				660
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	661	The inline token data will also be stored in the
				662	inline structures file (see I<--inline-structures>),
				663	unless the inline token foundry is prepended
				664	by an B<!> exclamation mark, indicating that inline
				665	tokens are stored exclusively in the inline tokens
				666	file.
				667
				668	Example:
				669
				670	tei2korapxml --inline-tokens '!gingko#morpho' < data.i5.xml > korapxml.zip
				671
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	672	=item B<--inline-structures> <foundry>#[<file>]
				673
				674	Define the foundry and file (without extension)
				675	to store inline structure information in.
				676	Defaults to C<struct> and C<structures>.
				677
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	678	=item B<--base-foundry> <foundry>
				679
				680	Define the base foundry to store newly generated
				681	token information in.
				682	Defaults to C<base>.
				683
				684	=item B<--data-file> <file>
				685
				686	Define the file (without extension)
				687	to store primary data information in.
				688	Defaults to C<data>.
				689
				690	=item B<--header-file> <file>
				691
				692	Define the file name (without extension)
				693	to store header information on
				694	the corpus, document, and text level in.
				695	Defaults to C<header>.
				696
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	697	=item B<--use-tokenizer-sentence-splits\|-s>
				698
				699	Replace existing with, or add new, sentence boundary information
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	700	provided by the tokenizer.
				701	Currently KorAP-tokenizer and certain external tokenizers support
				702	these boundaries.
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	703
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	704	=item B<--tokens-file> <file>
				705
				706	Define the file (without extension)
				707	to store generated token information in
				708	(either from the KorAP tokenizer or an externally called tokenizer).
				709	Defaults to C<tokens>.
				710
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	711	=item B<--log\|-l>
				712
				713	Loglevel for I<Log::Any>. Defaults to C<notice>.
				714
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	715	=back
				716
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	717	=head1 ENVIRONMENT VARIABLES
				718
				719	=over 2
				720
				721	=item B<KORAPXMLTEI_DEBUG>
				722
				723	Activate minimal debugging.
				724	Defaults to C<false>.
				725
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	726	=back
				727
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	728	=head1 COPYRIGHT AND LICENSE
				729
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	730	Copyright (C) 2021-2023, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	731
				732	Author: Peter Harders
				733
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	734	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	735
				736	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				737	Corpus Analysis Platform at the
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	738	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	739	member of the
				740	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				741
				742	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	743	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	744
				745	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	746
				747	# NOTES
				748
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	749	## Notes on segfault prevention
				750
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	751	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	752	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				753	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				754	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				755	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.