Blame - script/tei2korapxml - KorAP/KorAP-XML-TEI

blob: 86f7527ab4c31885871216b5d968b98c45405577 [file] [log] [blame]

Akron	9cb1394	2020-02-14 07:39:54 +0100	[diff] [blame]	1	#!/usr/bin/env perl
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	2	use strict;
				3	use warnings;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	4
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	5	use Log::Any '$log';
				6	use Log::Any::Adapter;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	7	use Pod::Usage;
				8	use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame^]	9	use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	10
				11	use File::Basename qw(dirname);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	12
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	13	use Encode qw(decode);
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	14
Akron	4f67cd4	2020-07-02 12:27:58 +0200	[diff] [blame]	15	use FindBin;
				16	BEGIN {
				17	unshift @INC, "$FindBin::Bin/../lib";
				18	};
				19
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	20	use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron	8b511f9	2020-07-09 17:28:08 +0200	[diff] [blame]	21	use KorAP::XML::TEI::Tokenizer::External;
Akron	d962747	2020-07-09 16:53:09 +0200	[diff] [blame]	22	use KorAP::XML::TEI::Tokenizer::Conservative;
				23	use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron	8571751	2020-07-08 11:19:19 +0200	[diff] [blame]	24	use KorAP::XML::TEI::Zipper;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	25	use KorAP::XML::TEI::Header;
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	26	use KorAP::XML::TEI::Inline;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	27
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	28	our $VERSION = '2.6.0';
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	29
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	30	our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
				31
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	32	use constant {
				33	# Set to 1 for minimal more debug output (no need to be parametrized)
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	34	DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	35	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	36
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	37	if ($ENV{KORAPXMLTEI_INLINE}) {
				38	warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
				39	};
				40
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	41	# Inline tokens won't be stored in the structure file
				42	my $inline_tokens_exclusive = 0;
				43
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	44	# Inline dependencies won't be stored in the tokens file
				45	my $inline_deps_exclusive = 0;
				46
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	47	# Parse options from the command line
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	48	GetOptions(
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame^]	49	'auto-textsigle\|A=s' => \(my $auto_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	50	'root\|r=s' => \(my $root_dir = '.'),
				51	'input\|i=s' => \(my $input_fname = ''),
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	52	'output\|o=s' => \(my $output_fname = ''),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	53	'tokenizer-call\|tc=s' => \(my $tokenizer_call),
				54	'tokenizer-korap\|tk' => \(my $tokenizer_korap),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	55	'tokenizer-internal\|ti' => \(my $tokenizer_intern),
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	56	'no-tokenizer' => \(my $no_tokenizer),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	57	'use-tokenizer-sentence-splits\|s' => \(my $use_tokenizer_sentence_splits),
				58	'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
				59	'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	60	'inline-dependencies=s' => \(my $inline_dependencies),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	61	'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	62	'skip-inline-token-annotations!' => \(
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	63	my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	64	'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	65	'base-foundry=s' => \(my $base_dir = 'base'),
				66	'data-file=s' => \(my $data_file = 'data'),
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	67	'header-file=s' => \(my $header_file = 'header'),
				68	'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	69	'xmlid-to-textsigle\|x=s'=> \(my $xmlid_to_textsigle = ''),
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	70	'log\|l=s' => \(my $log_level = 'notice'),
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	71	'required-version\|rv=s' => \(my $required_version),
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	72	'' => \(my $stdio),
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	73	'help\|h' => sub {
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	74	pod2usage(
				75	-verbose => 99,
				76	-sections => 'NAME\|DESCRIPTION\|SYNOPSIS\|ARGUMENTS\|OPTIONS',
				77	-msg => $VERSION_MSG,
				78	-output => '-'
				79	)
				80	},
				81	'version\|v' => sub {
				82	pod2usage(
				83	-verbose => 0,
				84	-msg => $VERSION_MSG,
				85	-output => '-'
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	86	);
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	87	}
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	88	);
				89
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	90
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	91	# Establish logger
Akron	33db4ec	2021-02-24 12:52:21 +0100	[diff] [blame]	92	binmode(STDERR, ':encoding(UTF-8)');
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	93	Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	94	$log->notice('Debugging is activated') if DEBUG;
				95
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	96
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	97	if ($required_version) {
Marc Kupietz	2475c95	2024-01-09 10:40:04 +0100	[diff] [blame]	98	$required_version =~ /^\s(\d+\.\d+\.\d+(-TRIAL)?)\s$/;
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	99	if (!$1 \|\| $1 ne $VERSION) {
				100	$log->error("Required version $required_version mismatches version $VERSION");
				101	exit(1);
				102	};
				103	};
				104
				105
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	106	my ($what, $with);
				107	if ($xmlid_to_textsigle ne '') {
				108	($what, $with) = split('@', $xmlid_to_textsigle);
				109	$what = qr!$what!;
				110	};
				111
Akron	0529e51	2021-02-22 09:55:35 +0100	[diff] [blame]	112	# tag (without attributes), which contains the primary text
				113	my $_TEXT_BODY = 'text';
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	114	# optional
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	115
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	116	# Remember to skip certain inline tags
				117	my %skip_inline_tags = ();
				118	if ($skip_inline_tags_str) {
				119	foreach (split /\s,\s/, $skip_inline_tags_str) {
				120	$skip_inline_tags{$_} = 1;
				121	};
				122	};
				123
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	124	# External tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	125	my $ext_tok;
				126	if ($tokenizer_call) {
				127	$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	128	$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	129	}
Marc Kupietz	1e882fb	2020-09-09 00:05:46 +0200	[diff] [blame]	130
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	131	# KorAP tokenization
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	132	elsif ($tokenizer_korap) {
Akron	bd4281e	2022-03-28 08:31:40 +0200	[diff] [blame]	133	eval {
				134	require KorAP::XML::TEI::Tokenizer::KorAP;
				135	1;
				136	};
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	137
				138	my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
				139	if ($korap_tok_ver ne $VERSION) {
				140	$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
				141	exit(1);
				142	};
				143
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	144	$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	145	}
				146
				147	# No internal tokenizer chosen
				148	elsif (!$tokenizer_intern && !$no_tokenizer) {
				149	$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
				150	exit(1);
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	151	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	152
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	153	if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	154	$skip_inline_tags{s} = 1;
				155	};
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	156
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	157	# Internal tokenization
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	158	my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
				159	my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	160
Peter Harders	41c3562	2020-07-12 01:16:22 +0200	[diff] [blame]	161
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	162	# Name of the directory and the file containing all inline structure informations
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	163	# except for $_TOKENS_TAG information
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	164	my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	165
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	166	# Name of the directory and the file containing all inline token informations
				167	# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
				168	my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	169
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	170	if (index($_tokens_dir, '!') == 0) {
				171	$_tokens_dir = substr($_tokens_dir, 1);
				172	$inline_tokens_exclusive = 1;
				173	};
				174
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	175
				176	my ($_dep_dir, $_dep_file);
				177	if ($inline_dependencies) {
				178	($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
				179	$inline_dependencies = 1;
				180
				181	if ($_dep_dir && index($_dep_dir, '!') == 0) {
				182	$_dep_dir = substr($_dep_dir, 1);
				183	$inline_deps_exclusive = 1;
				184	};
				185	};
				186
				187
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	188	# Initialize zipper
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	189	my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	190
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	191	# text directory (below $root_dir)
				192	my $dir = '';
Akron	09e0b2c	2020-07-28 15:57:01 +0200	[diff] [blame]	193
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	194	# Escaped version of text id
				195	my $text_id_esc;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	196
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	197	# Default encoding of the text
				198	my $input_enc = 'UTF-8';
				199
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	200	# text line (needed for whitespace handling)
				201	my $text_line = 0;
				202
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	203
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	204	# Input file handle (default: stdin)
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	205	my $input_fh;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	206
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	207	# Single dash was set
				208	if ($stdio) {
				209	$input_fh = *STDIN;
				210	}
				211
				212	# Input flag was passed
				213	elsif ($input_fname ne '') {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	214	unless (open($input_fh, '<', $input_fname)) {
				215	die $log->fatal("File '$input_fname' could not be opened.");
				216	};
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	217	}
				218
				219	# No input to process
				220	else {
				221	pod2usage(
				222	-verbose => 99,
				223	-sections => 'NAME\|SYNOPSIS',
				224	-msg => $VERSION_MSG,
				225	-output => '-'
				226	);
				227	exit;
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	228	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	229
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	230	# Prevents segfaulting (see notes on segfault prevention)
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	231	binmode $input_fh;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	232
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	233
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	234	# Create inline parser object
				235	my $inline = KorAP::XML::TEI::Inline->new(
				236	$skip_inline_tokens,
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	237	\%skip_inline_tags,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	238	$inline_tokens_exclusive,
				239	$inline_dependencies
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	240	);
				241
				242
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	243	# Reading input document
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	244	MAIN: while (<$input_fh>) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	245
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	246	# remove HTML (multi-line) comments (<!--...-->)
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	247	$_ = remove_xml_comments($input_fh, $_);
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	248
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	249	# Set input encoding
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	250	if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	251	$input_enc = $2;
				252	next;
				253	};
				254
				255	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	256	$_ = replace_entities($_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	257
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	258	# Start of text body
				259	if (index($_, $_TEXT_BODY) >= 0 && m#^(.)<${_TEXT_BODY}(?: [^>])?>(.*)$#) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	260	my $suffix = $2;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	261
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	262	if ($1 !~ /^\s$/ \|\| $suffix !~ /^\s$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	263	die $log->fatal("input line number $.: " .
				264	"line with opening text-body tag '${_TEXT_BODY}' " .
				265	"contains additional information ... => Aborting (line=$_)");
Akron	0bb7e72	2020-09-29 07:48:33 +0200	[diff] [blame]	266	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	267
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	268	# Text body data extracted from input document ($input_fh),
				269	# further processed by XML::LibXML::Reader
				270	my $text_buffer = '';
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	271
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	272	# Iterate over all lines in the text body
				273	while (<$input_fh>) {
Peter Harders	9015734	2020-07-01 21:05:14 +0200	[diff] [blame]	274
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	275	$_ = remove_xml_comments($input_fh, $_);
Akron	eaa9623	2020-10-15 17:06:15 +0200	[diff] [blame]	276	$_ = decode($input_enc, $_);
Marc Kupietz	8a954e5	2021-02-16 22:03:07 +0100	[diff] [blame]	277	$_ = replace_entities($_);
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	278
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	279	# End of text body
Akron	72f4a88	2023-03-02 09:48:14 +0100	[diff] [blame]	280	if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	281
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	282	# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	283
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	284	if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	285	die $log->fatal("input line number $.: " .
				286	"line with closing text-body tag '${_TEXT_BODY}'".
				287	" contains additional information ... => Aborting (line=$_)");
				288	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	289
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	290	if ($dir eq '') {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	291	$log->warn(
				292	"Maybe empty textSigle => skipping this text ...\n" .
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	293	'data=' . substr($inline->data->data, 0, 200)
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	294	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	295	next MAIN;
				296	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	297
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	298	# Parse inline structure
				299	$inline->parse($text_id_esc, \$text_buffer);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	300
				301	if (DEBUG) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	302	$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	303	};
				304
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	305	my $data = $inline->data;
				306
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	307	# Write data.xml
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	308	$data->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	309	$zipper->new_stream("$dir/${data_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	310	$text_id_esc
				311	);
				312
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	313	# Tokenize with external tokenizer
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	314	if ($ext_tok) {
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	315
				316	# Tokenize and output
				317	$ext_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	318	$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	319	$text_id_esc
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	320	);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	321
				322	if ($use_tokenizer_sentence_splits) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	323	$ext_tok->sentencize_from_previous_input($inline->structures);
Akron	d53ab4b	2021-02-24 09:56:12 +0100	[diff] [blame]	324	};
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	325	};
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	326
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	327	# Tokenize with internal tokenizer
				328	if ($tokenizer_intern) {
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	329
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	330	# Tokenize and output
				331	$cons_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	332	$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	333	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	334	)->reset;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	335
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	336	$aggr_tok->tokenize($data->data)->to_zip(
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	337	$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	338	$text_id_esc
Akron	cc27d79	2021-02-24 12:32:20 +0100	[diff] [blame]	339	)->reset;
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	340	};
Akron	a10ad59	2020-08-03 11:20:23 +0200	[diff] [blame]	341
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	342	# ~ write structures ~
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	343	unless ($inline->structures->empty) {
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	344	$inline->structures->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	345	$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	346	$text_id_esc,
				347	2 # = structure serialization
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	348	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	349	};
				350
				351	# ~ write tokens ~
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	352	unless ($skip_inline_tokens \|\| $inline->tokens->empty) {
				353	$inline->tokens->to_zip(
Akron	b87c58d	2021-02-23 17:23:30 +0100	[diff] [blame]	354	$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	355	$text_id_esc,
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	356	# Either 0 = tokens without inline or
				357	# 1 = tokens with inline
				358	# !$skip_inline_token_annotations
				359	($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
Akron	eb12e23	2021-02-25 13:49:50 +0100	[diff] [blame]	360	);
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	361	};
				362
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	363	# ~ write dependencies ~
				364	unless ($inline->dependencies->empty) {
				365	$inline->dependencies->to_zip(
				366	$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
				367	$text_id_esc,
				368	3 # = dependency serialization
				369	);
				370	};
				371
				372
Akron	dafaa7a	2021-02-19 15:17:58 +0100	[diff] [blame]	373	# reinit.
				374	$dir = '';
				375
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	376	next MAIN;
Akron	598d1a7	2020-08-02 17:33:31 +0200	[diff] [blame]	377	};
				378
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	379
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	380	# ~ whitespace handling ~
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	381
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	382	# Fix whitespaces (see notes on whitespace fixing)
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	383
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	384	# TODO:
				385	# Maybe it's best, to keep the stripping of whitespace and
				386	# to just remove the if-clause and to insert a blank by default
				387	# (with possibly an option on how newlines in primary text should
				388	# be handled (stripped or replaced by a whitespace)).
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	389
				390	# Remove consecutive whitespace at beginning and end (mostly one newline)
				391	s/^\s+//; s/\s+$//;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	392
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	393	# NOTE:
				394	# this is only relevant, if a text consists of more than one line
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	395
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	396	# TODO:
				397	# find a better solution, or create a warning, if a text has more
				398	# than one line ($text_line > 1)
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	399
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	400	# TODO:
				401	# do testing with 2 different corpora
				402	# (one with only one-line texts, the other with several lines per text)
				403
Akron	ec50325	2023-04-24 18:03:17 +0200	[diff] [blame]	404	# line contains at least one non-tag character
				405	if (m/^[^<]*$/ \|\| m/(?:<[^>]+>[^<])\|(?:[^<]<[^>]+>)/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	406
				407	# Increment counter for text lines
				408	$text_line++;
				409
				410	# insert blank before 1st character
Akron	6e2b125	2021-02-24 12:41:15 +0100	[diff] [blame]	411	# (for 2nd line and consecutive lines)
				412	$_ = ' ' . $_ if $text_line > 1;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	413	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	414
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	415	# add line to buffer
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	416	$text_buffer .= $_;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	417	};
Akron	d3e1d28	2021-02-24 14:51:27 +0100	[diff] [blame]	418	}
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	419
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	420	elsif (m#^(.)\<TEI\s+[^>]?xml:id=(["'])(.+?)\2#) {
				421	my $leadin = $1;
				422	my $id = $3;
				423	my $sigle = $3;
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	424
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	425	if ($what) {
				426	$_ = $id;
				427	eval "s\|$what\|$with\|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
				428	$sigle = $_;
				429	$log->debug("Converted text id `$id' to sigle `$sigle'");
				430	};
				431	$sigle =~ s/\./-/g;
				432
				433	my @parts = split(/[\/_]/, $sigle);
				434	if (@parts != 3) {
				435	die $log->fatal(
				436	"input line number $.: " .
				437	"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
				438	"=> Aborting (line=$_)");
				439	};
				440
				441	$dir = join("/", @parts);
				442	$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
				443	$log->notice("$0: text_id=$text_id_esc");
				444
				445	if ($leadin !~ /^\s*$/) {
				446	die $log->fatal(
				447	"input line number $.: " .
				448	'line with opening header tag is not in expected format ... ' .
				449	"=> Aborting (line=$_)");
				450	};
				451	}
				452
				453	# Start of header section
				454	elsif (m#^(.)(\<(?:ids\|tei)Header.)$#) {
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	455	my $content = "$2\n";
Akron	f57ed81	2020-07-27 10:37:52 +0200	[diff] [blame]	456
Akron	d20898f	2021-02-19 15:52:17 +0100	[diff] [blame]	457	if ($1 !~ /^\s*$/) {
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	458	die $log->fatal(
				459	"input line number $.: " .
				460	'line with opening header tag is not in expected format ... ' .
				461	"=> Aborting (line=$_)");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	462	};
				463
				464	# Parse header
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame^]	465	my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
				466	if ($auto_textsigle) {
				467	$auto_textsigle = increase_auto_textsigle($auto_textsigle);
				468	$log->debug("Auto-incremented text sigle to $auto_textsigle");
				469	};
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	470	# Header was parseable
				471	if ($header) {
				472
				473	# Write header to zip
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	474	my $file = $header->dir . '/' . $header_file . '.xml';
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	475
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	476	$log->debug("Writing file $file") if DEBUG;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	477
				478	$header->to_zip($zipper->new_stream($file));
				479
				480	# Header is for text level
				481	if ($header->type eq 'text') {
				482
				483	# Remember dir and sigles
				484	$dir = $header->dir;
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	485	$text_id_esc = $header->id_esc;
				486
				487	# log output for seeing progression
Akron	bc89919	2021-02-24 12:14:47 +0100	[diff] [blame]	488	$log->notice("$0: text_id=$text_id_esc");
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	489
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	490	# Reset counter for text lines
				491	# (needed for whitespace handling)
				492	$text_line = 0;
				493	};
				494	};
				495	};
				496	};
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	497
Akron	347be81	2020-09-29 07:52:52 +0200	[diff] [blame]	498	$zipper->close;
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	499
Akron	9df4a24	2021-02-19 15:31:16 +0100	[diff] [blame]	500	$ext_tok->close if $ext_tok;
Peter Harders	d892a58	2020-02-12 15:45:22 +0100	[diff] [blame]	501
Akron	d53913c	2021-02-24 09:50:13 +0100	[diff] [blame]	502	close $input_fh;
				503
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	504
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	505	__END__
				506
				507	=pod
				508
				509	=encoding utf8
				510
				511	=head1 NAME
				512
				513	tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
				514
				515	=head1 SYNOPSIS
				516
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	517	cat corpus.i5.xml \| tei2korapxml - > corpus.korapxml.zip
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	518
				519	=head1 DESCRIPTION
				520
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	521	C<tei2korapxml> is a script to convert TEI P5 and
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	522	L<I5\|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	523	based documents to the
				524	L<KorAP-XML format\|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	525
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	526	This program is usually called from inside another script.
				527
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	528	=head1 FORMATS
				529
				530	=head2 Input restrictions
				531
				532	=over 2
				533
				534	=item
				535
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	536	TEI P5 formatted input with certain restrictions:
				537
				538	=over 4
				539
				540	=item
				541
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	542	B<mandatory>: text-header with integrated textsigle
				543	(or convertable identifier), text-body
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	544
				545	=item
				546
				547	B<optional>: corp-header with integrated corpsigle,
				548	doc-header with integrated docsigle
				549
				550	=back
				551
				552	=item
				553
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	554	All tokens inside the primary text may not be
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	555	newline seperated, because newlines are removed
Akron	0c41ab3	2020-09-29 07:33:33 +0200	[diff] [blame]	556	(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	557	into blanks between 2 tokens could lead to additional blanks,
				558	where there should be none (e.g.: punctuation characters like C<,> or
				559	C<.> should not be seperated from their predecessor token).
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	560	(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	561
Akron	940ca6f	2021-10-11 12:38:39 +0200	[diff] [blame]	562	=item
				563
				564	Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
				565	need to be defined in the same line as the header tag.
				566
Akron	ee434b1	2020-07-08 12:53:01 +0200	[diff] [blame]	567	=back
				568
				569	=head2 Notes on the output
				570
				571	=over 2
				572
				573	=item
				574
				575	zip file output (default on C<stdout>) with utf8 encoded entries
				576	(which together form the KorAP-XML format)
				577
				578	=back
				579
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	580	=head1 INSTALLATION
				581
Akron	d26319b	2023-01-12 15:34:41 +0100	[diff] [blame]	582	C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietz	e83a4e9	2021-03-16 20:51:26 +0100	[diff] [blame]	583	When these requirements are met, the preferred way to install the script is
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	584	to use L<cpanm\|App::cpanminus>.
				585
				586	$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
				587
				588	In case everything went well, the C<tei2korapxml> tool will
				589	be available on your command line immediately.
Peter Harders	6f526a3	2020-06-29 21:44:41 +0200	[diff] [blame]	590
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	591	Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
				592
				593	=head1 OPTIONS
				594
				595	=over 2
				596
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	597	=item B<--input\|-i>
				598
				599	The input file to process. If no specific input is defined and a single
				600	dash C<-> is passed as an argument, data is read from C<STDIN>.
				601
Akron	132bdeb	2024-06-06 14:28:56 +0200	[diff] [blame]	602	=item B<--output\|-o>
				603
				604	The output zip file to be created. If no specific output is defined,
				605	data is written to C<STDOUT>.
Akron	a2cb281	2021-10-30 10:29:08 +0200	[diff] [blame]	606
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	607	=item B<--root\|-r>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	608
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	609	The root directory for output. Defaults to C<.>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	610
				611	=item B<--help\|-h>
				612
				613	Print help information.
				614
				615	=item B<--version\|-v>
				616
				617	Print version information.
				618
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	619	=item B<--tokenizer-korap\|-tk>
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	620
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	621	Use the standard KorAP/DeReKo tokenizer.
				622
				623	=item B<--tokenizer-internal\|-ti>
				624
				625	Tokenize the data using two embedded tokenizers,
				626	that will take an I<aggressive> and a I<conservative>
				627	approach.
Akron	2520a34	2022-03-29 18:18:05 +0200	[diff] [blame]	628
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	629	=item B<--tokenizer-call\|-tc>
				630
				631	Call an external tokenizer process, that will tokenize
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	632	from STDIN and outputs the offsets of all tokens.
				633
				634	Texts are separated using C<\x04\n>. The external process
				635	should add a new line per text.
				636
				637	If the L</--use-tokenizer-sentence-splits> option is activated,
				638	sentences are marked by offset as well in new lines.
				639
				640	To use L<Datok\|https://github.com/KorAP/Datok> including sentence
				641	splitting, call C<tei2korap> as follows:
				642
				643	$ cat corpus.i5.xml \| tei2korapxml -s \
				644	$ -tc 'datok tokenize \
				645	$ -t ./tokenizer.matok \
				646	$ -p --newline-after-eot --no-sentences \
				647	$ --no-tokens --sentence-positions -' - \
				648	$ > corpus.korapxml.zip
Akron	4e603a5	2020-07-27 14:23:49 +0200	[diff] [blame]	649
Akron	b93fabb	2023-01-13 12:05:44 +0100	[diff] [blame]	650	=item B<--no-tokenizer>
				651
				652	Boolean flag indicating that no tokenizer should be used.
				653	This is meant to ensure that by default a final token layer always
				654	exists.
				655	If a separate tokenizer is chosen, this flag is ignored.
				656
Akron	75d6314	2021-02-23 18:40:56 +0100	[diff] [blame]	657	=item B<--skip-inline-tokens>
				658
				659	Boolean flag indicating that inline tokens should not
				660	be processed. Defaults to false (meaning inline tokens will be processed).
				661
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	662	=item B<--skip-inline-token-annotations>
				663
				664	Boolean flag indicating that inline token annotations should not
				665	be processed. Defaults to true (meaning inline token annotations
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	666	won't be processed). Can be negated with
				667	C<--no-skip-inline-token-annotations>.
Akron	692d17d	2021-03-05 13:21:03 +0100	[diff] [blame]	668
Akron	ca70a1d	2021-02-25 16:21:31 +0100	[diff] [blame]	669	=item B<--skip-inline-tags> <tags>
Akron	54c3ff1	2021-02-25 11:33:37 +0100	[diff] [blame]	670
				671	Expects a comma-separated list of tags to be ignored when the structure
				672	is parsed. Content of these tags however will be processed.
				673
Marc Kupietz	fc3a0ee	2024-07-05 16:58:16 +0200	[diff] [blame^]	674	=item B<--auto-textsigle> <textsigle>
				675
				676	Expects a text sigle thats serves as fallback if no text sigles
				677	are given in the input data.
				678	The auto text sigle will be incremented for each text processed.
				679
				680	Example:
				681
				682	tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
				683	< data.i5.xml > korapxml.zip
				684
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	685	=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
				686
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	687	Expects a regular replacement expression (separated by B<@> between the
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	688	search and the replacement) to convert text id attributes to text sigles
				689	with three parts (separated by B</>).
				690
				691	Example:
				692
				693	tei2korapxml \
				694	--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
				695	-tk - < t/data/icc_german_sample.p5.xml
				696
Akron	e48bec4	2023-01-05 12:18:45 +0100	[diff] [blame]	697	Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
				698	sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietz	a671ae5	2022-12-22 16:28:14 +0100	[diff] [blame]	699
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	700	=item B<--inline-tokens> <foundry>#[<file>]
				701
				702	Define the foundry and file (without extension)
				703	to store inline token information in.
Akron	8a0c4bf	2021-03-16 16:51:21 +0100	[diff] [blame]	704	Unless C<--skip-inline-token-annotations> is set,
				705	this will contain annotations as well.
Akron	1a5271a	2021-02-18 13:18:15 +0100	[diff] [blame]	706	Defaults to C<tokens> and C<morpho>.
				707
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	708	The inline token data will also be stored in the
				709	inline structures file (see I<--inline-structures>),
				710	unless the inline token foundry is prepended
				711	by an B<!> exclamation mark, indicating that inline
				712	tokens are stored exclusively in the inline tokens
				713	file.
				714
				715	Example:
				716
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	717	tei2korapxml --no-tokenizer --inline-tokens \
				718	'!gingko#morpho' < data.i5.xml > korapxml.zip
				719
				720	=item B<--inline-dependencies> <foundry>#[<file>]
				721
				722	Define the foundry and file (without extension)
				723	to store inline dependency information in.
				724	Defaults to the layer of C<dependency> and
				725	will be ignored if not set (which means, dependency
				726	attributes will be stored in the inline tokens file,
				727	if not skipped).
				728
				729	The dependency data will also be stored in the
				730	inline token file (see I<--inline-tokens>),
				731	unless the inline dependencies foundry is prepended
				732	by an B<!> exclamation mark, indicating that inline
				733	dependency data is stored exclusively in the inline
				734	dependencies file.
				735
				736	Example:
				737
				738	tei2korapxml --no-tokenizer --inline-dependencies \
				739	'gingko#dependency' < data.i5.xml > korapxml.zip
				740
Akron	e2819a1	2021-10-12 15:52:55 +0200	[diff] [blame]	741
Akron	dd0be8f	2021-02-18 19:29:41 +0100	[diff] [blame]	742	=item B<--inline-structures> <foundry>#[<file>]
				743
				744	Define the foundry and file (without extension)
				745	to store inline structure information in.
				746	Defaults to C<struct> and C<structures>.
				747
Akron	26a7152	2021-02-19 10:27:37 +0100	[diff] [blame]	748	=item B<--base-foundry> <foundry>
				749
				750	Define the base foundry to store newly generated
				751	token information in.
				752	Defaults to C<base>.
				753
				754	=item B<--data-file> <file>
				755
				756	Define the file (without extension)
				757	to store primary data information in.
				758	Defaults to C<data>.
				759
				760	=item B<--header-file> <file>
				761
				762	Define the file name (without extension)
				763	to store header information on
				764	the corpus, document, and text level in.
				765	Defaults to C<header>.
				766
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	767	=item B<--use-tokenizer-sentence-splits\|-s>
				768
				769	Replace existing with, or add new, sentence boundary information
Akron	1148478	2021-11-03 20:12:14 +0100	[diff] [blame]	770	provided by the tokenizer.
				771	Currently KorAP-tokenizer and certain external tokenizers support
				772	these boundaries.
Marc Kupietz	985da0c	2021-02-15 19:29:50 +0100	[diff] [blame]	773
Akron	91705d7	2021-02-19 10:59:45 +0100	[diff] [blame]	774	=item B<--tokens-file> <file>
				775
				776	Define the file (without extension)
				777	to store generated token information in
				778	(either from the KorAP tokenizer or an externally called tokenizer).
				779	Defaults to C<tokens>.
				780
Akron	3378dfd	2020-08-01 15:01:36 +0200	[diff] [blame]	781	=item B<--log\|-l>
				782
				783	Loglevel for I<Log::Any>. Defaults to C<notice>.
				784
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	785	=back
				786
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	787	=head1 ENVIRONMENT VARIABLES
				788
				789	=over 2
				790
				791	=item B<KORAPXMLTEI_DEBUG>
				792
				793	Activate minimal debugging.
				794	Defaults to C<false>.
				795
Akron	b364947	2020-09-29 08:24:46 +0200	[diff] [blame]	796	=back
				797
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	798	=head1 COPYRIGHT AND LICENSE
				799
Akron	6b1f26b	2024-09-19 11:35:32 +0200	[diff] [blame]	800	Copyright (C) 2021-2024, L<IDS Mannheim\|https://www.ids-mannheim.de/>
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	801
				802	Author: Peter Harders
				803
Akron	aabd095	2020-09-29 07:35:08 +0200	[diff] [blame]	804	Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	805
				806	L<KorAP::XML::TEI> is developed as part of the L<KorAP\|https://korap.ids-mannheim.de/>
				807	Corpus Analysis Platform at the
Akron	d72baca	2021-07-23 13:25:32 +0200	[diff] [blame]	808	L<Leibniz Institute for the German Language (IDS)\|https://www.ids-mannheim.de/>,
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	809	member of the
				810	L<Leibniz-Gemeinschaft\|http://www.leibniz-gemeinschaft.de/>.
				811
				812	This program is free software published under the
Marc Kupietz	e955ecc	2021-02-17 17:42:01 +0100	[diff] [blame]	813	L<BSD-2 License\|https://opensource.org/licenses/BSD-2-Clause>.
Akron	d949e18	2020-02-14 12:23:57 +0100	[diff] [blame]	814
				815	=cut
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	816
				817	# NOTES
				818
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	819	## Notes on segfault prevention
				820
Akron	9157792	2021-02-19 10:32:54 +0100	[diff] [blame]	821	binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akron	f8088e6	2021-02-18 16:18:59 +0100	[diff] [blame]	822	(see notes on 'PerlIO layers' in 'man XML::LibXML'),
				823	removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
				824	see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
				825	see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.