blob: 7d079cab223452dc0a92d69f2a1084f850063525 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +02009use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders6f526a32020-06-29 21:44:41 +020010
11use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010012
Akroneaa96232020-10-15 17:06:15 +020013use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010014
Akron4f67cd42020-07-02 12:27:58 +020015use FindBin;
16BEGIN {
17 unshift @INC, "$FindBin::Bin/../lib";
18};
19
Marc Kupietz8a954e52021-02-16 22:03:07 +010020use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020021use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020022use KorAP::XML::TEI::Tokenizer::Conservative;
23use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020024use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020025use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010026use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010027
Akron132bdeb2024-06-06 14:28:56 +020028our $VERSION = '2.6.0';
Peter Harders6f526a32020-06-29 21:44:41 +020029
Akrond949e182020-02-14 12:23:57 +010030our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
31
Akron33db4ec2021-02-24 12:52:21 +010032use constant {
33 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010034 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010035};
Peter Hardersd892a582020-02-12 15:45:22 +010036
Akron692d17d2021-03-05 13:21:03 +010037if ($ENV{KORAPXMLTEI_INLINE}) {
38 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
39};
40
Akrone2819a12021-10-12 15:52:55 +020041# Inline tokens won't be stored in the structure file
42my $inline_tokens_exclusive = 0;
43
Akron6b1f26b2024-09-19 11:35:32 +020044# Inline dependencies won't be stored in the tokens file
45my $inline_deps_exclusive = 0;
46
Peter Harders6f526a32020-06-29 21:44:41 +020047# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010048GetOptions(
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +020049 'auto-textsigle|A=s' => \(my $auto_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010050 'root|r=s' => \(my $root_dir = '.'),
51 'input|i=s' => \(my $input_fname = ''),
Akron132bdeb2024-06-06 14:28:56 +020052 'output|o=s' => \(my $output_fname = ''),
Akron75d63142021-02-23 18:40:56 +010053 'tokenizer-call|tc=s' => \(my $tokenizer_call),
54 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010055 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akronb93fabb2023-01-13 12:05:44 +010056 'no-tokenizer' => \(my $no_tokenizer),
Akron75d63142021-02-23 18:40:56 +010057 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
58 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
59 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron6b1f26b2024-09-19 11:35:32 +020060 'inline-dependencies=s' => \(my $inline_dependencies),
Akron75d63142021-02-23 18:40:56 +010061 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron6b1f26b2024-09-19 11:35:32 +020062 'skip-inline-token-annotations!' => \(
Akron692d17d2021-03-05 13:21:03 +010063 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010064 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietza671ae52022-12-22 16:28:14 +010069 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010070 'log|l=s' => \(my $log_level = 'notice'),
Akron2520a342022-03-29 18:18:05 +020071 'required-version|rv=s' => \(my $required_version),
Akrona2cb2812021-10-30 10:29:08 +020072 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010073 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010074 pod2usage(
75 -verbose => 99,
76 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
77 -msg => $VERSION_MSG,
78 -output => '-'
79 )
80 },
81 'version|v' => sub {
82 pod2usage(
83 -verbose => 0,
84 -msg => $VERSION_MSG,
85 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010086 );
Akrond949e182020-02-14 12:23:57 +010087 }
Peter Hardersd892a582020-02-12 15:45:22 +010088);
89
Akrond3e1d282021-02-24 14:51:27 +010090
Akronb87c58d2021-02-23 17:23:30 +010091# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010092binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020093Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020094$log->notice('Debugging is activated') if DEBUG;
95
Akrond3e1d282021-02-24 14:51:27 +010096
Akron2520a342022-03-29 18:18:05 +020097if ($required_version) {
Marc Kupietz2475c952024-01-09 10:40:04 +010098 $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
Akron2520a342022-03-29 18:18:05 +020099 if (!$1 || $1 ne $VERSION) {
100 $log->error("Required version $required_version mismatches version $VERSION");
101 exit(1);
102 };
103};
104
105
Marc Kupietza671ae52022-12-22 16:28:14 +0100106my ($what, $with);
107if ($xmlid_to_textsigle ne '') {
108 ($what, $with) = split('@', $xmlid_to_textsigle);
109 $what = qr!$what!;
110};
111
Akron0529e512021-02-22 09:55:35 +0100112# tag (without attributes), which contains the primary text
113my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +0200114# optional
Akron09e0b2c2020-07-28 15:57:01 +0200115
Akron54c3ff12021-02-25 11:33:37 +0100116# Remember to skip certain inline tags
117my %skip_inline_tags = ();
118if ($skip_inline_tags_str) {
119 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
120 $skip_inline_tags{$_} = 1;
121 };
122};
123
Akrond3e1d282021-02-24 14:51:27 +0100124# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200125my $ext_tok;
126if ($tokenizer_call) {
127 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100128 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200129}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200130
Akronb93fabb2023-01-13 12:05:44 +0100131# KorAP tokenization
Akron0c41ab32020-09-29 07:33:33 +0200132elsif ($tokenizer_korap) {
Akronbd4281e2022-03-28 08:31:40 +0200133 eval {
134 require KorAP::XML::TEI::Tokenizer::KorAP;
135 1;
136 };
Akron2520a342022-03-29 18:18:05 +0200137
138 my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
139 if ($korap_tok_ver ne $VERSION) {
140 $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
141 exit(1);
142 };
143
Marc Kupietz985da0c2021-02-15 19:29:50 +0100144 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akronb93fabb2023-01-13 12:05:44 +0100145}
146
147# No internal tokenizer chosen
148elsif (!$tokenizer_intern && !$no_tokenizer) {
149 $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
150 exit(1);
Akron0c41ab32020-09-29 07:33:33 +0200151};
Peter Harders6f526a32020-06-29 21:44:41 +0200152
Akron6b1f26b2024-09-19 11:35:32 +0200153if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron11484782021-11-03 20:12:14 +0100154 $skip_inline_tags{s} = 1;
155};
Akron0c41ab32020-09-29 07:33:33 +0200156
Akrond3e1d282021-02-24 14:51:27 +0100157# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100158my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
159my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100160
Peter Harders41c35622020-07-12 01:16:22 +0200161
Akrondd0be8f2021-02-18 19:29:41 +0100162# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100163# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100164my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100165
Akron1a5271a2021-02-18 13:18:15 +0100166# Name of the directory and the file containing all inline token informations
167# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
168my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100169
Akrone2819a12021-10-12 15:52:55 +0200170if (index($_tokens_dir, '!') == 0) {
171 $_tokens_dir = substr($_tokens_dir, 1);
172 $inline_tokens_exclusive = 1;
173};
174
Akron6b1f26b2024-09-19 11:35:32 +0200175
176my ($_dep_dir, $_dep_file);
177if ($inline_dependencies) {
178 ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
179 $inline_dependencies = 1;
180
181 if ($_dep_dir && index($_dep_dir, '!') == 0) {
182 $_dep_dir = substr($_dep_dir, 1);
183 $inline_deps_exclusive = 1;
184 };
185};
186
187
Akronb87c58d2021-02-23 17:23:30 +0100188# Initialize zipper
Akron132bdeb2024-06-06 14:28:56 +0200189my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron09e0b2c2020-07-28 15:57:01 +0200190
Akronbc899192021-02-24 12:14:47 +0100191# text directory (below $root_dir)
192my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200193
Akronbc899192021-02-24 12:14:47 +0100194# Escaped version of text id
195my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200196
Akrond53913c2021-02-24 09:50:13 +0100197# Default encoding of the text
198my $input_enc = 'UTF-8';
199
Akrond53913c2021-02-24 09:50:13 +0100200# text line (needed for whitespace handling)
201my $text_line = 0;
202
Peter Harders6f526a32020-06-29 21:44:41 +0200203
Akrond53913c2021-02-24 09:50:13 +0100204# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200205my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100206
Akrona2cb2812021-10-30 10:29:08 +0200207# Single dash was set
208if ($stdio) {
209 $input_fh = *STDIN;
210}
Akrona2cb2812021-10-30 10:29:08 +0200211# Input flag was passed
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200212elsif (@ARGV || $input_fname ne '') {
213 unless ($input_fname ne '') {
214 $input_fname = shift @ARGV;
215 };
Akron347be812020-09-29 07:52:52 +0200216 unless (open($input_fh, '<', $input_fname)) {
217 die $log->fatal("File '$input_fname' could not be opened.");
218 };
Akrona2cb2812021-10-30 10:29:08 +0200219}
220
221# No input to process
222else {
223 pod2usage(
224 -verbose => 99,
225 -sections => 'NAME|SYNOPSIS',
226 -msg => $VERSION_MSG,
227 -output => '-'
228 );
229 exit;
Akrond53913c2021-02-24 09:50:13 +0100230};
Peter Harders6f526a32020-06-29 21:44:41 +0200231
Akronf8088e62021-02-18 16:18:59 +0100232# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200233binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200234
Peter Harders6f526a32020-06-29 21:44:41 +0200235
Akroneb12e232021-02-25 13:49:50 +0100236# Create inline parser object
237my $inline = KorAP::XML::TEI::Inline->new(
238 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200239 \%skip_inline_tags,
Akron6b1f26b2024-09-19 11:35:32 +0200240 $inline_tokens_exclusive,
241 $inline_dependencies
Akroneb12e232021-02-25 13:49:50 +0100242);
243
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200244do {
245 $log->notice("Reading input document $input_fname") if ($input_fname ne '');
246 MAIN:
247 while (<$input_fh>) {
Akroneb12e232021-02-25 13:49:50 +0100248
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200249 # remove HTML (multi-line) comments (<!--...-->)
250 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200251
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200252 # Set input encoding
253 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
254 $input_enc = $2;
255 next;
Akron0bb7e722020-09-29 07:48:33 +0200256 };
Peter Harders6f526a32020-06-29 21:44:41 +0200257
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200258 $_ = decode($input_enc, $_);
259 $_ = replace_entities($_);
Peter Harders90157342020-07-01 21:05:14 +0200260
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200261 # Start of text body
262 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
263 my $suffix = $2;
Peter Harders90157342020-07-01 21:05:14 +0200264
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200265 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
266 die $log->fatal("input line number $.: " .
267 "line with opening text-body tag '${_TEXT_BODY}' " .
268 "contains additional information ... => Aborting (line=$_)");
269 };
Peter Harders6f526a32020-06-29 21:44:41 +0200270
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200271 # Text body data extracted from input document ($input_fh),
272 # further processed by XML::LibXML::Reader
273 my $text_buffer = '';
Peter Harders6f526a32020-06-29 21:44:41 +0200274
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200275 # Iterate over all lines in the text body
276 while (<$input_fh>) {
Peter Harders6f526a32020-06-29 21:44:41 +0200277
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200278 $_ = remove_xml_comments($input_fh, $_);
279 $_ = decode($input_enc, $_);
280 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200281
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200282 # End of text body
283 if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
284
285 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
286
287 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
288 die $log->fatal("input line number $.: " .
289 "line with closing text-body tag '${_TEXT_BODY}'" .
290 " contains additional information ... => Aborting (line=$_)");
291 };
292
293 if ($dir eq '') {
294 $log->warn(
295 "Maybe empty textSigle => skipping this text ...\n" .
296 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100297 );
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200298 next MAIN;
299 };
300
301 # Parse inline structure
302 $inline->parse($text_id_esc, \$text_buffer);
303
304 if (DEBUG) {
305 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
306 };
307
308 my $data = $inline->data;
309
310 # Write data.xml
311 $data->to_zip(
312 $zipper->new_stream("$dir/${data_file}.xml"),
313 $text_id_esc
314 );
315
316 # Tokenize with external tokenizer
317 if ($ext_tok) {
318
319 # Tokenize and output
320 $ext_tok->tokenize($data->data)->to_zip(
321 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
322 $text_id_esc
323 );
324
325 if ($use_tokenizer_sentence_splits) {
326 $ext_tok->sentencize_from_previous_input($inline->structures);
327 };
328 };
329
330 # Tokenize with internal tokenizer
331 if ($tokenizer_intern) {
332
333 # Tokenize and output
334 $cons_tok->tokenize($data->data)->to_zip(
335 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
336 $text_id_esc
337 )->reset;
338
339 $aggr_tok->tokenize($data->data)->to_zip(
340 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
341 $text_id_esc
342 )->reset;
343 };
344
345 # ~ write structures ~
346 unless ($inline->structures->empty) {
347 $inline->structures->to_zip(
348 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
349 $text_id_esc,
350 2 # = structure serialization
351 );
352 };
353
354 # ~ write tokens ~
355 unless ($skip_inline_tokens || $inline->tokens->empty) {
356 $inline->tokens->to_zip(
357 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
358 $text_id_esc,
359 # Either 0 = tokens without inline or
360 # 1 = tokens with inline
361 # !$skip_inline_token_annotations
362 ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
363 );
364 };
365
366 # ~ write dependencies ~
367 unless ($inline->dependencies->empty) {
368 $inline->dependencies->to_zip(
369 $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
370 $text_id_esc,
371 3 # = dependency serialization
372 );
373 };
374
375
376 # reinit.
377 $dir = '';
378
Akrondafaa7a2021-02-19 15:17:58 +0100379 next MAIN;
380 };
Peter Harders6f526a32020-06-29 21:44:41 +0200381
Akrondafaa7a2021-02-19 15:17:58 +0100382
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200383 # ~ whitespace handling ~
Akrondafaa7a2021-02-19 15:17:58 +0100384
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200385 # Fix whitespaces (see notes on whitespace fixing)
Akroneb12e232021-02-25 13:49:50 +0100386
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200387 # TODO:
388 # Maybe it's best, to keep the stripping of whitespace and
389 # to just remove the if-clause and to insert a blank by default
390 # (with possibly an option on how newlines in primary text should
391 # be handled (stripped or replaced by a whitespace)).
Akrondafaa7a2021-02-19 15:17:58 +0100392
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200393 # Remove consecutive whitespace at beginning and end (mostly one newline)
394 s/^\s+//;
395 s/\s+$//;
Akrondafaa7a2021-02-19 15:17:58 +0100396
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200397 # NOTE:
398 # this is only relevant, if a text consists of more than one line
Akrond53ab4b2021-02-24 09:56:12 +0100399
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200400 # TODO:
401 # find a better solution, or create a warning, if a text has more
402 # than one line ($text_line > 1)
Peter Harders6f526a32020-06-29 21:44:41 +0200403
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200404 # TODO:
405 # do testing with 2 different corpora
406 # (one with only one-line texts, the other with several lines per text)
Peter Harders6f526a32020-06-29 21:44:41 +0200407
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200408 # line contains at least one non-tag character
409 if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
Akron598d1a72020-08-02 17:33:31 +0200410
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200411 # Increment counter for text lines
412 $text_line++;
Akrona10ad592020-08-03 11:20:23 +0200413
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200414 # insert blank before 1st character
415 # (for 2nd line and consecutive lines)
416 $_ = ' ' . $_ if $text_line > 1;
417 }
Akrondafaa7a2021-02-19 15:17:58 +0100418
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200419 # add line to buffer
420 $text_buffer .= $_;
421 };
422 }
423 elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
424 my $leadin = $1;
425 my $id = $3;
426 my $sigle = $3;
Akrondafaa7a2021-02-19 15:17:58 +0100427
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200428 if ($what) {
429 $_ = $id;
430 eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
431 $sigle = $_;
432 $log->debug("Converted text id `$id' to sigle `$sigle'");
433 };
434 $sigle =~ s/\./-/g;
Akron6b1f26b2024-09-19 11:35:32 +0200435
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200436 my @parts = split(/[\/_]/, $sigle);
437 if (@parts != 3) {
438 die $log->fatal(
439 "input line number $.: " .
440 "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
441 "=> Aborting (line=$_)");
Akron598d1a72020-08-02 17:33:31 +0200442 };
443
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200444 $dir = join("/", @parts);
445 $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
446 $log->notice("$0: text_id=$text_id_esc");
Peter Harders6f526a32020-06-29 21:44:41 +0200447
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200448 if ($leadin !~ /^\s*$/) {
449 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100450 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200451 'line with opening header tag is not in expected format ... ' .
452 "=> Aborting (line=$_)");
453 };
454 }
Marc Kupietza671ae52022-12-22 16:28:14 +0100455
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200456 # Start of header section
457 elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
458 my $content = "$2\n";
Marc Kupietza671ae52022-12-22 16:28:14 +0100459
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200460 if ($1 !~ /^\s*$/) {
461 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100462 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200463 'line with opening header tag is not in expected format ... ' .
464 "=> Aborting (line=$_)");
465 };
Marc Kupietza671ae52022-12-22 16:28:14 +0100466
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200467 # Parse header
468 my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
469 if ($auto_textsigle) {
470 $auto_textsigle = increase_auto_textsigle($auto_textsigle);
471 $log->debug("Auto-incremented text sigle to $auto_textsigle");
472 };
Akronf57ed812020-07-27 10:37:52 +0200473
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200474 # Header was parseable
475 if ($header) {
Akron347be812020-09-29 07:52:52 +0200476
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200477 # Write header to zip
478 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200479
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200480 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200481
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200482 $header->to_zip($zipper->new_stream($file));
Akron347be812020-09-29 07:52:52 +0200483
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200484 # Header is for text level
485 if ($header->type eq 'text') {
Akron347be812020-09-29 07:52:52 +0200486
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200487 # Remember dir and sigles
488 $dir = $header->dir;
489 $text_id_esc = $header->id_esc;
Akron347be812020-09-29 07:52:52 +0200490
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200491 # log output for seeing progression
492 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200493
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200494 # Reset counter for text lines
495 # (needed for whitespace handling)
496 $text_line = 0;
497 };
Akrond53913c2021-02-24 09:50:13 +0100498 };
499 };
500 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200501 $text_id_esc = $auto_textsigle if ($auto_textsigle);
502} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron347be812020-09-29 07:52:52 +0200503$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200504
Akron9df4a242021-02-19 15:31:16 +0100505$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100506
Akrond53913c2021-02-24 09:50:13 +0100507close $input_fh;
508
Peter Harders6f526a32020-06-29 21:44:41 +0200509
Akrond949e182020-02-14 12:23:57 +0100510__END__
511
512=pod
513
514=encoding utf8
515
516=head1 NAME
517
518tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
519
520=head1 SYNOPSIS
521
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200522 cat corpus.i5.xml | tei2korapxml -tk - > corpus.korapxml.zip
523 tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100524
525=head1 DESCRIPTION
526
Akronee434b12020-07-08 12:53:01 +0200527C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200528L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200529based documents to the
530L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200531
Akrond949e182020-02-14 12:23:57 +0100532This program is usually called from inside another script.
533
Akronee434b12020-07-08 12:53:01 +0200534=head1 FORMATS
535
536=head2 Input restrictions
537
538=over 2
539
540=item
541
Akronee434b12020-07-08 12:53:01 +0200542TEI P5 formatted input with certain restrictions:
543
544=over 4
545
546=item
547
Akrone48bec42023-01-05 12:18:45 +0100548B<mandatory>: text-header with integrated textsigle
549(or convertable identifier), text-body
Akronee434b12020-07-08 12:53:01 +0200550
551=item
552
553B<optional>: corp-header with integrated corpsigle,
554doc-header with integrated docsigle
555
556=back
557
558=item
559
Akron0c41ab32020-09-29 07:33:33 +0200560All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200561newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200562(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200563into blanks between 2 tokens could lead to additional blanks,
564where there should be none (e.g.: punctuation characters like C<,> or
565C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100566(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200567
Akron940ca6f2021-10-11 12:38:39 +0200568=item
569
570Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
571need to be defined in the same line as the header tag.
572
Akronee434b12020-07-08 12:53:01 +0200573=back
574
575=head2 Notes on the output
576
577=over 2
578
579=item
580
581zip file output (default on C<stdout>) with utf8 encoded entries
582(which together form the KorAP-XML format)
583
584=back
585
Akrond949e182020-02-14 12:23:57 +0100586=head1 INSTALLATION
587
Akrond26319b2023-01-12 15:34:41 +0100588C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietze83a4e92021-03-16 20:51:26 +0100589When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100590to use L<cpanm|App::cpanminus>.
591
592 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
593
594In case everything went well, the C<tei2korapxml> tool will
595be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200596
Akrond949e182020-02-14 12:23:57 +0100597Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
598
599=head1 OPTIONS
600
601=over 2
602
Akrona2cb2812021-10-30 10:29:08 +0200603=item B<--input|-i>
604
605The input file to process. If no specific input is defined and a single
606dash C<-> is passed as an argument, data is read from C<STDIN>.
607
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200608Instead of using C<-i> input files can also be defined as trailing arguments
609to the command:
610
611 tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
612
Akron132bdeb2024-06-06 14:28:56 +0200613=item B<--output|-o>
614
615The output zip file to be created. If no specific output is defined,
616data is written to C<STDOUT>.
Akrona2cb2812021-10-30 10:29:08 +0200617
Akron4e603a52020-07-27 14:23:49 +0200618=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100619
Akron4e603a52020-07-27 14:23:49 +0200620The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100621
622=item B<--help|-h>
623
624Print help information.
625
626=item B<--version|-v>
627
628Print version information.
629
Akrone48bec42023-01-05 12:18:45 +0100630=item B<--tokenizer-korap|-tk>
Akron2520a342022-03-29 18:18:05 +0200631
Akrone48bec42023-01-05 12:18:45 +0100632Use the standard KorAP/DeReKo tokenizer.
633
634=item B<--tokenizer-internal|-ti>
635
636Tokenize the data using two embedded tokenizers,
637that will take an I<aggressive> and a I<conservative>
638approach.
Akron2520a342022-03-29 18:18:05 +0200639
Akron4e603a52020-07-27 14:23:49 +0200640=item B<--tokenizer-call|-tc>
641
642Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100643from STDIN and outputs the offsets of all tokens.
644
645Texts are separated using C<\x04\n>. The external process
646should add a new line per text.
647
648If the L</--use-tokenizer-sentence-splits> option is activated,
649sentences are marked by offset as well in new lines.
650
651To use L<Datok|https://github.com/KorAP/Datok> including sentence
652splitting, call C<tei2korap> as follows:
653
654 $ cat corpus.i5.xml | tei2korapxml -s \
655 $ -tc 'datok tokenize \
656 $ -t ./tokenizer.matok \
657 $ -p --newline-after-eot --no-sentences \
658 $ --no-tokens --sentence-positions -' - \
659 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200660
Akronb93fabb2023-01-13 12:05:44 +0100661=item B<--no-tokenizer>
662
663Boolean flag indicating that no tokenizer should be used.
664This is meant to ensure that by default a final token layer always
665exists.
666If a separate tokenizer is chosen, this flag is ignored.
667
Akron75d63142021-02-23 18:40:56 +0100668=item B<--skip-inline-tokens>
669
670Boolean flag indicating that inline tokens should not
671be processed. Defaults to false (meaning inline tokens will be processed).
672
Akron692d17d2021-03-05 13:21:03 +0100673=item B<--skip-inline-token-annotations>
674
675Boolean flag indicating that inline token annotations should not
676be processed. Defaults to true (meaning inline token annotations
Akron6b1f26b2024-09-19 11:35:32 +0200677won't be processed). Can be negated with
678C<--no-skip-inline-token-annotations>.
Akron692d17d2021-03-05 13:21:03 +0100679
Akronca70a1d2021-02-25 16:21:31 +0100680=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100681
682Expects a comma-separated list of tags to be ignored when the structure
683is parsed. Content of these tags however will be processed.
684
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +0200685=item B<--auto-textsigle> <textsigle>
686
687Expects a text sigle thats serves as fallback if no text sigles
688are given in the input data.
689The auto text sigle will be incremented for each text processed.
690
691Example:
692
693 tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
694 < data.i5.xml > korapxml.zip
695
Marc Kupietza671ae52022-12-22 16:28:14 +0100696=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
697
Akrone48bec42023-01-05 12:18:45 +0100698Expects a regular replacement expression (separated by B<@> between the
Marc Kupietza671ae52022-12-22 16:28:14 +0100699search and the replacement) to convert text id attributes to text sigles
700with three parts (separated by B</>).
701
702Example:
703
704 tei2korapxml \
705 --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
706 -tk - < t/data/icc_german_sample.p5.xml
707
Akrone48bec42023-01-05 12:18:45 +0100708Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
709sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietza671ae52022-12-22 16:28:14 +0100710
Akron1a5271a2021-02-18 13:18:15 +0100711=item B<--inline-tokens> <foundry>#[<file>]
712
713Define the foundry and file (without extension)
714to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100715Unless C<--skip-inline-token-annotations> is set,
716this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100717Defaults to C<tokens> and C<morpho>.
718
Akrone2819a12021-10-12 15:52:55 +0200719The inline token data will also be stored in the
720inline structures file (see I<--inline-structures>),
721unless the inline token foundry is prepended
722by an B<!> exclamation mark, indicating that inline
723tokens are stored exclusively in the inline tokens
724file.
725
726Example:
727
Akron6b1f26b2024-09-19 11:35:32 +0200728 tei2korapxml --no-tokenizer --inline-tokens \
729 '!gingko#morpho' < data.i5.xml > korapxml.zip
730
731=item B<--inline-dependencies> <foundry>#[<file>]
732
733Define the foundry and file (without extension)
734to store inline dependency information in.
735Defaults to the layer of C<dependency> and
736will be ignored if not set (which means, dependency
737attributes will be stored in the inline tokens file,
738if not skipped).
739
740The dependency data will also be stored in the
741inline token file (see I<--inline-tokens>),
742unless the inline dependencies foundry is prepended
743by an B<!> exclamation mark, indicating that inline
744dependency data is stored exclusively in the inline
745dependencies file.
746
747Example:
748
749 tei2korapxml --no-tokenizer --inline-dependencies \
750 'gingko#dependency' < data.i5.xml > korapxml.zip
751
Akrone2819a12021-10-12 15:52:55 +0200752
Akrondd0be8f2021-02-18 19:29:41 +0100753=item B<--inline-structures> <foundry>#[<file>]
754
755Define the foundry and file (without extension)
756to store inline structure information in.
757Defaults to C<struct> and C<structures>.
758
Akron26a71522021-02-19 10:27:37 +0100759=item B<--base-foundry> <foundry>
760
761Define the base foundry to store newly generated
762token information in.
763Defaults to C<base>.
764
765=item B<--data-file> <file>
766
767Define the file (without extension)
768to store primary data information in.
769Defaults to C<data>.
770
771=item B<--header-file> <file>
772
773Define the file name (without extension)
774to store header information on
775the corpus, document, and text level in.
776Defaults to C<header>.
777
Marc Kupietz985da0c2021-02-15 19:29:50 +0100778=item B<--use-tokenizer-sentence-splits|-s>
779
780Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100781provided by the tokenizer.
782Currently KorAP-tokenizer and certain external tokenizers support
783these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100784
Akron91705d72021-02-19 10:59:45 +0100785=item B<--tokens-file> <file>
786
787Define the file (without extension)
788to store generated token information in
789(either from the KorAP tokenizer or an externally called tokenizer).
790Defaults to C<tokens>.
791
Akron3378dfd2020-08-01 15:01:36 +0200792=item B<--log|-l>
793
794Loglevel for I<Log::Any>. Defaults to C<notice>.
795
Akrond949e182020-02-14 12:23:57 +0100796=back
797
Akronb3649472020-09-29 08:24:46 +0200798=head1 ENVIRONMENT VARIABLES
799
800=over 2
801
802=item B<KORAPXMLTEI_DEBUG>
803
804Activate minimal debugging.
805Defaults to C<false>.
806
Akronb3649472020-09-29 08:24:46 +0200807=back
808
Akrond949e182020-02-14 12:23:57 +0100809=head1 COPYRIGHT AND LICENSE
810
Akron6b1f26b2024-09-19 11:35:32 +0200811Copyright (C) 2021-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100812
813Author: Peter Harders
814
Akronaabd0952020-09-29 07:35:08 +0200815Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100816
817L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
818Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200819L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100820member of the
821L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
822
823This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100824L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100825
826=cut
Akronf8088e62021-02-18 16:18:59 +0100827
828# NOTES
829
Akronf8088e62021-02-18 16:18:59 +0100830## Notes on segfault prevention
831
Akron91577922021-02-19 10:32:54 +0100832binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100833(see notes on 'PerlIO layers' in 'man XML::LibXML'),
834removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
835see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
836see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.