blob: 5855406ec7d1e27dd3c64ade3fcb1ccefdecea91 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +02009use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders6f526a32020-06-29 21:44:41 +020010
11use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010012
Akroneaa96232020-10-15 17:06:15 +020013use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010014
Akron4f67cd42020-07-02 12:27:58 +020015use FindBin;
16BEGIN {
17 unshift @INC, "$FindBin::Bin/../lib";
18};
19
Marc Kupietz8a954e52021-02-16 22:03:07 +010020use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020021use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020022use KorAP::XML::TEI::Tokenizer::Conservative;
23use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020024use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020025use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010026use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010027
Marc Kupietz4ad648e2025-12-10 10:38:46 +010028our $VERSION = '2.6.2';
Peter Harders6f526a32020-06-29 21:44:41 +020029
Akrond949e182020-02-14 12:23:57 +010030our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
31
Akron33db4ec2021-02-24 12:52:21 +010032use constant {
33 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010034 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010035};
Peter Hardersd892a582020-02-12 15:45:22 +010036
Akron692d17d2021-03-05 13:21:03 +010037if ($ENV{KORAPXMLTEI_INLINE}) {
38 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
39};
40
Akrone2819a12021-10-12 15:52:55 +020041# Inline tokens won't be stored in the structure file
42my $inline_tokens_exclusive = 0;
43
Akron6b1f26b2024-09-19 11:35:32 +020044# Inline dependencies won't be stored in the tokens file
45my $inline_deps_exclusive = 0;
46
Peter Harders6f526a32020-06-29 21:44:41 +020047# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010048GetOptions(
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +020049 'auto-textsigle|A=s' => \(my $auto_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010050 'root|r=s' => \(my $root_dir = '.'),
51 'input|i=s' => \(my $input_fname = ''),
Akron132bdeb2024-06-06 14:28:56 +020052 'output|o=s' => \(my $output_fname = ''),
Akron75d63142021-02-23 18:40:56 +010053 'tokenizer-call|tc=s' => \(my $tokenizer_call),
54 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010055 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akronb93fabb2023-01-13 12:05:44 +010056 'no-tokenizer' => \(my $no_tokenizer),
Akron75d63142021-02-23 18:40:56 +010057 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
58 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
59 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron6b1f26b2024-09-19 11:35:32 +020060 'inline-dependencies=s' => \(my $inline_dependencies),
Akron75d63142021-02-23 18:40:56 +010061 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron6b1f26b2024-09-19 11:35:32 +020062 'skip-inline-token-annotations!' => \(
Akron692d17d2021-03-05 13:21:03 +010063 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010064 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietza671ae52022-12-22 16:28:14 +010069 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010070 'log|l=s' => \(my $log_level = 'notice'),
Akron2520a342022-03-29 18:18:05 +020071 'required-version|rv=s' => \(my $required_version),
Marc Kupietz2115ecc2025-12-10 11:37:03 +010072 'progress|p' => \(my $progress),
Akrona2cb2812021-10-30 10:29:08 +020073 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010074 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010075 pod2usage(
76 -verbose => 99,
77 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
78 -msg => $VERSION_MSG,
79 -output => '-'
80 )
81 },
82 'version|v' => sub {
83 pod2usage(
84 -verbose => 0,
85 -msg => $VERSION_MSG,
86 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010087 );
Akrond949e182020-02-14 12:23:57 +010088 }
Peter Hardersd892a582020-02-12 15:45:22 +010089);
90
Akrond3e1d282021-02-24 14:51:27 +010091
Akronb87c58d2021-02-23 17:23:30 +010092# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010093binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020094Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020095$log->notice('Debugging is activated') if DEBUG;
96
Akrond3e1d282021-02-24 14:51:27 +010097
Akron2520a342022-03-29 18:18:05 +020098if ($required_version) {
Marc Kupietz2475c952024-01-09 10:40:04 +010099 $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
Akron2520a342022-03-29 18:18:05 +0200100 if (!$1 || $1 ne $VERSION) {
101 $log->error("Required version $required_version mismatches version $VERSION");
102 exit(1);
103 };
104};
105
106
Marc Kupietza671ae52022-12-22 16:28:14 +0100107my ($what, $with);
108if ($xmlid_to_textsigle ne '') {
109 ($what, $with) = split('@', $xmlid_to_textsigle);
110 $what = qr!$what!;
111};
112
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100113if ($progress) {
114 eval {
115 require Time::Progress;
116 1;
117 } or do {
118 $log->warn('Time::Progress not installed. Progress bar disabled.');
119 $progress = 0;
120 }
121};
122
Akron0529e512021-02-22 09:55:35 +0100123# tag (without attributes), which contains the primary text
124my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +0200125# optional
Akron09e0b2c2020-07-28 15:57:01 +0200126
Akron54c3ff12021-02-25 11:33:37 +0100127# Remember to skip certain inline tags
128my %skip_inline_tags = ();
129if ($skip_inline_tags_str) {
130 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
131 $skip_inline_tags{$_} = 1;
132 };
133};
134
Akrond3e1d282021-02-24 14:51:27 +0100135# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200136my $ext_tok;
137if ($tokenizer_call) {
138 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100139 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200140}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200141
Akronb93fabb2023-01-13 12:05:44 +0100142# KorAP tokenization
Akron0c41ab32020-09-29 07:33:33 +0200143elsif ($tokenizer_korap) {
Akronbd4281e2022-03-28 08:31:40 +0200144 eval {
145 require KorAP::XML::TEI::Tokenizer::KorAP;
146 1;
147 };
Akron2520a342022-03-29 18:18:05 +0200148
149 my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
150 if ($korap_tok_ver ne $VERSION) {
151 $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
152 exit(1);
153 };
154
Marc Kupietz985da0c2021-02-15 19:29:50 +0100155 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akronb93fabb2023-01-13 12:05:44 +0100156}
157
158# No internal tokenizer chosen
159elsif (!$tokenizer_intern && !$no_tokenizer) {
160 $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
161 exit(1);
Akron0c41ab32020-09-29 07:33:33 +0200162};
Peter Harders6f526a32020-06-29 21:44:41 +0200163
Akron6b1f26b2024-09-19 11:35:32 +0200164if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron11484782021-11-03 20:12:14 +0100165 $skip_inline_tags{s} = 1;
166};
Akron0c41ab32020-09-29 07:33:33 +0200167
Akrond3e1d282021-02-24 14:51:27 +0100168# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100169my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
170my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100171
Peter Harders41c35622020-07-12 01:16:22 +0200172
Akrondd0be8f2021-02-18 19:29:41 +0100173# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100174# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100175my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100176
Akron1a5271a2021-02-18 13:18:15 +0100177# Name of the directory and the file containing all inline token informations
178# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
179my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100180
Akrone2819a12021-10-12 15:52:55 +0200181if (index($_tokens_dir, '!') == 0) {
182 $_tokens_dir = substr($_tokens_dir, 1);
183 $inline_tokens_exclusive = 1;
184};
185
Akron6b1f26b2024-09-19 11:35:32 +0200186
187my ($_dep_dir, $_dep_file);
188if ($inline_dependencies) {
189 ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
190 $inline_dependencies = 1;
191
192 if ($_dep_dir && index($_dep_dir, '!') == 0) {
193 $_dep_dir = substr($_dep_dir, 1);
194 $inline_deps_exclusive = 1;
195 };
196};
197
198
Akronb87c58d2021-02-23 17:23:30 +0100199# Initialize zipper
Akron132bdeb2024-06-06 14:28:56 +0200200my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron09e0b2c2020-07-28 15:57:01 +0200201
Akronbc899192021-02-24 12:14:47 +0100202# text directory (below $root_dir)
203my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200204
Akronbc899192021-02-24 12:14:47 +0100205# Escaped version of text id
206my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200207
Akrond53913c2021-02-24 09:50:13 +0100208# Default encoding of the text
209my $input_enc = 'UTF-8';
210
Akrond53913c2021-02-24 09:50:13 +0100211# text line (needed for whitespace handling)
212my $text_line = 0;
213
Peter Harders6f526a32020-06-29 21:44:41 +0200214
Akrond53913c2021-02-24 09:50:13 +0100215# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200216my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100217
Akrona2cb2812021-10-30 10:29:08 +0200218# Single dash was set
219if ($stdio) {
220 $input_fh = *STDIN;
221}
Akrona2cb2812021-10-30 10:29:08 +0200222# Input flag was passed
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200223elsif (@ARGV || $input_fname ne '') {
224 unless ($input_fname ne '') {
225 $input_fname = shift @ARGV;
226 };
Akron347be812020-09-29 07:52:52 +0200227 unless (open($input_fh, '<', $input_fname)) {
228 die $log->fatal("File '$input_fname' could not be opened.");
229 };
Akrona2cb2812021-10-30 10:29:08 +0200230}
231
232# No input to process
233else {
234 pod2usage(
235 -verbose => 99,
236 -sections => 'NAME|SYNOPSIS',
237 -msg => $VERSION_MSG,
238 -output => '-'
239 );
240 exit;
Akrond53913c2021-02-24 09:50:13 +0100241};
Peter Harders6f526a32020-06-29 21:44:41 +0200242
Akronf8088e62021-02-18 16:18:59 +0100243# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200244binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200245
Peter Harders6f526a32020-06-29 21:44:41 +0200246
Akroneb12e232021-02-25 13:49:50 +0100247# Create inline parser object
248my $inline = KorAP::XML::TEI::Inline->new(
249 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200250 \%skip_inline_tags,
Akron6b1f26b2024-09-19 11:35:32 +0200251 $inline_tokens_exclusive,
252 $inline_dependencies
Akroneb12e232021-02-25 13:49:50 +0100253);
254
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200255do {
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100256 my $p;
257 if ($progress && $input_fname ne '') {
258 my $file_size = -s $input_fname;
259 if ($file_size) {
260 $p = Time::Progress->new(min => 0, max => $file_size);
261 $log->notice("Reading input document $input_fname (Size: $file_size bytes)");
262 }
263 } elsif ($input_fname ne '') {
264 $log->notice("Reading input document $input_fname");
265 };
266
267 my $i = 0;
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200268 MAIN:
269 while (<$input_fh>) {
Akroneb12e232021-02-25 13:49:50 +0100270
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100271 if ($p && ($i++ % 500 == 0)) {
272 print STDERR $p->report("\r%20b %p ETA: %E", tell($input_fh));
273 };
274
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200275 # remove HTML (multi-line) comments (<!--...-->)
276 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200277
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200278 # Set input encoding
279 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
280 $input_enc = $2;
281 next;
Akron0bb7e722020-09-29 07:48:33 +0200282 };
Peter Harders6f526a32020-06-29 21:44:41 +0200283
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200284 $_ = decode($input_enc, $_);
285 $_ = replace_entities($_);
Peter Harders90157342020-07-01 21:05:14 +0200286
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200287 # Start of text body
288 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
289 my $suffix = $2;
Peter Harders90157342020-07-01 21:05:14 +0200290
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200291 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
292 die $log->fatal("input line number $.: " .
293 "line with opening text-body tag '${_TEXT_BODY}' " .
294 "contains additional information ... => Aborting (line=$_)");
295 };
Peter Harders6f526a32020-06-29 21:44:41 +0200296
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200297 # Text body data extracted from input document ($input_fh),
298 # further processed by XML::LibXML::Reader
299 my $text_buffer = '';
Peter Harders6f526a32020-06-29 21:44:41 +0200300
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200301 # Iterate over all lines in the text body
302 while (<$input_fh>) {
Peter Harders6f526a32020-06-29 21:44:41 +0200303
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200304 $_ = remove_xml_comments($input_fh, $_);
305 $_ = decode($input_enc, $_);
306 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200307
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200308 # End of text body
309 if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
310
311 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
312
313 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
314 die $log->fatal("input line number $.: " .
315 "line with closing text-body tag '${_TEXT_BODY}'" .
316 " contains additional information ... => Aborting (line=$_)");
317 };
318
319 if ($dir eq '') {
320 $log->warn(
321 "Maybe empty textSigle => skipping this text ...\n" .
322 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100323 );
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200324 next MAIN;
325 };
326
327 # Parse inline structure
328 $inline->parse($text_id_esc, \$text_buffer);
329
330 if (DEBUG) {
331 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
332 };
333
334 my $data = $inline->data;
335
336 # Write data.xml
337 $data->to_zip(
338 $zipper->new_stream("$dir/${data_file}.xml"),
339 $text_id_esc
340 );
341
342 # Tokenize with external tokenizer
343 if ($ext_tok) {
344
345 # Tokenize and output
346 $ext_tok->tokenize($data->data)->to_zip(
347 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
348 $text_id_esc
349 );
350
351 if ($use_tokenizer_sentence_splits) {
352 $ext_tok->sentencize_from_previous_input($inline->structures);
353 };
354 };
355
356 # Tokenize with internal tokenizer
357 if ($tokenizer_intern) {
358
359 # Tokenize and output
360 $cons_tok->tokenize($data->data)->to_zip(
361 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
362 $text_id_esc
363 )->reset;
364
365 $aggr_tok->tokenize($data->data)->to_zip(
366 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
367 $text_id_esc
368 )->reset;
369 };
370
371 # ~ write structures ~
372 unless ($inline->structures->empty) {
373 $inline->structures->to_zip(
374 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
375 $text_id_esc,
376 2 # = structure serialization
377 );
378 };
379
380 # ~ write tokens ~
381 unless ($skip_inline_tokens || $inline->tokens->empty) {
382 $inline->tokens->to_zip(
383 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
384 $text_id_esc,
385 # Either 0 = tokens without inline or
386 # 1 = tokens with inline
387 # !$skip_inline_token_annotations
388 ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
389 );
390 };
391
392 # ~ write dependencies ~
393 unless ($inline->dependencies->empty) {
394 $inline->dependencies->to_zip(
395 $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
396 $text_id_esc,
397 3 # = dependency serialization
398 );
399 };
400
401
402 # reinit.
403 $dir = '';
404
Akrondafaa7a2021-02-19 15:17:58 +0100405 next MAIN;
406 };
Peter Harders6f526a32020-06-29 21:44:41 +0200407
Akrondafaa7a2021-02-19 15:17:58 +0100408
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200409 # ~ whitespace handling ~
Akrondafaa7a2021-02-19 15:17:58 +0100410
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200411 # Fix whitespaces (see notes on whitespace fixing)
Akroneb12e232021-02-25 13:49:50 +0100412
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200413 # TODO:
414 # Maybe it's best, to keep the stripping of whitespace and
415 # to just remove the if-clause and to insert a blank by default
416 # (with possibly an option on how newlines in primary text should
417 # be handled (stripped or replaced by a whitespace)).
Akrondafaa7a2021-02-19 15:17:58 +0100418
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200419 # Remove consecutive whitespace at beginning and end (mostly one newline)
420 s/^\s+//;
421 s/\s+$//;
Akrondafaa7a2021-02-19 15:17:58 +0100422
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200423 # NOTE:
424 # this is only relevant, if a text consists of more than one line
Akrond53ab4b2021-02-24 09:56:12 +0100425
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200426 # TODO:
427 # find a better solution, or create a warning, if a text has more
428 # than one line ($text_line > 1)
Peter Harders6f526a32020-06-29 21:44:41 +0200429
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200430 # TODO:
431 # do testing with 2 different corpora
432 # (one with only one-line texts, the other with several lines per text)
Peter Harders6f526a32020-06-29 21:44:41 +0200433
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200434 # line contains at least one non-tag character
435 if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
Akron598d1a72020-08-02 17:33:31 +0200436
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200437 # Increment counter for text lines
438 $text_line++;
Akrona10ad592020-08-03 11:20:23 +0200439
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200440 # insert blank before 1st character
441 # (for 2nd line and consecutive lines)
442 $_ = ' ' . $_ if $text_line > 1;
443 }
Akrondafaa7a2021-02-19 15:17:58 +0100444
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200445 # add line to buffer
446 $text_buffer .= $_;
447 };
448 }
449 elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
450 my $leadin = $1;
451 my $id = $3;
452 my $sigle = $3;
Akrondafaa7a2021-02-19 15:17:58 +0100453
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200454 if ($what) {
455 $_ = $id;
456 eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
457 $sigle = $_;
458 $log->debug("Converted text id `$id' to sigle `$sigle'");
459 };
460 $sigle =~ s/\./-/g;
Akron6b1f26b2024-09-19 11:35:32 +0200461
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200462 my @parts = split(/[\/_]/, $sigle);
463 if (@parts != 3) {
464 die $log->fatal(
465 "input line number $.: " .
466 "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
467 "=> Aborting (line=$_)");
Akron598d1a72020-08-02 17:33:31 +0200468 };
469
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200470 $dir = join("/", @parts);
471 $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
472 $log->notice("$0: text_id=$text_id_esc");
Peter Harders6f526a32020-06-29 21:44:41 +0200473
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200474 if ($leadin !~ /^\s*$/) {
475 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100476 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200477 'line with opening header tag is not in expected format ... ' .
478 "=> Aborting (line=$_)");
479 };
480 }
Marc Kupietza671ae52022-12-22 16:28:14 +0100481
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200482 # Start of header section
483 elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
484 my $content = "$2\n";
Marc Kupietza671ae52022-12-22 16:28:14 +0100485
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200486 if ($1 !~ /^\s*$/) {
487 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100488 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200489 'line with opening header tag is not in expected format ... ' .
490 "=> Aborting (line=$_)");
491 };
Marc Kupietza671ae52022-12-22 16:28:14 +0100492
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200493 # Parse header
494 my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
495 if ($auto_textsigle) {
496 $auto_textsigle = increase_auto_textsigle($auto_textsigle);
497 $log->debug("Auto-incremented text sigle to $auto_textsigle");
498 };
Akronf57ed812020-07-27 10:37:52 +0200499
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200500 # Header was parseable
501 if ($header) {
Akron347be812020-09-29 07:52:52 +0200502
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200503 # Write header to zip
504 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200505
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200506 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200507
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200508 $header->to_zip($zipper->new_stream($file));
Akron347be812020-09-29 07:52:52 +0200509
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200510 # Header is for text level
511 if ($header->type eq 'text') {
Akron347be812020-09-29 07:52:52 +0200512
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200513 # Remember dir and sigles
514 $dir = $header->dir;
515 $text_id_esc = $header->id_esc;
Akron347be812020-09-29 07:52:52 +0200516
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200517 # log output for seeing progression
518 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200519
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200520 # Reset counter for text lines
521 # (needed for whitespace handling)
522 $text_line = 0;
523 };
Akrond53913c2021-02-24 09:50:13 +0100524 };
525 };
526 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200527 $text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100528
529 if ($p) {
530 print STDERR $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
531 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200532} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron347be812020-09-29 07:52:52 +0200533$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200534
Akron9df4a242021-02-19 15:31:16 +0100535$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100536
Akrond53913c2021-02-24 09:50:13 +0100537close $input_fh;
538
Peter Harders6f526a32020-06-29 21:44:41 +0200539
Akrond949e182020-02-14 12:23:57 +0100540__END__
541
542=pod
543
544=encoding utf8
545
546=head1 NAME
547
548tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
549
550=head1 SYNOPSIS
551
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200552 cat corpus.i5.xml | tei2korapxml -tk - > corpus.korapxml.zip
553 tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100554
555=head1 DESCRIPTION
556
Akronee434b12020-07-08 12:53:01 +0200557C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200558L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200559based documents to the
560L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200561
Akrond949e182020-02-14 12:23:57 +0100562This program is usually called from inside another script.
563
Akronee434b12020-07-08 12:53:01 +0200564=head1 FORMATS
565
566=head2 Input restrictions
567
568=over 2
569
570=item
571
Akronee434b12020-07-08 12:53:01 +0200572TEI P5 formatted input with certain restrictions:
573
574=over 4
575
576=item
577
Akrone48bec42023-01-05 12:18:45 +0100578B<mandatory>: text-header with integrated textsigle
579(or convertable identifier), text-body
Akronee434b12020-07-08 12:53:01 +0200580
581=item
582
583B<optional>: corp-header with integrated corpsigle,
584doc-header with integrated docsigle
585
586=back
587
588=item
589
Akron0c41ab32020-09-29 07:33:33 +0200590All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200591newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200592(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200593into blanks between 2 tokens could lead to additional blanks,
594where there should be none (e.g.: punctuation characters like C<,> or
595C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100596(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200597
Akron940ca6f2021-10-11 12:38:39 +0200598=item
599
600Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
601need to be defined in the same line as the header tag.
602
Akronee434b12020-07-08 12:53:01 +0200603=back
604
605=head2 Notes on the output
606
607=over 2
608
609=item
610
611zip file output (default on C<stdout>) with utf8 encoded entries
612(which together form the KorAP-XML format)
613
614=back
615
Akrond949e182020-02-14 12:23:57 +0100616=head1 INSTALLATION
617
Akrond26319b2023-01-12 15:34:41 +0100618C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietze83a4e92021-03-16 20:51:26 +0100619When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100620to use L<cpanm|App::cpanminus>.
621
622 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
623
624In case everything went well, the C<tei2korapxml> tool will
625be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200626
Akrond949e182020-02-14 12:23:57 +0100627Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
628
629=head1 OPTIONS
630
631=over 2
632
Akrona2cb2812021-10-30 10:29:08 +0200633=item B<--input|-i>
634
635The input file to process. If no specific input is defined and a single
636dash C<-> is passed as an argument, data is read from C<STDIN>.
637
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200638Instead of using C<-i> input files can also be defined as trailing arguments
639to the command:
640
641 tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
642
Akron132bdeb2024-06-06 14:28:56 +0200643=item B<--output|-o>
644
645The output zip file to be created. If no specific output is defined,
646data is written to C<STDOUT>.
Akrona2cb2812021-10-30 10:29:08 +0200647
Akron4e603a52020-07-27 14:23:49 +0200648=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100649
Akron4e603a52020-07-27 14:23:49 +0200650The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100651
652=item B<--help|-h>
653
654Print help information.
655
656=item B<--version|-v>
657
658Print version information.
659
Akrone48bec42023-01-05 12:18:45 +0100660=item B<--tokenizer-korap|-tk>
Akron2520a342022-03-29 18:18:05 +0200661
Akrone48bec42023-01-05 12:18:45 +0100662Use the standard KorAP/DeReKo tokenizer.
663
664=item B<--tokenizer-internal|-ti>
665
666Tokenize the data using two embedded tokenizers,
667that will take an I<aggressive> and a I<conservative>
668approach.
Akron2520a342022-03-29 18:18:05 +0200669
Akron4e603a52020-07-27 14:23:49 +0200670=item B<--tokenizer-call|-tc>
671
672Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100673from STDIN and outputs the offsets of all tokens.
674
675Texts are separated using C<\x04\n>. The external process
676should add a new line per text.
677
678If the L</--use-tokenizer-sentence-splits> option is activated,
679sentences are marked by offset as well in new lines.
680
681To use L<Datok|https://github.com/KorAP/Datok> including sentence
682splitting, call C<tei2korap> as follows:
683
684 $ cat corpus.i5.xml | tei2korapxml -s \
685 $ -tc 'datok tokenize \
686 $ -t ./tokenizer.matok \
687 $ -p --newline-after-eot --no-sentences \
688 $ --no-tokens --sentence-positions -' - \
689 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200690
Akronb93fabb2023-01-13 12:05:44 +0100691=item B<--no-tokenizer>
692
693Boolean flag indicating that no tokenizer should be used.
694This is meant to ensure that by default a final token layer always
695exists.
696If a separate tokenizer is chosen, this flag is ignored.
697
Akron75d63142021-02-23 18:40:56 +0100698=item B<--skip-inline-tokens>
699
700Boolean flag indicating that inline tokens should not
701be processed. Defaults to false (meaning inline tokens will be processed).
702
Akron692d17d2021-03-05 13:21:03 +0100703=item B<--skip-inline-token-annotations>
704
705Boolean flag indicating that inline token annotations should not
706be processed. Defaults to true (meaning inline token annotations
Akron6b1f26b2024-09-19 11:35:32 +0200707won't be processed). Can be negated with
708C<--no-skip-inline-token-annotations>.
Akron692d17d2021-03-05 13:21:03 +0100709
Akronca70a1d2021-02-25 16:21:31 +0100710=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100711
712Expects a comma-separated list of tags to be ignored when the structure
713is parsed. Content of these tags however will be processed.
714
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +0200715=item B<--auto-textsigle> <textsigle>
716
717Expects a text sigle thats serves as fallback if no text sigles
718are given in the input data.
719The auto text sigle will be incremented for each text processed.
720
721Example:
722
723 tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
724 < data.i5.xml > korapxml.zip
725
Marc Kupietza671ae52022-12-22 16:28:14 +0100726=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
727
Akrone48bec42023-01-05 12:18:45 +0100728Expects a regular replacement expression (separated by B<@> between the
Marc Kupietza671ae52022-12-22 16:28:14 +0100729search and the replacement) to convert text id attributes to text sigles
730with three parts (separated by B</>).
731
732Example:
733
734 tei2korapxml \
735 --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
736 -tk - < t/data/icc_german_sample.p5.xml
737
Akrone48bec42023-01-05 12:18:45 +0100738Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
739sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietza671ae52022-12-22 16:28:14 +0100740
Akron1a5271a2021-02-18 13:18:15 +0100741=item B<--inline-tokens> <foundry>#[<file>]
742
743Define the foundry and file (without extension)
744to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100745Unless C<--skip-inline-token-annotations> is set,
746this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100747Defaults to C<tokens> and C<morpho>.
748
Akrone2819a12021-10-12 15:52:55 +0200749The inline token data will also be stored in the
750inline structures file (see I<--inline-structures>),
751unless the inline token foundry is prepended
752by an B<!> exclamation mark, indicating that inline
753tokens are stored exclusively in the inline tokens
754file.
755
756Example:
757
Akron6b1f26b2024-09-19 11:35:32 +0200758 tei2korapxml --no-tokenizer --inline-tokens \
759 '!gingko#morpho' < data.i5.xml > korapxml.zip
760
761=item B<--inline-dependencies> <foundry>#[<file>]
762
763Define the foundry and file (without extension)
764to store inline dependency information in.
765Defaults to the layer of C<dependency> and
766will be ignored if not set (which means, dependency
767attributes will be stored in the inline tokens file,
768if not skipped).
769
770The dependency data will also be stored in the
771inline token file (see I<--inline-tokens>),
772unless the inline dependencies foundry is prepended
773by an B<!> exclamation mark, indicating that inline
774dependency data is stored exclusively in the inline
775dependencies file.
776
777Example:
778
779 tei2korapxml --no-tokenizer --inline-dependencies \
780 'gingko#dependency' < data.i5.xml > korapxml.zip
781
Akrone2819a12021-10-12 15:52:55 +0200782
Akrondd0be8f2021-02-18 19:29:41 +0100783=item B<--inline-structures> <foundry>#[<file>]
784
785Define the foundry and file (without extension)
786to store inline structure information in.
787Defaults to C<struct> and C<structures>.
788
Akron26a71522021-02-19 10:27:37 +0100789=item B<--base-foundry> <foundry>
790
791Define the base foundry to store newly generated
792token information in.
793Defaults to C<base>.
794
795=item B<--data-file> <file>
796
797Define the file (without extension)
798to store primary data information in.
799Defaults to C<data>.
800
801=item B<--header-file> <file>
802
803Define the file name (without extension)
804to store header information on
805the corpus, document, and text level in.
806Defaults to C<header>.
807
Marc Kupietz985da0c2021-02-15 19:29:50 +0100808=item B<--use-tokenizer-sentence-splits|-s>
809
810Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100811provided by the tokenizer.
812Currently KorAP-tokenizer and certain external tokenizers support
813these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100814
Akron91705d72021-02-19 10:59:45 +0100815=item B<--tokens-file> <file>
816
817Define the file (without extension)
818to store generated token information in
819(either from the KorAP tokenizer or an externally called tokenizer).
820Defaults to C<tokens>.
821
Akron3378dfd2020-08-01 15:01:36 +0200822=item B<--log|-l>
823
824Loglevel for I<Log::Any>. Defaults to C<notice>.
825
Akrond949e182020-02-14 12:23:57 +0100826=back
827
Akronb3649472020-09-29 08:24:46 +0200828=head1 ENVIRONMENT VARIABLES
829
830=over 2
831
832=item B<KORAPXMLTEI_DEBUG>
833
834Activate minimal debugging.
835Defaults to C<false>.
836
Marc Kupietzd254f5c2025-04-16 10:37:08 +0200837=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
838
839Set the heap size for the tokenizer process.
840Defaults to C<512m>.
841
Akronb3649472020-09-29 08:24:46 +0200842=back
843
Akrond949e182020-02-14 12:23:57 +0100844=head1 COPYRIGHT AND LICENSE
845
Marc Kupietzb6fd6bc2025-04-16 12:47:26 +0200846Copyright (C) 2021-2025, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100847
848Author: Peter Harders
849
Akronaabd0952020-09-29 07:35:08 +0200850Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100851
852L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
853Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200854L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100855member of the
856L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
857
858This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100859L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100860
861=cut
Akronf8088e62021-02-18 16:18:59 +0100862
863# NOTES
864
Akronf8088e62021-02-18 16:18:59 +0100865## Notes on segfault prevention
866
Akron91577922021-02-19 10:32:54 +0100867binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100868(see notes on 'PerlIO layers' in 'man XML::LibXML'),
869removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
870see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
871see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.