blob: 953fc4452872da8874afa92ee3a5aa59cdaade21 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +02009use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders6f526a32020-06-29 21:44:41 +020010
11use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010012
Akroneaa96232020-10-15 17:06:15 +020013use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010014
Akron4f67cd42020-07-02 12:27:58 +020015use FindBin;
16BEGIN {
17 unshift @INC, "$FindBin::Bin/../lib";
18};
19
Marc Kupietz8a954e52021-02-16 22:03:07 +010020use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020021use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020022use KorAP::XML::TEI::Tokenizer::Conservative;
23use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020024use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020025use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010026use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010027
Marc Kupietzef5dfd32026-03-05 10:02:47 +010028our $VERSION = '2.7.1';
Peter Harders6f526a32020-06-29 21:44:41 +020029
Akrond949e182020-02-14 12:23:57 +010030our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
31
Akron33db4ec2021-02-24 12:52:21 +010032use constant {
33 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010034 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010035};
Peter Hardersd892a582020-02-12 15:45:22 +010036
Akron692d17d2021-03-05 13:21:03 +010037if ($ENV{KORAPXMLTEI_INLINE}) {
38 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
39};
40
Akrone2819a12021-10-12 15:52:55 +020041# Inline tokens won't be stored in the structure file
42my $inline_tokens_exclusive = 0;
43
Akron6b1f26b2024-09-19 11:35:32 +020044# Inline dependencies won't be stored in the tokens file
45my $inline_deps_exclusive = 0;
46
Peter Harders6f526a32020-06-29 21:44:41 +020047# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010048GetOptions(
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +020049 'auto-textsigle|A=s' => \(my $auto_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010050 'root|r=s' => \(my $root_dir = '.'),
51 'input|i=s' => \(my $input_fname = ''),
Akron132bdeb2024-06-06 14:28:56 +020052 'output|o=s' => \(my $output_fname = ''),
Akron75d63142021-02-23 18:40:56 +010053 'tokenizer-call|tc=s' => \(my $tokenizer_call),
54 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010055 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akronb93fabb2023-01-13 12:05:44 +010056 'no-tokenizer' => \(my $no_tokenizer),
Akron75d63142021-02-23 18:40:56 +010057 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
58 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
59 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron6b1f26b2024-09-19 11:35:32 +020060 'inline-dependencies=s' => \(my $inline_dependencies),
Akron75d63142021-02-23 18:40:56 +010061 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron6b1f26b2024-09-19 11:35:32 +020062 'skip-inline-token-annotations!' => \(
Akron692d17d2021-03-05 13:21:03 +010063 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010064 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietza671ae52022-12-22 16:28:14 +010069 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010070 'log|l=s' => \(my $log_level = 'notice'),
Akron2520a342022-03-29 18:18:05 +020071 'required-version|rv=s' => \(my $required_version),
Marc Kupietz2115ecc2025-12-10 11:37:03 +010072 'progress|p' => \(my $progress),
Akrona2cb2812021-10-30 10:29:08 +020073 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010074 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010075 pod2usage(
76 -verbose => 99,
77 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
78 -msg => $VERSION_MSG,
79 -output => '-'
80 )
81 },
82 'version|v' => sub {
83 pod2usage(
84 -verbose => 0,
85 -msg => $VERSION_MSG,
86 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010087 );
Akrond949e182020-02-14 12:23:57 +010088 }
Peter Hardersd892a582020-02-12 15:45:22 +010089);
90
Akrond3e1d282021-02-24 14:51:27 +010091
Akronb87c58d2021-02-23 17:23:30 +010092# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010093binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020094Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020095$log->notice('Debugging is activated') if DEBUG;
96
Akrond3e1d282021-02-24 14:51:27 +010097
Akron2520a342022-03-29 18:18:05 +020098if ($required_version) {
Marc Kupietz2475c952024-01-09 10:40:04 +010099 $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
Akron2520a342022-03-29 18:18:05 +0200100 if (!$1 || $1 ne $VERSION) {
101 $log->error("Required version $required_version mismatches version $VERSION");
102 exit(1);
103 };
104};
105
106
Marc Kupietza671ae52022-12-22 16:28:14 +0100107my ($what, $with);
108if ($xmlid_to_textsigle ne '') {
109 ($what, $with) = split('@', $xmlid_to_textsigle);
110 $what = qr!$what!;
111};
112
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100113if ($progress) {
114 eval {
115 require Time::Progress;
116 1;
117 } or do {
118 $log->warn('Time::Progress not installed. Progress bar disabled.');
119 $progress = 0;
120 }
121};
122
Akron0529e512021-02-22 09:55:35 +0100123# tag (without attributes), which contains the primary text
124my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +0200125# optional
Akron09e0b2c2020-07-28 15:57:01 +0200126
Akron54c3ff12021-02-25 11:33:37 +0100127# Remember to skip certain inline tags
128my %skip_inline_tags = ();
129if ($skip_inline_tags_str) {
130 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
131 $skip_inline_tags{$_} = 1;
132 };
133};
134
Akrond3e1d282021-02-24 14:51:27 +0100135# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200136my $ext_tok;
137if ($tokenizer_call) {
138 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100139 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200140}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200141
Akronb93fabb2023-01-13 12:05:44 +0100142# KorAP tokenization
Akron0c41ab32020-09-29 07:33:33 +0200143elsif ($tokenizer_korap) {
Akronbd4281e2022-03-28 08:31:40 +0200144 eval {
145 require KorAP::XML::TEI::Tokenizer::KorAP;
146 1;
147 };
Akron2520a342022-03-29 18:18:05 +0200148
149 my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
150 if ($korap_tok_ver ne $VERSION) {
151 $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
152 exit(1);
153 };
154
Marc Kupietz985da0c2021-02-15 19:29:50 +0100155 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akronb93fabb2023-01-13 12:05:44 +0100156}
157
158# No internal tokenizer chosen
159elsif (!$tokenizer_intern && !$no_tokenizer) {
160 $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
161 exit(1);
Akron0c41ab32020-09-29 07:33:33 +0200162};
Peter Harders6f526a32020-06-29 21:44:41 +0200163
Akron6b1f26b2024-09-19 11:35:32 +0200164if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron11484782021-11-03 20:12:14 +0100165 $skip_inline_tags{s} = 1;
166};
Akron0c41ab32020-09-29 07:33:33 +0200167
Akrond3e1d282021-02-24 14:51:27 +0100168# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100169my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
170my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100171
Peter Harders41c35622020-07-12 01:16:22 +0200172
Akrondd0be8f2021-02-18 19:29:41 +0100173# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100174# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100175my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100176
Akron1a5271a2021-02-18 13:18:15 +0100177# Name of the directory and the file containing all inline token informations
178# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
179my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100180
Akrone2819a12021-10-12 15:52:55 +0200181if (index($_tokens_dir, '!') == 0) {
182 $_tokens_dir = substr($_tokens_dir, 1);
183 $inline_tokens_exclusive = 1;
184};
185
Akron6b1f26b2024-09-19 11:35:32 +0200186
187my ($_dep_dir, $_dep_file);
188if ($inline_dependencies) {
189 ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
190 $inline_dependencies = 1;
191
192 if ($_dep_dir && index($_dep_dir, '!') == 0) {
193 $_dep_dir = substr($_dep_dir, 1);
194 $inline_deps_exclusive = 1;
195 };
196};
197
198
Akronb87c58d2021-02-23 17:23:30 +0100199# Initialize zipper
Akron132bdeb2024-06-06 14:28:56 +0200200my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron09e0b2c2020-07-28 15:57:01 +0200201
Akronbc899192021-02-24 12:14:47 +0100202# text directory (below $root_dir)
203my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200204
Akronbc899192021-02-24 12:14:47 +0100205# Escaped version of text id
206my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200207
Akrond53913c2021-02-24 09:50:13 +0100208# Default encoding of the text
209my $input_enc = 'UTF-8';
210
Akrond53913c2021-02-24 09:50:13 +0100211# text line (needed for whitespace handling)
212my $text_line = 0;
213
Peter Harders6f526a32020-06-29 21:44:41 +0200214
Akrond53913c2021-02-24 09:50:13 +0100215# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200216my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100217
Akrona2cb2812021-10-30 10:29:08 +0200218# Single dash was set
219if ($stdio) {
220 $input_fh = *STDIN;
221}
Akrona2cb2812021-10-30 10:29:08 +0200222# Input flag was passed
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200223elsif (@ARGV || $input_fname ne '') {
224 unless ($input_fname ne '') {
225 $input_fname = shift @ARGV;
226 };
Akron347be812020-09-29 07:52:52 +0200227 unless (open($input_fh, '<', $input_fname)) {
228 die $log->fatal("File '$input_fname' could not be opened.");
229 };
Akrona2cb2812021-10-30 10:29:08 +0200230}
231
232# No input to process
233else {
234 pod2usage(
235 -verbose => 99,
236 -sections => 'NAME|SYNOPSIS',
237 -msg => $VERSION_MSG,
238 -output => '-'
239 );
240 exit;
Akrond53913c2021-02-24 09:50:13 +0100241};
Peter Harders6f526a32020-06-29 21:44:41 +0200242
Akronf8088e62021-02-18 16:18:59 +0100243# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200244binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200245
Peter Harders6f526a32020-06-29 21:44:41 +0200246
Akroneb12e232021-02-25 13:49:50 +0100247# Create inline parser object
248my $inline = KorAP::XML::TEI::Inline->new(
249 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200250 \%skip_inline_tags,
Akron6b1f26b2024-09-19 11:35:32 +0200251 $inline_tokens_exclusive,
252 $inline_dependencies
Akroneb12e232021-02-25 13:49:50 +0100253);
254
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200255do {
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100256 my $p;
257 if ($progress && $input_fname ne '') {
258 my $file_size = -s $input_fname;
259 if ($file_size) {
260 $p = Time::Progress->new(min => 0, max => $file_size);
261 $log->notice("Reading input document $input_fname (Size: $file_size bytes)");
262 }
263 } elsif ($input_fname ne '') {
264 $log->notice("Reading input document $input_fname");
265 };
266
267 my $i = 0;
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200268 MAIN:
269 while (<$input_fh>) {
Akroneb12e232021-02-25 13:49:50 +0100270
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100271 if ($p && ($i++ % 500 == 0)) {
272 print STDERR $p->report("\r%20b %p ETA: %E", tell($input_fh));
273 };
274
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200275 # remove HTML (multi-line) comments (<!--...-->)
276 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200277
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200278 # Set input encoding
279 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
280 $input_enc = $2;
281 next;
Akron0bb7e722020-09-29 07:48:33 +0200282 };
Peter Harders6f526a32020-06-29 21:44:41 +0200283
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200284 $_ = decode($input_enc, $_);
285 $_ = replace_entities($_);
Peter Harders90157342020-07-01 21:05:14 +0200286
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200287 # Start of text body
288 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
289 my $suffix = $2;
Peter Harders90157342020-07-01 21:05:14 +0200290
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200291 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
292 die $log->fatal("input line number $.: " .
293 "line with opening text-body tag '${_TEXT_BODY}' " .
294 "contains additional information ... => Aborting (line=$_)");
295 };
Peter Harders6f526a32020-06-29 21:44:41 +0200296
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200297 # Text body data extracted from input document ($input_fh),
298 # further processed by XML::LibXML::Reader
299 my $text_buffer = '';
Peter Harders6f526a32020-06-29 21:44:41 +0200300
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200301 # Iterate over all lines in the text body
302 while (<$input_fh>) {
Peter Harders6f526a32020-06-29 21:44:41 +0200303
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200304 $_ = remove_xml_comments($input_fh, $_);
305 $_ = decode($input_enc, $_);
306 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200307
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200308 # End of text body
309 if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
310
311 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
312
Marc Kupietzff061ef2026-03-05 09:59:35 +0100313 my $before = substr($_, 0, $pos);
314 my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
315 my $before_check = $before;
316 $before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
317 if (($before_check . $after) !~ /^\s*$/) {
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200318 die $log->fatal("input line number $.: " .
319 "line with closing text-body tag '${_TEXT_BODY}'" .
320 " contains additional information ... => Aborting (line=$_)");
321 };
322
Marc Kupietzff061ef2026-03-05 09:59:35 +0100323 # Add any remaining content before </text> (e.g. </body>) to the buffer
324 $before =~ s/^\s+//;
325 $before =~ s/\s+$//;
326 $text_buffer .= $before if $before ne '';
327
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200328 if ($dir eq '') {
329 $log->warn(
330 "Maybe empty textSigle => skipping this text ...\n" .
331 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100332 );
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200333 next MAIN;
334 };
335
336 # Parse inline structure
337 $inline->parse($text_id_esc, \$text_buffer);
338
339 if (DEBUG) {
340 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
341 };
342
343 my $data = $inline->data;
344
345 # Write data.xml
346 $data->to_zip(
347 $zipper->new_stream("$dir/${data_file}.xml"),
348 $text_id_esc
349 );
350
351 # Tokenize with external tokenizer
352 if ($ext_tok) {
353
354 # Tokenize and output
355 $ext_tok->tokenize($data->data)->to_zip(
356 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
357 $text_id_esc
358 );
359
360 if ($use_tokenizer_sentence_splits) {
361 $ext_tok->sentencize_from_previous_input($inline->structures);
362 };
363 };
364
365 # Tokenize with internal tokenizer
366 if ($tokenizer_intern) {
367
368 # Tokenize and output
369 $cons_tok->tokenize($data->data)->to_zip(
370 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
371 $text_id_esc
372 )->reset;
373
374 $aggr_tok->tokenize($data->data)->to_zip(
375 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
376 $text_id_esc
377 )->reset;
378 };
379
380 # ~ write structures ~
381 unless ($inline->structures->empty) {
382 $inline->structures->to_zip(
383 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
384 $text_id_esc,
385 2 # = structure serialization
386 );
387 };
388
389 # ~ write tokens ~
390 unless ($skip_inline_tokens || $inline->tokens->empty) {
391 $inline->tokens->to_zip(
392 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
393 $text_id_esc,
394 # Either 0 = tokens without inline or
395 # 1 = tokens with inline
396 # !$skip_inline_token_annotations
397 ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
398 );
399 };
400
401 # ~ write dependencies ~
402 unless ($inline->dependencies->empty) {
403 $inline->dependencies->to_zip(
404 $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
405 $text_id_esc,
406 3 # = dependency serialization
407 );
408 };
409
410
411 # reinit.
412 $dir = '';
413
Akrondafaa7a2021-02-19 15:17:58 +0100414 next MAIN;
415 };
Peter Harders6f526a32020-06-29 21:44:41 +0200416
Akrondafaa7a2021-02-19 15:17:58 +0100417
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200418 # ~ whitespace handling ~
Akrondafaa7a2021-02-19 15:17:58 +0100419
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200420 # Fix whitespaces (see notes on whitespace fixing)
Akroneb12e232021-02-25 13:49:50 +0100421
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200422 # TODO:
423 # Maybe it's best, to keep the stripping of whitespace and
424 # to just remove the if-clause and to insert a blank by default
425 # (with possibly an option on how newlines in primary text should
426 # be handled (stripped or replaced by a whitespace)).
Akrondafaa7a2021-02-19 15:17:58 +0100427
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200428 # Remove consecutive whitespace at beginning and end (mostly one newline)
429 s/^\s+//;
430 s/\s+$//;
Akrondafaa7a2021-02-19 15:17:58 +0100431
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200432 # NOTE:
433 # this is only relevant, if a text consists of more than one line
Akrond53ab4b2021-02-24 09:56:12 +0100434
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200435 # TODO:
436 # find a better solution, or create a warning, if a text has more
437 # than one line ($text_line > 1)
Peter Harders6f526a32020-06-29 21:44:41 +0200438
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200439 # TODO:
440 # do testing with 2 different corpora
441 # (one with only one-line texts, the other with several lines per text)
Peter Harders6f526a32020-06-29 21:44:41 +0200442
Marc Kupietza84fcb52026-03-05 17:22:43 +0100443 # Check if the buffer currently ends inside an open XML tag
444 # (last '<' is after last '>'), meaning this line is a continuation of
445 # a multi-line element (e.g. attributes split across lines like <ref>).
446 # A space must be prepended to avoid "attributes construct error" in the
447 # XML parser when two attribute tokens are concatenated without separator.
448 my $in_open_tag = ($text_buffer ne '' &&
449 rindex($text_buffer, '<') > rindex($text_buffer, '>'));
450
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200451 # line contains at least one non-tag character
452 if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
Akron598d1a72020-08-02 17:33:31 +0200453
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200454 # Increment counter for text lines
455 $text_line++;
Akrona10ad592020-08-03 11:20:23 +0200456
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200457 # insert blank before 1st character
Marc Kupietza84fcb52026-03-05 17:22:43 +0100458 # (for 2nd line and consecutive lines, or when continuing an open tag)
459 $_ = ' ' . $_ if $text_line > 1 || $in_open_tag;
460 }
461
462 # Line is purely within an open tag (attribute continuation):
463 # prepend a space so attributes are properly separated.
464 elsif ($in_open_tag) {
465 $_ = ' ' . $_;
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200466 }
Akrondafaa7a2021-02-19 15:17:58 +0100467
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200468 # add line to buffer
469 $text_buffer .= $_;
470 };
471 }
472 elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
473 my $leadin = $1;
474 my $id = $3;
475 my $sigle = $3;
Akrondafaa7a2021-02-19 15:17:58 +0100476
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200477 if ($what) {
478 $_ = $id;
479 eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
480 $sigle = $_;
481 $log->debug("Converted text id `$id' to sigle `$sigle'");
482 };
483 $sigle =~ s/\./-/g;
Akron6b1f26b2024-09-19 11:35:32 +0200484
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200485 my @parts = split(/[\/_]/, $sigle);
486 if (@parts != 3) {
487 die $log->fatal(
488 "input line number $.: " .
489 "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
490 "=> Aborting (line=$_)");
Akron598d1a72020-08-02 17:33:31 +0200491 };
492
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200493 $dir = join("/", @parts);
494 $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
495 $log->notice("$0: text_id=$text_id_esc");
Peter Harders6f526a32020-06-29 21:44:41 +0200496
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200497 if ($leadin !~ /^\s*$/) {
498 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100499 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200500 'line with opening header tag is not in expected format ... ' .
501 "=> Aborting (line=$_)");
502 };
503 }
Marc Kupietza671ae52022-12-22 16:28:14 +0100504
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200505 # Start of header section
506 elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
507 my $content = "$2\n";
Marc Kupietza671ae52022-12-22 16:28:14 +0100508
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200509 if ($1 !~ /^\s*$/) {
510 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100511 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200512 'line with opening header tag is not in expected format ... ' .
513 "=> Aborting (line=$_)");
514 };
Marc Kupietza671ae52022-12-22 16:28:14 +0100515
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200516 # Parse header
517 my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
518 if ($auto_textsigle) {
519 $auto_textsigle = increase_auto_textsigle($auto_textsigle);
520 $log->debug("Auto-incremented text sigle to $auto_textsigle");
521 };
Akronf57ed812020-07-27 10:37:52 +0200522
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200523 # Header was parseable
524 if ($header) {
Akron347be812020-09-29 07:52:52 +0200525
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200526 # Write header to zip
527 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200528
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200529 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200530
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200531 $header->to_zip($zipper->new_stream($file));
Akron347be812020-09-29 07:52:52 +0200532
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200533 # Header is for text level
534 if ($header->type eq 'text') {
Akron347be812020-09-29 07:52:52 +0200535
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200536 # Remember dir and sigles
537 $dir = $header->dir;
538 $text_id_esc = $header->id_esc;
Akron347be812020-09-29 07:52:52 +0200539
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200540 # log output for seeing progression
541 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200542
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200543 # Reset counter for text lines
544 # (needed for whitespace handling)
545 $text_line = 0;
546 };
Akrond53913c2021-02-24 09:50:13 +0100547 };
548 };
549 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200550 $text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100551
552 if ($p) {
553 print STDERR $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
554 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200555} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron347be812020-09-29 07:52:52 +0200556$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200557
Akron9df4a242021-02-19 15:31:16 +0100558$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100559
Akrond53913c2021-02-24 09:50:13 +0100560close $input_fh;
561
Peter Harders6f526a32020-06-29 21:44:41 +0200562
Akrond949e182020-02-14 12:23:57 +0100563__END__
564
565=pod
566
567=encoding utf8
568
569=head1 NAME
570
571tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
572
573=head1 SYNOPSIS
574
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200575 cat corpus.i5.xml | tei2korapxml -tk - > corpus.korapxml.zip
576 tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100577
578=head1 DESCRIPTION
579
Akronee434b12020-07-08 12:53:01 +0200580C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200581L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200582based documents to the
583L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200584
Akrond949e182020-02-14 12:23:57 +0100585This program is usually called from inside another script.
586
Akronee434b12020-07-08 12:53:01 +0200587=head1 FORMATS
588
589=head2 Input restrictions
590
591=over 2
592
593=item
594
Akronee434b12020-07-08 12:53:01 +0200595TEI P5 formatted input with certain restrictions:
596
597=over 4
598
599=item
600
Akrone48bec42023-01-05 12:18:45 +0100601B<mandatory>: text-header with integrated textsigle
602(or convertable identifier), text-body
Akronee434b12020-07-08 12:53:01 +0200603
604=item
605
606B<optional>: corp-header with integrated corpsigle,
607doc-header with integrated docsigle
608
609=back
610
611=item
612
Akron0c41ab32020-09-29 07:33:33 +0200613All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200614newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200615(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200616into blanks between 2 tokens could lead to additional blanks,
617where there should be none (e.g.: punctuation characters like C<,> or
618C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100619(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200620
Akron940ca6f2021-10-11 12:38:39 +0200621=item
622
623Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
624need to be defined in the same line as the header tag.
625
Akronee434b12020-07-08 12:53:01 +0200626=back
627
628=head2 Notes on the output
629
630=over 2
631
632=item
633
634zip file output (default on C<stdout>) with utf8 encoded entries
635(which together form the KorAP-XML format)
636
637=back
638
Akrond949e182020-02-14 12:23:57 +0100639=head1 INSTALLATION
640
Akrond26319b2023-01-12 15:34:41 +0100641C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietze83a4e92021-03-16 20:51:26 +0100642When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100643to use L<cpanm|App::cpanminus>.
644
645 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
646
647In case everything went well, the C<tei2korapxml> tool will
648be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200649
Akrond949e182020-02-14 12:23:57 +0100650Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
651
652=head1 OPTIONS
653
654=over 2
655
Akrona2cb2812021-10-30 10:29:08 +0200656=item B<--input|-i>
657
658The input file to process. If no specific input is defined and a single
659dash C<-> is passed as an argument, data is read from C<STDIN>.
660
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200661Instead of using C<-i> input files can also be defined as trailing arguments
662to the command:
663
664 tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
665
Akron132bdeb2024-06-06 14:28:56 +0200666=item B<--output|-o>
667
668The output zip file to be created. If no specific output is defined,
669data is written to C<STDOUT>.
Akrona2cb2812021-10-30 10:29:08 +0200670
Akron4e603a52020-07-27 14:23:49 +0200671=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100672
Akron4e603a52020-07-27 14:23:49 +0200673The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100674
675=item B<--help|-h>
676
677Print help information.
678
679=item B<--version|-v>
680
681Print version information.
682
Akrone48bec42023-01-05 12:18:45 +0100683=item B<--tokenizer-korap|-tk>
Akron2520a342022-03-29 18:18:05 +0200684
Akrone48bec42023-01-05 12:18:45 +0100685Use the standard KorAP/DeReKo tokenizer.
686
687=item B<--tokenizer-internal|-ti>
688
689Tokenize the data using two embedded tokenizers,
690that will take an I<aggressive> and a I<conservative>
691approach.
Akron2520a342022-03-29 18:18:05 +0200692
Akron4e603a52020-07-27 14:23:49 +0200693=item B<--tokenizer-call|-tc>
694
695Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100696from STDIN and outputs the offsets of all tokens.
697
698Texts are separated using C<\x04\n>. The external process
699should add a new line per text.
700
701If the L</--use-tokenizer-sentence-splits> option is activated,
702sentences are marked by offset as well in new lines.
703
704To use L<Datok|https://github.com/KorAP/Datok> including sentence
705splitting, call C<tei2korap> as follows:
706
707 $ cat corpus.i5.xml | tei2korapxml -s \
708 $ -tc 'datok tokenize \
709 $ -t ./tokenizer.matok \
710 $ -p --newline-after-eot --no-sentences \
711 $ --no-tokens --sentence-positions -' - \
712 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200713
Akronb93fabb2023-01-13 12:05:44 +0100714=item B<--no-tokenizer>
715
716Boolean flag indicating that no tokenizer should be used.
717This is meant to ensure that by default a final token layer always
718exists.
719If a separate tokenizer is chosen, this flag is ignored.
720
Akron75d63142021-02-23 18:40:56 +0100721=item B<--skip-inline-tokens>
722
723Boolean flag indicating that inline tokens should not
724be processed. Defaults to false (meaning inline tokens will be processed).
725
Akron692d17d2021-03-05 13:21:03 +0100726=item B<--skip-inline-token-annotations>
727
728Boolean flag indicating that inline token annotations should not
729be processed. Defaults to true (meaning inline token annotations
Akron6b1f26b2024-09-19 11:35:32 +0200730won't be processed). Can be negated with
731C<--no-skip-inline-token-annotations>.
Akron692d17d2021-03-05 13:21:03 +0100732
Akronca70a1d2021-02-25 16:21:31 +0100733=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100734
735Expects a comma-separated list of tags to be ignored when the structure
736is parsed. Content of these tags however will be processed.
737
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +0200738=item B<--auto-textsigle> <textsigle>
739
740Expects a text sigle thats serves as fallback if no text sigles
741are given in the input data.
742The auto text sigle will be incremented for each text processed.
743
744Example:
745
746 tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
747 < data.i5.xml > korapxml.zip
748
Marc Kupietza671ae52022-12-22 16:28:14 +0100749=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
750
Akrone48bec42023-01-05 12:18:45 +0100751Expects a regular replacement expression (separated by B<@> between the
Marc Kupietza671ae52022-12-22 16:28:14 +0100752search and the replacement) to convert text id attributes to text sigles
753with three parts (separated by B</>).
754
755Example:
756
757 tei2korapxml \
758 --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
759 -tk - < t/data/icc_german_sample.p5.xml
760
Akrone48bec42023-01-05 12:18:45 +0100761Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
762sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietza671ae52022-12-22 16:28:14 +0100763
Akron1a5271a2021-02-18 13:18:15 +0100764=item B<--inline-tokens> <foundry>#[<file>]
765
766Define the foundry and file (without extension)
767to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100768Unless C<--skip-inline-token-annotations> is set,
769this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100770Defaults to C<tokens> and C<morpho>.
771
Akrone2819a12021-10-12 15:52:55 +0200772The inline token data will also be stored in the
773inline structures file (see I<--inline-structures>),
774unless the inline token foundry is prepended
775by an B<!> exclamation mark, indicating that inline
776tokens are stored exclusively in the inline tokens
777file.
778
779Example:
780
Akron6b1f26b2024-09-19 11:35:32 +0200781 tei2korapxml --no-tokenizer --inline-tokens \
782 '!gingko#morpho' < data.i5.xml > korapxml.zip
783
784=item B<--inline-dependencies> <foundry>#[<file>]
785
786Define the foundry and file (without extension)
787to store inline dependency information in.
788Defaults to the layer of C<dependency> and
789will be ignored if not set (which means, dependency
790attributes will be stored in the inline tokens file,
791if not skipped).
792
793The dependency data will also be stored in the
794inline token file (see I<--inline-tokens>),
795unless the inline dependencies foundry is prepended
796by an B<!> exclamation mark, indicating that inline
797dependency data is stored exclusively in the inline
798dependencies file.
799
800Example:
801
802 tei2korapxml --no-tokenizer --inline-dependencies \
803 'gingko#dependency' < data.i5.xml > korapxml.zip
804
Akrone2819a12021-10-12 15:52:55 +0200805
Akrondd0be8f2021-02-18 19:29:41 +0100806=item B<--inline-structures> <foundry>#[<file>]
807
808Define the foundry and file (without extension)
809to store inline structure information in.
810Defaults to C<struct> and C<structures>.
811
Akron26a71522021-02-19 10:27:37 +0100812=item B<--base-foundry> <foundry>
813
814Define the base foundry to store newly generated
815token information in.
816Defaults to C<base>.
817
818=item B<--data-file> <file>
819
820Define the file (without extension)
821to store primary data information in.
822Defaults to C<data>.
823
824=item B<--header-file> <file>
825
826Define the file name (without extension)
827to store header information on
828the corpus, document, and text level in.
829Defaults to C<header>.
830
Marc Kupietz985da0c2021-02-15 19:29:50 +0100831=item B<--use-tokenizer-sentence-splits|-s>
832
833Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100834provided by the tokenizer.
835Currently KorAP-tokenizer and certain external tokenizers support
836these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100837
Akron91705d72021-02-19 10:59:45 +0100838=item B<--tokens-file> <file>
839
840Define the file (without extension)
841to store generated token information in
842(either from the KorAP tokenizer or an externally called tokenizer).
843Defaults to C<tokens>.
844
Akron3378dfd2020-08-01 15:01:36 +0200845=item B<--log|-l>
846
847Loglevel for I<Log::Any>. Defaults to C<notice>.
848
Akrond949e182020-02-14 12:23:57 +0100849=back
850
Akronb3649472020-09-29 08:24:46 +0200851=head1 ENVIRONMENT VARIABLES
852
853=over 2
854
855=item B<KORAPXMLTEI_DEBUG>
856
857Activate minimal debugging.
858Defaults to C<false>.
859
Marc Kupietzd254f5c2025-04-16 10:37:08 +0200860=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
861
862Set the heap size for the tokenizer process.
863Defaults to C<512m>.
864
Akronb3649472020-09-29 08:24:46 +0200865=back
866
Akrond949e182020-02-14 12:23:57 +0100867=head1 COPYRIGHT AND LICENSE
868
Marc Kupietzb6fd6bc2025-04-16 12:47:26 +0200869Copyright (C) 2021-2025, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100870
871Author: Peter Harders
872
Akronaabd0952020-09-29 07:35:08 +0200873Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100874
875L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
876Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200877L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100878member of the
879L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
880
881This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100882L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100883
884=cut
Akronf8088e62021-02-18 16:18:59 +0100885
886# NOTES
887
Akronf8088e62021-02-18 16:18:59 +0100888## Notes on segfault prevention
889
Akron91577922021-02-19 10:32:54 +0100890binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100891(see notes on 'PerlIO layers' in 'man XML::LibXML'),
892removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
893see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
894see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.