blob: f6cbe5a531918b76796f55c91549415a27b03b67 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +02009use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders6f526a32020-06-29 21:44:41 +020010
11use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010012
Akroneaa96232020-10-15 17:06:15 +020013use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010014
Akron4f67cd42020-07-02 12:27:58 +020015use FindBin;
16BEGIN {
17 unshift @INC, "$FindBin::Bin/../lib";
18};
19
Marc Kupietz8a954e52021-02-16 22:03:07 +010020use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020021use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020022use KorAP::XML::TEI::Tokenizer::Conservative;
23use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020024use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020025use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010026use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010027
Marc Kupietzef5dfd32026-03-05 10:02:47 +010028our $VERSION = '2.7.1';
Peter Harders6f526a32020-06-29 21:44:41 +020029
Akrond949e182020-02-14 12:23:57 +010030our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
31
Akron33db4ec2021-02-24 12:52:21 +010032use constant {
33 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010034 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010035};
Peter Hardersd892a582020-02-12 15:45:22 +010036
Akron692d17d2021-03-05 13:21:03 +010037if ($ENV{KORAPXMLTEI_INLINE}) {
38 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
39};
40
Akrone2819a12021-10-12 15:52:55 +020041# Inline tokens won't be stored in the structure file
42my $inline_tokens_exclusive = 0;
43
Akron6b1f26b2024-09-19 11:35:32 +020044# Inline dependencies won't be stored in the tokens file
45my $inline_deps_exclusive = 0;
46
Peter Harders6f526a32020-06-29 21:44:41 +020047# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010048GetOptions(
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +020049 'auto-textsigle|A=s' => \(my $auto_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010050 'root|r=s' => \(my $root_dir = '.'),
51 'input|i=s' => \(my $input_fname = ''),
Akron132bdeb2024-06-06 14:28:56 +020052 'output|o=s' => \(my $output_fname = ''),
Akron75d63142021-02-23 18:40:56 +010053 'tokenizer-call|tc=s' => \(my $tokenizer_call),
54 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010055 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akronb93fabb2023-01-13 12:05:44 +010056 'no-tokenizer' => \(my $no_tokenizer),
Akron75d63142021-02-23 18:40:56 +010057 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
58 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
59 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron6b1f26b2024-09-19 11:35:32 +020060 'inline-dependencies=s' => \(my $inline_dependencies),
Akron75d63142021-02-23 18:40:56 +010061 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron6b1f26b2024-09-19 11:35:32 +020062 'skip-inline-token-annotations!' => \(
Akron692d17d2021-03-05 13:21:03 +010063 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010064 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietza671ae52022-12-22 16:28:14 +010069 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010070 'log|l=s' => \(my $log_level = 'notice'),
Akron2520a342022-03-29 18:18:05 +020071 'required-version|rv=s' => \(my $required_version),
Marc Kupietz2115ecc2025-12-10 11:37:03 +010072 'progress|p' => \(my $progress),
Akrona2cb2812021-10-30 10:29:08 +020073 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010074 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010075 pod2usage(
76 -verbose => 99,
77 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
78 -msg => $VERSION_MSG,
79 -output => '-'
80 )
81 },
82 'version|v' => sub {
83 pod2usage(
84 -verbose => 0,
85 -msg => $VERSION_MSG,
86 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010087 );
Akrond949e182020-02-14 12:23:57 +010088 }
Peter Hardersd892a582020-02-12 15:45:22 +010089);
90
Akrond3e1d282021-02-24 14:51:27 +010091
Akronb87c58d2021-02-23 17:23:30 +010092# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010093binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020094Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020095$log->notice('Debugging is activated') if DEBUG;
96
Akrond3e1d282021-02-24 14:51:27 +010097
Akron2520a342022-03-29 18:18:05 +020098if ($required_version) {
Marc Kupietz2475c952024-01-09 10:40:04 +010099 $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
Akron2520a342022-03-29 18:18:05 +0200100 if (!$1 || $1 ne $VERSION) {
101 $log->error("Required version $required_version mismatches version $VERSION");
102 exit(1);
103 };
104};
105
106
Marc Kupietza671ae52022-12-22 16:28:14 +0100107my ($what, $with);
108if ($xmlid_to_textsigle ne '') {
109 ($what, $with) = split('@', $xmlid_to_textsigle);
110 $what = qr!$what!;
111};
112
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100113if ($progress) {
114 eval {
115 require Time::Progress;
116 1;
117 } or do {
118 $log->warn('Time::Progress not installed. Progress bar disabled.');
119 $progress = 0;
120 }
121};
122
Akron0529e512021-02-22 09:55:35 +0100123# tag (without attributes), which contains the primary text
124my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +0200125# optional
Akron09e0b2c2020-07-28 15:57:01 +0200126
Akron54c3ff12021-02-25 11:33:37 +0100127# Remember to skip certain inline tags
128my %skip_inline_tags = ();
129if ($skip_inline_tags_str) {
130 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
131 $skip_inline_tags{$_} = 1;
132 };
133};
134
Akrond3e1d282021-02-24 14:51:27 +0100135# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200136my $ext_tok;
137if ($tokenizer_call) {
138 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100139 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200140}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200141
Akronb93fabb2023-01-13 12:05:44 +0100142# KorAP tokenization
Akron0c41ab32020-09-29 07:33:33 +0200143elsif ($tokenizer_korap) {
Akronbd4281e2022-03-28 08:31:40 +0200144 eval {
145 require KorAP::XML::TEI::Tokenizer::KorAP;
146 1;
147 };
Akron2520a342022-03-29 18:18:05 +0200148
149 my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
150 if ($korap_tok_ver ne $VERSION) {
151 $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
152 exit(1);
153 };
154
Marc Kupietz985da0c2021-02-15 19:29:50 +0100155 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akronb93fabb2023-01-13 12:05:44 +0100156}
157
158# No internal tokenizer chosen
159elsif (!$tokenizer_intern && !$no_tokenizer) {
160 $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
161 exit(1);
Akron0c41ab32020-09-29 07:33:33 +0200162};
Peter Harders6f526a32020-06-29 21:44:41 +0200163
Akron6b1f26b2024-09-19 11:35:32 +0200164if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron11484782021-11-03 20:12:14 +0100165 $skip_inline_tags{s} = 1;
166};
Akron0c41ab32020-09-29 07:33:33 +0200167
Akrond3e1d282021-02-24 14:51:27 +0100168# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100169my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
170my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100171
Peter Harders41c35622020-07-12 01:16:22 +0200172
Akrondd0be8f2021-02-18 19:29:41 +0100173# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100174# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100175my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100176
Akron1a5271a2021-02-18 13:18:15 +0100177# Name of the directory and the file containing all inline token informations
178# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
179my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100180
Akrone2819a12021-10-12 15:52:55 +0200181if (index($_tokens_dir, '!') == 0) {
182 $_tokens_dir = substr($_tokens_dir, 1);
183 $inline_tokens_exclusive = 1;
184};
185
Akron6b1f26b2024-09-19 11:35:32 +0200186
187my ($_dep_dir, $_dep_file);
188if ($inline_dependencies) {
189 ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
190 $inline_dependencies = 1;
191
192 if ($_dep_dir && index($_dep_dir, '!') == 0) {
193 $_dep_dir = substr($_dep_dir, 1);
194 $inline_deps_exclusive = 1;
195 };
196};
197
198
Akronb87c58d2021-02-23 17:23:30 +0100199# Initialize zipper
Akron132bdeb2024-06-06 14:28:56 +0200200my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron09e0b2c2020-07-28 15:57:01 +0200201
Akronbc899192021-02-24 12:14:47 +0100202# text directory (below $root_dir)
203my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200204
Akronbc899192021-02-24 12:14:47 +0100205# Escaped version of text id
206my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200207
Akrond53913c2021-02-24 09:50:13 +0100208# Default encoding of the text
209my $input_enc = 'UTF-8';
210
Akrond53913c2021-02-24 09:50:13 +0100211# text line (needed for whitespace handling)
212my $text_line = 0;
213
Peter Harders6f526a32020-06-29 21:44:41 +0200214
Akrond53913c2021-02-24 09:50:13 +0100215# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200216my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100217
Akrona2cb2812021-10-30 10:29:08 +0200218# Single dash was set
219if ($stdio) {
220 $input_fh = *STDIN;
221}
Akrona2cb2812021-10-30 10:29:08 +0200222# Input flag was passed
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200223elsif (@ARGV || $input_fname ne '') {
224 unless ($input_fname ne '') {
225 $input_fname = shift @ARGV;
226 };
Akron347be812020-09-29 07:52:52 +0200227 unless (open($input_fh, '<', $input_fname)) {
228 die $log->fatal("File '$input_fname' could not be opened.");
229 };
Akrona2cb2812021-10-30 10:29:08 +0200230}
231
232# No input to process
233else {
234 pod2usage(
235 -verbose => 99,
236 -sections => 'NAME|SYNOPSIS',
237 -msg => $VERSION_MSG,
238 -output => '-'
239 );
240 exit;
Akrond53913c2021-02-24 09:50:13 +0100241};
Peter Harders6f526a32020-06-29 21:44:41 +0200242
Akronf8088e62021-02-18 16:18:59 +0100243# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200244binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200245
Peter Harders6f526a32020-06-29 21:44:41 +0200246
Akroneb12e232021-02-25 13:49:50 +0100247# Create inline parser object
248my $inline = KorAP::XML::TEI::Inline->new(
249 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200250 \%skip_inline_tags,
Akron6b1f26b2024-09-19 11:35:32 +0200251 $inline_tokens_exclusive,
252 $inline_dependencies
Akroneb12e232021-02-25 13:49:50 +0100253);
254
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200255do {
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100256 my $p;
257 if ($progress && $input_fname ne '') {
258 my $file_size = -s $input_fname;
259 if ($file_size) {
260 $p = Time::Progress->new(min => 0, max => $file_size);
261 $log->notice("Reading input document $input_fname (Size: $file_size bytes)");
262 }
263 } elsif ($input_fname ne '') {
264 $log->notice("Reading input document $input_fname");
265 };
266
267 my $i = 0;
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200268 MAIN:
269 while (<$input_fh>) {
Akroneb12e232021-02-25 13:49:50 +0100270
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100271 if ($p && ($i++ % 500 == 0)) {
272 print STDERR $p->report("\r%20b %p ETA: %E", tell($input_fh));
273 };
274
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200275 # remove HTML (multi-line) comments (<!--...-->)
276 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200277
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200278 # Set input encoding
279 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
280 $input_enc = $2;
281 next;
Akron0bb7e722020-09-29 07:48:33 +0200282 };
Peter Harders6f526a32020-06-29 21:44:41 +0200283
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200284 $_ = decode($input_enc, $_);
285 $_ = replace_entities($_);
Peter Harders90157342020-07-01 21:05:14 +0200286
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200287 # Start of text body
288 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
289 my $suffix = $2;
Peter Harders90157342020-07-01 21:05:14 +0200290
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200291 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
292 die $log->fatal("input line number $.: " .
293 "line with opening text-body tag '${_TEXT_BODY}' " .
294 "contains additional information ... => Aborting (line=$_)");
295 };
Peter Harders6f526a32020-06-29 21:44:41 +0200296
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200297 # Text body data extracted from input document ($input_fh),
298 # further processed by XML::LibXML::Reader
299 my $text_buffer = '';
Peter Harders6f526a32020-06-29 21:44:41 +0200300
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200301 # Iterate over all lines in the text body
302 while (<$input_fh>) {
Peter Harders6f526a32020-06-29 21:44:41 +0200303
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200304 $_ = remove_xml_comments($input_fh, $_);
305 $_ = decode($input_enc, $_);
306 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200307
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200308 # End of text body
309 if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
310
311 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
312
Marc Kupietzff061ef2026-03-05 09:59:35 +0100313 my $before = substr($_, 0, $pos);
314 my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
315 my $before_check = $before;
316 $before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
317 if (($before_check . $after) !~ /^\s*$/) {
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200318 die $log->fatal("input line number $.: " .
319 "line with closing text-body tag '${_TEXT_BODY}'" .
320 " contains additional information ... => Aborting (line=$_)");
321 };
322
Marc Kupietzff061ef2026-03-05 09:59:35 +0100323 # Add any remaining content before </text> (e.g. </body>) to the buffer
324 $before =~ s/^\s+//;
325 $before =~ s/\s+$//;
326 $text_buffer .= $before if $before ne '';
327
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200328 if ($dir eq '') {
329 $log->warn(
330 "Maybe empty textSigle => skipping this text ...\n" .
331 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100332 );
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200333 next MAIN;
334 };
335
336 # Parse inline structure
337 $inline->parse($text_id_esc, \$text_buffer);
338
339 if (DEBUG) {
340 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
341 };
342
343 my $data = $inline->data;
344
345 # Write data.xml
346 $data->to_zip(
347 $zipper->new_stream("$dir/${data_file}.xml"),
348 $text_id_esc
349 );
350
351 # Tokenize with external tokenizer
352 if ($ext_tok) {
353
354 # Tokenize and output
355 $ext_tok->tokenize($data->data)->to_zip(
356 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
357 $text_id_esc
358 );
359
360 if ($use_tokenizer_sentence_splits) {
361 $ext_tok->sentencize_from_previous_input($inline->structures);
362 };
363 };
364
365 # Tokenize with internal tokenizer
366 if ($tokenizer_intern) {
367
368 # Tokenize and output
369 $cons_tok->tokenize($data->data)->to_zip(
370 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
371 $text_id_esc
372 )->reset;
373
374 $aggr_tok->tokenize($data->data)->to_zip(
375 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
376 $text_id_esc
377 )->reset;
378 };
379
380 # ~ write structures ~
381 unless ($inline->structures->empty) {
382 $inline->structures->to_zip(
383 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
384 $text_id_esc,
385 2 # = structure serialization
386 );
387 };
388
389 # ~ write tokens ~
390 unless ($skip_inline_tokens || $inline->tokens->empty) {
391 $inline->tokens->to_zip(
392 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
393 $text_id_esc,
394 # Either 0 = tokens without inline or
395 # 1 = tokens with inline
396 # !$skip_inline_token_annotations
397 ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
398 );
399 };
400
401 # ~ write dependencies ~
402 unless ($inline->dependencies->empty) {
403 $inline->dependencies->to_zip(
404 $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
405 $text_id_esc,
406 3 # = dependency serialization
407 );
408 };
409
410
411 # reinit.
412 $dir = '';
413
Akrondafaa7a2021-02-19 15:17:58 +0100414 next MAIN;
415 };
Peter Harders6f526a32020-06-29 21:44:41 +0200416
Akrondafaa7a2021-02-19 15:17:58 +0100417
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200418 # ~ whitespace handling ~
Akrondafaa7a2021-02-19 15:17:58 +0100419
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200420 # Fix whitespaces (see notes on whitespace fixing)
Akroneb12e232021-02-25 13:49:50 +0100421
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200422 # TODO:
423 # Maybe it's best, to keep the stripping of whitespace and
424 # to just remove the if-clause and to insert a blank by default
425 # (with possibly an option on how newlines in primary text should
426 # be handled (stripped or replaced by a whitespace)).
Akrondafaa7a2021-02-19 15:17:58 +0100427
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200428 # Remove consecutive whitespace at beginning and end (mostly one newline)
429 s/^\s+//;
430 s/\s+$//;
Akrondafaa7a2021-02-19 15:17:58 +0100431
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200432 # NOTE:
433 # this is only relevant, if a text consists of more than one line
Akrond53ab4b2021-02-24 09:56:12 +0100434
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200435 # TODO:
436 # find a better solution, or create a warning, if a text has more
437 # than one line ($text_line > 1)
Peter Harders6f526a32020-06-29 21:44:41 +0200438
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200439 # TODO:
440 # do testing with 2 different corpora
441 # (one with only one-line texts, the other with several lines per text)
Peter Harders6f526a32020-06-29 21:44:41 +0200442
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200443 # line contains at least one non-tag character
444 if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
Akron598d1a72020-08-02 17:33:31 +0200445
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200446 # Increment counter for text lines
447 $text_line++;
Akrona10ad592020-08-03 11:20:23 +0200448
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200449 # insert blank before 1st character
450 # (for 2nd line and consecutive lines)
451 $_ = ' ' . $_ if $text_line > 1;
452 }
Akrondafaa7a2021-02-19 15:17:58 +0100453
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200454 # add line to buffer
455 $text_buffer .= $_;
456 };
457 }
458 elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
459 my $leadin = $1;
460 my $id = $3;
461 my $sigle = $3;
Akrondafaa7a2021-02-19 15:17:58 +0100462
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200463 if ($what) {
464 $_ = $id;
465 eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
466 $sigle = $_;
467 $log->debug("Converted text id `$id' to sigle `$sigle'");
468 };
469 $sigle =~ s/\./-/g;
Akron6b1f26b2024-09-19 11:35:32 +0200470
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200471 my @parts = split(/[\/_]/, $sigle);
472 if (@parts != 3) {
473 die $log->fatal(
474 "input line number $.: " .
475 "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
476 "=> Aborting (line=$_)");
Akron598d1a72020-08-02 17:33:31 +0200477 };
478
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200479 $dir = join("/", @parts);
480 $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
481 $log->notice("$0: text_id=$text_id_esc");
Peter Harders6f526a32020-06-29 21:44:41 +0200482
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200483 if ($leadin !~ /^\s*$/) {
484 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100485 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200486 'line with opening header tag is not in expected format ... ' .
487 "=> Aborting (line=$_)");
488 };
489 }
Marc Kupietza671ae52022-12-22 16:28:14 +0100490
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200491 # Start of header section
492 elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
493 my $content = "$2\n";
Marc Kupietza671ae52022-12-22 16:28:14 +0100494
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200495 if ($1 !~ /^\s*$/) {
496 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100497 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200498 'line with opening header tag is not in expected format ... ' .
499 "=> Aborting (line=$_)");
500 };
Marc Kupietza671ae52022-12-22 16:28:14 +0100501
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200502 # Parse header
503 my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
504 if ($auto_textsigle) {
505 $auto_textsigle = increase_auto_textsigle($auto_textsigle);
506 $log->debug("Auto-incremented text sigle to $auto_textsigle");
507 };
Akronf57ed812020-07-27 10:37:52 +0200508
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200509 # Header was parseable
510 if ($header) {
Akron347be812020-09-29 07:52:52 +0200511
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200512 # Write header to zip
513 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200514
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200515 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200516
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200517 $header->to_zip($zipper->new_stream($file));
Akron347be812020-09-29 07:52:52 +0200518
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200519 # Header is for text level
520 if ($header->type eq 'text') {
Akron347be812020-09-29 07:52:52 +0200521
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200522 # Remember dir and sigles
523 $dir = $header->dir;
524 $text_id_esc = $header->id_esc;
Akron347be812020-09-29 07:52:52 +0200525
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200526 # log output for seeing progression
527 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200528
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200529 # Reset counter for text lines
530 # (needed for whitespace handling)
531 $text_line = 0;
532 };
Akrond53913c2021-02-24 09:50:13 +0100533 };
534 };
535 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200536 $text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100537
538 if ($p) {
539 print STDERR $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
540 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200541} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron347be812020-09-29 07:52:52 +0200542$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200543
Akron9df4a242021-02-19 15:31:16 +0100544$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100545
Akrond53913c2021-02-24 09:50:13 +0100546close $input_fh;
547
Peter Harders6f526a32020-06-29 21:44:41 +0200548
Akrond949e182020-02-14 12:23:57 +0100549__END__
550
551=pod
552
553=encoding utf8
554
555=head1 NAME
556
557tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
558
559=head1 SYNOPSIS
560
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200561 cat corpus.i5.xml | tei2korapxml -tk - > corpus.korapxml.zip
562 tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100563
564=head1 DESCRIPTION
565
Akronee434b12020-07-08 12:53:01 +0200566C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200567L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200568based documents to the
569L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200570
Akrond949e182020-02-14 12:23:57 +0100571This program is usually called from inside another script.
572
Akronee434b12020-07-08 12:53:01 +0200573=head1 FORMATS
574
575=head2 Input restrictions
576
577=over 2
578
579=item
580
Akronee434b12020-07-08 12:53:01 +0200581TEI P5 formatted input with certain restrictions:
582
583=over 4
584
585=item
586
Akrone48bec42023-01-05 12:18:45 +0100587B<mandatory>: text-header with integrated textsigle
588(or convertable identifier), text-body
Akronee434b12020-07-08 12:53:01 +0200589
590=item
591
592B<optional>: corp-header with integrated corpsigle,
593doc-header with integrated docsigle
594
595=back
596
597=item
598
Akron0c41ab32020-09-29 07:33:33 +0200599All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200600newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200601(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200602into blanks between 2 tokens could lead to additional blanks,
603where there should be none (e.g.: punctuation characters like C<,> or
604C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100605(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200606
Akron940ca6f2021-10-11 12:38:39 +0200607=item
608
609Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
610need to be defined in the same line as the header tag.
611
Akronee434b12020-07-08 12:53:01 +0200612=back
613
614=head2 Notes on the output
615
616=over 2
617
618=item
619
620zip file output (default on C<stdout>) with utf8 encoded entries
621(which together form the KorAP-XML format)
622
623=back
624
Akrond949e182020-02-14 12:23:57 +0100625=head1 INSTALLATION
626
Akrond26319b2023-01-12 15:34:41 +0100627C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietze83a4e92021-03-16 20:51:26 +0100628When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100629to use L<cpanm|App::cpanminus>.
630
631 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
632
633In case everything went well, the C<tei2korapxml> tool will
634be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200635
Akrond949e182020-02-14 12:23:57 +0100636Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
637
638=head1 OPTIONS
639
640=over 2
641
Akrona2cb2812021-10-30 10:29:08 +0200642=item B<--input|-i>
643
644The input file to process. If no specific input is defined and a single
645dash C<-> is passed as an argument, data is read from C<STDIN>.
646
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200647Instead of using C<-i> input files can also be defined as trailing arguments
648to the command:
649
650 tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
651
Akron132bdeb2024-06-06 14:28:56 +0200652=item B<--output|-o>
653
654The output zip file to be created. If no specific output is defined,
655data is written to C<STDOUT>.
Akrona2cb2812021-10-30 10:29:08 +0200656
Akron4e603a52020-07-27 14:23:49 +0200657=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100658
Akron4e603a52020-07-27 14:23:49 +0200659The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100660
661=item B<--help|-h>
662
663Print help information.
664
665=item B<--version|-v>
666
667Print version information.
668
Akrone48bec42023-01-05 12:18:45 +0100669=item B<--tokenizer-korap|-tk>
Akron2520a342022-03-29 18:18:05 +0200670
Akrone48bec42023-01-05 12:18:45 +0100671Use the standard KorAP/DeReKo tokenizer.
672
673=item B<--tokenizer-internal|-ti>
674
675Tokenize the data using two embedded tokenizers,
676that will take an I<aggressive> and a I<conservative>
677approach.
Akron2520a342022-03-29 18:18:05 +0200678
Akron4e603a52020-07-27 14:23:49 +0200679=item B<--tokenizer-call|-tc>
680
681Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100682from STDIN and outputs the offsets of all tokens.
683
684Texts are separated using C<\x04\n>. The external process
685should add a new line per text.
686
687If the L</--use-tokenizer-sentence-splits> option is activated,
688sentences are marked by offset as well in new lines.
689
690To use L<Datok|https://github.com/KorAP/Datok> including sentence
691splitting, call C<tei2korap> as follows:
692
693 $ cat corpus.i5.xml | tei2korapxml -s \
694 $ -tc 'datok tokenize \
695 $ -t ./tokenizer.matok \
696 $ -p --newline-after-eot --no-sentences \
697 $ --no-tokens --sentence-positions -' - \
698 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200699
Akronb93fabb2023-01-13 12:05:44 +0100700=item B<--no-tokenizer>
701
702Boolean flag indicating that no tokenizer should be used.
703This is meant to ensure that by default a final token layer always
704exists.
705If a separate tokenizer is chosen, this flag is ignored.
706
Akron75d63142021-02-23 18:40:56 +0100707=item B<--skip-inline-tokens>
708
709Boolean flag indicating that inline tokens should not
710be processed. Defaults to false (meaning inline tokens will be processed).
711
Akron692d17d2021-03-05 13:21:03 +0100712=item B<--skip-inline-token-annotations>
713
714Boolean flag indicating that inline token annotations should not
715be processed. Defaults to true (meaning inline token annotations
Akron6b1f26b2024-09-19 11:35:32 +0200716won't be processed). Can be negated with
717C<--no-skip-inline-token-annotations>.
Akron692d17d2021-03-05 13:21:03 +0100718
Akronca70a1d2021-02-25 16:21:31 +0100719=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100720
721Expects a comma-separated list of tags to be ignored when the structure
722is parsed. Content of these tags however will be processed.
723
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +0200724=item B<--auto-textsigle> <textsigle>
725
726Expects a text sigle thats serves as fallback if no text sigles
727are given in the input data.
728The auto text sigle will be incremented for each text processed.
729
730Example:
731
732 tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
733 < data.i5.xml > korapxml.zip
734
Marc Kupietza671ae52022-12-22 16:28:14 +0100735=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
736
Akrone48bec42023-01-05 12:18:45 +0100737Expects a regular replacement expression (separated by B<@> between the
Marc Kupietza671ae52022-12-22 16:28:14 +0100738search and the replacement) to convert text id attributes to text sigles
739with three parts (separated by B</>).
740
741Example:
742
743 tei2korapxml \
744 --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
745 -tk - < t/data/icc_german_sample.p5.xml
746
Akrone48bec42023-01-05 12:18:45 +0100747Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
748sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietza671ae52022-12-22 16:28:14 +0100749
Akron1a5271a2021-02-18 13:18:15 +0100750=item B<--inline-tokens> <foundry>#[<file>]
751
752Define the foundry and file (without extension)
753to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100754Unless C<--skip-inline-token-annotations> is set,
755this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100756Defaults to C<tokens> and C<morpho>.
757
Akrone2819a12021-10-12 15:52:55 +0200758The inline token data will also be stored in the
759inline structures file (see I<--inline-structures>),
760unless the inline token foundry is prepended
761by an B<!> exclamation mark, indicating that inline
762tokens are stored exclusively in the inline tokens
763file.
764
765Example:
766
Akron6b1f26b2024-09-19 11:35:32 +0200767 tei2korapxml --no-tokenizer --inline-tokens \
768 '!gingko#morpho' < data.i5.xml > korapxml.zip
769
770=item B<--inline-dependencies> <foundry>#[<file>]
771
772Define the foundry and file (without extension)
773to store inline dependency information in.
774Defaults to the layer of C<dependency> and
775will be ignored if not set (which means, dependency
776attributes will be stored in the inline tokens file,
777if not skipped).
778
779The dependency data will also be stored in the
780inline token file (see I<--inline-tokens>),
781unless the inline dependencies foundry is prepended
782by an B<!> exclamation mark, indicating that inline
783dependency data is stored exclusively in the inline
784dependencies file.
785
786Example:
787
788 tei2korapxml --no-tokenizer --inline-dependencies \
789 'gingko#dependency' < data.i5.xml > korapxml.zip
790
Akrone2819a12021-10-12 15:52:55 +0200791
Akrondd0be8f2021-02-18 19:29:41 +0100792=item B<--inline-structures> <foundry>#[<file>]
793
794Define the foundry and file (without extension)
795to store inline structure information in.
796Defaults to C<struct> and C<structures>.
797
Akron26a71522021-02-19 10:27:37 +0100798=item B<--base-foundry> <foundry>
799
800Define the base foundry to store newly generated
801token information in.
802Defaults to C<base>.
803
804=item B<--data-file> <file>
805
806Define the file (without extension)
807to store primary data information in.
808Defaults to C<data>.
809
810=item B<--header-file> <file>
811
812Define the file name (without extension)
813to store header information on
814the corpus, document, and text level in.
815Defaults to C<header>.
816
Marc Kupietz985da0c2021-02-15 19:29:50 +0100817=item B<--use-tokenizer-sentence-splits|-s>
818
819Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100820provided by the tokenizer.
821Currently KorAP-tokenizer and certain external tokenizers support
822these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100823
Akron91705d72021-02-19 10:59:45 +0100824=item B<--tokens-file> <file>
825
826Define the file (without extension)
827to store generated token information in
828(either from the KorAP tokenizer or an externally called tokenizer).
829Defaults to C<tokens>.
830
Akron3378dfd2020-08-01 15:01:36 +0200831=item B<--log|-l>
832
833Loglevel for I<Log::Any>. Defaults to C<notice>.
834
Akrond949e182020-02-14 12:23:57 +0100835=back
836
Akronb3649472020-09-29 08:24:46 +0200837=head1 ENVIRONMENT VARIABLES
838
839=over 2
840
841=item B<KORAPXMLTEI_DEBUG>
842
843Activate minimal debugging.
844Defaults to C<false>.
845
Marc Kupietzd254f5c2025-04-16 10:37:08 +0200846=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
847
848Set the heap size for the tokenizer process.
849Defaults to C<512m>.
850
Akronb3649472020-09-29 08:24:46 +0200851=back
852
Akrond949e182020-02-14 12:23:57 +0100853=head1 COPYRIGHT AND LICENSE
854
Marc Kupietzb6fd6bc2025-04-16 12:47:26 +0200855Copyright (C) 2021-2025, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100856
857Author: Peter Harders
858
Akronaabd0952020-09-29 07:35:08 +0200859Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100860
861L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
862Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200863L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100864member of the
865L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
866
867This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100868L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100869
870=cut
Akronf8088e62021-02-18 16:18:59 +0100871
872# NOTES
873
Akronf8088e62021-02-18 16:18:59 +0100874## Notes on segfault prevention
875
Akron91577922021-02-19 10:32:54 +0100876binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100877(see notes on 'PerlIO layers' in 'man XML::LibXML'),
878removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
879see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
880see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.