blob: 207078242618fc47f2831725e75265a2f3822c56 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +02009use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders6f526a32020-06-29 21:44:41 +020010
11use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010012
Akroneaa96232020-10-15 17:06:15 +020013use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010014
Akron4f67cd42020-07-02 12:27:58 +020015use FindBin;
16BEGIN {
17 unshift @INC, "$FindBin::Bin/../lib";
18};
19
Marc Kupietz8a954e52021-02-16 22:03:07 +010020use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020021use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020022use KorAP::XML::TEI::Tokenizer::Conservative;
23use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020024use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020025use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010026use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010027
Marc Kupietzef5dfd32026-03-05 10:02:47 +010028our $VERSION = '2.7.1';
Peter Harders6f526a32020-06-29 21:44:41 +020029
Akrond949e182020-02-14 12:23:57 +010030our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
31
Akron33db4ec2021-02-24 12:52:21 +010032use constant {
33 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010034 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010035};
Peter Hardersd892a582020-02-12 15:45:22 +010036
Akron692d17d2021-03-05 13:21:03 +010037if ($ENV{KORAPXMLTEI_INLINE}) {
38 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
39};
40
Akrone2819a12021-10-12 15:52:55 +020041# Inline tokens won't be stored in the structure file
42my $inline_tokens_exclusive = 0;
43
Akron6b1f26b2024-09-19 11:35:32 +020044# Inline dependencies won't be stored in the tokens file
45my $inline_deps_exclusive = 0;
46
Peter Harders6f526a32020-06-29 21:44:41 +020047# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010048GetOptions(
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +020049 'auto-textsigle|A=s' => \(my $auto_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010050 'root|r=s' => \(my $root_dir = '.'),
51 'input|i=s' => \(my $input_fname = ''),
Akron132bdeb2024-06-06 14:28:56 +020052 'output|o=s' => \(my $output_fname = ''),
Akron75d63142021-02-23 18:40:56 +010053 'tokenizer-call|tc=s' => \(my $tokenizer_call),
54 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010055 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akronb93fabb2023-01-13 12:05:44 +010056 'no-tokenizer' => \(my $no_tokenizer),
Akron75d63142021-02-23 18:40:56 +010057 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
58 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
59 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron6b1f26b2024-09-19 11:35:32 +020060 'inline-dependencies=s' => \(my $inline_dependencies),
Akron75d63142021-02-23 18:40:56 +010061 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron6b1f26b2024-09-19 11:35:32 +020062 'skip-inline-token-annotations!' => \(
Akron692d17d2021-03-05 13:21:03 +010063 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010064 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietza671ae52022-12-22 16:28:14 +010069 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010070 'log|l=s' => \(my $log_level = 'notice'),
Akron2520a342022-03-29 18:18:05 +020071 'required-version|rv=s' => \(my $required_version),
Marc Kupietz2115ecc2025-12-10 11:37:03 +010072 'progress|p' => \(my $progress),
Akrona2cb2812021-10-30 10:29:08 +020073 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010074 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010075 pod2usage(
76 -verbose => 99,
77 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
78 -msg => $VERSION_MSG,
79 -output => '-'
80 )
81 },
82 'version|v' => sub {
83 pod2usage(
84 -verbose => 0,
85 -msg => $VERSION_MSG,
86 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010087 );
Akrond949e182020-02-14 12:23:57 +010088 }
Peter Hardersd892a582020-02-12 15:45:22 +010089);
90
Akrond3e1d282021-02-24 14:51:27 +010091
Akronb87c58d2021-02-23 17:23:30 +010092# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010093binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020094Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020095$log->notice('Debugging is activated') if DEBUG;
96
Akrond3e1d282021-02-24 14:51:27 +010097
Akron2520a342022-03-29 18:18:05 +020098if ($required_version) {
Marc Kupietz2475c952024-01-09 10:40:04 +010099 $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
Akron2520a342022-03-29 18:18:05 +0200100 if (!$1 || $1 ne $VERSION) {
101 $log->error("Required version $required_version mismatches version $VERSION");
102 exit(1);
103 };
104};
105
106
Marc Kupietza671ae52022-12-22 16:28:14 +0100107my ($what, $with);
108if ($xmlid_to_textsigle ne '') {
109 ($what, $with) = split('@', $xmlid_to_textsigle);
110 $what = qr!$what!;
111};
112
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100113my $progress_fh;
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100114if ($progress) {
115 eval {
116 require Time::Progress;
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100117 my $tty = $^O eq 'MSWin32' ? 'CON' : '/dev/tty';
118 open($progress_fh, '>', $tty)
119 or die "Cannot open $tty";
120 $progress_fh->autoflush(1);
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100121 1;
122 } or do {
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100123 $log->warn('Progress bar disabled: ' . ($@ =~ s/ at .*//sr));
124 $progress = 0;
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100125 }
126};
127
Akron0529e512021-02-22 09:55:35 +0100128# tag (without attributes), which contains the primary text
129my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +0200130# optional
Akron09e0b2c2020-07-28 15:57:01 +0200131
Akron54c3ff12021-02-25 11:33:37 +0100132# Remember to skip certain inline tags
133my %skip_inline_tags = ();
134if ($skip_inline_tags_str) {
135 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
136 $skip_inline_tags{$_} = 1;
137 };
138};
139
Akrond3e1d282021-02-24 14:51:27 +0100140# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200141my $ext_tok;
142if ($tokenizer_call) {
143 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100144 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200145}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200146
Akronb93fabb2023-01-13 12:05:44 +0100147# KorAP tokenization
Akron0c41ab32020-09-29 07:33:33 +0200148elsif ($tokenizer_korap) {
Akronbd4281e2022-03-28 08:31:40 +0200149 eval {
150 require KorAP::XML::TEI::Tokenizer::KorAP;
151 1;
152 };
Akron2520a342022-03-29 18:18:05 +0200153
154 my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
155 if ($korap_tok_ver ne $VERSION) {
156 $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
157 exit(1);
158 };
159
Marc Kupietz985da0c2021-02-15 19:29:50 +0100160 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akronb93fabb2023-01-13 12:05:44 +0100161}
162
163# No internal tokenizer chosen
164elsif (!$tokenizer_intern && !$no_tokenizer) {
165 $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
166 exit(1);
Akron0c41ab32020-09-29 07:33:33 +0200167};
Peter Harders6f526a32020-06-29 21:44:41 +0200168
Akron6b1f26b2024-09-19 11:35:32 +0200169if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron11484782021-11-03 20:12:14 +0100170 $skip_inline_tags{s} = 1;
171};
Akron0c41ab32020-09-29 07:33:33 +0200172
Akrond3e1d282021-02-24 14:51:27 +0100173# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100174my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
175my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100176
Peter Harders41c35622020-07-12 01:16:22 +0200177
Akrondd0be8f2021-02-18 19:29:41 +0100178# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100179# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100180my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100181
Akron1a5271a2021-02-18 13:18:15 +0100182# Name of the directory and the file containing all inline token informations
183# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
184my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100185
Akrone2819a12021-10-12 15:52:55 +0200186if (index($_tokens_dir, '!') == 0) {
187 $_tokens_dir = substr($_tokens_dir, 1);
188 $inline_tokens_exclusive = 1;
189};
190
Akron6b1f26b2024-09-19 11:35:32 +0200191
192my ($_dep_dir, $_dep_file);
193if ($inline_dependencies) {
194 ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
195 $inline_dependencies = 1;
196
197 if ($_dep_dir && index($_dep_dir, '!') == 0) {
198 $_dep_dir = substr($_dep_dir, 1);
199 $inline_deps_exclusive = 1;
200 };
201};
202
203
Akronb87c58d2021-02-23 17:23:30 +0100204# Initialize zipper
Akron132bdeb2024-06-06 14:28:56 +0200205my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron09e0b2c2020-07-28 15:57:01 +0200206
Akronbc899192021-02-24 12:14:47 +0100207# text directory (below $root_dir)
208my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200209
Akronbc899192021-02-24 12:14:47 +0100210# Escaped version of text id
211my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200212
Akrond53913c2021-02-24 09:50:13 +0100213# Default encoding of the text
214my $input_enc = 'UTF-8';
215
Akrond53913c2021-02-24 09:50:13 +0100216# text line (needed for whitespace handling)
217my $text_line = 0;
218
Peter Harders6f526a32020-06-29 21:44:41 +0200219
Akrond53913c2021-02-24 09:50:13 +0100220# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200221my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100222
Akrona2cb2812021-10-30 10:29:08 +0200223# Single dash was set
224if ($stdio) {
225 $input_fh = *STDIN;
226}
Akrona2cb2812021-10-30 10:29:08 +0200227# Input flag was passed
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200228elsif (@ARGV || $input_fname ne '') {
229 unless ($input_fname ne '') {
230 $input_fname = shift @ARGV;
231 };
Akron347be812020-09-29 07:52:52 +0200232 unless (open($input_fh, '<', $input_fname)) {
233 die $log->fatal("File '$input_fname' could not be opened.");
234 };
Akrona2cb2812021-10-30 10:29:08 +0200235}
236
237# No input to process
238else {
239 pod2usage(
240 -verbose => 99,
241 -sections => 'NAME|SYNOPSIS',
242 -msg => $VERSION_MSG,
243 -output => '-'
244 );
245 exit;
Akrond53913c2021-02-24 09:50:13 +0100246};
Peter Harders6f526a32020-06-29 21:44:41 +0200247
Akronf8088e62021-02-18 16:18:59 +0100248# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200249binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200250
Peter Harders6f526a32020-06-29 21:44:41 +0200251
Akroneb12e232021-02-25 13:49:50 +0100252# Create inline parser object
253my $inline = KorAP::XML::TEI::Inline->new(
254 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200255 \%skip_inline_tags,
Akron6b1f26b2024-09-19 11:35:32 +0200256 $inline_tokens_exclusive,
257 $inline_dependencies
Akroneb12e232021-02-25 13:49:50 +0100258);
259
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200260do {
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100261 my $p;
262 if ($progress && $input_fname ne '') {
263 my $file_size = -s $input_fname;
264 if ($file_size) {
265 $p = Time::Progress->new(min => 0, max => $file_size);
266 $log->notice("Reading input document $input_fname (Size: $file_size bytes)");
267 }
268 } elsif ($input_fname ne '') {
269 $log->notice("Reading input document $input_fname");
270 };
271
272 my $i = 0;
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200273 MAIN:
274 while (<$input_fh>) {
Akroneb12e232021-02-25 13:49:50 +0100275
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100276 if ($p && ($i++ % 500 == 0)) {
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100277 print $progress_fh $p->report("\r%20b %p ETA: %E", tell($input_fh));
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100278 };
279
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200280 # remove HTML (multi-line) comments (<!--...-->)
281 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200282
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200283 # Set input encoding
284 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
285 $input_enc = $2;
286 next;
Akron0bb7e722020-09-29 07:48:33 +0200287 };
Peter Harders6f526a32020-06-29 21:44:41 +0200288
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200289 $_ = decode($input_enc, $_);
290 $_ = replace_entities($_);
Peter Harders90157342020-07-01 21:05:14 +0200291
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200292 # Start of text body
293 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
294 my $suffix = $2;
Peter Harders90157342020-07-01 21:05:14 +0200295
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200296 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
297 die $log->fatal("input line number $.: " .
298 "line with opening text-body tag '${_TEXT_BODY}' " .
299 "contains additional information ... => Aborting (line=$_)");
300 };
Peter Harders6f526a32020-06-29 21:44:41 +0200301
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200302 # Text body data extracted from input document ($input_fh),
303 # further processed by XML::LibXML::Reader
304 my $text_buffer = '';
Peter Harders6f526a32020-06-29 21:44:41 +0200305
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200306 # Iterate over all lines in the text body
307 while (<$input_fh>) {
Peter Harders6f526a32020-06-29 21:44:41 +0200308
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200309 $_ = remove_xml_comments($input_fh, $_);
310 $_ = decode($input_enc, $_);
311 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200312
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200313 # End of text body
314 if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
315
316 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
317
Marc Kupietzff061ef2026-03-05 09:59:35 +0100318 my $before = substr($_, 0, $pos);
319 my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
320 my $before_check = $before;
321 $before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
322 if (($before_check . $after) !~ /^\s*$/) {
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200323 die $log->fatal("input line number $.: " .
324 "line with closing text-body tag '${_TEXT_BODY}'" .
325 " contains additional information ... => Aborting (line=$_)");
326 };
327
Marc Kupietzff061ef2026-03-05 09:59:35 +0100328 # Add any remaining content before </text> (e.g. </body>) to the buffer
329 $before =~ s/^\s+//;
330 $before =~ s/\s+$//;
331 $text_buffer .= $before if $before ne '';
332
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200333 if ($dir eq '') {
334 $log->warn(
335 "Maybe empty textSigle => skipping this text ...\n" .
336 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100337 );
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200338 next MAIN;
339 };
340
341 # Parse inline structure
342 $inline->parse($text_id_esc, \$text_buffer);
343
344 if (DEBUG) {
345 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
346 };
347
348 my $data = $inline->data;
349
350 # Write data.xml
351 $data->to_zip(
352 $zipper->new_stream("$dir/${data_file}.xml"),
353 $text_id_esc
354 );
355
356 # Tokenize with external tokenizer
357 if ($ext_tok) {
358
359 # Tokenize and output
360 $ext_tok->tokenize($data->data)->to_zip(
361 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
362 $text_id_esc
363 );
364
365 if ($use_tokenizer_sentence_splits) {
366 $ext_tok->sentencize_from_previous_input($inline->structures);
367 };
368 };
369
370 # Tokenize with internal tokenizer
371 if ($tokenizer_intern) {
372
373 # Tokenize and output
374 $cons_tok->tokenize($data->data)->to_zip(
375 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
376 $text_id_esc
377 )->reset;
378
379 $aggr_tok->tokenize($data->data)->to_zip(
380 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
381 $text_id_esc
382 )->reset;
383 };
384
385 # ~ write structures ~
386 unless ($inline->structures->empty) {
387 $inline->structures->to_zip(
388 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
389 $text_id_esc,
390 2 # = structure serialization
391 );
392 };
393
394 # ~ write tokens ~
395 unless ($skip_inline_tokens || $inline->tokens->empty) {
396 $inline->tokens->to_zip(
397 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
398 $text_id_esc,
399 # Either 0 = tokens without inline or
400 # 1 = tokens with inline
401 # !$skip_inline_token_annotations
402 ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
403 );
404 };
405
406 # ~ write dependencies ~
407 unless ($inline->dependencies->empty) {
408 $inline->dependencies->to_zip(
409 $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
410 $text_id_esc,
411 3 # = dependency serialization
412 );
413 };
414
415
416 # reinit.
417 $dir = '';
418
Akrondafaa7a2021-02-19 15:17:58 +0100419 next MAIN;
420 };
Peter Harders6f526a32020-06-29 21:44:41 +0200421
Akrondafaa7a2021-02-19 15:17:58 +0100422
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200423 # ~ whitespace handling ~
Akrondafaa7a2021-02-19 15:17:58 +0100424
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200425 # Fix whitespaces (see notes on whitespace fixing)
Akroneb12e232021-02-25 13:49:50 +0100426
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200427 # TODO:
428 # Maybe it's best, to keep the stripping of whitespace and
429 # to just remove the if-clause and to insert a blank by default
430 # (with possibly an option on how newlines in primary text should
431 # be handled (stripped or replaced by a whitespace)).
Akrondafaa7a2021-02-19 15:17:58 +0100432
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200433 # Remove consecutive whitespace at beginning and end (mostly one newline)
434 s/^\s+//;
435 s/\s+$//;
Akrondafaa7a2021-02-19 15:17:58 +0100436
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200437 # NOTE:
438 # this is only relevant, if a text consists of more than one line
Akrond53ab4b2021-02-24 09:56:12 +0100439
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200440 # TODO:
441 # find a better solution, or create a warning, if a text has more
442 # than one line ($text_line > 1)
Peter Harders6f526a32020-06-29 21:44:41 +0200443
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200444 # TODO:
445 # do testing with 2 different corpora
446 # (one with only one-line texts, the other with several lines per text)
Peter Harders6f526a32020-06-29 21:44:41 +0200447
Marc Kupietza84fcb52026-03-05 17:22:43 +0100448 # Check if the buffer currently ends inside an open XML tag
449 # (last '<' is after last '>'), meaning this line is a continuation of
450 # a multi-line element (e.g. attributes split across lines like <ref>).
451 # A space must be prepended to avoid "attributes construct error" in the
452 # XML parser when two attribute tokens are concatenated without separator.
453 my $in_open_tag = ($text_buffer ne '' &&
454 rindex($text_buffer, '<') > rindex($text_buffer, '>'));
455
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200456 # line contains at least one non-tag character
457 if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
Akron598d1a72020-08-02 17:33:31 +0200458
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200459 # Increment counter for text lines
460 $text_line++;
Akrona10ad592020-08-03 11:20:23 +0200461
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200462 # insert blank before 1st character
Marc Kupietza84fcb52026-03-05 17:22:43 +0100463 # (for 2nd line and consecutive lines, or when continuing an open tag)
464 $_ = ' ' . $_ if $text_line > 1 || $in_open_tag;
465 }
466
467 # Line is purely within an open tag (attribute continuation):
468 # prepend a space so attributes are properly separated.
469 elsif ($in_open_tag) {
470 $_ = ' ' . $_;
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200471 }
Akrondafaa7a2021-02-19 15:17:58 +0100472
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200473 # add line to buffer
474 $text_buffer .= $_;
475 };
476 }
477 elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
478 my $leadin = $1;
479 my $id = $3;
480 my $sigle = $3;
Akrondafaa7a2021-02-19 15:17:58 +0100481
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200482 if ($what) {
483 $_ = $id;
484 eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
485 $sigle = $_;
486 $log->debug("Converted text id `$id' to sigle `$sigle'");
487 };
488 $sigle =~ s/\./-/g;
Akron6b1f26b2024-09-19 11:35:32 +0200489
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200490 my @parts = split(/[\/_]/, $sigle);
491 if (@parts != 3) {
492 die $log->fatal(
493 "input line number $.: " .
494 "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
495 "=> Aborting (line=$_)");
Akron598d1a72020-08-02 17:33:31 +0200496 };
497
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200498 $dir = join("/", @parts);
499 $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
500 $log->notice("$0: text_id=$text_id_esc");
Peter Harders6f526a32020-06-29 21:44:41 +0200501
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200502 if ($leadin !~ /^\s*$/) {
503 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100504 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200505 'line with opening header tag is not in expected format ... ' .
506 "=> Aborting (line=$_)");
507 };
508 }
Marc Kupietza671ae52022-12-22 16:28:14 +0100509
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200510 # Start of header section
511 elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
512 my $content = "$2\n";
Marc Kupietza671ae52022-12-22 16:28:14 +0100513
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200514 if ($1 !~ /^\s*$/) {
515 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100516 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200517 'line with opening header tag is not in expected format ... ' .
518 "=> Aborting (line=$_)");
519 };
Marc Kupietza671ae52022-12-22 16:28:14 +0100520
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200521 # Parse header
522 my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
523 if ($auto_textsigle) {
524 $auto_textsigle = increase_auto_textsigle($auto_textsigle);
525 $log->debug("Auto-incremented text sigle to $auto_textsigle");
526 };
Akronf57ed812020-07-27 10:37:52 +0200527
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200528 # Header was parseable
529 if ($header) {
Akron347be812020-09-29 07:52:52 +0200530
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200531 # Write header to zip
532 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200533
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200534 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200535
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200536 $header->to_zip($zipper->new_stream($file));
Akron347be812020-09-29 07:52:52 +0200537
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200538 # Header is for text level
539 if ($header->type eq 'text') {
Akron347be812020-09-29 07:52:52 +0200540
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200541 # Remember dir and sigles
542 $dir = $header->dir;
543 $text_id_esc = $header->id_esc;
Akron347be812020-09-29 07:52:52 +0200544
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200545 # log output for seeing progression
546 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200547
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200548 # Reset counter for text lines
549 # (needed for whitespace handling)
550 $text_line = 0;
551 };
Akrond53913c2021-02-24 09:50:13 +0100552 };
553 };
554 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200555 $text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100556
557 if ($p) {
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100558 print $progress_fh $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100559 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200560} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron347be812020-09-29 07:52:52 +0200561$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200562
Akron9df4a242021-02-19 15:31:16 +0100563$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100564
Akrond53913c2021-02-24 09:50:13 +0100565close $input_fh;
566
Peter Harders6f526a32020-06-29 21:44:41 +0200567
Akrond949e182020-02-14 12:23:57 +0100568__END__
569
570=pod
571
572=encoding utf8
573
574=head1 NAME
575
576tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
577
578=head1 SYNOPSIS
579
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200580 cat corpus.i5.xml | tei2korapxml -tk - > corpus.korapxml.zip
581 tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100582
583=head1 DESCRIPTION
584
Akronee434b12020-07-08 12:53:01 +0200585C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200586L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200587based documents to the
588L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200589
Akrond949e182020-02-14 12:23:57 +0100590This program is usually called from inside another script.
591
Akronee434b12020-07-08 12:53:01 +0200592=head1 FORMATS
593
594=head2 Input restrictions
595
596=over 2
597
598=item
599
Akronee434b12020-07-08 12:53:01 +0200600TEI P5 formatted input with certain restrictions:
601
602=over 4
603
604=item
605
Akrone48bec42023-01-05 12:18:45 +0100606B<mandatory>: text-header with integrated textsigle
607(or convertable identifier), text-body
Akronee434b12020-07-08 12:53:01 +0200608
609=item
610
611B<optional>: corp-header with integrated corpsigle,
612doc-header with integrated docsigle
613
614=back
615
616=item
617
Akron0c41ab32020-09-29 07:33:33 +0200618All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200619newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200620(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200621into blanks between 2 tokens could lead to additional blanks,
622where there should be none (e.g.: punctuation characters like C<,> or
623C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100624(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200625
Akron940ca6f2021-10-11 12:38:39 +0200626=item
627
628Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
629need to be defined in the same line as the header tag.
630
Akronee434b12020-07-08 12:53:01 +0200631=back
632
633=head2 Notes on the output
634
635=over 2
636
637=item
638
639zip file output (default on C<stdout>) with utf8 encoded entries
640(which together form the KorAP-XML format)
641
642=back
643
Akrond949e182020-02-14 12:23:57 +0100644=head1 INSTALLATION
645
Akrond26319b2023-01-12 15:34:41 +0100646C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietze83a4e92021-03-16 20:51:26 +0100647When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100648to use L<cpanm|App::cpanminus>.
649
650 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
651
652In case everything went well, the C<tei2korapxml> tool will
653be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200654
Akrond949e182020-02-14 12:23:57 +0100655Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
656
657=head1 OPTIONS
658
659=over 2
660
Akrona2cb2812021-10-30 10:29:08 +0200661=item B<--input|-i>
662
663The input file to process. If no specific input is defined and a single
664dash C<-> is passed as an argument, data is read from C<STDIN>.
665
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200666Instead of using C<-i> input files can also be defined as trailing arguments
667to the command:
668
669 tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
670
Akron132bdeb2024-06-06 14:28:56 +0200671=item B<--output|-o>
672
673The output zip file to be created. If no specific output is defined,
674data is written to C<STDOUT>.
Akrona2cb2812021-10-30 10:29:08 +0200675
Akron4e603a52020-07-27 14:23:49 +0200676=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100677
Akron4e603a52020-07-27 14:23:49 +0200678The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100679
680=item B<--help|-h>
681
682Print help information.
683
684=item B<--version|-v>
685
686Print version information.
687
Akrone48bec42023-01-05 12:18:45 +0100688=item B<--tokenizer-korap|-tk>
Akron2520a342022-03-29 18:18:05 +0200689
Akrone48bec42023-01-05 12:18:45 +0100690Use the standard KorAP/DeReKo tokenizer.
691
692=item B<--tokenizer-internal|-ti>
693
694Tokenize the data using two embedded tokenizers,
695that will take an I<aggressive> and a I<conservative>
696approach.
Akron2520a342022-03-29 18:18:05 +0200697
Akron4e603a52020-07-27 14:23:49 +0200698=item B<--tokenizer-call|-tc>
699
700Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100701from STDIN and outputs the offsets of all tokens.
702
703Texts are separated using C<\x04\n>. The external process
704should add a new line per text.
705
706If the L</--use-tokenizer-sentence-splits> option is activated,
707sentences are marked by offset as well in new lines.
708
709To use L<Datok|https://github.com/KorAP/Datok> including sentence
710splitting, call C<tei2korap> as follows:
711
712 $ cat corpus.i5.xml | tei2korapxml -s \
713 $ -tc 'datok tokenize \
714 $ -t ./tokenizer.matok \
715 $ -p --newline-after-eot --no-sentences \
716 $ --no-tokens --sentence-positions -' - \
717 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200718
Akronb93fabb2023-01-13 12:05:44 +0100719=item B<--no-tokenizer>
720
721Boolean flag indicating that no tokenizer should be used.
722This is meant to ensure that by default a final token layer always
723exists.
724If a separate tokenizer is chosen, this flag is ignored.
725
Akron75d63142021-02-23 18:40:56 +0100726=item B<--skip-inline-tokens>
727
728Boolean flag indicating that inline tokens should not
729be processed. Defaults to false (meaning inline tokens will be processed).
730
Akron692d17d2021-03-05 13:21:03 +0100731=item B<--skip-inline-token-annotations>
732
733Boolean flag indicating that inline token annotations should not
734be processed. Defaults to true (meaning inline token annotations
Akron6b1f26b2024-09-19 11:35:32 +0200735won't be processed). Can be negated with
736C<--no-skip-inline-token-annotations>.
Akron692d17d2021-03-05 13:21:03 +0100737
Akronca70a1d2021-02-25 16:21:31 +0100738=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100739
740Expects a comma-separated list of tags to be ignored when the structure
741is parsed. Content of these tags however will be processed.
742
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +0200743=item B<--auto-textsigle> <textsigle>
744
745Expects a text sigle thats serves as fallback if no text sigles
746are given in the input data.
747The auto text sigle will be incremented for each text processed.
748
749Example:
750
751 tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
752 < data.i5.xml > korapxml.zip
753
Marc Kupietza671ae52022-12-22 16:28:14 +0100754=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
755
Akrone48bec42023-01-05 12:18:45 +0100756Expects a regular replacement expression (separated by B<@> between the
Marc Kupietza671ae52022-12-22 16:28:14 +0100757search and the replacement) to convert text id attributes to text sigles
758with three parts (separated by B</>).
759
760Example:
761
762 tei2korapxml \
763 --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
764 -tk - < t/data/icc_german_sample.p5.xml
765
Akrone48bec42023-01-05 12:18:45 +0100766Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
767sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietza671ae52022-12-22 16:28:14 +0100768
Akron1a5271a2021-02-18 13:18:15 +0100769=item B<--inline-tokens> <foundry>#[<file>]
770
771Define the foundry and file (without extension)
772to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100773Unless C<--skip-inline-token-annotations> is set,
774this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100775Defaults to C<tokens> and C<morpho>.
776
Akrone2819a12021-10-12 15:52:55 +0200777The inline token data will also be stored in the
778inline structures file (see I<--inline-structures>),
779unless the inline token foundry is prepended
780by an B<!> exclamation mark, indicating that inline
781tokens are stored exclusively in the inline tokens
782file.
783
784Example:
785
Akron6b1f26b2024-09-19 11:35:32 +0200786 tei2korapxml --no-tokenizer --inline-tokens \
787 '!gingko#morpho' < data.i5.xml > korapxml.zip
788
789=item B<--inline-dependencies> <foundry>#[<file>]
790
791Define the foundry and file (without extension)
792to store inline dependency information in.
793Defaults to the layer of C<dependency> and
794will be ignored if not set (which means, dependency
795attributes will be stored in the inline tokens file,
796if not skipped).
797
798The dependency data will also be stored in the
799inline token file (see I<--inline-tokens>),
800unless the inline dependencies foundry is prepended
801by an B<!> exclamation mark, indicating that inline
802dependency data is stored exclusively in the inline
803dependencies file.
804
805Example:
806
807 tei2korapxml --no-tokenizer --inline-dependencies \
808 'gingko#dependency' < data.i5.xml > korapxml.zip
809
Akrone2819a12021-10-12 15:52:55 +0200810
Akrondd0be8f2021-02-18 19:29:41 +0100811=item B<--inline-structures> <foundry>#[<file>]
812
813Define the foundry and file (without extension)
814to store inline structure information in.
815Defaults to C<struct> and C<structures>.
816
Akron26a71522021-02-19 10:27:37 +0100817=item B<--base-foundry> <foundry>
818
819Define the base foundry to store newly generated
820token information in.
821Defaults to C<base>.
822
823=item B<--data-file> <file>
824
825Define the file (without extension)
826to store primary data information in.
827Defaults to C<data>.
828
829=item B<--header-file> <file>
830
831Define the file name (without extension)
832to store header information on
833the corpus, document, and text level in.
834Defaults to C<header>.
835
Marc Kupietz985da0c2021-02-15 19:29:50 +0100836=item B<--use-tokenizer-sentence-splits|-s>
837
838Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100839provided by the tokenizer.
840Currently KorAP-tokenizer and certain external tokenizers support
841these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100842
Akron91705d72021-02-19 10:59:45 +0100843=item B<--tokens-file> <file>
844
845Define the file (without extension)
846to store generated token information in
847(either from the KorAP tokenizer or an externally called tokenizer).
848Defaults to C<tokens>.
849
Akron3378dfd2020-08-01 15:01:36 +0200850=item B<--log|-l>
851
852Loglevel for I<Log::Any>. Defaults to C<notice>.
853
Akrond949e182020-02-14 12:23:57 +0100854=back
855
Akronb3649472020-09-29 08:24:46 +0200856=head1 ENVIRONMENT VARIABLES
857
858=over 2
859
860=item B<KORAPXMLTEI_DEBUG>
861
862Activate minimal debugging.
863Defaults to C<false>.
864
Marc Kupietzd254f5c2025-04-16 10:37:08 +0200865=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
866
867Set the heap size for the tokenizer process.
868Defaults to C<512m>.
869
Akronb3649472020-09-29 08:24:46 +0200870=back
871
Akrond949e182020-02-14 12:23:57 +0100872=head1 COPYRIGHT AND LICENSE
873
Marc Kupietzb6fd6bc2025-04-16 12:47:26 +0200874Copyright (C) 2021-2025, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100875
876Author: Peter Harders
877
Akronaabd0952020-09-29 07:35:08 +0200878Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100879
880L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
881Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200882L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100883member of the
884L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
885
886This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100887L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100888
889=cut
Akronf8088e62021-02-18 16:18:59 +0100890
891# NOTES
892
Akronf8088e62021-02-18 16:18:59 +0100893## Notes on segfault prevention
894
Akron91577922021-02-19 10:32:54 +0100895binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100896(see notes on 'PerlIO layers' in 'man XML::LibXML'),
897removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
898see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
899see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.