blob: 418408e2b8253ea0badd5192da8c1527bce9fd55 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Akron4f67cd42020-07-02 12:27:58 +020014use FindBin;
15BEGIN {
16 unshift @INC, "$FindBin::Bin/../lib";
17};
18
Marc Kupietz8a954e52021-02-16 22:03:07 +010019use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020020use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020021use KorAP::XML::TEI::Tokenizer::Conservative;
22use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020023use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020024use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010025use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010026
Akron132bdeb2024-06-06 14:28:56 +020027our $VERSION = '2.6.0';
Peter Harders6f526a32020-06-29 21:44:41 +020028
Akrond949e182020-02-14 12:23:57 +010029our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
30
Akron33db4ec2021-02-24 12:52:21 +010031use constant {
32 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010033 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010034};
Peter Hardersd892a582020-02-12 15:45:22 +010035
Akron692d17d2021-03-05 13:21:03 +010036if ($ENV{KORAPXMLTEI_INLINE}) {
37 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
38};
39
Akrone2819a12021-10-12 15:52:55 +020040# Inline tokens won't be stored in the structure file
41my $inline_tokens_exclusive = 0;
42
Akron6b1f26b2024-09-19 11:35:32 +020043# Inline dependencies won't be stored in the tokens file
44my $inline_deps_exclusive = 0;
45
Peter Harders6f526a32020-06-29 21:44:41 +020046# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010047GetOptions(
Akrond3e1d282021-02-24 14:51:27 +010048 'root|r=s' => \(my $root_dir = '.'),
49 'input|i=s' => \(my $input_fname = ''),
Akron132bdeb2024-06-06 14:28:56 +020050 'output|o=s' => \(my $output_fname = ''),
Akron75d63142021-02-23 18:40:56 +010051 'tokenizer-call|tc=s' => \(my $tokenizer_call),
52 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010053 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akronb93fabb2023-01-13 12:05:44 +010054 'no-tokenizer' => \(my $no_tokenizer),
Akron75d63142021-02-23 18:40:56 +010055 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
56 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
57 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron6b1f26b2024-09-19 11:35:32 +020058 'inline-dependencies=s' => \(my $inline_dependencies),
Akron75d63142021-02-23 18:40:56 +010059 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron6b1f26b2024-09-19 11:35:32 +020060 'skip-inline-token-annotations!' => \(
Akron692d17d2021-03-05 13:21:03 +010061 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010062 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010063 'base-foundry=s' => \(my $base_dir = 'base'),
64 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010065 'header-file=s' => \(my $header_file = 'header'),
66 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietza671ae52022-12-22 16:28:14 +010067 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010068 'log|l=s' => \(my $log_level = 'notice'),
Akron2520a342022-03-29 18:18:05 +020069 'required-version|rv=s' => \(my $required_version),
Akrona2cb2812021-10-30 10:29:08 +020070 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010071 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010072 pod2usage(
73 -verbose => 99,
74 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
75 -msg => $VERSION_MSG,
76 -output => '-'
77 )
78 },
79 'version|v' => sub {
80 pod2usage(
81 -verbose => 0,
82 -msg => $VERSION_MSG,
83 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010084 );
Akrond949e182020-02-14 12:23:57 +010085 }
Peter Hardersd892a582020-02-12 15:45:22 +010086);
87
Akrond3e1d282021-02-24 14:51:27 +010088
Akronb87c58d2021-02-23 17:23:30 +010089# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010090binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020091Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020092$log->notice('Debugging is activated') if DEBUG;
93
Akrond3e1d282021-02-24 14:51:27 +010094
Akron2520a342022-03-29 18:18:05 +020095if ($required_version) {
Marc Kupietz2475c952024-01-09 10:40:04 +010096 $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
Akron2520a342022-03-29 18:18:05 +020097 if (!$1 || $1 ne $VERSION) {
98 $log->error("Required version $required_version mismatches version $VERSION");
99 exit(1);
100 };
101};
102
103
Marc Kupietza671ae52022-12-22 16:28:14 +0100104my ($what, $with);
105if ($xmlid_to_textsigle ne '') {
106 ($what, $with) = split('@', $xmlid_to_textsigle);
107 $what = qr!$what!;
108};
109
Akron0529e512021-02-22 09:55:35 +0100110# tag (without attributes), which contains the primary text
111my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +0200112# optional
Akron09e0b2c2020-07-28 15:57:01 +0200113
Akron54c3ff12021-02-25 11:33:37 +0100114# Remember to skip certain inline tags
115my %skip_inline_tags = ();
116if ($skip_inline_tags_str) {
117 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
118 $skip_inline_tags{$_} = 1;
119 };
120};
121
Akrond3e1d282021-02-24 14:51:27 +0100122# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200123my $ext_tok;
124if ($tokenizer_call) {
125 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100126 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200127}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200128
Akronb93fabb2023-01-13 12:05:44 +0100129# KorAP tokenization
Akron0c41ab32020-09-29 07:33:33 +0200130elsif ($tokenizer_korap) {
Akronbd4281e2022-03-28 08:31:40 +0200131 eval {
132 require KorAP::XML::TEI::Tokenizer::KorAP;
133 1;
134 };
Akron2520a342022-03-29 18:18:05 +0200135
136 my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
137 if ($korap_tok_ver ne $VERSION) {
138 $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
139 exit(1);
140 };
141
Marc Kupietz985da0c2021-02-15 19:29:50 +0100142 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akronb93fabb2023-01-13 12:05:44 +0100143}
144
145# No internal tokenizer chosen
146elsif (!$tokenizer_intern && !$no_tokenizer) {
147 $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
148 exit(1);
Akron0c41ab32020-09-29 07:33:33 +0200149};
Peter Harders6f526a32020-06-29 21:44:41 +0200150
Akron6b1f26b2024-09-19 11:35:32 +0200151if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron11484782021-11-03 20:12:14 +0100152 $skip_inline_tags{s} = 1;
153};
Akron0c41ab32020-09-29 07:33:33 +0200154
Akrond3e1d282021-02-24 14:51:27 +0100155# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100156my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
157my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100158
Peter Harders41c35622020-07-12 01:16:22 +0200159
Akrondd0be8f2021-02-18 19:29:41 +0100160# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100161# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100162my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100163
Akron1a5271a2021-02-18 13:18:15 +0100164# Name of the directory and the file containing all inline token informations
165# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
166my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100167
Akrone2819a12021-10-12 15:52:55 +0200168if (index($_tokens_dir, '!') == 0) {
169 $_tokens_dir = substr($_tokens_dir, 1);
170 $inline_tokens_exclusive = 1;
171};
172
Akron6b1f26b2024-09-19 11:35:32 +0200173
174my ($_dep_dir, $_dep_file);
175if ($inline_dependencies) {
176 ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
177 $inline_dependencies = 1;
178
179 if ($_dep_dir && index($_dep_dir, '!') == 0) {
180 $_dep_dir = substr($_dep_dir, 1);
181 $inline_deps_exclusive = 1;
182 };
183};
184
185
Akronb87c58d2021-02-23 17:23:30 +0100186# Initialize zipper
Akron132bdeb2024-06-06 14:28:56 +0200187my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron09e0b2c2020-07-28 15:57:01 +0200188
Akronbc899192021-02-24 12:14:47 +0100189# text directory (below $root_dir)
190my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200191
Akronbc899192021-02-24 12:14:47 +0100192# Escaped version of text id
193my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200194
Akrond53913c2021-02-24 09:50:13 +0100195# Default encoding of the text
196my $input_enc = 'UTF-8';
197
Akrond53913c2021-02-24 09:50:13 +0100198# text line (needed for whitespace handling)
199my $text_line = 0;
200
Peter Harders6f526a32020-06-29 21:44:41 +0200201
Akrond53913c2021-02-24 09:50:13 +0100202# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200203my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100204
Akrona2cb2812021-10-30 10:29:08 +0200205# Single dash was set
206if ($stdio) {
207 $input_fh = *STDIN;
208}
209
210# Input flag was passed
211elsif ($input_fname ne '') {
Akron347be812020-09-29 07:52:52 +0200212 unless (open($input_fh, '<', $input_fname)) {
213 die $log->fatal("File '$input_fname' could not be opened.");
214 };
Akrona2cb2812021-10-30 10:29:08 +0200215}
216
217# No input to process
218else {
219 pod2usage(
220 -verbose => 99,
221 -sections => 'NAME|SYNOPSIS',
222 -msg => $VERSION_MSG,
223 -output => '-'
224 );
225 exit;
Akrond53913c2021-02-24 09:50:13 +0100226};
Peter Harders6f526a32020-06-29 21:44:41 +0200227
Akronf8088e62021-02-18 16:18:59 +0100228# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200229binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200230
Peter Harders6f526a32020-06-29 21:44:41 +0200231
Akroneb12e232021-02-25 13:49:50 +0100232# Create inline parser object
233my $inline = KorAP::XML::TEI::Inline->new(
234 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200235 \%skip_inline_tags,
Akron6b1f26b2024-09-19 11:35:32 +0200236 $inline_tokens_exclusive,
237 $inline_dependencies
Akroneb12e232021-02-25 13:49:50 +0100238);
239
240
Akrond53913c2021-02-24 09:50:13 +0100241# Reading input document
Akrond3e1d282021-02-24 14:51:27 +0100242MAIN: while (<$input_fh>) {
Akron347be812020-09-29 07:52:52 +0200243
Akrond53913c2021-02-24 09:50:13 +0100244 # remove HTML (multi-line) comments (<!--...-->)
Akrond3e1d282021-02-24 14:51:27 +0100245 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200246
Akroneaa96232020-10-15 17:06:15 +0200247 # Set input encoding
Akrond53913c2021-02-24 09:50:13 +0100248 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akroneaa96232020-10-15 17:06:15 +0200249 $input_enc = $2;
250 next;
251 };
252
253 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100254 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200255
Akrond3e1d282021-02-24 14:51:27 +0100256 # Start of text body
257 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
Akrond53913c2021-02-24 09:50:13 +0100258 my $suffix = $2;
Akron347be812020-09-29 07:52:52 +0200259
Akrond53913c2021-02-24 09:50:13 +0100260 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200261 die $log->fatal("input line number $.: " .
262 "line with opening text-body tag '${_TEXT_BODY}' " .
263 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200264 };
Peter Harders6f526a32020-06-29 21:44:41 +0200265
Akrond53913c2021-02-24 09:50:13 +0100266 # Text body data extracted from input document ($input_fh),
267 # further processed by XML::LibXML::Reader
268 my $text_buffer = '';
Peter Harders90157342020-07-01 21:05:14 +0200269
Akron347be812020-09-29 07:52:52 +0200270 # Iterate over all lines in the text body
271 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200272
Akrond3e1d282021-02-24 14:51:27 +0100273 $_ = remove_xml_comments($input_fh, $_);
Akroneaa96232020-10-15 17:06:15 +0200274 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100275 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200276
Akrond53913c2021-02-24 09:50:13 +0100277 # End of text body
Akron72f4a882023-03-02 09:48:14 +0100278 if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200279
Akron91705d72021-02-19 10:59:45 +0100280 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200281
Akrond53913c2021-02-24 09:50:13 +0100282 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200283 die $log->fatal("input line number $.: " .
284 "line with closing text-body tag '${_TEXT_BODY}'".
285 " contains additional information ... => Aborting (line=$_)");
286 };
Peter Harders6f526a32020-06-29 21:44:41 +0200287
Akrondafaa7a2021-02-19 15:17:58 +0100288 if ($dir eq '') {
Akrond53913c2021-02-24 09:50:13 +0100289 $log->warn(
290 "Maybe empty textSigle => skipping this text ...\n" .
Akroneb12e232021-02-25 13:49:50 +0100291 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100292 );
Akrondafaa7a2021-02-19 15:17:58 +0100293 next MAIN;
294 };
Peter Harders6f526a32020-06-29 21:44:41 +0200295
Akroneb12e232021-02-25 13:49:50 +0100296 # Parse inline structure
297 $inline->parse($text_id_esc, \$text_buffer);
Akrondafaa7a2021-02-19 15:17:58 +0100298
299 if (DEBUG) {
Akrond53913c2021-02-24 09:50:13 +0100300 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akrondafaa7a2021-02-19 15:17:58 +0100301 };
302
Akroneb12e232021-02-25 13:49:50 +0100303 my $data = $inline->data;
304
Akrond53913c2021-02-24 09:50:13 +0100305 # Write data.xml
Akrondafaa7a2021-02-19 15:17:58 +0100306 $data->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100307 $zipper->new_stream("$dir/${data_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100308 $text_id_esc
309 );
310
Akrond53913c2021-02-24 09:50:13 +0100311 # Tokenize with external tokenizer
Akron9df4a242021-02-19 15:31:16 +0100312 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100313
314 # Tokenize and output
315 $ext_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100316 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100317 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100318 );
Akrond53ab4b2021-02-24 09:56:12 +0100319
320 if ($use_tokenizer_sentence_splits) {
Akroneb12e232021-02-25 13:49:50 +0100321 $ext_tok->sentencize_from_previous_input($inline->structures);
Akrond53ab4b2021-02-24 09:56:12 +0100322 };
Akrondafaa7a2021-02-19 15:17:58 +0100323 };
Peter Harders6f526a32020-06-29 21:44:41 +0200324
Akrond53913c2021-02-24 09:50:13 +0100325 # Tokenize with internal tokenizer
326 if ($tokenizer_intern) {
Peter Harders6f526a32020-06-29 21:44:41 +0200327
Akrondafaa7a2021-02-19 15:17:58 +0100328 # Tokenize and output
329 $cons_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100330 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200331 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100332 )->reset;
Akron598d1a72020-08-02 17:33:31 +0200333
Akrondafaa7a2021-02-19 15:17:58 +0100334 $aggr_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100335 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akrondafaa7a2021-02-19 15:17:58 +0100336 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100337 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100338 };
Akrona10ad592020-08-03 11:20:23 +0200339
Akrondafaa7a2021-02-19 15:17:58 +0100340 # ~ write structures ~
Akron6b1f26b2024-09-19 11:35:32 +0200341 unless ($inline->structures->empty) {
Akroneb12e232021-02-25 13:49:50 +0100342 $inline->structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100343 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100344 $text_id_esc,
345 2 # = structure serialization
Akroneb12e232021-02-25 13:49:50 +0100346 );
Akrondafaa7a2021-02-19 15:17:58 +0100347 };
348
349 # ~ write tokens ~
Akroneb12e232021-02-25 13:49:50 +0100350 unless ($skip_inline_tokens || $inline->tokens->empty) {
351 $inline->tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100352 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100353 $text_id_esc,
Akron6b1f26b2024-09-19 11:35:32 +0200354 # Either 0 = tokens without inline or
355 # 1 = tokens with inline
356 # !$skip_inline_token_annotations
357 ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
Akroneb12e232021-02-25 13:49:50 +0100358 );
Akrondafaa7a2021-02-19 15:17:58 +0100359 };
360
Akron6b1f26b2024-09-19 11:35:32 +0200361 # ~ write dependencies ~
362 unless ($inline->dependencies->empty) {
363 $inline->dependencies->to_zip(
364 $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
365 $text_id_esc,
366 3 # = dependency serialization
367 );
368 };
369
370
Akrondafaa7a2021-02-19 15:17:58 +0100371 # reinit.
372 $dir = '';
373
Akron347be812020-09-29 07:52:52 +0200374 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200375 };
376
Peter Harders6f526a32020-06-29 21:44:41 +0200377
Akron347be812020-09-29 07:52:52 +0200378 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200379
Akronf8088e62021-02-18 16:18:59 +0100380 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100381
Akrond53913c2021-02-24 09:50:13 +0100382 # TODO:
383 # Maybe it's best, to keep the stripping of whitespace and
384 # to just remove the if-clause and to insert a blank by default
385 # (with possibly an option on how newlines in primary text should
386 # be handled (stripped or replaced by a whitespace)).
Akronf8088e62021-02-18 16:18:59 +0100387
388 # Remove consecutive whitespace at beginning and end (mostly one newline)
389 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200390
Akrond53913c2021-02-24 09:50:13 +0100391 # NOTE:
392 # this is only relevant, if a text consists of more than one line
Akronf57ed812020-07-27 10:37:52 +0200393
Akrond53913c2021-02-24 09:50:13 +0100394 # TODO:
395 # find a better solution, or create a warning, if a text has more
396 # than one line ($text_line > 1)
Akronf57ed812020-07-27 10:37:52 +0200397
Akrond53913c2021-02-24 09:50:13 +0100398 # TODO:
399 # do testing with 2 different corpora
400 # (one with only one-line texts, the other with several lines per text)
401
Akronec503252023-04-24 18:03:17 +0200402 # line contains at least one non-tag character
403 if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
Akrond53913c2021-02-24 09:50:13 +0100404
405 # Increment counter for text lines
406 $text_line++;
407
408 # insert blank before 1st character
Akron6e2b1252021-02-24 12:41:15 +0100409 # (for 2nd line and consecutive lines)
410 $_ = ' ' . $_ if $text_line > 1;
Akron347be812020-09-29 07:52:52 +0200411 }
Akronf57ed812020-07-27 10:37:52 +0200412
Akron347be812020-09-29 07:52:52 +0200413 # add line to buffer
Akrond53913c2021-02-24 09:50:13 +0100414 $text_buffer .= $_;
Akron347be812020-09-29 07:52:52 +0200415 };
Akrond3e1d282021-02-24 14:51:27 +0100416 }
Akronf57ed812020-07-27 10:37:52 +0200417
Marc Kupietza671ae52022-12-22 16:28:14 +0100418 elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
419 my $leadin = $1;
420 my $id = $3;
421 my $sigle = $3;
Akronf57ed812020-07-27 10:37:52 +0200422
Marc Kupietza671ae52022-12-22 16:28:14 +0100423 if ($what) {
424 $_ = $id;
425 eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
426 $sigle = $_;
427 $log->debug("Converted text id `$id' to sigle `$sigle'");
428 };
429 $sigle =~ s/\./-/g;
430
431 my @parts = split(/[\/_]/, $sigle);
432 if (@parts != 3) {
433 die $log->fatal(
434 "input line number $.: " .
435 "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
436 "=> Aborting (line=$_)");
437 };
438
439 $dir = join("/", @parts);
440 $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
441 $log->notice("$0: text_id=$text_id_esc");
442
443 if ($leadin !~ /^\s*$/) {
444 die $log->fatal(
445 "input line number $.: " .
446 'line with opening header tag is not in expected format ... ' .
447 "=> Aborting (line=$_)");
448 };
449 }
450
451 # Start of header section
452 elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
Akron347be812020-09-29 07:52:52 +0200453 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200454
Akrond20898f2021-02-19 15:52:17 +0100455 if ($1 !~ /^\s*$/) {
Akrond53913c2021-02-24 09:50:13 +0100456 die $log->fatal(
457 "input line number $.: " .
458 'line with opening header tag is not in expected format ... ' .
459 "=> Aborting (line=$_)");
Akron347be812020-09-29 07:52:52 +0200460 };
461
462 # Parse header
Marc Kupietza671ae52022-12-22 16:28:14 +0100463 my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200464
465 # Header was parseable
466 if ($header) {
467
468 # Write header to zip
Akrond53913c2021-02-24 09:50:13 +0100469 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200470
Akronb3649472020-09-29 08:24:46 +0200471 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200472
473 $header->to_zip($zipper->new_stream($file));
474
475 # Header is for text level
476 if ($header->type eq 'text') {
477
478 # Remember dir and sigles
479 $dir = $header->dir;
Akron347be812020-09-29 07:52:52 +0200480 $text_id_esc = $header->id_esc;
481
482 # log output for seeing progression
Akronbc899192021-02-24 12:14:47 +0100483 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200484
Akrond53913c2021-02-24 09:50:13 +0100485 # Reset counter for text lines
486 # (needed for whitespace handling)
487 $text_line = 0;
488 };
489 };
490 };
491};
Peter Hardersd892a582020-02-12 15:45:22 +0100492
Akron347be812020-09-29 07:52:52 +0200493$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200494
Akron9df4a242021-02-19 15:31:16 +0100495$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100496
Akrond53913c2021-02-24 09:50:13 +0100497close $input_fh;
498
Peter Harders6f526a32020-06-29 21:44:41 +0200499
Akrond949e182020-02-14 12:23:57 +0100500__END__
501
502=pod
503
504=encoding utf8
505
506=head1 NAME
507
508tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
509
510=head1 SYNOPSIS
511
Akrona2cb2812021-10-30 10:29:08 +0200512 cat corpus.i5.xml | tei2korapxml - > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100513
514=head1 DESCRIPTION
515
Akronee434b12020-07-08 12:53:01 +0200516C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200517L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200518based documents to the
519L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200520
Akrond949e182020-02-14 12:23:57 +0100521This program is usually called from inside another script.
522
Akronee434b12020-07-08 12:53:01 +0200523=head1 FORMATS
524
525=head2 Input restrictions
526
527=over 2
528
529=item
530
Akronee434b12020-07-08 12:53:01 +0200531TEI P5 formatted input with certain restrictions:
532
533=over 4
534
535=item
536
Akrone48bec42023-01-05 12:18:45 +0100537B<mandatory>: text-header with integrated textsigle
538(or convertable identifier), text-body
Akronee434b12020-07-08 12:53:01 +0200539
540=item
541
542B<optional>: corp-header with integrated corpsigle,
543doc-header with integrated docsigle
544
545=back
546
547=item
548
Akron0c41ab32020-09-29 07:33:33 +0200549All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200550newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200551(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200552into blanks between 2 tokens could lead to additional blanks,
553where there should be none (e.g.: punctuation characters like C<,> or
554C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100555(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200556
Akron940ca6f2021-10-11 12:38:39 +0200557=item
558
559Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
560need to be defined in the same line as the header tag.
561
Akronee434b12020-07-08 12:53:01 +0200562=back
563
564=head2 Notes on the output
565
566=over 2
567
568=item
569
570zip file output (default on C<stdout>) with utf8 encoded entries
571(which together form the KorAP-XML format)
572
573=back
574
Akrond949e182020-02-14 12:23:57 +0100575=head1 INSTALLATION
576
Akrond26319b2023-01-12 15:34:41 +0100577C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietze83a4e92021-03-16 20:51:26 +0100578When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100579to use L<cpanm|App::cpanminus>.
580
581 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
582
583In case everything went well, the C<tei2korapxml> tool will
584be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200585
Akrond949e182020-02-14 12:23:57 +0100586Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
587
588=head1 OPTIONS
589
590=over 2
591
Akrona2cb2812021-10-30 10:29:08 +0200592=item B<--input|-i>
593
594The input file to process. If no specific input is defined and a single
595dash C<-> is passed as an argument, data is read from C<STDIN>.
596
Akron132bdeb2024-06-06 14:28:56 +0200597=item B<--output|-o>
598
599The output zip file to be created. If no specific output is defined,
600data is written to C<STDOUT>.
Akrona2cb2812021-10-30 10:29:08 +0200601
Akron4e603a52020-07-27 14:23:49 +0200602=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100603
Akron4e603a52020-07-27 14:23:49 +0200604The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100605
606=item B<--help|-h>
607
608Print help information.
609
610=item B<--version|-v>
611
612Print version information.
613
Akrone48bec42023-01-05 12:18:45 +0100614=item B<--tokenizer-korap|-tk>
Akron2520a342022-03-29 18:18:05 +0200615
Akrone48bec42023-01-05 12:18:45 +0100616Use the standard KorAP/DeReKo tokenizer.
617
618=item B<--tokenizer-internal|-ti>
619
620Tokenize the data using two embedded tokenizers,
621that will take an I<aggressive> and a I<conservative>
622approach.
Akron2520a342022-03-29 18:18:05 +0200623
Akron4e603a52020-07-27 14:23:49 +0200624=item B<--tokenizer-call|-tc>
625
626Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100627from STDIN and outputs the offsets of all tokens.
628
629Texts are separated using C<\x04\n>. The external process
630should add a new line per text.
631
632If the L</--use-tokenizer-sentence-splits> option is activated,
633sentences are marked by offset as well in new lines.
634
635To use L<Datok|https://github.com/KorAP/Datok> including sentence
636splitting, call C<tei2korap> as follows:
637
638 $ cat corpus.i5.xml | tei2korapxml -s \
639 $ -tc 'datok tokenize \
640 $ -t ./tokenizer.matok \
641 $ -p --newline-after-eot --no-sentences \
642 $ --no-tokens --sentence-positions -' - \
643 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200644
Akronb93fabb2023-01-13 12:05:44 +0100645=item B<--no-tokenizer>
646
647Boolean flag indicating that no tokenizer should be used.
648This is meant to ensure that by default a final token layer always
649exists.
650If a separate tokenizer is chosen, this flag is ignored.
651
Akron75d63142021-02-23 18:40:56 +0100652=item B<--skip-inline-tokens>
653
654Boolean flag indicating that inline tokens should not
655be processed. Defaults to false (meaning inline tokens will be processed).
656
Akron692d17d2021-03-05 13:21:03 +0100657=item B<--skip-inline-token-annotations>
658
659Boolean flag indicating that inline token annotations should not
660be processed. Defaults to true (meaning inline token annotations
Akron6b1f26b2024-09-19 11:35:32 +0200661won't be processed). Can be negated with
662C<--no-skip-inline-token-annotations>.
Akron692d17d2021-03-05 13:21:03 +0100663
Akronca70a1d2021-02-25 16:21:31 +0100664=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100665
666Expects a comma-separated list of tags to be ignored when the structure
667is parsed. Content of these tags however will be processed.
668
Marc Kupietza671ae52022-12-22 16:28:14 +0100669=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
670
Akrone48bec42023-01-05 12:18:45 +0100671Expects a regular replacement expression (separated by B<@> between the
Marc Kupietza671ae52022-12-22 16:28:14 +0100672search and the replacement) to convert text id attributes to text sigles
673with three parts (separated by B</>).
674
675Example:
676
677 tei2korapxml \
678 --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
679 -tk - < t/data/icc_german_sample.p5.xml
680
Akrone48bec42023-01-05 12:18:45 +0100681Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
682sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietza671ae52022-12-22 16:28:14 +0100683
Akron1a5271a2021-02-18 13:18:15 +0100684=item B<--inline-tokens> <foundry>#[<file>]
685
686Define the foundry and file (without extension)
687to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100688Unless C<--skip-inline-token-annotations> is set,
689this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100690Defaults to C<tokens> and C<morpho>.
691
Akrone2819a12021-10-12 15:52:55 +0200692The inline token data will also be stored in the
693inline structures file (see I<--inline-structures>),
694unless the inline token foundry is prepended
695by an B<!> exclamation mark, indicating that inline
696tokens are stored exclusively in the inline tokens
697file.
698
699Example:
700
Akron6b1f26b2024-09-19 11:35:32 +0200701 tei2korapxml --no-tokenizer --inline-tokens \
702 '!gingko#morpho' < data.i5.xml > korapxml.zip
703
704=item B<--inline-dependencies> <foundry>#[<file>]
705
706Define the foundry and file (without extension)
707to store inline dependency information in.
708Defaults to the layer of C<dependency> and
709will be ignored if not set (which means, dependency
710attributes will be stored in the inline tokens file,
711if not skipped).
712
713The dependency data will also be stored in the
714inline token file (see I<--inline-tokens>),
715unless the inline dependencies foundry is prepended
716by an B<!> exclamation mark, indicating that inline
717dependency data is stored exclusively in the inline
718dependencies file.
719
720Example:
721
722 tei2korapxml --no-tokenizer --inline-dependencies \
723 'gingko#dependency' < data.i5.xml > korapxml.zip
724
Akrone2819a12021-10-12 15:52:55 +0200725
Akrondd0be8f2021-02-18 19:29:41 +0100726=item B<--inline-structures> <foundry>#[<file>]
727
728Define the foundry and file (without extension)
729to store inline structure information in.
730Defaults to C<struct> and C<structures>.
731
Akron26a71522021-02-19 10:27:37 +0100732=item B<--base-foundry> <foundry>
733
734Define the base foundry to store newly generated
735token information in.
736Defaults to C<base>.
737
738=item B<--data-file> <file>
739
740Define the file (without extension)
741to store primary data information in.
742Defaults to C<data>.
743
744=item B<--header-file> <file>
745
746Define the file name (without extension)
747to store header information on
748the corpus, document, and text level in.
749Defaults to C<header>.
750
Marc Kupietz985da0c2021-02-15 19:29:50 +0100751=item B<--use-tokenizer-sentence-splits|-s>
752
753Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100754provided by the tokenizer.
755Currently KorAP-tokenizer and certain external tokenizers support
756these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100757
Akron91705d72021-02-19 10:59:45 +0100758=item B<--tokens-file> <file>
759
760Define the file (without extension)
761to store generated token information in
762(either from the KorAP tokenizer or an externally called tokenizer).
763Defaults to C<tokens>.
764
Akron3378dfd2020-08-01 15:01:36 +0200765=item B<--log|-l>
766
767Loglevel for I<Log::Any>. Defaults to C<notice>.
768
Akrond949e182020-02-14 12:23:57 +0100769=back
770
Akronb3649472020-09-29 08:24:46 +0200771=head1 ENVIRONMENT VARIABLES
772
773=over 2
774
775=item B<KORAPXMLTEI_DEBUG>
776
777Activate minimal debugging.
778Defaults to C<false>.
779
Akronb3649472020-09-29 08:24:46 +0200780=back
781
Akrond949e182020-02-14 12:23:57 +0100782=head1 COPYRIGHT AND LICENSE
783
Akron6b1f26b2024-09-19 11:35:32 +0200784Copyright (C) 2021-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100785
786Author: Peter Harders
787
Akronaabd0952020-09-29 07:35:08 +0200788Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100789
790L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
791Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200792L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100793member of the
794L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
795
796This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100797L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100798
799=cut
Akronf8088e62021-02-18 16:18:59 +0100800
801# NOTES
802
Akronf8088e62021-02-18 16:18:59 +0100803## Notes on segfault prevention
804
Akron91577922021-02-19 10:32:54 +0100805binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100806(see notes on 'PerlIO layers' in 'man XML::LibXML'),
807removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
808see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
809see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.