blob: 86f7527ab4c31885871216b5d968b98c45405577 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +02009use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders6f526a32020-06-29 21:44:41 +020010
11use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010012
Akroneaa96232020-10-15 17:06:15 +020013use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010014
Akron4f67cd42020-07-02 12:27:58 +020015use FindBin;
16BEGIN {
17 unshift @INC, "$FindBin::Bin/../lib";
18};
19
Marc Kupietz8a954e52021-02-16 22:03:07 +010020use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020021use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020022use KorAP::XML::TEI::Tokenizer::Conservative;
23use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020024use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020025use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010026use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010027
Akron132bdeb2024-06-06 14:28:56 +020028our $VERSION = '2.6.0';
Peter Harders6f526a32020-06-29 21:44:41 +020029
Akrond949e182020-02-14 12:23:57 +010030our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
31
Akron33db4ec2021-02-24 12:52:21 +010032use constant {
33 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010034 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010035};
Peter Hardersd892a582020-02-12 15:45:22 +010036
Akron692d17d2021-03-05 13:21:03 +010037if ($ENV{KORAPXMLTEI_INLINE}) {
38 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
39};
40
Akrone2819a12021-10-12 15:52:55 +020041# Inline tokens won't be stored in the structure file
42my $inline_tokens_exclusive = 0;
43
Akron6b1f26b2024-09-19 11:35:32 +020044# Inline dependencies won't be stored in the tokens file
45my $inline_deps_exclusive = 0;
46
Peter Harders6f526a32020-06-29 21:44:41 +020047# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010048GetOptions(
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +020049 'auto-textsigle|A=s' => \(my $auto_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010050 'root|r=s' => \(my $root_dir = '.'),
51 'input|i=s' => \(my $input_fname = ''),
Akron132bdeb2024-06-06 14:28:56 +020052 'output|o=s' => \(my $output_fname = ''),
Akron75d63142021-02-23 18:40:56 +010053 'tokenizer-call|tc=s' => \(my $tokenizer_call),
54 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010055 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akronb93fabb2023-01-13 12:05:44 +010056 'no-tokenizer' => \(my $no_tokenizer),
Akron75d63142021-02-23 18:40:56 +010057 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
58 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
59 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron6b1f26b2024-09-19 11:35:32 +020060 'inline-dependencies=s' => \(my $inline_dependencies),
Akron75d63142021-02-23 18:40:56 +010061 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron6b1f26b2024-09-19 11:35:32 +020062 'skip-inline-token-annotations!' => \(
Akron692d17d2021-03-05 13:21:03 +010063 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010064 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietza671ae52022-12-22 16:28:14 +010069 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010070 'log|l=s' => \(my $log_level = 'notice'),
Akron2520a342022-03-29 18:18:05 +020071 'required-version|rv=s' => \(my $required_version),
Akrona2cb2812021-10-30 10:29:08 +020072 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010073 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010074 pod2usage(
75 -verbose => 99,
76 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
77 -msg => $VERSION_MSG,
78 -output => '-'
79 )
80 },
81 'version|v' => sub {
82 pod2usage(
83 -verbose => 0,
84 -msg => $VERSION_MSG,
85 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010086 );
Akrond949e182020-02-14 12:23:57 +010087 }
Peter Hardersd892a582020-02-12 15:45:22 +010088);
89
Akrond3e1d282021-02-24 14:51:27 +010090
Akronb87c58d2021-02-23 17:23:30 +010091# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010092binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020093Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020094$log->notice('Debugging is activated') if DEBUG;
95
Akrond3e1d282021-02-24 14:51:27 +010096
Akron2520a342022-03-29 18:18:05 +020097if ($required_version) {
Marc Kupietz2475c952024-01-09 10:40:04 +010098 $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
Akron2520a342022-03-29 18:18:05 +020099 if (!$1 || $1 ne $VERSION) {
100 $log->error("Required version $required_version mismatches version $VERSION");
101 exit(1);
102 };
103};
104
105
Marc Kupietza671ae52022-12-22 16:28:14 +0100106my ($what, $with);
107if ($xmlid_to_textsigle ne '') {
108 ($what, $with) = split('@', $xmlid_to_textsigle);
109 $what = qr!$what!;
110};
111
Akron0529e512021-02-22 09:55:35 +0100112# tag (without attributes), which contains the primary text
113my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +0200114# optional
Akron09e0b2c2020-07-28 15:57:01 +0200115
Akron54c3ff12021-02-25 11:33:37 +0100116# Remember to skip certain inline tags
117my %skip_inline_tags = ();
118if ($skip_inline_tags_str) {
119 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
120 $skip_inline_tags{$_} = 1;
121 };
122};
123
Akrond3e1d282021-02-24 14:51:27 +0100124# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200125my $ext_tok;
126if ($tokenizer_call) {
127 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100128 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200129}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200130
Akronb93fabb2023-01-13 12:05:44 +0100131# KorAP tokenization
Akron0c41ab32020-09-29 07:33:33 +0200132elsif ($tokenizer_korap) {
Akronbd4281e2022-03-28 08:31:40 +0200133 eval {
134 require KorAP::XML::TEI::Tokenizer::KorAP;
135 1;
136 };
Akron2520a342022-03-29 18:18:05 +0200137
138 my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
139 if ($korap_tok_ver ne $VERSION) {
140 $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
141 exit(1);
142 };
143
Marc Kupietz985da0c2021-02-15 19:29:50 +0100144 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akronb93fabb2023-01-13 12:05:44 +0100145}
146
147# No internal tokenizer chosen
148elsif (!$tokenizer_intern && !$no_tokenizer) {
149 $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
150 exit(1);
Akron0c41ab32020-09-29 07:33:33 +0200151};
Peter Harders6f526a32020-06-29 21:44:41 +0200152
Akron6b1f26b2024-09-19 11:35:32 +0200153if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron11484782021-11-03 20:12:14 +0100154 $skip_inline_tags{s} = 1;
155};
Akron0c41ab32020-09-29 07:33:33 +0200156
Akrond3e1d282021-02-24 14:51:27 +0100157# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100158my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
159my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100160
Peter Harders41c35622020-07-12 01:16:22 +0200161
Akrondd0be8f2021-02-18 19:29:41 +0100162# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100163# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100164my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100165
Akron1a5271a2021-02-18 13:18:15 +0100166# Name of the directory and the file containing all inline token informations
167# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
168my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100169
Akrone2819a12021-10-12 15:52:55 +0200170if (index($_tokens_dir, '!') == 0) {
171 $_tokens_dir = substr($_tokens_dir, 1);
172 $inline_tokens_exclusive = 1;
173};
174
Akron6b1f26b2024-09-19 11:35:32 +0200175
176my ($_dep_dir, $_dep_file);
177if ($inline_dependencies) {
178 ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
179 $inline_dependencies = 1;
180
181 if ($_dep_dir && index($_dep_dir, '!') == 0) {
182 $_dep_dir = substr($_dep_dir, 1);
183 $inline_deps_exclusive = 1;
184 };
185};
186
187
Akronb87c58d2021-02-23 17:23:30 +0100188# Initialize zipper
Akron132bdeb2024-06-06 14:28:56 +0200189my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron09e0b2c2020-07-28 15:57:01 +0200190
Akronbc899192021-02-24 12:14:47 +0100191# text directory (below $root_dir)
192my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200193
Akronbc899192021-02-24 12:14:47 +0100194# Escaped version of text id
195my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200196
Akrond53913c2021-02-24 09:50:13 +0100197# Default encoding of the text
198my $input_enc = 'UTF-8';
199
Akrond53913c2021-02-24 09:50:13 +0100200# text line (needed for whitespace handling)
201my $text_line = 0;
202
Peter Harders6f526a32020-06-29 21:44:41 +0200203
Akrond53913c2021-02-24 09:50:13 +0100204# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200205my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100206
Akrona2cb2812021-10-30 10:29:08 +0200207# Single dash was set
208if ($stdio) {
209 $input_fh = *STDIN;
210}
211
212# Input flag was passed
213elsif ($input_fname ne '') {
Akron347be812020-09-29 07:52:52 +0200214 unless (open($input_fh, '<', $input_fname)) {
215 die $log->fatal("File '$input_fname' could not be opened.");
216 };
Akrona2cb2812021-10-30 10:29:08 +0200217}
218
219# No input to process
220else {
221 pod2usage(
222 -verbose => 99,
223 -sections => 'NAME|SYNOPSIS',
224 -msg => $VERSION_MSG,
225 -output => '-'
226 );
227 exit;
Akrond53913c2021-02-24 09:50:13 +0100228};
Peter Harders6f526a32020-06-29 21:44:41 +0200229
Akronf8088e62021-02-18 16:18:59 +0100230# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200231binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200232
Peter Harders6f526a32020-06-29 21:44:41 +0200233
Akroneb12e232021-02-25 13:49:50 +0100234# Create inline parser object
235my $inline = KorAP::XML::TEI::Inline->new(
236 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200237 \%skip_inline_tags,
Akron6b1f26b2024-09-19 11:35:32 +0200238 $inline_tokens_exclusive,
239 $inline_dependencies
Akroneb12e232021-02-25 13:49:50 +0100240);
241
242
Akrond53913c2021-02-24 09:50:13 +0100243# Reading input document
Akrond3e1d282021-02-24 14:51:27 +0100244MAIN: while (<$input_fh>) {
Akron347be812020-09-29 07:52:52 +0200245
Akrond53913c2021-02-24 09:50:13 +0100246 # remove HTML (multi-line) comments (<!--...-->)
Akrond3e1d282021-02-24 14:51:27 +0100247 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200248
Akroneaa96232020-10-15 17:06:15 +0200249 # Set input encoding
Akrond53913c2021-02-24 09:50:13 +0100250 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akroneaa96232020-10-15 17:06:15 +0200251 $input_enc = $2;
252 next;
253 };
254
255 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100256 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200257
Akrond3e1d282021-02-24 14:51:27 +0100258 # Start of text body
259 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
Akrond53913c2021-02-24 09:50:13 +0100260 my $suffix = $2;
Akron347be812020-09-29 07:52:52 +0200261
Akrond53913c2021-02-24 09:50:13 +0100262 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200263 die $log->fatal("input line number $.: " .
264 "line with opening text-body tag '${_TEXT_BODY}' " .
265 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200266 };
Peter Harders6f526a32020-06-29 21:44:41 +0200267
Akrond53913c2021-02-24 09:50:13 +0100268 # Text body data extracted from input document ($input_fh),
269 # further processed by XML::LibXML::Reader
270 my $text_buffer = '';
Peter Harders90157342020-07-01 21:05:14 +0200271
Akron347be812020-09-29 07:52:52 +0200272 # Iterate over all lines in the text body
273 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200274
Akrond3e1d282021-02-24 14:51:27 +0100275 $_ = remove_xml_comments($input_fh, $_);
Akroneaa96232020-10-15 17:06:15 +0200276 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100277 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200278
Akrond53913c2021-02-24 09:50:13 +0100279 # End of text body
Akron72f4a882023-03-02 09:48:14 +0100280 if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200281
Akron91705d72021-02-19 10:59:45 +0100282 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200283
Akrond53913c2021-02-24 09:50:13 +0100284 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200285 die $log->fatal("input line number $.: " .
286 "line with closing text-body tag '${_TEXT_BODY}'".
287 " contains additional information ... => Aborting (line=$_)");
288 };
Peter Harders6f526a32020-06-29 21:44:41 +0200289
Akrondafaa7a2021-02-19 15:17:58 +0100290 if ($dir eq '') {
Akrond53913c2021-02-24 09:50:13 +0100291 $log->warn(
292 "Maybe empty textSigle => skipping this text ...\n" .
Akroneb12e232021-02-25 13:49:50 +0100293 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100294 );
Akrondafaa7a2021-02-19 15:17:58 +0100295 next MAIN;
296 };
Peter Harders6f526a32020-06-29 21:44:41 +0200297
Akroneb12e232021-02-25 13:49:50 +0100298 # Parse inline structure
299 $inline->parse($text_id_esc, \$text_buffer);
Akrondafaa7a2021-02-19 15:17:58 +0100300
301 if (DEBUG) {
Akrond53913c2021-02-24 09:50:13 +0100302 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akrondafaa7a2021-02-19 15:17:58 +0100303 };
304
Akroneb12e232021-02-25 13:49:50 +0100305 my $data = $inline->data;
306
Akrond53913c2021-02-24 09:50:13 +0100307 # Write data.xml
Akrondafaa7a2021-02-19 15:17:58 +0100308 $data->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100309 $zipper->new_stream("$dir/${data_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100310 $text_id_esc
311 );
312
Akrond53913c2021-02-24 09:50:13 +0100313 # Tokenize with external tokenizer
Akron9df4a242021-02-19 15:31:16 +0100314 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100315
316 # Tokenize and output
317 $ext_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100318 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100319 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100320 );
Akrond53ab4b2021-02-24 09:56:12 +0100321
322 if ($use_tokenizer_sentence_splits) {
Akroneb12e232021-02-25 13:49:50 +0100323 $ext_tok->sentencize_from_previous_input($inline->structures);
Akrond53ab4b2021-02-24 09:56:12 +0100324 };
Akrondafaa7a2021-02-19 15:17:58 +0100325 };
Peter Harders6f526a32020-06-29 21:44:41 +0200326
Akrond53913c2021-02-24 09:50:13 +0100327 # Tokenize with internal tokenizer
328 if ($tokenizer_intern) {
Peter Harders6f526a32020-06-29 21:44:41 +0200329
Akrondafaa7a2021-02-19 15:17:58 +0100330 # Tokenize and output
331 $cons_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100332 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200333 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100334 )->reset;
Akron598d1a72020-08-02 17:33:31 +0200335
Akrondafaa7a2021-02-19 15:17:58 +0100336 $aggr_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100337 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akrondafaa7a2021-02-19 15:17:58 +0100338 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100339 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100340 };
Akrona10ad592020-08-03 11:20:23 +0200341
Akrondafaa7a2021-02-19 15:17:58 +0100342 # ~ write structures ~
Akron6b1f26b2024-09-19 11:35:32 +0200343 unless ($inline->structures->empty) {
Akroneb12e232021-02-25 13:49:50 +0100344 $inline->structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100345 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100346 $text_id_esc,
347 2 # = structure serialization
Akroneb12e232021-02-25 13:49:50 +0100348 );
Akrondafaa7a2021-02-19 15:17:58 +0100349 };
350
351 # ~ write tokens ~
Akroneb12e232021-02-25 13:49:50 +0100352 unless ($skip_inline_tokens || $inline->tokens->empty) {
353 $inline->tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100354 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100355 $text_id_esc,
Akron6b1f26b2024-09-19 11:35:32 +0200356 # Either 0 = tokens without inline or
357 # 1 = tokens with inline
358 # !$skip_inline_token_annotations
359 ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
Akroneb12e232021-02-25 13:49:50 +0100360 );
Akrondafaa7a2021-02-19 15:17:58 +0100361 };
362
Akron6b1f26b2024-09-19 11:35:32 +0200363 # ~ write dependencies ~
364 unless ($inline->dependencies->empty) {
365 $inline->dependencies->to_zip(
366 $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
367 $text_id_esc,
368 3 # = dependency serialization
369 );
370 };
371
372
Akrondafaa7a2021-02-19 15:17:58 +0100373 # reinit.
374 $dir = '';
375
Akron347be812020-09-29 07:52:52 +0200376 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200377 };
378
Peter Harders6f526a32020-06-29 21:44:41 +0200379
Akron347be812020-09-29 07:52:52 +0200380 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200381
Akronf8088e62021-02-18 16:18:59 +0100382 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100383
Akrond53913c2021-02-24 09:50:13 +0100384 # TODO:
385 # Maybe it's best, to keep the stripping of whitespace and
386 # to just remove the if-clause and to insert a blank by default
387 # (with possibly an option on how newlines in primary text should
388 # be handled (stripped or replaced by a whitespace)).
Akronf8088e62021-02-18 16:18:59 +0100389
390 # Remove consecutive whitespace at beginning and end (mostly one newline)
391 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200392
Akrond53913c2021-02-24 09:50:13 +0100393 # NOTE:
394 # this is only relevant, if a text consists of more than one line
Akronf57ed812020-07-27 10:37:52 +0200395
Akrond53913c2021-02-24 09:50:13 +0100396 # TODO:
397 # find a better solution, or create a warning, if a text has more
398 # than one line ($text_line > 1)
Akronf57ed812020-07-27 10:37:52 +0200399
Akrond53913c2021-02-24 09:50:13 +0100400 # TODO:
401 # do testing with 2 different corpora
402 # (one with only one-line texts, the other with several lines per text)
403
Akronec503252023-04-24 18:03:17 +0200404 # line contains at least one non-tag character
405 if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
Akrond53913c2021-02-24 09:50:13 +0100406
407 # Increment counter for text lines
408 $text_line++;
409
410 # insert blank before 1st character
Akron6e2b1252021-02-24 12:41:15 +0100411 # (for 2nd line and consecutive lines)
412 $_ = ' ' . $_ if $text_line > 1;
Akron347be812020-09-29 07:52:52 +0200413 }
Akronf57ed812020-07-27 10:37:52 +0200414
Akron347be812020-09-29 07:52:52 +0200415 # add line to buffer
Akrond53913c2021-02-24 09:50:13 +0100416 $text_buffer .= $_;
Akron347be812020-09-29 07:52:52 +0200417 };
Akrond3e1d282021-02-24 14:51:27 +0100418 }
Akronf57ed812020-07-27 10:37:52 +0200419
Marc Kupietza671ae52022-12-22 16:28:14 +0100420 elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
421 my $leadin = $1;
422 my $id = $3;
423 my $sigle = $3;
Akronf57ed812020-07-27 10:37:52 +0200424
Marc Kupietza671ae52022-12-22 16:28:14 +0100425 if ($what) {
426 $_ = $id;
427 eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
428 $sigle = $_;
429 $log->debug("Converted text id `$id' to sigle `$sigle'");
430 };
431 $sigle =~ s/\./-/g;
432
433 my @parts = split(/[\/_]/, $sigle);
434 if (@parts != 3) {
435 die $log->fatal(
436 "input line number $.: " .
437 "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
438 "=> Aborting (line=$_)");
439 };
440
441 $dir = join("/", @parts);
442 $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
443 $log->notice("$0: text_id=$text_id_esc");
444
445 if ($leadin !~ /^\s*$/) {
446 die $log->fatal(
447 "input line number $.: " .
448 'line with opening header tag is not in expected format ... ' .
449 "=> Aborting (line=$_)");
450 };
451 }
452
453 # Start of header section
454 elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
Akron347be812020-09-29 07:52:52 +0200455 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200456
Akrond20898f2021-02-19 15:52:17 +0100457 if ($1 !~ /^\s*$/) {
Akrond53913c2021-02-24 09:50:13 +0100458 die $log->fatal(
459 "input line number $.: " .
460 'line with opening header tag is not in expected format ... ' .
461 "=> Aborting (line=$_)");
Akron347be812020-09-29 07:52:52 +0200462 };
463
464 # Parse header
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +0200465 my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
466 if ($auto_textsigle) {
467 $auto_textsigle = increase_auto_textsigle($auto_textsigle);
468 $log->debug("Auto-incremented text sigle to $auto_textsigle");
469 };
Akron347be812020-09-29 07:52:52 +0200470 # Header was parseable
471 if ($header) {
472
473 # Write header to zip
Akrond53913c2021-02-24 09:50:13 +0100474 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200475
Akronb3649472020-09-29 08:24:46 +0200476 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200477
478 $header->to_zip($zipper->new_stream($file));
479
480 # Header is for text level
481 if ($header->type eq 'text') {
482
483 # Remember dir and sigles
484 $dir = $header->dir;
Akron347be812020-09-29 07:52:52 +0200485 $text_id_esc = $header->id_esc;
486
487 # log output for seeing progression
Akronbc899192021-02-24 12:14:47 +0100488 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200489
Akrond53913c2021-02-24 09:50:13 +0100490 # Reset counter for text lines
491 # (needed for whitespace handling)
492 $text_line = 0;
493 };
494 };
495 };
496};
Peter Hardersd892a582020-02-12 15:45:22 +0100497
Akron347be812020-09-29 07:52:52 +0200498$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200499
Akron9df4a242021-02-19 15:31:16 +0100500$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100501
Akrond53913c2021-02-24 09:50:13 +0100502close $input_fh;
503
Peter Harders6f526a32020-06-29 21:44:41 +0200504
Akrond949e182020-02-14 12:23:57 +0100505__END__
506
507=pod
508
509=encoding utf8
510
511=head1 NAME
512
513tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
514
515=head1 SYNOPSIS
516
Akrona2cb2812021-10-30 10:29:08 +0200517 cat corpus.i5.xml | tei2korapxml - > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100518
519=head1 DESCRIPTION
520
Akronee434b12020-07-08 12:53:01 +0200521C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200522L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200523based documents to the
524L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200525
Akrond949e182020-02-14 12:23:57 +0100526This program is usually called from inside another script.
527
Akronee434b12020-07-08 12:53:01 +0200528=head1 FORMATS
529
530=head2 Input restrictions
531
532=over 2
533
534=item
535
Akronee434b12020-07-08 12:53:01 +0200536TEI P5 formatted input with certain restrictions:
537
538=over 4
539
540=item
541
Akrone48bec42023-01-05 12:18:45 +0100542B<mandatory>: text-header with integrated textsigle
543(or convertable identifier), text-body
Akronee434b12020-07-08 12:53:01 +0200544
545=item
546
547B<optional>: corp-header with integrated corpsigle,
548doc-header with integrated docsigle
549
550=back
551
552=item
553
Akron0c41ab32020-09-29 07:33:33 +0200554All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200555newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200556(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200557into blanks between 2 tokens could lead to additional blanks,
558where there should be none (e.g.: punctuation characters like C<,> or
559C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100560(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200561
Akron940ca6f2021-10-11 12:38:39 +0200562=item
563
564Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
565need to be defined in the same line as the header tag.
566
Akronee434b12020-07-08 12:53:01 +0200567=back
568
569=head2 Notes on the output
570
571=over 2
572
573=item
574
575zip file output (default on C<stdout>) with utf8 encoded entries
576(which together form the KorAP-XML format)
577
578=back
579
Akrond949e182020-02-14 12:23:57 +0100580=head1 INSTALLATION
581
Akrond26319b2023-01-12 15:34:41 +0100582C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietze83a4e92021-03-16 20:51:26 +0100583When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100584to use L<cpanm|App::cpanminus>.
585
586 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
587
588In case everything went well, the C<tei2korapxml> tool will
589be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200590
Akrond949e182020-02-14 12:23:57 +0100591Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
592
593=head1 OPTIONS
594
595=over 2
596
Akrona2cb2812021-10-30 10:29:08 +0200597=item B<--input|-i>
598
599The input file to process. If no specific input is defined and a single
600dash C<-> is passed as an argument, data is read from C<STDIN>.
601
Akron132bdeb2024-06-06 14:28:56 +0200602=item B<--output|-o>
603
604The output zip file to be created. If no specific output is defined,
605data is written to C<STDOUT>.
Akrona2cb2812021-10-30 10:29:08 +0200606
Akron4e603a52020-07-27 14:23:49 +0200607=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100608
Akron4e603a52020-07-27 14:23:49 +0200609The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100610
611=item B<--help|-h>
612
613Print help information.
614
615=item B<--version|-v>
616
617Print version information.
618
Akrone48bec42023-01-05 12:18:45 +0100619=item B<--tokenizer-korap|-tk>
Akron2520a342022-03-29 18:18:05 +0200620
Akrone48bec42023-01-05 12:18:45 +0100621Use the standard KorAP/DeReKo tokenizer.
622
623=item B<--tokenizer-internal|-ti>
624
625Tokenize the data using two embedded tokenizers,
626that will take an I<aggressive> and a I<conservative>
627approach.
Akron2520a342022-03-29 18:18:05 +0200628
Akron4e603a52020-07-27 14:23:49 +0200629=item B<--tokenizer-call|-tc>
630
631Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100632from STDIN and outputs the offsets of all tokens.
633
634Texts are separated using C<\x04\n>. The external process
635should add a new line per text.
636
637If the L</--use-tokenizer-sentence-splits> option is activated,
638sentences are marked by offset as well in new lines.
639
640To use L<Datok|https://github.com/KorAP/Datok> including sentence
641splitting, call C<tei2korap> as follows:
642
643 $ cat corpus.i5.xml | tei2korapxml -s \
644 $ -tc 'datok tokenize \
645 $ -t ./tokenizer.matok \
646 $ -p --newline-after-eot --no-sentences \
647 $ --no-tokens --sentence-positions -' - \
648 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200649
Akronb93fabb2023-01-13 12:05:44 +0100650=item B<--no-tokenizer>
651
652Boolean flag indicating that no tokenizer should be used.
653This is meant to ensure that by default a final token layer always
654exists.
655If a separate tokenizer is chosen, this flag is ignored.
656
Akron75d63142021-02-23 18:40:56 +0100657=item B<--skip-inline-tokens>
658
659Boolean flag indicating that inline tokens should not
660be processed. Defaults to false (meaning inline tokens will be processed).
661
Akron692d17d2021-03-05 13:21:03 +0100662=item B<--skip-inline-token-annotations>
663
664Boolean flag indicating that inline token annotations should not
665be processed. Defaults to true (meaning inline token annotations
Akron6b1f26b2024-09-19 11:35:32 +0200666won't be processed). Can be negated with
667C<--no-skip-inline-token-annotations>.
Akron692d17d2021-03-05 13:21:03 +0100668
Akronca70a1d2021-02-25 16:21:31 +0100669=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100670
671Expects a comma-separated list of tags to be ignored when the structure
672is parsed. Content of these tags however will be processed.
673
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +0200674=item B<--auto-textsigle> <textsigle>
675
676Expects a text sigle thats serves as fallback if no text sigles
677are given in the input data.
678The auto text sigle will be incremented for each text processed.
679
680Example:
681
682 tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
683 < data.i5.xml > korapxml.zip
684
Marc Kupietza671ae52022-12-22 16:28:14 +0100685=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
686
Akrone48bec42023-01-05 12:18:45 +0100687Expects a regular replacement expression (separated by B<@> between the
Marc Kupietza671ae52022-12-22 16:28:14 +0100688search and the replacement) to convert text id attributes to text sigles
689with three parts (separated by B</>).
690
691Example:
692
693 tei2korapxml \
694 --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
695 -tk - < t/data/icc_german_sample.p5.xml
696
Akrone48bec42023-01-05 12:18:45 +0100697Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
698sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietza671ae52022-12-22 16:28:14 +0100699
Akron1a5271a2021-02-18 13:18:15 +0100700=item B<--inline-tokens> <foundry>#[<file>]
701
702Define the foundry and file (without extension)
703to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100704Unless C<--skip-inline-token-annotations> is set,
705this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100706Defaults to C<tokens> and C<morpho>.
707
Akrone2819a12021-10-12 15:52:55 +0200708The inline token data will also be stored in the
709inline structures file (see I<--inline-structures>),
710unless the inline token foundry is prepended
711by an B<!> exclamation mark, indicating that inline
712tokens are stored exclusively in the inline tokens
713file.
714
715Example:
716
Akron6b1f26b2024-09-19 11:35:32 +0200717 tei2korapxml --no-tokenizer --inline-tokens \
718 '!gingko#morpho' < data.i5.xml > korapxml.zip
719
720=item B<--inline-dependencies> <foundry>#[<file>]
721
722Define the foundry and file (without extension)
723to store inline dependency information in.
724Defaults to the layer of C<dependency> and
725will be ignored if not set (which means, dependency
726attributes will be stored in the inline tokens file,
727if not skipped).
728
729The dependency data will also be stored in the
730inline token file (see I<--inline-tokens>),
731unless the inline dependencies foundry is prepended
732by an B<!> exclamation mark, indicating that inline
733dependency data is stored exclusively in the inline
734dependencies file.
735
736Example:
737
738 tei2korapxml --no-tokenizer --inline-dependencies \
739 'gingko#dependency' < data.i5.xml > korapxml.zip
740
Akrone2819a12021-10-12 15:52:55 +0200741
Akrondd0be8f2021-02-18 19:29:41 +0100742=item B<--inline-structures> <foundry>#[<file>]
743
744Define the foundry and file (without extension)
745to store inline structure information in.
746Defaults to C<struct> and C<structures>.
747
Akron26a71522021-02-19 10:27:37 +0100748=item B<--base-foundry> <foundry>
749
750Define the base foundry to store newly generated
751token information in.
752Defaults to C<base>.
753
754=item B<--data-file> <file>
755
756Define the file (without extension)
757to store primary data information in.
758Defaults to C<data>.
759
760=item B<--header-file> <file>
761
762Define the file name (without extension)
763to store header information on
764the corpus, document, and text level in.
765Defaults to C<header>.
766
Marc Kupietz985da0c2021-02-15 19:29:50 +0100767=item B<--use-tokenizer-sentence-splits|-s>
768
769Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100770provided by the tokenizer.
771Currently KorAP-tokenizer and certain external tokenizers support
772these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100773
Akron91705d72021-02-19 10:59:45 +0100774=item B<--tokens-file> <file>
775
776Define the file (without extension)
777to store generated token information in
778(either from the KorAP tokenizer or an externally called tokenizer).
779Defaults to C<tokens>.
780
Akron3378dfd2020-08-01 15:01:36 +0200781=item B<--log|-l>
782
783Loglevel for I<Log::Any>. Defaults to C<notice>.
784
Akrond949e182020-02-14 12:23:57 +0100785=back
786
Akronb3649472020-09-29 08:24:46 +0200787=head1 ENVIRONMENT VARIABLES
788
789=over 2
790
791=item B<KORAPXMLTEI_DEBUG>
792
793Activate minimal debugging.
794Defaults to C<false>.
795
Akronb3649472020-09-29 08:24:46 +0200796=back
797
Akrond949e182020-02-14 12:23:57 +0100798=head1 COPYRIGHT AND LICENSE
799
Akron6b1f26b2024-09-19 11:35:32 +0200800Copyright (C) 2021-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100801
802Author: Peter Harders
803
Akronaabd0952020-09-29 07:35:08 +0200804Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100805
806L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
807Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200808L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100809member of the
810L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
811
812This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100813L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100814
815=cut
Akronf8088e62021-02-18 16:18:59 +0100816
817# NOTES
818
Akronf8088e62021-02-18 16:18:59 +0100819## Notes on segfault prevention
820
Akron91577922021-02-19 10:32:54 +0100821binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100822(see notes on 'PerlIO layers' in 'man XML::LibXML'),
823removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
824see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
825see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.