blob: f8a26c2afd1ce6c5b65634fd64247caa3421521b [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +02009use KorAP::XML::TEI qw(increase_auto_textsigle);
Peter Harders6f526a32020-06-29 21:44:41 +020010
11use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010012
Marc Kupietz8ab68322026-03-18 18:04:14 +010013use Encode qw(decode encode);
Peter Hardersd892a582020-02-12 15:45:22 +010014
Akron4f67cd42020-07-02 12:27:58 +020015use FindBin;
16BEGIN {
17 unshift @INC, "$FindBin::Bin/../lib";
18};
19
Marc Kupietz8a954e52021-02-16 22:03:07 +010020use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020021use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020022use KorAP::XML::TEI::Tokenizer::Conservative;
23use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020024use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020025use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010026use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010027
Marc Kupietz32781e92026-03-05 18:32:43 +010028our $VERSION = '2.7.2';
Peter Harders6f526a32020-06-29 21:44:41 +020029
Akrond949e182020-02-14 12:23:57 +010030our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
31
Akron33db4ec2021-02-24 12:52:21 +010032use constant {
33 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010034 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010035};
Peter Hardersd892a582020-02-12 15:45:22 +010036
Akron692d17d2021-03-05 13:21:03 +010037if ($ENV{KORAPXMLTEI_INLINE}) {
38 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
39};
40
Akrone2819a12021-10-12 15:52:55 +020041# Inline tokens won't be stored in the structure file
42my $inline_tokens_exclusive = 0;
43
Akron6b1f26b2024-09-19 11:35:32 +020044# Inline dependencies won't be stored in the tokens file
45my $inline_deps_exclusive = 0;
46
Peter Harders6f526a32020-06-29 21:44:41 +020047# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010048GetOptions(
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +020049 'auto-textsigle|A=s' => \(my $auto_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010050 'root|r=s' => \(my $root_dir = '.'),
51 'input|i=s' => \(my $input_fname = ''),
Akron132bdeb2024-06-06 14:28:56 +020052 'output|o=s' => \(my $output_fname = ''),
Akron75d63142021-02-23 18:40:56 +010053 'tokenizer-call|tc=s' => \(my $tokenizer_call),
54 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010055 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akronb93fabb2023-01-13 12:05:44 +010056 'no-tokenizer' => \(my $no_tokenizer),
Akron75d63142021-02-23 18:40:56 +010057 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
58 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
59 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
Akron6b1f26b2024-09-19 11:35:32 +020060 'inline-dependencies=s' => \(my $inline_dependencies),
Akron75d63142021-02-23 18:40:56 +010061 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron6b1f26b2024-09-19 11:35:32 +020062 'skip-inline-token-annotations!' => \(
Akron692d17d2021-03-05 13:21:03 +010063 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010064 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010065 'base-foundry=s' => \(my $base_dir = 'base'),
66 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010067 'header-file=s' => \(my $header_file = 'header'),
68 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Marc Kupietza671ae52022-12-22 16:28:14 +010069 'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
Akrond3e1d282021-02-24 14:51:27 +010070 'log|l=s' => \(my $log_level = 'notice'),
Akron2520a342022-03-29 18:18:05 +020071 'required-version|rv=s' => \(my $required_version),
Marc Kupietz2115ecc2025-12-10 11:37:03 +010072 'progress|p' => \(my $progress),
Akrona2cb2812021-10-30 10:29:08 +020073 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010074 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010075 pod2usage(
76 -verbose => 99,
77 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
78 -msg => $VERSION_MSG,
79 -output => '-'
80 )
81 },
82 'version|v' => sub {
83 pod2usage(
84 -verbose => 0,
85 -msg => $VERSION_MSG,
86 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010087 );
Akrond949e182020-02-14 12:23:57 +010088 }
Peter Hardersd892a582020-02-12 15:45:22 +010089);
90
Akrond3e1d282021-02-24 14:51:27 +010091
Akronb87c58d2021-02-23 17:23:30 +010092# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010093binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020094Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020095$log->notice('Debugging is activated') if DEBUG;
96
Akrond3e1d282021-02-24 14:51:27 +010097
Akron2520a342022-03-29 18:18:05 +020098if ($required_version) {
Marc Kupietz2475c952024-01-09 10:40:04 +010099 $required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
Akron2520a342022-03-29 18:18:05 +0200100 if (!$1 || $1 ne $VERSION) {
101 $log->error("Required version $required_version mismatches version $VERSION");
102 exit(1);
103 };
104};
105
106
Marc Kupietza671ae52022-12-22 16:28:14 +0100107my ($what, $with);
108if ($xmlid_to_textsigle ne '') {
109 ($what, $with) = split('@', $xmlid_to_textsigle);
110 $what = qr!$what!;
111};
112
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100113my $progress_fh;
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100114if ($progress) {
115 eval {
116 require Time::Progress;
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100117 my $tty = $^O eq 'MSWin32' ? 'CON' : '/dev/tty';
118 open($progress_fh, '>', $tty)
119 or die "Cannot open $tty";
120 $progress_fh->autoflush(1);
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100121 1;
122 } or do {
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100123 $log->warn('Progress bar disabled: ' . ($@ =~ s/ at .*//sr));
124 $progress = 0;
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100125 }
126};
127
Akron0529e512021-02-22 09:55:35 +0100128# tag (without attributes), which contains the primary text
129my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +0200130# optional
Akron09e0b2c2020-07-28 15:57:01 +0200131
Akron54c3ff12021-02-25 11:33:37 +0100132# Remember to skip certain inline tags
133my %skip_inline_tags = ();
134if ($skip_inline_tags_str) {
135 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
136 $skip_inline_tags{$_} = 1;
137 };
138};
139
Akrond3e1d282021-02-24 14:51:27 +0100140# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200141my $ext_tok;
142if ($tokenizer_call) {
143 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100144 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200145}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200146
Akronb93fabb2023-01-13 12:05:44 +0100147# KorAP tokenization
Akron0c41ab32020-09-29 07:33:33 +0200148elsif ($tokenizer_korap) {
Akronbd4281e2022-03-28 08:31:40 +0200149 eval {
150 require KorAP::XML::TEI::Tokenizer::KorAP;
151 1;
152 };
Akron2520a342022-03-29 18:18:05 +0200153
154 my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
155 if ($korap_tok_ver ne $VERSION) {
156 $log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
157 exit(1);
158 };
159
Marc Kupietz985da0c2021-02-15 19:29:50 +0100160 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akronb93fabb2023-01-13 12:05:44 +0100161}
162
163# No internal tokenizer chosen
164elsif (!$tokenizer_intern && !$no_tokenizer) {
165 $log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
166 exit(1);
Akron0c41ab32020-09-29 07:33:33 +0200167};
Peter Harders6f526a32020-06-29 21:44:41 +0200168
Akron6b1f26b2024-09-19 11:35:32 +0200169if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
Akron11484782021-11-03 20:12:14 +0100170 $skip_inline_tags{s} = 1;
171};
Akron0c41ab32020-09-29 07:33:33 +0200172
Akrond3e1d282021-02-24 14:51:27 +0100173# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100174my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
175my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100176
Peter Harders41c35622020-07-12 01:16:22 +0200177
Akrondd0be8f2021-02-18 19:29:41 +0100178# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100179# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100180my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100181
Akron1a5271a2021-02-18 13:18:15 +0100182# Name of the directory and the file containing all inline token informations
183# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
184my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100185
Akrone2819a12021-10-12 15:52:55 +0200186if (index($_tokens_dir, '!') == 0) {
187 $_tokens_dir = substr($_tokens_dir, 1);
188 $inline_tokens_exclusive = 1;
189};
190
Akron6b1f26b2024-09-19 11:35:32 +0200191
192my ($_dep_dir, $_dep_file);
193if ($inline_dependencies) {
194 ($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
195 $inline_dependencies = 1;
196
197 if ($_dep_dir && index($_dep_dir, '!') == 0) {
198 $_dep_dir = substr($_dep_dir, 1);
199 $inline_deps_exclusive = 1;
200 };
201};
202
203
Akronb87c58d2021-02-23 17:23:30 +0100204# Initialize zipper
Akron132bdeb2024-06-06 14:28:56 +0200205my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
Akron09e0b2c2020-07-28 15:57:01 +0200206
Akronbc899192021-02-24 12:14:47 +0100207# text directory (below $root_dir)
208my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200209
Akronbc899192021-02-24 12:14:47 +0100210# Escaped version of text id
211my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200212
Akrond53913c2021-02-24 09:50:13 +0100213# Default encoding of the text
214my $input_enc = 'UTF-8';
215
Akrond53913c2021-02-24 09:50:13 +0100216# text line (needed for whitespace handling)
217my $text_line = 0;
218
Peter Harders6f526a32020-06-29 21:44:41 +0200219
Akrond53913c2021-02-24 09:50:13 +0100220# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200221my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100222
Akrona2cb2812021-10-30 10:29:08 +0200223# Single dash was set
224if ($stdio) {
225 $input_fh = *STDIN;
226}
Akrona2cb2812021-10-30 10:29:08 +0200227# Input flag was passed
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200228elsif (@ARGV || $input_fname ne '') {
229 unless ($input_fname ne '') {
230 $input_fname = shift @ARGV;
231 };
Akron347be812020-09-29 07:52:52 +0200232 unless (open($input_fh, '<', $input_fname)) {
233 die $log->fatal("File '$input_fname' could not be opened.");
234 };
Akrona2cb2812021-10-30 10:29:08 +0200235}
236
237# No input to process
238else {
239 pod2usage(
240 -verbose => 99,
241 -sections => 'NAME|SYNOPSIS',
242 -msg => $VERSION_MSG,
243 -output => '-'
244 );
245 exit;
Akrond53913c2021-02-24 09:50:13 +0100246};
Peter Harders6f526a32020-06-29 21:44:41 +0200247
Akronf8088e62021-02-18 16:18:59 +0100248# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200249binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200250
Peter Harders6f526a32020-06-29 21:44:41 +0200251
Akroneb12e232021-02-25 13:49:50 +0100252# Create inline parser object
253my $inline = KorAP::XML::TEI::Inline->new(
254 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200255 \%skip_inline_tags,
Akron6b1f26b2024-09-19 11:35:32 +0200256 $inline_tokens_exclusive,
257 $inline_dependencies
Akroneb12e232021-02-25 13:49:50 +0100258);
259
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200260do {
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100261 my $p;
262 if ($progress && $input_fname ne '') {
263 my $file_size = -s $input_fname;
264 if ($file_size) {
265 $p = Time::Progress->new(min => 0, max => $file_size);
266 $log->notice("Reading input document $input_fname (Size: $file_size bytes)");
267 }
268 } elsif ($input_fname ne '') {
269 $log->notice("Reading input document $input_fname");
270 };
271
272 my $i = 0;
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200273 MAIN:
274 while (<$input_fh>) {
Akroneb12e232021-02-25 13:49:50 +0100275
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100276 if ($p && ($i++ % 500 == 0)) {
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100277 print $progress_fh $p->report("\r%20b %p ETA: %E", tell($input_fh));
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100278 };
279
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200280 # remove HTML (multi-line) comments (<!--...-->)
281 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200282
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200283 # Set input encoding
284 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
285 $input_enc = $2;
286 next;
Akron0bb7e722020-09-29 07:48:33 +0200287 };
Peter Harders6f526a32020-06-29 21:44:41 +0200288
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200289 $_ = decode($input_enc, $_);
290 $_ = replace_entities($_);
Peter Harders90157342020-07-01 21:05:14 +0200291
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200292 # Start of text body
293 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
294 my $suffix = $2;
Peter Harders90157342020-07-01 21:05:14 +0200295
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200296 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
297 die $log->fatal("input line number $.: " .
298 "line with opening text-body tag '${_TEXT_BODY}' " .
299 "contains additional information ... => Aborting (line=$_)");
300 };
Peter Harders6f526a32020-06-29 21:44:41 +0200301
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200302 # Text body data extracted from input document ($input_fh),
303 # further processed by XML::LibXML::Reader
304 my $text_buffer = '';
Peter Harders6f526a32020-06-29 21:44:41 +0200305
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200306 # Iterate over all lines in the text body
307 while (<$input_fh>) {
Peter Harders6f526a32020-06-29 21:44:41 +0200308
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200309 $_ = remove_xml_comments($input_fh, $_);
310 $_ = decode($input_enc, $_);
311 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200312
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200313 # End of text body
314 if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
315
316 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
317
Marc Kupietzff061ef2026-03-05 09:59:35 +0100318 my $before = substr($_, 0, $pos);
319 my $after = substr($_, length("</$_TEXT_BODY>") + $pos);
320 my $before_check = $before;
321 $before_check =~ s/<[^>]+>//g; # strip XML tags like </body>
322 if (($before_check . $after) !~ /^\s*$/) {
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200323 die $log->fatal("input line number $.: " .
324 "line with closing text-body tag '${_TEXT_BODY}'" .
325 " contains additional information ... => Aborting (line=$_)");
326 };
327
Marc Kupietzff061ef2026-03-05 09:59:35 +0100328 # Add any remaining content before </text> (e.g. </body>) to the buffer
329 $before =~ s/^\s+//;
330 $before =~ s/\s+$//;
331 $text_buffer .= $before if $before ne '';
332
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200333 if ($dir eq '') {
334 $log->warn(
335 "Maybe empty textSigle => skipping this text ...\n" .
336 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100337 );
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200338 next MAIN;
339 };
340
341 # Parse inline structure
342 $inline->parse($text_id_esc, \$text_buffer);
343
344 if (DEBUG) {
345 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
346 };
347
348 my $data = $inline->data;
349
350 # Write data.xml
351 $data->to_zip(
352 $zipper->new_stream("$dir/${data_file}.xml"),
353 $text_id_esc
354 );
355
356 # Tokenize with external tokenizer
357 if ($ext_tok) {
358
Marc Kupietz8ab68322026-03-18 18:04:14 +0100359 my $tokens_output = eval {
360 $ext_tok->tokenize($data->data)->to_string($text_id_esc);
361 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200362
Marc Kupietz8ab68322026-03-18 18:04:14 +0100363 if (my $err = $@) {
364 $err =~ s/\s+$//;
365 $log->error("Skipping external tokenization for '$text_id_esc': $err");
366 $ext_tok->reset;
367 }
368 elsif (defined $tokens_output) {
369 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml")
370 ->print(encode('UTF-8', $tokens_output));
371
372 if ($use_tokenizer_sentence_splits) {
373 $ext_tok->sentencize_from_previous_input($inline->structures);
374 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200375 };
376 };
377
378 # Tokenize with internal tokenizer
379 if ($tokenizer_intern) {
380
381 # Tokenize and output
382 $cons_tok->tokenize($data->data)->to_zip(
383 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
384 $text_id_esc
385 )->reset;
386
387 $aggr_tok->tokenize($data->data)->to_zip(
388 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
389 $text_id_esc
390 )->reset;
391 };
392
393 # ~ write structures ~
394 unless ($inline->structures->empty) {
395 $inline->structures->to_zip(
396 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
397 $text_id_esc,
398 2 # = structure serialization
399 );
400 };
401
402 # ~ write tokens ~
403 unless ($skip_inline_tokens || $inline->tokens->empty) {
404 $inline->tokens->to_zip(
405 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
406 $text_id_esc,
407 # Either 0 = tokens without inline or
408 # 1 = tokens with inline
409 # !$skip_inline_token_annotations
410 ($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
411 );
412 };
413
414 # ~ write dependencies ~
415 unless ($inline->dependencies->empty) {
416 $inline->dependencies->to_zip(
417 $zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
418 $text_id_esc,
419 3 # = dependency serialization
420 );
421 };
422
423
424 # reinit.
425 $dir = '';
426
Akrondafaa7a2021-02-19 15:17:58 +0100427 next MAIN;
428 };
Peter Harders6f526a32020-06-29 21:44:41 +0200429
Akrondafaa7a2021-02-19 15:17:58 +0100430
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200431 # ~ whitespace handling ~
Akrondafaa7a2021-02-19 15:17:58 +0100432
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200433 # Fix whitespaces (see notes on whitespace fixing)
Akroneb12e232021-02-25 13:49:50 +0100434
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200435 # TODO:
436 # Maybe it's best, to keep the stripping of whitespace and
437 # to just remove the if-clause and to insert a blank by default
438 # (with possibly an option on how newlines in primary text should
439 # be handled (stripped or replaced by a whitespace)).
Akrondafaa7a2021-02-19 15:17:58 +0100440
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200441 # Remove consecutive whitespace at beginning and end (mostly one newline)
442 s/^\s+//;
443 s/\s+$//;
Akrondafaa7a2021-02-19 15:17:58 +0100444
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200445 # NOTE:
446 # this is only relevant, if a text consists of more than one line
Akrond53ab4b2021-02-24 09:56:12 +0100447
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200448 # TODO:
449 # find a better solution, or create a warning, if a text has more
450 # than one line ($text_line > 1)
Peter Harders6f526a32020-06-29 21:44:41 +0200451
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200452 # TODO:
453 # do testing with 2 different corpora
454 # (one with only one-line texts, the other with several lines per text)
Peter Harders6f526a32020-06-29 21:44:41 +0200455
Marc Kupietza84fcb52026-03-05 17:22:43 +0100456 # Check if the buffer currently ends inside an open XML tag
457 # (last '<' is after last '>'), meaning this line is a continuation of
458 # a multi-line element (e.g. attributes split across lines like <ref>).
459 # A space must be prepended to avoid "attributes construct error" in the
460 # XML parser when two attribute tokens are concatenated without separator.
461 my $in_open_tag = ($text_buffer ne '' &&
462 rindex($text_buffer, '<') > rindex($text_buffer, '>'));
463
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200464 # line contains at least one non-tag character
465 if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
Akron598d1a72020-08-02 17:33:31 +0200466
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200467 # Increment counter for text lines
468 $text_line++;
Akrona10ad592020-08-03 11:20:23 +0200469
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200470 # insert blank before 1st character
Marc Kupietza84fcb52026-03-05 17:22:43 +0100471 # (for 2nd line and consecutive lines, or when continuing an open tag)
472 $_ = ' ' . $_ if $text_line > 1 || $in_open_tag;
473 }
474
475 # Line is purely within an open tag (attribute continuation):
476 # prepend a space so attributes are properly separated.
477 elsif ($in_open_tag) {
478 $_ = ' ' . $_;
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200479 }
Akrondafaa7a2021-02-19 15:17:58 +0100480
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200481 # add line to buffer
482 $text_buffer .= $_;
483 };
484 }
485 elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
486 my $leadin = $1;
487 my $id = $3;
488 my $sigle = $3;
Akrondafaa7a2021-02-19 15:17:58 +0100489
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200490 if ($what) {
491 $_ = $id;
492 eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
493 $sigle = $_;
494 $log->debug("Converted text id `$id' to sigle `$sigle'");
495 };
496 $sigle =~ s/\./-/g;
Akron6b1f26b2024-09-19 11:35:32 +0200497
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200498 my @parts = split(/[\/_]/, $sigle);
499 if (@parts != 3) {
500 die $log->fatal(
501 "input line number $.: " .
502 "ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " " .
503 "=> Aborting (line=$_)");
Akron598d1a72020-08-02 17:33:31 +0200504 };
505
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200506 $dir = join("/", @parts);
507 $text_id_esc = "$parts[0]/$parts[1].$parts[2]";
508 $log->notice("$0: text_id=$text_id_esc");
Peter Harders6f526a32020-06-29 21:44:41 +0200509
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200510 if ($leadin !~ /^\s*$/) {
511 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100512 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200513 'line with opening header tag is not in expected format ... ' .
514 "=> Aborting (line=$_)");
515 };
516 }
Marc Kupietza671ae52022-12-22 16:28:14 +0100517
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200518 # Start of header section
519 elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
520 my $content = "$2\n";
Marc Kupietza671ae52022-12-22 16:28:14 +0100521
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200522 if ($1 !~ /^\s*$/) {
523 die $log->fatal(
Marc Kupietza671ae52022-12-22 16:28:14 +0100524 "input line number $.: " .
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200525 'line with opening header tag is not in expected format ... ' .
526 "=> Aborting (line=$_)");
527 };
Marc Kupietza671ae52022-12-22 16:28:14 +0100528
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200529 # Parse header
530 my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc // $auto_textsigle)->parse($input_fh);
531 if ($auto_textsigle) {
532 $auto_textsigle = increase_auto_textsigle($auto_textsigle);
533 $log->debug("Auto-incremented text sigle to $auto_textsigle");
534 };
Akronf57ed812020-07-27 10:37:52 +0200535
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200536 # Header was parseable
537 if ($header) {
Akron347be812020-09-29 07:52:52 +0200538
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200539 # Write header to zip
540 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200541
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200542 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200543
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200544 $header->to_zip($zipper->new_stream($file));
Akron347be812020-09-29 07:52:52 +0200545
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200546 # Header is for text level
547 if ($header->type eq 'text') {
Akron347be812020-09-29 07:52:52 +0200548
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200549 # Remember dir and sigles
550 $dir = $header->dir;
551 $text_id_esc = $header->id_esc;
Akron347be812020-09-29 07:52:52 +0200552
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200553 # log output for seeing progression
554 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200555
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200556 # Reset counter for text lines
557 # (needed for whitespace handling)
558 $text_line = 0;
559 };
Akrond53913c2021-02-24 09:50:13 +0100560 };
561 };
562 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200563 $text_id_esc = $auto_textsigle if ($auto_textsigle);
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100564
565 if ($p) {
Marc Kupietz3c16cb92026-03-05 18:29:59 +0100566 print $progress_fh $p->report("\r%20b %p ETA: %E\n", tell($input_fh));
Marc Kupietz2115ecc2025-12-10 11:37:03 +0100567 };
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200568} while (($input_fname = shift(@ARGV)) && open($input_fh, '<', $input_fname));
Akron347be812020-09-29 07:52:52 +0200569$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200570
Akron9df4a242021-02-19 15:31:16 +0100571$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100572
Akrond53913c2021-02-24 09:50:13 +0100573close $input_fh;
574
Peter Harders6f526a32020-06-29 21:44:41 +0200575
Akrond949e182020-02-14 12:23:57 +0100576__END__
577
578=pod
579
580=encoding utf8
581
582=head1 NAME
583
584tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
585
586=head1 SYNOPSIS
587
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200588 cat corpus.i5.xml | tei2korapxml -tk - > corpus.korapxml.zip
589 tei2korapxml -tk corpus.i5.xml > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100590
591=head1 DESCRIPTION
592
Akronee434b12020-07-08 12:53:01 +0200593C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200594L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200595based documents to the
596L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200597
Akrond949e182020-02-14 12:23:57 +0100598This program is usually called from inside another script.
599
Akronee434b12020-07-08 12:53:01 +0200600=head1 FORMATS
601
602=head2 Input restrictions
603
604=over 2
605
606=item
607
Akronee434b12020-07-08 12:53:01 +0200608TEI P5 formatted input with certain restrictions:
609
610=over 4
611
612=item
613
Akrone48bec42023-01-05 12:18:45 +0100614B<mandatory>: text-header with integrated textsigle
615(or convertable identifier), text-body
Akronee434b12020-07-08 12:53:01 +0200616
617=item
618
619B<optional>: corp-header with integrated corpsigle,
620doc-header with integrated docsigle
621
622=back
623
624=item
625
Akron0c41ab32020-09-29 07:33:33 +0200626All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200627newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200628(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200629into blanks between 2 tokens could lead to additional blanks,
630where there should be none (e.g.: punctuation characters like C<,> or
631C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100632(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200633
Akron940ca6f2021-10-11 12:38:39 +0200634=item
635
636Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
637need to be defined in the same line as the header tag.
638
Akronee434b12020-07-08 12:53:01 +0200639=back
640
641=head2 Notes on the output
642
643=over 2
644
645=item
646
647zip file output (default on C<stdout>) with utf8 encoded entries
648(which together form the KorAP-XML format)
649
650=back
651
Akrond949e182020-02-14 12:23:57 +0100652=head1 INSTALLATION
653
Akrond26319b2023-01-12 15:34:41 +0100654C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
Marc Kupietze83a4e92021-03-16 20:51:26 +0100655When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100656to use L<cpanm|App::cpanminus>.
657
658 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
659
660In case everything went well, the C<tei2korapxml> tool will
661be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200662
Akrond949e182020-02-14 12:23:57 +0100663Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
664
665=head1 OPTIONS
666
667=over 2
668
Akrona2cb2812021-10-30 10:29:08 +0200669=item B<--input|-i>
670
671The input file to process. If no specific input is defined and a single
672dash C<-> is passed as an argument, data is read from C<STDIN>.
673
Marc Kupietz5b3f1d82024-07-05 17:50:55 +0200674Instead of using C<-i> input files can also be defined as trailing arguments
675to the command:
676
677 tei2korapxml -tk corpus1.i5.xml corpus2.i5.xml
678
Akron132bdeb2024-06-06 14:28:56 +0200679=item B<--output|-o>
680
681The output zip file to be created. If no specific output is defined,
682data is written to C<STDOUT>.
Akrona2cb2812021-10-30 10:29:08 +0200683
Akron4e603a52020-07-27 14:23:49 +0200684=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100685
Akron4e603a52020-07-27 14:23:49 +0200686The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100687
688=item B<--help|-h>
689
690Print help information.
691
692=item B<--version|-v>
693
694Print version information.
695
Akrone48bec42023-01-05 12:18:45 +0100696=item B<--tokenizer-korap|-tk>
Akron2520a342022-03-29 18:18:05 +0200697
Akrone48bec42023-01-05 12:18:45 +0100698Use the standard KorAP/DeReKo tokenizer.
699
700=item B<--tokenizer-internal|-ti>
701
702Tokenize the data using two embedded tokenizers,
703that will take an I<aggressive> and a I<conservative>
704approach.
Akron2520a342022-03-29 18:18:05 +0200705
Akron4e603a52020-07-27 14:23:49 +0200706=item B<--tokenizer-call|-tc>
707
708Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100709from STDIN and outputs the offsets of all tokens.
710
711Texts are separated using C<\x04\n>. The external process
712should add a new line per text.
713
714If the L</--use-tokenizer-sentence-splits> option is activated,
715sentences are marked by offset as well in new lines.
716
717To use L<Datok|https://github.com/KorAP/Datok> including sentence
718splitting, call C<tei2korap> as follows:
719
720 $ cat corpus.i5.xml | tei2korapxml -s \
721 $ -tc 'datok tokenize \
722 $ -t ./tokenizer.matok \
723 $ -p --newline-after-eot --no-sentences \
724 $ --no-tokens --sentence-positions -' - \
725 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200726
Akronb93fabb2023-01-13 12:05:44 +0100727=item B<--no-tokenizer>
728
729Boolean flag indicating that no tokenizer should be used.
730This is meant to ensure that by default a final token layer always
731exists.
732If a separate tokenizer is chosen, this flag is ignored.
733
Akron75d63142021-02-23 18:40:56 +0100734=item B<--skip-inline-tokens>
735
736Boolean flag indicating that inline tokens should not
737be processed. Defaults to false (meaning inline tokens will be processed).
738
Akron692d17d2021-03-05 13:21:03 +0100739=item B<--skip-inline-token-annotations>
740
741Boolean flag indicating that inline token annotations should not
742be processed. Defaults to true (meaning inline token annotations
Akron6b1f26b2024-09-19 11:35:32 +0200743won't be processed). Can be negated with
744C<--no-skip-inline-token-annotations>.
Akron692d17d2021-03-05 13:21:03 +0100745
Akronca70a1d2021-02-25 16:21:31 +0100746=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100747
748Expects a comma-separated list of tags to be ignored when the structure
749is parsed. Content of these tags however will be processed.
750
Marc Kupietzfc3a0ee2024-07-05 16:58:16 +0200751=item B<--auto-textsigle> <textsigle>
752
753Expects a text sigle thats serves as fallback if no text sigles
754are given in the input data.
755The auto text sigle will be incremented for each text processed.
756
757Example:
758
759 tei2korapxml --auto-textsigle 'ICC/GER.00001' -s -tk - \
760 < data.i5.xml > korapxml.zip
761
Marc Kupietza671ae52022-12-22 16:28:14 +0100762=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
763
Akrone48bec42023-01-05 12:18:45 +0100764Expects a regular replacement expression (separated by B<@> between the
Marc Kupietza671ae52022-12-22 16:28:14 +0100765search and the replacement) to convert text id attributes to text sigles
766with three parts (separated by B</>).
767
768Example:
769
770 tei2korapxml \
771 --xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
772 -tk - < t/data/icc_german_sample.p5.xml
773
Akrone48bec42023-01-05 12:18:45 +0100774Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
775sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
Marc Kupietza671ae52022-12-22 16:28:14 +0100776
Akron1a5271a2021-02-18 13:18:15 +0100777=item B<--inline-tokens> <foundry>#[<file>]
778
779Define the foundry and file (without extension)
780to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100781Unless C<--skip-inline-token-annotations> is set,
782this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100783Defaults to C<tokens> and C<morpho>.
784
Akrone2819a12021-10-12 15:52:55 +0200785The inline token data will also be stored in the
786inline structures file (see I<--inline-structures>),
787unless the inline token foundry is prepended
788by an B<!> exclamation mark, indicating that inline
789tokens are stored exclusively in the inline tokens
790file.
791
792Example:
793
Akron6b1f26b2024-09-19 11:35:32 +0200794 tei2korapxml --no-tokenizer --inline-tokens \
795 '!gingko#morpho' < data.i5.xml > korapxml.zip
796
797=item B<--inline-dependencies> <foundry>#[<file>]
798
799Define the foundry and file (without extension)
800to store inline dependency information in.
801Defaults to the layer of C<dependency> and
802will be ignored if not set (which means, dependency
803attributes will be stored in the inline tokens file,
804if not skipped).
805
806The dependency data will also be stored in the
807inline token file (see I<--inline-tokens>),
808unless the inline dependencies foundry is prepended
809by an B<!> exclamation mark, indicating that inline
810dependency data is stored exclusively in the inline
811dependencies file.
812
813Example:
814
815 tei2korapxml --no-tokenizer --inline-dependencies \
816 'gingko#dependency' < data.i5.xml > korapxml.zip
817
Akrone2819a12021-10-12 15:52:55 +0200818
Akrondd0be8f2021-02-18 19:29:41 +0100819=item B<--inline-structures> <foundry>#[<file>]
820
821Define the foundry and file (without extension)
822to store inline structure information in.
823Defaults to C<struct> and C<structures>.
824
Akron26a71522021-02-19 10:27:37 +0100825=item B<--base-foundry> <foundry>
826
827Define the base foundry to store newly generated
828token information in.
829Defaults to C<base>.
830
831=item B<--data-file> <file>
832
833Define the file (without extension)
834to store primary data information in.
835Defaults to C<data>.
836
837=item B<--header-file> <file>
838
839Define the file name (without extension)
840to store header information on
841the corpus, document, and text level in.
842Defaults to C<header>.
843
Marc Kupietz985da0c2021-02-15 19:29:50 +0100844=item B<--use-tokenizer-sentence-splits|-s>
845
846Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100847provided by the tokenizer.
848Currently KorAP-tokenizer and certain external tokenizers support
849these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100850
Akron91705d72021-02-19 10:59:45 +0100851=item B<--tokens-file> <file>
852
853Define the file (without extension)
854to store generated token information in
855(either from the KorAP tokenizer or an externally called tokenizer).
856Defaults to C<tokens>.
857
Akron3378dfd2020-08-01 15:01:36 +0200858=item B<--log|-l>
859
860Loglevel for I<Log::Any>. Defaults to C<notice>.
861
Akrond949e182020-02-14 12:23:57 +0100862=back
863
Akronb3649472020-09-29 08:24:46 +0200864=head1 ENVIRONMENT VARIABLES
865
866=over 2
867
868=item B<KORAPXMLTEI_DEBUG>
869
870Activate minimal debugging.
871Defaults to C<false>.
872
Marc Kupietzd254f5c2025-04-16 10:37:08 +0200873=item B<KORAPXMLTEI_TOKENIZER_HEAP_SIZE>
874
875Set the heap size for the tokenizer process.
876Defaults to C<512m>.
877
Akronb3649472020-09-29 08:24:46 +0200878=back
879
Akrond949e182020-02-14 12:23:57 +0100880=head1 COPYRIGHT AND LICENSE
881
Marc Kupietzb6fd6bc2025-04-16 12:47:26 +0200882Copyright (C) 2021-2025, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100883
884Author: Peter Harders
885
Akronaabd0952020-09-29 07:35:08 +0200886Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100887
888L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
889Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200890L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100891member of the
892L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
893
894This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100895L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100896
897=cut
Akronf8088e62021-02-18 16:18:59 +0100898
899# NOTES
900
Akronf8088e62021-02-18 16:18:59 +0100901## Notes on segfault prevention
902
Akron91577922021-02-19 10:32:54 +0100903binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100904(see notes on 'PerlIO layers' in 'man XML::LibXML'),
905removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
906see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
907see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.