blob: 7838bab44729c847046c08cee5177b1cc4395015 [file] [log] [blame]
Akron9cb13942020-02-14 07:39:54 +01001#!/usr/bin/env perl
Peter Hardersd892a582020-02-12 15:45:22 +01002use strict;
3use warnings;
Peter Harders6f526a32020-06-29 21:44:41 +02004
Akron3378dfd2020-08-01 15:01:36 +02005use Log::Any '$log';
6use Log::Any::Adapter;
Peter Harders6f526a32020-06-29 21:44:41 +02007use Pod::Usage;
8use Getopt::Long qw(GetOptions :config no_auto_abbrev);
9
10use File::Basename qw(dirname);
Peter Hardersd892a582020-02-12 15:45:22 +010011
Akroneaa96232020-10-15 17:06:15 +020012use Encode qw(decode);
Peter Hardersd892a582020-02-12 15:45:22 +010013
Akron4f67cd42020-07-02 12:27:58 +020014use FindBin;
15BEGIN {
16 unshift @INC, "$FindBin::Bin/../lib";
17};
18
Marc Kupietz8a954e52021-02-16 22:03:07 +010019use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
Akron8b511f92020-07-09 17:28:08 +020020use KorAP::XML::TEI::Tokenizer::External;
Akrond9627472020-07-09 16:53:09 +020021use KorAP::XML::TEI::Tokenizer::Conservative;
22use KorAP::XML::TEI::Tokenizer::Aggressive;
Akron85717512020-07-08 11:19:19 +020023use KorAP::XML::TEI::Zipper;
Akronf57ed812020-07-27 10:37:52 +020024use KorAP::XML::TEI::Header;
Akroneb12e232021-02-25 13:49:50 +010025use KorAP::XML::TEI::Inline;
Peter Hardersd892a582020-02-12 15:45:22 +010026
Marc Kupietz1e882fb2020-09-09 00:05:46 +020027eval {
28 require KorAP::XML::TEI::Tokenizer::KorAP;
29 1;
30};
Peter Harders1c5ce152020-07-22 18:02:50 +020031
Marc Kupietz0bca4f12022-01-14 13:24:22 +010032our $VERSION = '2.3.1';
Peter Harders6f526a32020-06-29 21:44:41 +020033
Akrond949e182020-02-14 12:23:57 +010034our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
35
Akron33db4ec2021-02-24 12:52:21 +010036use constant {
37 # Set to 1 for minimal more debug output (no need to be parametrized)
Akroneb12e232021-02-25 13:49:50 +010038 DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
Akron33db4ec2021-02-24 12:52:21 +010039};
Peter Hardersd892a582020-02-12 15:45:22 +010040
Akron692d17d2021-03-05 13:21:03 +010041if ($ENV{KORAPXMLTEI_INLINE}) {
42 warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
43};
44
Akrone2819a12021-10-12 15:52:55 +020045# Inline tokens won't be stored in the structure file
46my $inline_tokens_exclusive = 0;
47
Peter Harders6f526a32020-06-29 21:44:41 +020048# Parse options from the command line
Peter Hardersd892a582020-02-12 15:45:22 +010049GetOptions(
Akrond3e1d282021-02-24 14:51:27 +010050 'root|r=s' => \(my $root_dir = '.'),
51 'input|i=s' => \(my $input_fname = ''),
Akron75d63142021-02-23 18:40:56 +010052 'tokenizer-call|tc=s' => \(my $tokenizer_call),
53 'tokenizer-korap|tk' => \(my $tokenizer_korap),
Akrond53913c2021-02-24 09:50:13 +010054 'tokenizer-internal|ti' => \(my $tokenizer_intern),
Akron75d63142021-02-23 18:40:56 +010055 'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
56 'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
57 'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
58 'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
Akron692d17d2021-03-05 13:21:03 +010059 'skip-inline-token-annotations' => \(
60 my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
Akron54c3ff12021-02-25 11:33:37 +010061 'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
Akrond3e1d282021-02-24 14:51:27 +010062 'base-foundry=s' => \(my $base_dir = 'base'),
63 'data-file=s' => \(my $data_file = 'data'),
Akrond53913c2021-02-24 09:50:13 +010064 'header-file=s' => \(my $header_file = 'header'),
65 'tokens-file=s' => \(my $tokens_file = 'tokens'),
Akrond3e1d282021-02-24 14:51:27 +010066 'log|l=s' => \(my $log_level = 'notice'),
Akrona2cb2812021-10-30 10:29:08 +020067 '' => \(my $stdio),
Akron75d63142021-02-23 18:40:56 +010068 'help|h' => sub {
Akrond949e182020-02-14 12:23:57 +010069 pod2usage(
70 -verbose => 99,
71 -sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
72 -msg => $VERSION_MSG,
73 -output => '-'
74 )
75 },
76 'version|v' => sub {
77 pod2usage(
78 -verbose => 0,
79 -msg => $VERSION_MSG,
80 -output => '-'
Akrond3e1d282021-02-24 14:51:27 +010081 );
Akrond949e182020-02-14 12:23:57 +010082 }
Peter Hardersd892a582020-02-12 15:45:22 +010083);
84
Akrond3e1d282021-02-24 14:51:27 +010085
Akronb87c58d2021-02-23 17:23:30 +010086# Establish logger
Akron33db4ec2021-02-24 12:52:21 +010087binmode(STDERR, ':encoding(UTF-8)');
Akron3378dfd2020-08-01 15:01:36 +020088Log::Any::Adapter->set('Stderr', log_level => $log_level);
Akronb3649472020-09-29 08:24:46 +020089$log->notice('Debugging is activated') if DEBUG;
90
Akrond3e1d282021-02-24 14:51:27 +010091
Akron0529e512021-02-22 09:55:35 +010092# tag (without attributes), which contains the primary text
93my $_TEXT_BODY = 'text';
Akron0c41ab32020-09-29 07:33:33 +020094# optional
Akron09e0b2c2020-07-28 15:57:01 +020095
Akron0529e512021-02-22 09:55:35 +010096# TODO: IDS-specific (and redundant)
97my $_HEADER_TAG = 'idsHeader';
Akron0c41ab32020-09-29 07:33:33 +020098
Akron54c3ff12021-02-25 11:33:37 +010099# Remember to skip certain inline tags
100my %skip_inline_tags = ();
101if ($skip_inline_tags_str) {
102 foreach (split /\s*,\s*/, $skip_inline_tags_str) {
103 $skip_inline_tags{$_} = 1;
104 };
105};
106
Akrond3e1d282021-02-24 14:51:27 +0100107# External tokenization
Akron0c41ab32020-09-29 07:33:33 +0200108my $ext_tok;
109if ($tokenizer_call) {
110 $ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
Akron11484782021-11-03 20:12:14 +0100111 $ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
Akron0c41ab32020-09-29 07:33:33 +0200112}
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200113
Akron0c41ab32020-09-29 07:33:33 +0200114elsif ($tokenizer_korap) {
Marc Kupietz985da0c2021-02-15 19:29:50 +0100115 $ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
Akron0c41ab32020-09-29 07:33:33 +0200116};
Peter Harders6f526a32020-06-29 21:44:41 +0200117
Akron11484782021-11-03 20:12:14 +0100118if ($use_tokenizer_sentence_splits) {
119 $skip_inline_tags{s} = 1;
120};
Akron0c41ab32020-09-29 07:33:33 +0200121
Akrond3e1d282021-02-24 14:51:27 +0100122# Internal tokenization
Akronb87c58d2021-02-23 17:23:30 +0100123my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
124my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
Akrond3e1d282021-02-24 14:51:27 +0100125
Peter Harders41c35622020-07-12 01:16:22 +0200126
Akrondd0be8f2021-02-18 19:29:41 +0100127# Name of the directory and the file containing all inline structure informations
Akrond53913c2021-02-24 09:50:13 +0100128# except for $_TOKENS_TAG information
Akrondd0be8f2021-02-18 19:29:41 +0100129my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
Akrondd0be8f2021-02-18 19:29:41 +0100130
Akron1a5271a2021-02-18 13:18:15 +0100131# Name of the directory and the file containing all inline token informations
132# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
133my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
Akron1a5271a2021-02-18 13:18:15 +0100134
Akrone2819a12021-10-12 15:52:55 +0200135if (index($_tokens_dir, '!') == 0) {
136 $_tokens_dir = substr($_tokens_dir, 1);
137 $inline_tokens_exclusive = 1;
138};
139
Akronb87c58d2021-02-23 17:23:30 +0100140# Initialize zipper
Akrond53913c2021-02-24 09:50:13 +0100141my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
Akron09e0b2c2020-07-28 15:57:01 +0200142
Akronbc899192021-02-24 12:14:47 +0100143# text directory (below $root_dir)
144my $dir = '';
Akron09e0b2c2020-07-28 15:57:01 +0200145
Akronbc899192021-02-24 12:14:47 +0100146# Escaped version of text id
147my $text_id_esc;
Peter Harders6f526a32020-06-29 21:44:41 +0200148
Akrond53913c2021-02-24 09:50:13 +0100149# Default encoding of the text
150my $input_enc = 'UTF-8';
151
Akrond53913c2021-02-24 09:50:13 +0100152# text line (needed for whitespace handling)
153my $text_line = 0;
154
Peter Harders6f526a32020-06-29 21:44:41 +0200155
Akrond53913c2021-02-24 09:50:13 +0100156# Input file handle (default: stdin)
Akrona2cb2812021-10-30 10:29:08 +0200157my $input_fh;
Peter Hardersd892a582020-02-12 15:45:22 +0100158
Akrona2cb2812021-10-30 10:29:08 +0200159# Single dash was set
160if ($stdio) {
161 $input_fh = *STDIN;
162}
163
164# Input flag was passed
165elsif ($input_fname ne '') {
Akron347be812020-09-29 07:52:52 +0200166 unless (open($input_fh, '<', $input_fname)) {
167 die $log->fatal("File '$input_fname' could not be opened.");
168 };
Akrona2cb2812021-10-30 10:29:08 +0200169}
170
171# No input to process
172else {
173 pod2usage(
174 -verbose => 99,
175 -sections => 'NAME|SYNOPSIS',
176 -msg => $VERSION_MSG,
177 -output => '-'
178 );
179 exit;
Akrond53913c2021-02-24 09:50:13 +0100180};
Peter Harders6f526a32020-06-29 21:44:41 +0200181
Akronf8088e62021-02-18 16:18:59 +0100182# Prevents segfaulting (see notes on segfault prevention)
Akron347be812020-09-29 07:52:52 +0200183binmode $input_fh;
Peter Harders6f526a32020-06-29 21:44:41 +0200184
Peter Harders6f526a32020-06-29 21:44:41 +0200185
Akroneb12e232021-02-25 13:49:50 +0100186# Create inline parser object
187my $inline = KorAP::XML::TEI::Inline->new(
188 $skip_inline_tokens,
Akrone2819a12021-10-12 15:52:55 +0200189 \%skip_inline_tags,
190 $inline_tokens_exclusive
Akroneb12e232021-02-25 13:49:50 +0100191);
192
193
Akrond53913c2021-02-24 09:50:13 +0100194# Reading input document
Akrond3e1d282021-02-24 14:51:27 +0100195MAIN: while (<$input_fh>) {
Akron347be812020-09-29 07:52:52 +0200196
Akrond53913c2021-02-24 09:50:13 +0100197 # remove HTML (multi-line) comments (<!--...-->)
Akrond3e1d282021-02-24 14:51:27 +0100198 $_ = remove_xml_comments($input_fh, $_);
Akron347be812020-09-29 07:52:52 +0200199
Akroneaa96232020-10-15 17:06:15 +0200200 # Set input encoding
Akrond53913c2021-02-24 09:50:13 +0100201 if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
Akroneaa96232020-10-15 17:06:15 +0200202 $input_enc = $2;
203 next;
204 };
205
206 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100207 $_ = replace_entities($_);
Akroneaa96232020-10-15 17:06:15 +0200208
Akrond3e1d282021-02-24 14:51:27 +0100209 # Start of text body
210 if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
Akrond53913c2021-02-24 09:50:13 +0100211 my $suffix = $2;
Akron347be812020-09-29 07:52:52 +0200212
Akrond53913c2021-02-24 09:50:13 +0100213 if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200214 die $log->fatal("input line number $.: " .
215 "line with opening text-body tag '${_TEXT_BODY}' " .
216 "contains additional information ... => Aborting (line=$_)");
Akron0bb7e722020-09-29 07:48:33 +0200217 };
Peter Harders6f526a32020-06-29 21:44:41 +0200218
Akrond53913c2021-02-24 09:50:13 +0100219 # Text body data extracted from input document ($input_fh),
220 # further processed by XML::LibXML::Reader
221 my $text_buffer = '';
Peter Harders90157342020-07-01 21:05:14 +0200222
Akron347be812020-09-29 07:52:52 +0200223 # Iterate over all lines in the text body
224 while (<$input_fh>) {
Peter Harders90157342020-07-01 21:05:14 +0200225
Akrond3e1d282021-02-24 14:51:27 +0100226 $_ = remove_xml_comments($input_fh, $_);
Akroneaa96232020-10-15 17:06:15 +0200227 $_ = decode($input_enc, $_);
Marc Kupietz8a954e52021-02-16 22:03:07 +0100228 $_ = replace_entities($_);
Peter Harders6f526a32020-06-29 21:44:41 +0200229
Akrond53913c2021-02-24 09:50:13 +0100230 # End of text body
Akronb43b4912021-02-25 10:31:11 +0100231 if ((my $pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
Peter Harders6f526a32020-06-29 21:44:41 +0200232
Akron91705d72021-02-19 10:59:45 +0100233 # write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
Peter Harders6f526a32020-06-29 21:44:41 +0200234
Akrond53913c2021-02-24 09:50:13 +0100235 if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
Akron347be812020-09-29 07:52:52 +0200236 die $log->fatal("input line number $.: " .
237 "line with closing text-body tag '${_TEXT_BODY}'".
238 " contains additional information ... => Aborting (line=$_)");
239 };
Peter Harders6f526a32020-06-29 21:44:41 +0200240
Akrondafaa7a2021-02-19 15:17:58 +0100241 if ($dir eq '') {
Akrond53913c2021-02-24 09:50:13 +0100242 $log->warn(
243 "Maybe empty textSigle => skipping this text ...\n" .
Akroneb12e232021-02-25 13:49:50 +0100244 'data=' . substr($inline->data->data, 0, 200)
Akrond53913c2021-02-24 09:50:13 +0100245 );
Akrondafaa7a2021-02-19 15:17:58 +0100246 next MAIN;
247 };
Peter Harders6f526a32020-06-29 21:44:41 +0200248
Akroneb12e232021-02-25 13:49:50 +0100249 # Parse inline structure
250 $inline->parse($text_id_esc, \$text_buffer);
Akrondafaa7a2021-02-19 15:17:58 +0100251
252 if (DEBUG) {
Akrond53913c2021-02-24 09:50:13 +0100253 $log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
Akrondafaa7a2021-02-19 15:17:58 +0100254 };
255
Akroneb12e232021-02-25 13:49:50 +0100256 my $data = $inline->data;
257
Akrond53913c2021-02-24 09:50:13 +0100258 # Write data.xml
Akrondafaa7a2021-02-19 15:17:58 +0100259 $data->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100260 $zipper->new_stream("$dir/${data_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100261 $text_id_esc
262 );
263
Akrond53913c2021-02-24 09:50:13 +0100264 # Tokenize with external tokenizer
Akron9df4a242021-02-19 15:31:16 +0100265 if ($ext_tok) {
Akrondafaa7a2021-02-19 15:17:58 +0100266
267 # Tokenize and output
268 $ext_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100269 $zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100270 $text_id_esc
Akrond20898f2021-02-19 15:52:17 +0100271 );
Akrond53ab4b2021-02-24 09:56:12 +0100272
273 if ($use_tokenizer_sentence_splits) {
Akroneb12e232021-02-25 13:49:50 +0100274 $ext_tok->sentencize_from_previous_input($inline->structures);
Akrond53ab4b2021-02-24 09:56:12 +0100275 };
Akrondafaa7a2021-02-19 15:17:58 +0100276 };
Peter Harders6f526a32020-06-29 21:44:41 +0200277
Akrond53913c2021-02-24 09:50:13 +0100278 # Tokenize with internal tokenizer
279 if ($tokenizer_intern) {
Peter Harders6f526a32020-06-29 21:44:41 +0200280
Akrondafaa7a2021-02-19 15:17:58 +0100281 # Tokenize and output
282 $cons_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100283 $zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
Akron347be812020-09-29 07:52:52 +0200284 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100285 )->reset;
Akron598d1a72020-08-02 17:33:31 +0200286
Akrondafaa7a2021-02-19 15:17:58 +0100287 $aggr_tok->tokenize($data->data)->to_zip(
Akrond53913c2021-02-24 09:50:13 +0100288 $zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
Akrondafaa7a2021-02-19 15:17:58 +0100289 $text_id_esc
Akroncc27d792021-02-24 12:32:20 +0100290 )->reset;
Akrondafaa7a2021-02-19 15:17:58 +0100291 };
Akrona10ad592020-08-03 11:20:23 +0200292
Akrondafaa7a2021-02-19 15:17:58 +0100293 # ~ write structures ~
Akroneb12e232021-02-25 13:49:50 +0100294 if (!$inline->structures->empty) {
295 $inline->structures->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100296 $zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100297 $text_id_esc,
298 2 # = structure serialization
Akroneb12e232021-02-25 13:49:50 +0100299 );
Akrondafaa7a2021-02-19 15:17:58 +0100300 };
301
302 # ~ write tokens ~
Akroneb12e232021-02-25 13:49:50 +0100303 unless ($skip_inline_tokens || $inline->tokens->empty) {
304 $inline->tokens->to_zip(
Akronb87c58d2021-02-23 17:23:30 +0100305 $zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
Akrondafaa7a2021-02-19 15:17:58 +0100306 $text_id_esc,
Akron692d17d2021-03-05 13:21:03 +0100307 # Either 0 = tokens without inline or 1 = tokens with inline
308 !$skip_inline_token_annotations
Akroneb12e232021-02-25 13:49:50 +0100309 );
Akrondafaa7a2021-02-19 15:17:58 +0100310 };
311
312 # reinit.
313 $dir = '';
314
Akron347be812020-09-29 07:52:52 +0200315 next MAIN;
Akron598d1a72020-08-02 17:33:31 +0200316 };
317
Peter Harders6f526a32020-06-29 21:44:41 +0200318
Akron347be812020-09-29 07:52:52 +0200319 # ~ whitespace handling ~
Peter Harders6f526a32020-06-29 21:44:41 +0200320
Akronf8088e62021-02-18 16:18:59 +0100321 # Fix whitespaces (see notes on whitespace fixing)
Peter Hardersd892a582020-02-12 15:45:22 +0100322
Akrond53913c2021-02-24 09:50:13 +0100323 # TODO:
324 # Maybe it's best, to keep the stripping of whitespace and
325 # to just remove the if-clause and to insert a blank by default
326 # (with possibly an option on how newlines in primary text should
327 # be handled (stripped or replaced by a whitespace)).
Akronf8088e62021-02-18 16:18:59 +0100328
329 # Remove consecutive whitespace at beginning and end (mostly one newline)
330 s/^\s+//; s/\s+$//;
Akronf57ed812020-07-27 10:37:52 +0200331
Akrond53913c2021-02-24 09:50:13 +0100332 # NOTE:
333 # this is only relevant, if a text consists of more than one line
Akronf57ed812020-07-27 10:37:52 +0200334
Akrond53913c2021-02-24 09:50:13 +0100335 # TODO:
336 # find a better solution, or create a warning, if a text has more
337 # than one line ($text_line > 1)
Akronf57ed812020-07-27 10:37:52 +0200338
Akrond53913c2021-02-24 09:50:13 +0100339 # TODO:
340 # do testing with 2 different corpora
341 # (one with only one-line texts, the other with several lines per text)
342
343 # line contains at least one tag with at least one character contents
344 if (m/<[^>]+>[^<]/) {
345
346 # Increment counter for text lines
347 $text_line++;
348
349 # insert blank before 1st character
Akron6e2b1252021-02-24 12:41:15 +0100350 # (for 2nd line and consecutive lines)
351 $_ = ' ' . $_ if $text_line > 1;
Akron347be812020-09-29 07:52:52 +0200352 }
Akronf57ed812020-07-27 10:37:52 +0200353
Akron347be812020-09-29 07:52:52 +0200354 # add line to buffer
Akrond53913c2021-02-24 09:50:13 +0100355 $text_buffer .= $_;
Akron347be812020-09-29 07:52:52 +0200356 };
Akrond3e1d282021-02-24 14:51:27 +0100357 }
Akronf57ed812020-07-27 10:37:52 +0200358
Akrond3e1d282021-02-24 14:51:27 +0100359 # Start of header section
360 elsif (m#^(.*)(\<${_HEADER_TAG}[^>]*?type=["'].*)$#) {
Akronf57ed812020-07-27 10:37:52 +0200361
Akron347be812020-09-29 07:52:52 +0200362 my $content = "$2\n";
Akronf57ed812020-07-27 10:37:52 +0200363
Akrond20898f2021-02-19 15:52:17 +0100364 if ($1 !~ /^\s*$/) {
Akrond53913c2021-02-24 09:50:13 +0100365 die $log->fatal(
366 "input line number $.: " .
367 'line with opening header tag is not in expected format ... ' .
368 "=> Aborting (line=$_)");
Akron347be812020-09-29 07:52:52 +0200369 };
370
371 # Parse header
Akroneaa96232020-10-15 17:06:15 +0200372 my $header = KorAP::XML::TEI::Header->new($content, $input_enc)->parse($input_fh);
Akron347be812020-09-29 07:52:52 +0200373
374 # Header was parseable
375 if ($header) {
376
377 # Write header to zip
Akrond53913c2021-02-24 09:50:13 +0100378 my $file = $header->dir . '/' . $header_file . '.xml';
Akron347be812020-09-29 07:52:52 +0200379
Akronb3649472020-09-29 08:24:46 +0200380 $log->debug("Writing file $file") if DEBUG;
Akron347be812020-09-29 07:52:52 +0200381
382 $header->to_zip($zipper->new_stream($file));
383
384 # Header is for text level
385 if ($header->type eq 'text') {
386
387 # Remember dir and sigles
388 $dir = $header->dir;
Akron347be812020-09-29 07:52:52 +0200389 $text_id_esc = $header->id_esc;
390
391 # log output for seeing progression
Akronbc899192021-02-24 12:14:47 +0100392 $log->notice("$0: text_id=$text_id_esc");
Akron347be812020-09-29 07:52:52 +0200393
Akrond53913c2021-02-24 09:50:13 +0100394 # Reset counter for text lines
395 # (needed for whitespace handling)
396 $text_line = 0;
397 };
398 };
399 };
400};
Peter Hardersd892a582020-02-12 15:45:22 +0100401
Akron347be812020-09-29 07:52:52 +0200402$zipper->close;
Peter Harders6f526a32020-06-29 21:44:41 +0200403
Akron9df4a242021-02-19 15:31:16 +0100404$ext_tok->close if $ext_tok;
Peter Hardersd892a582020-02-12 15:45:22 +0100405
Akrond53913c2021-02-24 09:50:13 +0100406close $input_fh;
407
Peter Harders6f526a32020-06-29 21:44:41 +0200408
Akrond949e182020-02-14 12:23:57 +0100409__END__
410
411=pod
412
413=encoding utf8
414
415=head1 NAME
416
417tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
418
419=head1 SYNOPSIS
420
Akrona2cb2812021-10-30 10:29:08 +0200421 cat corpus.i5.xml | tei2korapxml - > corpus.korapxml.zip
Akrond949e182020-02-14 12:23:57 +0100422
423=head1 DESCRIPTION
424
Akronee434b12020-07-08 12:53:01 +0200425C<tei2korapxml> is a script to convert TEI P5 and
Akrond72baca2021-07-23 13:25:32 +0200426L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
Akronee434b12020-07-08 12:53:01 +0200427based documents to the
428L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
Peter Harders6f526a32020-06-29 21:44:41 +0200429
Akrond949e182020-02-14 12:23:57 +0100430This program is usually called from inside another script.
431
Akronee434b12020-07-08 12:53:01 +0200432=head1 FORMATS
433
434=head2 Input restrictions
435
436=over 2
437
438=item
439
Akronee434b12020-07-08 12:53:01 +0200440TEI P5 formatted input with certain restrictions:
441
442=over 4
443
444=item
445
446B<mandatory>: text-header with integrated textsigle, text-body
447
448=item
449
450B<optional>: corp-header with integrated corpsigle,
451doc-header with integrated docsigle
452
453=back
454
455=item
456
Akron0c41ab32020-09-29 07:33:33 +0200457All tokens inside the primary text may not be
Akronee434b12020-07-08 12:53:01 +0200458newline seperated, because newlines are removed
Akron0c41ab32020-09-29 07:33:33 +0200459(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
Akronee434b12020-07-08 12:53:01 +0200460into blanks between 2 tokens could lead to additional blanks,
461where there should be none (e.g.: punctuation characters like C<,> or
462C<.> should not be seperated from their predecessor token).
Akron8a0c4bf2021-03-16 16:51:21 +0100463(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
Akronee434b12020-07-08 12:53:01 +0200464
Akron940ca6f2021-10-11 12:38:39 +0200465=item
466
467Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
468need to be defined in the same line as the header tag.
469
Akronee434b12020-07-08 12:53:01 +0200470=back
471
472=head2 Notes on the output
473
474=over 2
475
476=item
477
478zip file output (default on C<stdout>) with utf8 encoded entries
479(which together form the KorAP-XML format)
480
481=back
482
Akrond949e182020-02-14 12:23:57 +0100483=head1 INSTALLATION
484
Marc Kupietze83a4e92021-03-16 20:51:26 +0100485C<tei2korapxml> requires L<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
486When these requirements are met, the preferred way to install the script is
Akrond949e182020-02-14 12:23:57 +0100487to use L<cpanm|App::cpanminus>.
488
489 $ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
490
491In case everything went well, the C<tei2korapxml> tool will
492be available on your command line immediately.
Peter Harders6f526a32020-06-29 21:44:41 +0200493
Akrond949e182020-02-14 12:23:57 +0100494Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
495
496=head1 OPTIONS
497
498=over 2
499
Akrona2cb2812021-10-30 10:29:08 +0200500=item B<--input|-i>
501
502The input file to process. If no specific input is defined and a single
503dash C<-> is passed as an argument, data is read from C<STDIN>.
504
505
Akron4e603a52020-07-27 14:23:49 +0200506=item B<--root|-r>
Akrond949e182020-02-14 12:23:57 +0100507
Akron4e603a52020-07-27 14:23:49 +0200508The root directory for output. Defaults to C<.>.
Akrond949e182020-02-14 12:23:57 +0100509
510=item B<--help|-h>
511
512Print help information.
513
514=item B<--version|-v>
515
516Print version information.
517
Akron4e603a52020-07-27 14:23:49 +0200518=item B<--tokenizer-call|-tc>
519
520Call an external tokenizer process, that will tokenize
Akron11484782021-11-03 20:12:14 +0100521from STDIN and outputs the offsets of all tokens.
522
523Texts are separated using C<\x04\n>. The external process
524should add a new line per text.
525
526If the L</--use-tokenizer-sentence-splits> option is activated,
527sentences are marked by offset as well in new lines.
528
529To use L<Datok|https://github.com/KorAP/Datok> including sentence
530splitting, call C<tei2korap> as follows:
531
532 $ cat corpus.i5.xml | tei2korapxml -s \
533 $ -tc 'datok tokenize \
534 $ -t ./tokenizer.matok \
535 $ -p --newline-after-eot --no-sentences \
536 $ --no-tokens --sentence-positions -' - \
537 $ > corpus.korapxml.zip
Akron4e603a52020-07-27 14:23:49 +0200538
Marc Kupietz1e882fb2020-09-09 00:05:46 +0200539=item B<--tokenizer-korap|-tk>
540
541Use the standard KorAP/DeReKo tokenizer.
542
Akron6d7b8e42020-09-29 07:37:41 +0200543=item B<--tokenizer-internal|-ti>
Akron4e603a52020-07-27 14:23:49 +0200544
545Tokenize the data using two embedded tokenizers,
546that will take an I<Aggressive> and a I<conservative>
547approach.
548
Akron75d63142021-02-23 18:40:56 +0100549=item B<--skip-inline-tokens>
550
551Boolean flag indicating that inline tokens should not
552be processed. Defaults to false (meaning inline tokens will be processed).
553
Akron692d17d2021-03-05 13:21:03 +0100554=item B<--skip-inline-token-annotations>
555
556Boolean flag indicating that inline token annotations should not
557be processed. Defaults to true (meaning inline token annotations
558won't be processed).
559
Akronca70a1d2021-02-25 16:21:31 +0100560=item B<--skip-inline-tags> <tags>
Akron54c3ff12021-02-25 11:33:37 +0100561
562Expects a comma-separated list of tags to be ignored when the structure
563is parsed. Content of these tags however will be processed.
564
Akron1a5271a2021-02-18 13:18:15 +0100565=item B<--inline-tokens> <foundry>#[<file>]
566
567Define the foundry and file (without extension)
568to store inline token information in.
Akron8a0c4bf2021-03-16 16:51:21 +0100569Unless C<--skip-inline-token-annotations> is set,
570this will contain annotations as well.
Akron1a5271a2021-02-18 13:18:15 +0100571Defaults to C<tokens> and C<morpho>.
572
Akrone2819a12021-10-12 15:52:55 +0200573The inline token data will also be stored in the
574inline structures file (see I<--inline-structures>),
575unless the inline token foundry is prepended
576by an B<!> exclamation mark, indicating that inline
577tokens are stored exclusively in the inline tokens
578file.
579
580Example:
581
582 tei2korapxml --inline-tokens '!gingko#morpho' < data.i5.xml > korapxml.zip
583
Akrondd0be8f2021-02-18 19:29:41 +0100584=item B<--inline-structures> <foundry>#[<file>]
585
586Define the foundry and file (without extension)
587to store inline structure information in.
588Defaults to C<struct> and C<structures>.
589
Akron26a71522021-02-19 10:27:37 +0100590=item B<--base-foundry> <foundry>
591
592Define the base foundry to store newly generated
593token information in.
594Defaults to C<base>.
595
596=item B<--data-file> <file>
597
598Define the file (without extension)
599to store primary data information in.
600Defaults to C<data>.
601
602=item B<--header-file> <file>
603
604Define the file name (without extension)
605to store header information on
606the corpus, document, and text level in.
607Defaults to C<header>.
608
Marc Kupietz985da0c2021-02-15 19:29:50 +0100609=item B<--use-tokenizer-sentence-splits|-s>
610
611Replace existing with, or add new, sentence boundary information
Akron11484782021-11-03 20:12:14 +0100612provided by the tokenizer.
613Currently KorAP-tokenizer and certain external tokenizers support
614these boundaries.
Marc Kupietz985da0c2021-02-15 19:29:50 +0100615
Akron91705d72021-02-19 10:59:45 +0100616=item B<--tokens-file> <file>
617
618Define the file (without extension)
619to store generated token information in
620(either from the KorAP tokenizer or an externally called tokenizer).
621Defaults to C<tokens>.
622
Akron3378dfd2020-08-01 15:01:36 +0200623=item B<--log|-l>
624
625Loglevel for I<Log::Any>. Defaults to C<notice>.
626
Akrond949e182020-02-14 12:23:57 +0100627=back
628
Akronb3649472020-09-29 08:24:46 +0200629=head1 ENVIRONMENT VARIABLES
630
631=over 2
632
633=item B<KORAPXMLTEI_DEBUG>
634
635Activate minimal debugging.
636Defaults to C<false>.
637
Akronb3649472020-09-29 08:24:46 +0200638=back
639
Akrond949e182020-02-14 12:23:57 +0100640=head1 COPYRIGHT AND LICENSE
641
Marc Kupietze955ecc2021-02-17 17:42:01 +0100642Copyright (C) 2021, L<IDS Mannheim|https://www.ids-mannheim.de/>
Akrond949e182020-02-14 12:23:57 +0100643
644Author: Peter Harders
645
Akronaabd0952020-09-29 07:35:08 +0200646Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
Akrond949e182020-02-14 12:23:57 +0100647
648L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
649Corpus Analysis Platform at the
Akrond72baca2021-07-23 13:25:32 +0200650L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
Akrond949e182020-02-14 12:23:57 +0100651member of the
652L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
653
654This program is free software published under the
Marc Kupietze955ecc2021-02-17 17:42:01 +0100655L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
Akrond949e182020-02-14 12:23:57 +0100656
657=cut
Akronf8088e62021-02-18 16:18:59 +0100658
659# NOTES
660
Akronf8088e62021-02-18 16:18:59 +0100661## Notes on segfault prevention
662
Akron91577922021-02-19 10:32:54 +0100663binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
Akronf8088e62021-02-18 16:18:59 +0100664(see notes on 'PerlIO layers' in 'man XML::LibXML'),
665removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
666see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
667see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.