blob: 418408e2b8253ea0badd5192da8c1527bce9fd55 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use Log::Any '$log';
use Log::Any::Adapter;
use Pod::Usage;
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use File::Basename qw(dirname);
use Encode qw(decode);
use FindBin;
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
use KorAP::XML::TEI::Inline;
our $VERSION = '2.6.0';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
use constant {
# Set to 1 for minimal more debug output (no need to be parametrized)
DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
};
if ($ENV{KORAPXMLTEI_INLINE}) {
warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
};
# Inline tokens won't be stored in the structure file
my $inline_tokens_exclusive = 0;
# Inline dependencies won't be stored in the tokens file
my $inline_deps_exclusive = 0;
# Parse options from the command line
GetOptions(
'root|r=s' => \(my $root_dir = '.'),
'input|i=s' => \(my $input_fname = ''),
'output|o=s' => \(my $output_fname = ''),
'tokenizer-call|tc=s' => \(my $tokenizer_call),
'tokenizer-korap|tk' => \(my $tokenizer_korap),
'tokenizer-internal|ti' => \(my $tokenizer_intern),
'no-tokenizer' => \(my $no_tokenizer),
'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
'inline-dependencies=s' => \(my $inline_dependencies),
'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
'skip-inline-token-annotations!' => \(
my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
'base-foundry=s' => \(my $base_dir = 'base'),
'data-file=s' => \(my $data_file = 'data'),
'header-file=s' => \(my $header_file = 'header'),
'tokens-file=s' => \(my $tokens_file = 'tokens'),
'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
'log|l=s' => \(my $log_level = 'notice'),
'required-version|rv=s' => \(my $required_version),
'' => \(my $stdio),
'help|h' => sub {
pod2usage(
-verbose => 99,
-sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
-msg => $VERSION_MSG,
-output => '-'
)
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG,
-output => '-'
);
}
);
# Establish logger
binmode(STDERR, ':encoding(UTF-8)');
Log::Any::Adapter->set('Stderr', log_level => $log_level);
$log->notice('Debugging is activated') if DEBUG;
if ($required_version) {
$required_version =~ /^\s*(\d+\.\d+\.\d+(-TRIAL)?)\s*$/;
if (!$1 || $1 ne $VERSION) {
$log->error("Required version $required_version mismatches version $VERSION");
exit(1);
};
};
my ($what, $with);
if ($xmlid_to_textsigle ne '') {
($what, $with) = split('@', $xmlid_to_textsigle);
$what = qr!$what!;
};
# tag (without attributes), which contains the primary text
my $_TEXT_BODY = 'text';
# optional
# Remember to skip certain inline tags
my %skip_inline_tags = ();
if ($skip_inline_tags_str) {
foreach (split /\s*,\s*/, $skip_inline_tags_str) {
$skip_inline_tags{$_} = 1;
};
};
# External tokenization
my $ext_tok;
if ($tokenizer_call) {
$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
}
# KorAP tokenization
elsif ($tokenizer_korap) {
eval {
require KorAP::XML::TEI::Tokenizer::KorAP;
1;
};
my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
if ($korap_tok_ver ne $VERSION) {
$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
exit(1);
};
$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
}
# No internal tokenizer chosen
elsif (!$tokenizer_intern && !$no_tokenizer) {
$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
exit(1);
};
if (!$no_tokenizer && $use_tokenizer_sentence_splits) {
$skip_inline_tags{s} = 1;
};
# Internal tokenization
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
# Name of the directory and the file containing all inline structure informations
# except for $_TOKENS_TAG information
my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
# Name of the directory and the file containing all inline token informations
# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
if (index($_tokens_dir, '!') == 0) {
$_tokens_dir = substr($_tokens_dir, 1);
$inline_tokens_exclusive = 1;
};
my ($_dep_dir, $_dep_file);
if ($inline_dependencies) {
($_dep_dir, $_dep_file) = split '#', $inline_dependencies . '#dependency';
$inline_dependencies = 1;
if ($_dep_dir && index($_dep_dir, '!') == 0) {
$_dep_dir = substr($_dep_dir, 1);
$inline_deps_exclusive = 1;
};
};
# Initialize zipper
my $zipper = KorAP::XML::TEI::Zipper->new($root_dir, $output_fname);
# text directory (below $root_dir)
my $dir = '';
# Escaped version of text id
my $text_id_esc;
# Default encoding of the text
my $input_enc = 'UTF-8';
# text line (needed for whitespace handling)
my $text_line = 0;
# Input file handle (default: stdin)
my $input_fh;
# Single dash was set
if ($stdio) {
$input_fh = *STDIN;
}
# Input flag was passed
elsif ($input_fname ne '') {
unless (open($input_fh, '<', $input_fname)) {
die $log->fatal("File '$input_fname' could not be opened.");
};
}
# No input to process
else {
pod2usage(
-verbose => 99,
-sections => 'NAME|SYNOPSIS',
-msg => $VERSION_MSG,
-output => '-'
);
exit;
};
# Prevents segfaulting (see notes on segfault prevention)
binmode $input_fh;
# Create inline parser object
my $inline = KorAP::XML::TEI::Inline->new(
$skip_inline_tokens,
\%skip_inline_tags,
$inline_tokens_exclusive,
$inline_dependencies
);
# Reading input document
MAIN: while (<$input_fh>) {
# remove HTML (multi-line) comments (<!--...-->)
$_ = remove_xml_comments($input_fh, $_);
# Set input encoding
if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
$input_enc = $2;
next;
};
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
# Start of text body
if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
my $suffix = $2;
if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with opening text-body tag '${_TEXT_BODY}' " .
"contains additional information ... => Aborting (line=$_)");
};
# Text body data extracted from input document ($input_fh),
# further processed by XML::LibXML::Reader
my $text_buffer = '';
# Iterate over all lines in the text body
while (<$input_fh>) {
$_ = remove_xml_comments($input_fh, $_);
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
# End of text body
if ((my $pos = index($_, "</$_TEXT_BODY>")) >= 0) {
# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with closing text-body tag '${_TEXT_BODY}'".
" contains additional information ... => Aborting (line=$_)");
};
if ($dir eq '') {
$log->warn(
"Maybe empty textSigle => skipping this text ...\n" .
'data=' . substr($inline->data->data, 0, 200)
);
next MAIN;
};
# Parse inline structure
$inline->parse($text_id_esc, \$text_buffer);
if (DEBUG) {
$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
};
my $data = $inline->data;
# Write data.xml
$data->to_zip(
$zipper->new_stream("$dir/${data_file}.xml"),
$text_id_esc
);
# Tokenize with external tokenizer
if ($ext_tok) {
# Tokenize and output
$ext_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
$text_id_esc
);
if ($use_tokenizer_sentence_splits) {
$ext_tok->sentencize_from_previous_input($inline->structures);
};
};
# Tokenize with internal tokenizer
if ($tokenizer_intern) {
# Tokenize and output
$cons_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
$text_id_esc
)->reset;
$aggr_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
$text_id_esc
)->reset;
};
# ~ write structures ~
unless ($inline->structures->empty) {
$inline->structures->to_zip(
$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
$text_id_esc,
2 # = structure serialization
);
};
# ~ write tokens ~
unless ($skip_inline_tokens || $inline->tokens->empty) {
$inline->tokens->to_zip(
$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
$text_id_esc,
# Either 0 = tokens without inline or
# 1 = tokens with inline
# !$skip_inline_token_annotations
($skip_inline_token_annotations ? 0 : ($inline_deps_exclusive ? 4 : 1))
);
};
# ~ write dependencies ~
unless ($inline->dependencies->empty) {
$inline->dependencies->to_zip(
$zipper->new_stream("$dir/$_dep_dir/${_dep_file}.xml"),
$text_id_esc,
3 # = dependency serialization
);
};
# reinit.
$dir = '';
next MAIN;
};
# ~ whitespace handling ~
# Fix whitespaces (see notes on whitespace fixing)
# TODO:
# Maybe it's best, to keep the stripping of whitespace and
# to just remove the if-clause and to insert a blank by default
# (with possibly an option on how newlines in primary text should
# be handled (stripped or replaced by a whitespace)).
# Remove consecutive whitespace at beginning and end (mostly one newline)
s/^\s+//; s/\s+$//;
# NOTE:
# this is only relevant, if a text consists of more than one line
# TODO:
# find a better solution, or create a warning, if a text has more
# than one line ($text_line > 1)
# TODO:
# do testing with 2 different corpora
# (one with only one-line texts, the other with several lines per text)
# line contains at least one non-tag character
if (m/^[^<]*$/ || m/(?:<[^>]+>[^<])|(?:[^<]<[^>]+>)/) {
# Increment counter for text lines
$text_line++;
# insert blank before 1st character
# (for 2nd line and consecutive lines)
$_ = ' ' . $_ if $text_line > 1;
}
# add line to buffer
$text_buffer .= $_;
};
}
elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
my $leadin = $1;
my $id = $3;
my $sigle = $3;
if ($what) {
$_ = $id;
eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
$sigle = $_;
$log->debug("Converted text id `$id' to sigle `$sigle'");
};
$sigle =~ s/\./-/g;
my @parts = split(/[\/_]/, $sigle);
if (@parts != 3) {
die $log->fatal(
"input line number $.: " .
"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
"=> Aborting (line=$_)");
};
$dir = join("/", @parts);
$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
$log->notice("$0: text_id=$text_id_esc");
if ($leadin !~ /^\s*$/) {
die $log->fatal(
"input line number $.: " .
'line with opening header tag is not in expected format ... ' .
"=> Aborting (line=$_)");
};
}
# Start of header section
elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
my $content = "$2\n";
if ($1 !~ /^\s*$/) {
die $log->fatal(
"input line number $.: " .
'line with opening header tag is not in expected format ... ' .
"=> Aborting (line=$_)");
};
# Parse header
my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);
# Header was parseable
if ($header) {
# Write header to zip
my $file = $header->dir . '/' . $header_file . '.xml';
$log->debug("Writing file $file") if DEBUG;
$header->to_zip($zipper->new_stream($file));
# Header is for text level
if ($header->type eq 'text') {
# Remember dir and sigles
$dir = $header->dir;
$text_id_esc = $header->id_esc;
# log output for seeing progression
$log->notice("$0: text_id=$text_id_esc");
# Reset counter for text lines
# (needed for whitespace handling)
$text_line = 0;
};
};
};
};
$zipper->close;
$ext_tok->close if $ext_tok;
close $input_fh;
__END__
=pod
=encoding utf8
=head1 NAME
tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
=head1 SYNOPSIS
cat corpus.i5.xml | tei2korapxml - > corpus.korapxml.zip
=head1 DESCRIPTION
C<tei2korapxml> is a script to convert TEI P5 and
L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
based documents to the
L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
This program is usually called from inside another script.
=head1 FORMATS
=head2 Input restrictions
=over 2
=item
TEI P5 formatted input with certain restrictions:
=over 4
=item
B<mandatory>: text-header with integrated textsigle
(or convertable identifier), text-body
=item
B<optional>: corp-header with integrated corpsigle,
doc-header with integrated docsigle
=back
=item
All tokens inside the primary text may not be
newline seperated, because newlines are removed
(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
into blanks between 2 tokens could lead to additional blanks,
where there should be none (e.g.: punctuation characters like C<,> or
C<.> should not be seperated from their predecessor token).
(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
=item
Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
need to be defined in the same line as the header tag.
=back
=head2 Notes on the output
=over 2
=item
zip file output (default on C<stdout>) with utf8 encoded entries
(which together form the KorAP-XML format)
=back
=head1 INSTALLATION
C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
When these requirements are met, the preferred way to install the script is
to use L<cpanm|App::cpanminus>.
$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
In case everything went well, the C<tei2korapxml> tool will
be available on your command line immediately.
Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
=head1 OPTIONS
=over 2
=item B<--input|-i>
The input file to process. If no specific input is defined and a single
dash C<-> is passed as an argument, data is read from C<STDIN>.
=item B<--output|-o>
The output zip file to be created. If no specific output is defined,
data is written to C<STDOUT>.
=item B<--root|-r>
The root directory for output. Defaults to C<.>.
=item B<--help|-h>
Print help information.
=item B<--version|-v>
Print version information.
=item B<--tokenizer-korap|-tk>
Use the standard KorAP/DeReKo tokenizer.
=item B<--tokenizer-internal|-ti>
Tokenize the data using two embedded tokenizers,
that will take an I<aggressive> and a I<conservative>
approach.
=item B<--tokenizer-call|-tc>
Call an external tokenizer process, that will tokenize
from STDIN and outputs the offsets of all tokens.
Texts are separated using C<\x04\n>. The external process
should add a new line per text.
If the L</--use-tokenizer-sentence-splits> option is activated,
sentences are marked by offset as well in new lines.
To use L<Datok|https://github.com/KorAP/Datok> including sentence
splitting, call C<tei2korap> as follows:
$ cat corpus.i5.xml | tei2korapxml -s \
$ -tc 'datok tokenize \
$ -t ./tokenizer.matok \
$ -p --newline-after-eot --no-sentences \
$ --no-tokens --sentence-positions -' - \
$ > corpus.korapxml.zip
=item B<--no-tokenizer>
Boolean flag indicating that no tokenizer should be used.
This is meant to ensure that by default a final token layer always
exists.
If a separate tokenizer is chosen, this flag is ignored.
=item B<--skip-inline-tokens>
Boolean flag indicating that inline tokens should not
be processed. Defaults to false (meaning inline tokens will be processed).
=item B<--skip-inline-token-annotations>
Boolean flag indicating that inline token annotations should not
be processed. Defaults to true (meaning inline token annotations
won't be processed). Can be negated with
C<--no-skip-inline-token-annotations>.
=item B<--skip-inline-tags> <tags>
Expects a comma-separated list of tags to be ignored when the structure
is parsed. Content of these tags however will be processed.
=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
Expects a regular replacement expression (separated by B<@> between the
search and the replacement) to convert text id attributes to text sigles
with three parts (separated by B</>).
Example:
tei2korapxml \
--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
-tk - < t/data/icc_german_sample.p5.xml
Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
=item B<--inline-tokens> <foundry>#[<file>]
Define the foundry and file (without extension)
to store inline token information in.
Unless C<--skip-inline-token-annotations> is set,
this will contain annotations as well.
Defaults to C<tokens> and C<morpho>.
The inline token data will also be stored in the
inline structures file (see I<--inline-structures>),
unless the inline token foundry is prepended
by an B<!> exclamation mark, indicating that inline
tokens are stored exclusively in the inline tokens
file.
Example:
tei2korapxml --no-tokenizer --inline-tokens \
'!gingko#morpho' < data.i5.xml > korapxml.zip
=item B<--inline-dependencies> <foundry>#[<file>]
Define the foundry and file (without extension)
to store inline dependency information in.
Defaults to the layer of C<dependency> and
will be ignored if not set (which means, dependency
attributes will be stored in the inline tokens file,
if not skipped).
The dependency data will also be stored in the
inline token file (see I<--inline-tokens>),
unless the inline dependencies foundry is prepended
by an B<!> exclamation mark, indicating that inline
dependency data is stored exclusively in the inline
dependencies file.
Example:
tei2korapxml --no-tokenizer --inline-dependencies \
'gingko#dependency' < data.i5.xml > korapxml.zip
=item B<--inline-structures> <foundry>#[<file>]
Define the foundry and file (without extension)
to store inline structure information in.
Defaults to C<struct> and C<structures>.
=item B<--base-foundry> <foundry>
Define the base foundry to store newly generated
token information in.
Defaults to C<base>.
=item B<--data-file> <file>
Define the file (without extension)
to store primary data information in.
Defaults to C<data>.
=item B<--header-file> <file>
Define the file name (without extension)
to store header information on
the corpus, document, and text level in.
Defaults to C<header>.
=item B<--use-tokenizer-sentence-splits|-s>
Replace existing with, or add new, sentence boundary information
provided by the tokenizer.
Currently KorAP-tokenizer and certain external tokenizers support
these boundaries.
=item B<--tokens-file> <file>
Define the file (without extension)
to store generated token information in
(either from the KorAP tokenizer or an externally called tokenizer).
Defaults to C<tokens>.
=item B<--log|-l>
Loglevel for I<Log::Any>. Defaults to C<notice>.
=back
=head1 ENVIRONMENT VARIABLES
=over 2
=item B<KORAPXMLTEI_DEBUG>
Activate minimal debugging.
Defaults to C<false>.
=back
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2021-2024, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: Peter Harders
Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
=cut
# NOTES
## Notes on segfault prevention
binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
(see notes on 'PerlIO layers' in 'man XML::LibXML'),
removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.