blob: 54c5b9fd5fc29513d26168ea2b77a305f7f309a6 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use Log::Any '$log';
use Log::Any::Adapter;
use Pod::Usage;
use Getopt::Long qw(GetOptions :config no_auto_abbrev);
use File::Basename qw(dirname);
use Encode qw(decode);
use FindBin;
BEGIN {
unshift @INC, "$FindBin::Bin/../lib";
};
use KorAP::XML::TEI qw!remove_xml_comments replace_entities!;
use KorAP::XML::TEI::Tokenizer::External;
use KorAP::XML::TEI::Tokenizer::Conservative;
use KorAP::XML::TEI::Tokenizer::Aggressive;
use KorAP::XML::TEI::Zipper;
use KorAP::XML::TEI::Header;
use KorAP::XML::TEI::Inline;
our $VERSION = '2.3.4';
our $VERSION_MSG = "\ntei2korapxml - v$VERSION\n";
use constant {
# Set to 1 for minimal more debug output (no need to be parametrized)
DEBUG => $ENV{KORAPXMLTEI_DEBUG} // 0
};
if ($ENV{KORAPXMLTEI_INLINE}) {
warn 'KORAPXMLTEI_INLINE is deprecated in favor of --skip-inline-token-annotations';
};
# Inline tokens won't be stored in the structure file
my $inline_tokens_exclusive = 0;
# Parse options from the command line
GetOptions(
'root|r=s' => \(my $root_dir = '.'),
'input|i=s' => \(my $input_fname = ''),
'tokenizer-call|tc=s' => \(my $tokenizer_call),
'tokenizer-korap|tk' => \(my $tokenizer_korap),
'tokenizer-internal|ti' => \(my $tokenizer_intern),
'no-tokenizer' => \(my $no_tokenizer),
'use-tokenizer-sentence-splits|s' => \(my $use_tokenizer_sentence_splits),
'inline-tokens=s' => \(my $inline_tokens = 'tokens#morpho'),
'inline-structures=s' => \(my $inline_structures = 'struct#structure'),
'skip-inline-tokens' => \(my $skip_inline_tokens = 0),
'skip-inline-token-annotations' => \(
my $skip_inline_token_annotations = ($ENV{KORAPXMLTEI_INLINE} ? 0 : 1)),
'skip-inline-tags=s' => \(my $skip_inline_tags_str = ''),
'base-foundry=s' => \(my $base_dir = 'base'),
'data-file=s' => \(my $data_file = 'data'),
'header-file=s' => \(my $header_file = 'header'),
'tokens-file=s' => \(my $tokens_file = 'tokens'),
'xmlid-to-textsigle|x=s'=> \(my $xmlid_to_textsigle = ''),
'log|l=s' => \(my $log_level = 'notice'),
'required-version|rv=s' => \(my $required_version),
'' => \(my $stdio),
'help|h' => sub {
pod2usage(
-verbose => 99,
-sections => 'NAME|DESCRIPTION|SYNOPSIS|ARGUMENTS|OPTIONS',
-msg => $VERSION_MSG,
-output => '-'
)
},
'version|v' => sub {
pod2usage(
-verbose => 0,
-msg => $VERSION_MSG,
-output => '-'
);
}
);
# Establish logger
binmode(STDERR, ':encoding(UTF-8)');
Log::Any::Adapter->set('Stderr', log_level => $log_level);
$log->notice('Debugging is activated') if DEBUG;
if ($required_version) {
$required_version =~ /^\s*(\d+\.\d+\.\d+)\s*$/;
if (!$1 || $1 ne $VERSION) {
$log->error("Required version $required_version mismatches version $VERSION");
exit(1);
};
};
my ($what, $with);
if ($xmlid_to_textsigle ne '') {
($what, $with) = split('@', $xmlid_to_textsigle);
$what = qr!$what!;
};
# tag (without attributes), which contains the primary text
my $_TEXT_BODY = 'text';
# optional
# Remember to skip certain inline tags
my %skip_inline_tags = ();
if ($skip_inline_tags_str) {
foreach (split /\s*,\s*/, $skip_inline_tags_str) {
$skip_inline_tags{$_} = 1;
};
};
# External tokenization
my $ext_tok;
if ($tokenizer_call) {
$ext_tok = KorAP::XML::TEI::Tokenizer::External->new($tokenizer_call);
$ext_tok->sentence_splits(1) if $use_tokenizer_sentence_splits;
}
# KorAP tokenization
elsif ($tokenizer_korap) {
eval {
require KorAP::XML::TEI::Tokenizer::KorAP;
1;
};
my $korap_tok_ver = $KorAP::XML::TEI::Tokenizer::KorAP::VERSION;
if ($korap_tok_ver ne $VERSION) {
$log->error("KorAP-Tokenizer version ($korap_tok_ver) differs from the expected version ($VERSION)");
exit(1);
};
$ext_tok = KorAP::XML::TEI::Tokenizer::KorAP->new($use_tokenizer_sentence_splits);
}
# No internal tokenizer chosen
elsif (!$tokenizer_intern && !$no_tokenizer) {
$log->error("No tokenizer chosen. If only internal tokens should be used, pass the --no-tokenizer flag");
exit(1);
};
if ($use_tokenizer_sentence_splits) {
$skip_inline_tags{s} = 1;
};
# Internal tokenization
my $aggr_tok = KorAP::XML::TEI::Tokenizer::Aggressive->new;
my $cons_tok = KorAP::XML::TEI::Tokenizer::Conservative->new;
# Name of the directory and the file containing all inline structure informations
# except for $_TOKENS_TAG information
my ($_structure_dir, $_structure_file) = split '#', $inline_structures . '#structure';
# Name of the directory and the file containing all inline token informations
# i.e. tokens of the $_TOKENS_TAG, if $_TOKENS_PROC is set
my ($_tokens_dir, $_tokens_file) = split '#', $inline_tokens . '#morpho';
if (index($_tokens_dir, '!') == 0) {
$_tokens_dir = substr($_tokens_dir, 1);
$inline_tokens_exclusive = 1;
};
# Initialize zipper
my $zipper = KorAP::XML::TEI::Zipper->new($root_dir);
# text directory (below $root_dir)
my $dir = '';
# Escaped version of text id
my $text_id_esc;
# Default encoding of the text
my $input_enc = 'UTF-8';
# text line (needed for whitespace handling)
my $text_line = 0;
# Input file handle (default: stdin)
my $input_fh;
# Single dash was set
if ($stdio) {
$input_fh = *STDIN;
}
# Input flag was passed
elsif ($input_fname ne '') {
unless (open($input_fh, '<', $input_fname)) {
die $log->fatal("File '$input_fname' could not be opened.");
};
}
# No input to process
else {
pod2usage(
-verbose => 99,
-sections => 'NAME|SYNOPSIS',
-msg => $VERSION_MSG,
-output => '-'
);
exit;
};
# Prevents segfaulting (see notes on segfault prevention)
binmode $input_fh;
# Create inline parser object
my $inline = KorAP::XML::TEI::Inline->new(
$skip_inline_tokens,
\%skip_inline_tags,
$inline_tokens_exclusive
);
# Reading input document
MAIN: while (<$input_fh>) {
# remove HTML (multi-line) comments (<!--...-->)
$_ = remove_xml_comments($input_fh, $_);
# Set input encoding
if (index($_, '<?xml') == 0 && $_ =~ /\sencoding=(['"])([^\1]+?)\1/) {
$input_enc = $2;
next;
};
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
# Start of text body
if (index($_, $_TEXT_BODY) >= 0 && m#^(.*)<${_TEXT_BODY}(?: [^>]*)?>(.*)$#) {
my $suffix = $2;
if ($1 !~ /^\s*$/ || $suffix !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with opening text-body tag '${_TEXT_BODY}' " .
"contains additional information ... => Aborting (line=$_)");
};
# Text body data extracted from input document ($input_fh),
# further processed by XML::LibXML::Reader
my $text_buffer = '';
# Iterate over all lines in the text body
while (<$input_fh>) {
$_ = remove_xml_comments($input_fh, $_);
$_ = decode($input_enc, $_);
$_ = replace_entities($_);
# End of text body
if ((my $pos = index($_, '</' . $_TEXT_BODY)) >= 0) {
# write data.xml, structure.xml and evtl. morpho.xml and/or tokenization files
if ((substr($_, 0, $pos) . substr($_, length("</$_TEXT_BODY>") + $pos)) !~ /^\s*$/) {
die $log->fatal("input line number $.: " .
"line with closing text-body tag '${_TEXT_BODY}'".
" contains additional information ... => Aborting (line=$_)");
};
if ($dir eq '') {
$log->warn(
"Maybe empty textSigle => skipping this text ...\n" .
'data=' . substr($inline->data->data, 0, 200)
);
next MAIN;
};
# Parse inline structure
$inline->parse($text_id_esc, \$text_buffer);
if (DEBUG) {
$log->debug("Writing (utf8-formatted) xml file $dir/${data_file}.xml");
};
my $data = $inline->data;
# Write data.xml
$data->to_zip(
$zipper->new_stream("$dir/${data_file}.xml"),
$text_id_esc
);
# Tokenize with external tokenizer
if ($ext_tok) {
# Tokenize and output
$ext_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/${tokens_file}.xml"),
$text_id_esc
);
if ($use_tokenizer_sentence_splits) {
$ext_tok->sentencize_from_previous_input($inline->structures);
};
};
# Tokenize with internal tokenizer
if ($tokenizer_intern) {
# Tokenize and output
$cons_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/" . $cons_tok->name . '.xml'),
$text_id_esc
)->reset;
$aggr_tok->tokenize($data->data)->to_zip(
$zipper->new_stream("$dir/$base_dir/" . $aggr_tok->name . '.xml'),
$text_id_esc
)->reset;
};
# ~ write structures ~
if (!$inline->structures->empty) {
$inline->structures->to_zip(
$zipper->new_stream("$dir/$_structure_dir/${_structure_file}.xml"),
$text_id_esc,
2 # = structure serialization
);
};
# ~ write tokens ~
unless ($skip_inline_tokens || $inline->tokens->empty) {
$inline->tokens->to_zip(
$zipper->new_stream("$dir/$_tokens_dir/${_tokens_file}.xml"),
$text_id_esc,
# Either 0 = tokens without inline or 1 = tokens with inline
!$skip_inline_token_annotations
);
};
# reinit.
$dir = '';
next MAIN;
};
# ~ whitespace handling ~
# Fix whitespaces (see notes on whitespace fixing)
# TODO:
# Maybe it's best, to keep the stripping of whitespace and
# to just remove the if-clause and to insert a blank by default
# (with possibly an option on how newlines in primary text should
# be handled (stripped or replaced by a whitespace)).
# Remove consecutive whitespace at beginning and end (mostly one newline)
s/^\s+//; s/\s+$//;
# NOTE:
# this is only relevant, if a text consists of more than one line
# TODO:
# find a better solution, or create a warning, if a text has more
# than one line ($text_line > 1)
# TODO:
# do testing with 2 different corpora
# (one with only one-line texts, the other with several lines per text)
# line contains at least one tag with at least one character contents
if (m/<[^>]+>[^<]/) {
# Increment counter for text lines
$text_line++;
# insert blank before 1st character
# (for 2nd line and consecutive lines)
$_ = ' ' . $_ if $text_line > 1;
}
# add line to buffer
$text_buffer .= $_;
};
}
elsif (m#^(.*)\<TEI\s+[^>]*?xml:id=(["'])(.+?)\2#) {
my $leadin = $1;
my $id = $3;
my $sigle = $3;
if ($what) {
$_ = $id;
eval "s|$what|$with|"; # s@ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2@;
$sigle = $_;
$log->debug("Converted text id `$id' to sigle `$sigle'");
};
$sigle =~ s/\./-/g;
my @parts = split(/[\/_]/, $sigle);
if (@parts != 3) {
die $log->fatal(
"input line number $.: " .
"ids must have exactly three parts split by '/', but `$id` only has " . scalar(@parts) . " ".
"=> Aborting (line=$_)");
};
$dir = join("/", @parts);
$text_id_esc = "$parts[0]/$parts[1].$parts[2]";
$log->notice("$0: text_id=$text_id_esc");
if ($leadin !~ /^\s*$/) {
die $log->fatal(
"input line number $.: " .
'line with opening header tag is not in expected format ... ' .
"=> Aborting (line=$_)");
};
}
# Start of header section
elsif (m#^(.*)(\<(?:ids|tei)Header.*)$#) {
my $content = "$2\n";
if ($1 !~ /^\s*$/) {
die $log->fatal(
"input line number $.: " .
'line with opening header tag is not in expected format ... ' .
"=> Aborting (line=$_)");
};
# Parse header
my $header = KorAP::XML::TEI::Header->new($content, $input_enc, $text_id_esc)->parse($input_fh);
# Header was parseable
if ($header) {
# Write header to zip
my $file = $header->dir . '/' . $header_file . '.xml';
$log->debug("Writing file $file") if DEBUG;
$header->to_zip($zipper->new_stream($file));
# Header is for text level
if ($header->type eq 'text') {
# Remember dir and sigles
$dir = $header->dir;
$text_id_esc = $header->id_esc;
# log output for seeing progression
$log->notice("$0: text_id=$text_id_esc");
# Reset counter for text lines
# (needed for whitespace handling)
$text_line = 0;
};
};
};
};
$zipper->close;
$ext_tok->close if $ext_tok;
close $input_fh;
__END__
=pod
=encoding utf8
=head1 NAME
tei2korapxml - Conversion of TEI P5 based formats to KorAP-XML
=head1 SYNOPSIS
cat corpus.i5.xml | tei2korapxml - > corpus.korapxml.zip
=head1 DESCRIPTION
C<tei2korapxml> is a script to convert TEI P5 and
L<I5|https://www.ids-mannheim.de/digspra/kl/projekte/korpora/textmodell>
based documents to the
L<KorAP-XML format|https://github.com/KorAP/KorAP-XML-Krill#about-korap-xml>.
This program is usually called from inside another script.
=head1 FORMATS
=head2 Input restrictions
=over 2
=item
TEI P5 formatted input with certain restrictions:
=over 4
=item
B<mandatory>: text-header with integrated textsigle
(or convertable identifier), text-body
=item
B<optional>: corp-header with integrated corpsigle,
doc-header with integrated docsigle
=back
=item
All tokens inside the primary text may not be
newline seperated, because newlines are removed
(see L<KorAP::XML::TEI::Data>) and a conversion of newlines
into blanks between 2 tokens could lead to additional blanks,
where there should be none (e.g.: punctuation characters like C<,> or
C<.> should not be seperated from their predecessor token).
(see also code section C<~ whitespace handling ~> in C<script/tei2korapxml>).
=item
Header types, like C<E<lt>idsHeader [...] type="document" [...] E<gt>>
need to be defined in the same line as the header tag.
=back
=head2 Notes on the output
=over 2
=item
zip file output (default on C<stdout>) with utf8 encoded entries
(which together form the KorAP-XML format)
=back
=head1 INSTALLATION
C<tei2korapxml> requires C<libxml2-dev> bindings and L<File::ShareDir::Install> to be installed.
When these requirements are met, the preferred way to install the script is
to use L<cpanm|App::cpanminus>.
$ cpanm https://github.com/KorAP/KorAP-XML-TEI.git
In case everything went well, the C<tei2korapxml> tool will
be available on your command line immediately.
Minimum requirement for L<KorAP::XML::TEI> is Perl 5.16.
=head1 OPTIONS
=over 2
=item B<--input|-i>
The input file to process. If no specific input is defined and a single
dash C<-> is passed as an argument, data is read from C<STDIN>.
=item B<--root|-r>
The root directory for output. Defaults to C<.>.
=item B<--help|-h>
Print help information.
=item B<--version|-v>
Print version information.
=item B<--tokenizer-korap|-tk>
Use the standard KorAP/DeReKo tokenizer.
=item B<--tokenizer-internal|-ti>
Tokenize the data using two embedded tokenizers,
that will take an I<aggressive> and a I<conservative>
approach.
=item B<--tokenizer-call|-tc>
Call an external tokenizer process, that will tokenize
from STDIN and outputs the offsets of all tokens.
Texts are separated using C<\x04\n>. The external process
should add a new line per text.
If the L</--use-tokenizer-sentence-splits> option is activated,
sentences are marked by offset as well in new lines.
To use L<Datok|https://github.com/KorAP/Datok> including sentence
splitting, call C<tei2korap> as follows:
$ cat corpus.i5.xml | tei2korapxml -s \
$ -tc 'datok tokenize \
$ -t ./tokenizer.matok \
$ -p --newline-after-eot --no-sentences \
$ --no-tokens --sentence-positions -' - \
$ > corpus.korapxml.zip
=item B<--no-tokenizer>
Boolean flag indicating that no tokenizer should be used.
This is meant to ensure that by default a final token layer always
exists.
If a separate tokenizer is chosen, this flag is ignored.
=item B<--skip-inline-tokens>
Boolean flag indicating that inline tokens should not
be processed. Defaults to false (meaning inline tokens will be processed).
=item B<--skip-inline-token-annotations>
Boolean flag indicating that inline token annotations should not
be processed. Defaults to true (meaning inline token annotations
won't be processed).
=item B<--skip-inline-tags> <tags>
Expects a comma-separated list of tags to be ignored when the structure
is parsed. Content of these tags however will be processed.
=item B<--xmlid-to-textsigle> <from-regex>@<to-c/to-d/to-t>
Expects a regular replacement expression (separated by B<@> between the
search and the replacement) to convert text id attributes to text sigles
with three parts (separated by B</>).
Example:
tei2korapxml \
--xmlid-to-textsigle 'ICC.German\.([^.]+\.[^.]+)\.(.+)@ICCGER/$1/$2' \
-tk - < t/data/icc_german_sample.p5.xml
Converts text id C<ICC.German.DeReKo.WPD17.G11.00238> to
sigle C<ICCGER/DeReKo.WPD17/G11.00238>.
=item B<--inline-tokens> <foundry>#[<file>]
Define the foundry and file (without extension)
to store inline token information in.
Unless C<--skip-inline-token-annotations> is set,
this will contain annotations as well.
Defaults to C<tokens> and C<morpho>.
The inline token data will also be stored in the
inline structures file (see I<--inline-structures>),
unless the inline token foundry is prepended
by an B<!> exclamation mark, indicating that inline
tokens are stored exclusively in the inline tokens
file.
Example:
tei2korapxml --inline-tokens '!gingko#morpho' < data.i5.xml > korapxml.zip
=item B<--inline-structures> <foundry>#[<file>]
Define the foundry and file (without extension)
to store inline structure information in.
Defaults to C<struct> and C<structures>.
=item B<--base-foundry> <foundry>
Define the base foundry to store newly generated
token information in.
Defaults to C<base>.
=item B<--data-file> <file>
Define the file (without extension)
to store primary data information in.
Defaults to C<data>.
=item B<--header-file> <file>
Define the file name (without extension)
to store header information on
the corpus, document, and text level in.
Defaults to C<header>.
=item B<--use-tokenizer-sentence-splits|-s>
Replace existing with, or add new, sentence boundary information
provided by the tokenizer.
Currently KorAP-tokenizer and certain external tokenizers support
these boundaries.
=item B<--tokens-file> <file>
Define the file (without extension)
to store generated token information in
(either from the KorAP tokenizer or an externally called tokenizer).
Defaults to C<tokens>.
=item B<--log|-l>
Loglevel for I<Log::Any>. Defaults to C<notice>.
=back
=head1 ENVIRONMENT VARIABLES
=over 2
=item B<KORAPXMLTEI_DEBUG>
Activate minimal debugging.
Defaults to C<false>.
=back
=head1 COPYRIGHT AND LICENSE
Copyright (C) 2021-2023, L<IDS Mannheim|https://www.ids-mannheim.de/>
Author: Peter Harders
Contributors: Nils Diewald, Marc Kupietz, Carsten Schnober
L<KorAP::XML::TEI> is developed as part of the L<KorAP|https://korap.ids-mannheim.de/>
Corpus Analysis Platform at the
L<Leibniz Institute for the German Language (IDS)|https://www.ids-mannheim.de/>,
member of the
L<Leibniz-Gemeinschaft|http://www.leibniz-gemeinschaft.de/>.
This program is free software published under the
L<BSD-2 License|https://opensource.org/licenses/BSD-2-Clause>.
=cut
# NOTES
## Notes on segfault prevention
binmode on the input handler prevents segfaulting of 'XML::LibXML::Reader' inside the main loop
(see notes on 'PerlIO layers' in 'man XML::LibXML'),
removing 'use open qw(:std :utf8)' would fix this problem too, but using binmode on input is more granular
see in perluniintro: You can switch encodings on an already opened stream by using "binmode()
see in perlfunc: If LAYER is omitted or specified as ":raw" the filehandle is made suitable for passing binary data.