blob: c242b1ebd779e834fa521fa9d226fe0709107445 [file] [log] [blame]
package KorAP::Tokenizer;
use Mojo::Base -base;
use Mojo::ByteStream 'b';
use Mojo::Loader;
use XML::Fast;
use Try::Tiny;
use Carp qw/croak/;
use KorAP::Tokenizer::Range;
use KorAP::Tokenizer::Match;
use KorAP::Tokenizer::Spans;
use KorAP::Tokenizer::Tokens;
use KorAP::Field::MultiTermTokenStream;
use List::MoreUtils 'uniq';
use JSON::XS;
use Log::Log4perl;
# TODO 1:
# Bei den Autoren im Index darauf achten, dass auch "etc." indiziert wird
# TODO 2:
# Add punktuations to the index
# [Er sagte: "Hallo - na?"] becomes
# [s:Er|tt/l:er|_1#0-2]
# [s:sagte|tt/l:sagen|_2#3-8|.>::#8-9$1|.>tt/l:PUNCT#8-9$1|.>:"#10-11$2|.>tt/l:PUNCT#10-11$2]
# [s:Hallo|tt/l:hallo|_3#11-16|.<::#8-9$2|.<tt/l:PUNCT#8-9$2|.<:"#10-11$1|.<tt/l:PUNCT#10-11$1|.>:-#17-18$1|.>tt/l:PUNCT#17-18$1]
# [s:na|tt/l:na|_4#19-21|.<:-#17-18$1|.<tt/l:PUNCT#17-18$1|.>:?#21-22$1|.>tt/l:PUNCT#21-22$1|.>:"#22-23$2|.>tt/l:PUNCT#22-23$2]
# TODO 3:
# Meta-Data:
# mysql> select * from textMeta limit 5;
# +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+
# | sigle | date | fname | fpos | length | tlen | country | topic1 | topic2 | tpc1v | tpc2v | supertopic1 | supertopic2 | words | stopwords | numbers | s | pars | tags | oo | no | ngc | sscr | cs | has | ressort | genre | type | articletype | md5 |
# +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+
# | A00/JAN.00001 | 2000-01-01 | a00.i5.xml | 12667 | 3131 | 641 | CH | Freizeit_Unterhaltung:Reisen | NULL | 0.99 | 0 | Freizeit_Unterhaltung | NULL | 34 | 24 | 0 | 6 | 1 | 5 | 0 | 0 | 0 | 0 | 0 | headlines | NULL | NULL | Zeitung | Bericht | dk6ORWB5uTH33eiakNcJAA |
# | A00/JAN.00002 | 2000-01-01 | a00.i5.xml | 15798 | 11853 | 9267 | CH | Staat_Gesellschaft:Biographien_Interviews | NULL | 1 | 0 | Staat_Gesellschaft | NULL | 652 | 445 | 12 | 77 | 18 | 14 | 0 | 0 | 0 | 0 | 0 | headlines | NULL | NULL | Zeitung | Bericht | m7phXF1Ds+aPq3GfLRddCw |
# | A00/JAN.00003 | 2000-01-01 | a00.i5.xml | 27651 | 4768 | 2234 | CH | Politik:Kommunalpolitik | NULL | 1 | 0 | Politik | NULL | 135 | 74 | 2 | 16 | 3 | 9 | 0 | 0 | 0 | 0 | 0 | headlines | NULL | NULL | Zeitung | Bericht | XwsIDMEIT7ht52DnMTkSHw |
# | A00/JAN.00004 | 2000-01-01 | a00.i5.xml | 32419 | 11096 | 8619 | CH | Politik:Kommunalpolitik | NULL | 1 | 0 | Politik | NULL | 521 | 368 | 7 | 57 | 8 | 22 | 0 | 0 | 0 | 0 | 0 | headlines | NULL | NULL | Zeitung | Bericht | TSx4mDVLU6XibNHUUT+ubA |
# | A00/JAN.00005 | 2000-01-01 | a00.i5.xml | 43515 | 5421 | 2875 | CH | Politik:Kommunalpolitik | NULL | 1 | 0 | Politik | NULL | 177 | 94 | 7 | 19 | 3 | 11 | 0 | 0 | 0 | 0 | 0 | headlines | NULL | NULL | Zeitung | Bericht | 6OGZ7MqyjSOb9AtvFLWodA |
# +---------------+------------+------------+-------+--------+------+---------+-------------------------------------------+--------+-------+-------+-----------------------+-------------+-------+-----------+---------+----+------+------+----+----+-----+------+------+-----------+---------+-------+---------+-------------+------------------------+
has [qw/path foundry doc stream should have name/];
has layer => 'Tokens';
has log => sub {
if(Log::Log4perl->initialized()) {
state $log = Log::Log4perl->get_logger(__PACKAGE__);
};
state $log = KorAP::Log->new;
return $log;
};
# Parse tokens of the document
sub parse {
my $self = shift;
# Create new token stream
my $mtts = KorAP::Field::MultiTermTokenStream->new;
my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
my $file = b($path)->slurp;
# my $tokens = Mojo::DOM->new($file);
# $tokens->xml(1);
my $doc = $self->doc;
my ($should, $have) = (0, 0);
# Create range and match objects
my $range = KorAP::Tokenizer::Range->new;
my $match = KorAP::Tokenizer::Match->new;
my $old = 0;
$self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);
# TODO: Reuse the following code from Spans.pm and tokens.pm
my ($tokens, $error);
try {
local $SIG{__WARN__} = sub {
$error = 1;
};
$tokens = xml2hash($file, text => '#text', array => ['span'], attr => '-')->{layer}->{spanList};
}
catch {
$self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
$error = 1;
};
return if $error;
if (ref $tokens && $tokens->{span}) {
$tokens = $tokens->{span};
}
else {
return $self;
};
$tokens = [$tokens] if ref $tokens ne 'ARRAY';
# Iterate over all tokens
# $tokens->find('span')->each(
# sub {
# my $span = $_;
foreach my $span (@$tokens) {
my $from = $span->{'-from'};
my $to = $span->{'-to'};
my $token = $doc->primary->data($from, $to);
# warn 'Has ' . $from . '->' . $to . "($old)";
unless (defined $token) {
$self->log->error("Unable to find substring [$from-$to] in $path");
next;
};
$should++;
# Ignore non-word tokens
next if $token !~ /[\w\d]/;
my $mtt = $mtts->add;
# Add gap for later finding matching positions before or after
$range->gap($old, $from, $have) unless $old >= $from;
# Add surface term
# That's always the first term!
$mtt->add('s:' . $token);
# Add case insensitive term
$mtt->add('i:' . lc $token);
# Add offset information
$mtt->o_start($from);
$mtt->o_end($to);
# Store offset information for position matching
$range->set($from, $to, $have);
$match->set($from, $to, $have);
$old = $to + 1;
# Add position term
$mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
$have++;
};
# Add token count
$mtts->add_meta('tokens', '<i>' . $have);
$range->gap($old, $doc->primary->data_length + 1, $have-1) if $doc->primary->data_length >= ($old - 1);
# Add info
$self->stream($mtts);
$self->{range} = $range;
$self->{match} = $match;
$self->should($should);
$self->have($have);
$self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %');
return $self;
};
sub add_subtokens {
my $self = shift;
my $mtts = $self->stream or return;
foreach my $mtt (@{$mtts->multi_term_tokens}) {
my $o_start = $mtt->o_start;
my $o_end = $mtt->o_end;
my $l = $o_end - $o_start;
my $os = my $s = $mtt->lc_surface;
# Algorithm based on aggressive tokenization in
# tokenize.pl from Carsten Schnober
$s =~ s/[[:alpha:]]/a/g;
$s =~ s/[[:digit:]]/0/g;
$s =~ s/\p{Punct}/#/g;
$s =~ y/~/A/;
$s .= 'E';
while ($s =~ /(a+)[^a]/g) {
my $from = $-[1];
my $to = $+[1];
$mtt->add(
term => 'i^1:' . substr($os, $from, $from + $to),
o_start => $from + $o_start,
o_end => $to + $o_start
) unless $to - $from == $l;
};
while ($s =~ /(0+)[^0]/g) {
my $from = $-[1];
my $to = $+[1];
$mtt->add(
term => 'i^2:' . substr($os, $from, $from + $to),
o_start => $from + $o_start,
o_end => $to + $o_start
) unless $to - $from == $l;
};
while ($s =~ /(#)/g) {
my $from = $-[1];
my $to = $+[1];
$mtt->add(
term => 'i^3:' . substr($os, $from, $from + $to),
o_start => $from + $o_start,
o_end => $to + $o_start
) unless $to - $from == $l;
};
};
return $self;
};
# Get span positions through character offsets
sub range {
return shift->{range} // KorAP::Tokenizer::Range->new;
};
# Get token positions through character offsets
sub match {
return shift->{match} // KorAP::Tokenizer::Match->new;
};
# Add information of spans to the tokens
sub add_spandata {
my $self = shift;
my %param = @_;
croak 'No token data available' unless $self->stream;
$self->log->trace(
($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer}
);
return if $param{skip};
my $cb = delete $param{cb};
$param{primary} = $self->doc->primary;
my $spans = KorAP::Tokenizer::Spans->new(
path => $self->path,
range => $self->range,
match => $self->match,
%param
);
my $spanarray = $spans->parse or return;
if ($spans->should == $spans->have) {
$self->log->trace('With perfect alignment!');
}
else {
$self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
};
if ($cb) {
foreach (@$spanarray) {
$cb->($self->stream, $_, $spans);
};
return 1;
};
return $spans;
};
# Add information to the tokens
sub add_tokendata {
my $self = shift;
my %param = @_;
croak 'No token data available' unless $self->stream;
$self->log->trace(
($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
);
return if $param{skip};
my $cb = delete $param{cb};
$param{primary} = $self->doc->primary;
my $tokens = KorAP::Tokenizer::Tokens->new(
path => $self->path,
range => $self->range,
match => $self->match,
%param
);
my $tokenarray = $tokens->parse or return;
if ($tokens->should == $tokens->have) {
$self->log->trace('With perfect alignment!');
}
else {
my $perc = _perc(
$tokens->should, $tokens->have, $self->should, $self->should - $self->have
);
$self->log->debug('With an alignment quota of ' . $perc);
};
if ($cb) {
foreach (@$tokenarray) {
$cb->($self->stream, $_, $tokens);
};
return 1;
};
return $tokens;
};
sub add {
my $self = shift;
my $loader = Mojo::Loader->new;
my $foundry = shift;
my $layer = shift;
unless ($foundry && $layer) {
warn 'Unable to add specific module - not enough information given!';
return;
};
my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;
if ($mod->can('new') || eval("require $mod; 1;")) {
if (my $retval = $mod->new($self)->parse(@_)) {
# This layer is supported
$self->support($foundry => $layer, @_);
# Get layerinfo
$self->layer_info($mod->layer_info);
return $retval;
};
}
else {
$self->log->error('Unable to load '.$mod . '(' . $@ . ')');
};
return;
};
sub _perc {
if (@_ == 2) {
# '[' . $_[0] . '/' . $_[1] . ']' .
return sprintf("%.2f", ($_[1] * 100) / $_[0]);
}
my $a_should = shift;
my $a_have = shift;
my $b_should = shift;
my $b_have = shift;
my $a_quota = ($a_have * 100) / $a_should;
my $b_quota = ($b_have * 100) / $b_should;
return sprintf("%.2f", $a_quota) . '%' .
((($a_quota + $b_quota) <= 100) ?
' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
};
sub support {
my $self = shift;
# No setting - just getting
unless ($_[0]) {
my @supports;
# Get all foundries
foreach my $foundry (keys %{$self->{support}}) {
push(@supports, $foundry);
# Get all layers
foreach my $layer (@{$self->{support}->{$foundry}}) {
my @layers = @$layer;
push(@supports, $foundry . '/' . $layers[0]);
# More information
if ($layers[1]) {
push(@supports, $foundry . '/' . join('/', @layers));
};
};
};
return lc ( join ' ', sort {$a cmp $b } @supports );
}
elsif (!$_[1]) {
return $self->{support}->{$_[0]} // []
};
my $f = lc shift;
my $l = lc shift;
my @info = @_;
$self->{support} //= {};
$self->{support}->{$f} //= [];
push(@{$self->{support}->{$f}}, [$l, @info]);
};
sub layer_info {
my $self = shift;
$self->{layer_info} //= [];
if ($_[0]) {
push(@{$self->{layer_info}}, @{$_[0]});
}
else {
return join ' ', sort {$a cmp $b } uniq @{$self->{layer_info}};
};
};
sub to_string {
my $self = shift;
my $primary = defined $_[0] ? $_[0] : 1;
my $string = "<meta>\n";
$string .= $self->doc->to_string;
$string .= "</meta>\n";
if ($primary) {
$string .= "<text>\n";
$string .= $self->doc->primary->data . "\n";
$string .= "</text>\n";
};
$string .= '<field name="' . $self->name . "\">\n";
$string .= "<info>\n";
$string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";
# There is support info
if (my $support = $self->support) {
$string .= 'support = ' . $support . "\n";
};
if (my $layer_info = $self->layer_info) {
$string .= 'layer_info = ' . $layer_info . "\n";
};
$string .= "</info>\n";
$string .= $self->stream->to_string;
$string .= "</field>";
return $string;
};
sub to_data {
my $self = shift;
my $primary = defined $_[0] ? $_[0] : 1;
my $legacy = defined $_[1] ? $_[1] : 0;
my %data = %{$self->doc->to_hash};
my @fields;
if ($legacy) {
push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
push(@fields, {
name => $self->name,
data => $self->stream->to_array,
tokenization => lc($self->foundry) . '#' . lc($self->layer),
foundries => $self->support,
layerInfo => $self->layer_info
});
$data{fields} = \@fields;
}
else {
$data{primaryData} = $self->doc->primary->data if $primary;
$data{tokenName} = $self->name;
$data{data} = $self->stream->to_array;
$data{tokenSource} = lc($self->foundry) . '#' . lc($self->layer);
$data{foundries} = $self->support;
$data{layerInfos} = $self->layer_info;
$data{version} = '0.02';
};
\%data;
};
sub to_json_legacy {
encode_json($_[0]->to_data($_[1], 1));
};
sub to_json {
encode_json($_[0]->to_data($_[1], 0));
};
sub to_pretty_json {
JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
};
1;
__END__
=pod
=head1 NAME
KorAP::Tokenizer
=head1 SYNOPSIS
my $tokens = KorAP::Tokenizer->new(
path => '../examples/00003',
doc => KorAP::Document->new( ... ),
foundry => 'opennlp',
layer => 'tokens'
);
$tokens->parse;
=head1 DESCRIPTION
Convert token information from the KorAP XML
format into Lucene Index compatible token streams.
=head1 ATTRIBUTES
=head2 path
print $tokens->path;
The path of the document.
=head2 foundry
print $tokens->foundry;
The name of the foundry.
=head2 should
Number of tokens that exist at all.
=head2 have
Number of tokens effectively stored in the token stream (e.g., no punctuations).
=head2 layer
print $tokens->layer;
The name of the tokens layer.
=head2 doc
print $tokens->doc->corpus_id;
The L<KorAP::Document> object.
=head2 stream
$tokens->stream->add_meta('adjCount', '<i>45');
The L<KorAP::Field::MultiTermTokenStream> object
=head2 range
$tokens->range->lookup(45);
The L<KorAP::Tokenizer::Range> object for converting span offsets to positions.
=head2 match
$tokens->match->lookup(45);
The L<KorAP::Tokenizer::Match> object for converting token offsets to positions.
=head1 METHODS
=head2 parse
$tokens->parse;
Start the tokenization process.
=head2 to_json_legacy
print $tokens->to_json_legacy;
print $tokens->to_json_legacy(1);
Return the token data in legacy JSON format.
An optional parsed boolean parameter indicates,
if primary data should be included.
=head2 to_json
print $tokens->to_json;
print $tokens->to_json(1);
Return the token data in JSON format
An optional parsed boolean parameter indicates,
if primary data should be included.
=head2 add_subtokens
$tokens->split_tokens;
$tokens->split_tokens(
sub {
...
}
);
Add sub token information to the index.
This is based on the C<aggressive> tokenization, written by Carsten Schnober.
=head2 add_spandata
$tokens->add_spandata(
foundry => 'base',
layer => 'sentences',
cb => sub {
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
$mtt->add(
term => '<>:s',
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
);
}
);
Add span information to the parsed token stream.
Expects a C<foundry> name, a C<layer> name and a
callback parameter, that will be called after each parsed
span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
as well as the current L<KorAP::Tokenizer::Span>.
An optional parameter C<encoding> may indicate that the span offsets
are either refering to C<bytes> or C<utf-8> offsets.
An optional parameter C<skip> allows for skipping the process.
=head2 add_tokendata
$tokens->add_tokendata(
foundry => 'connexor',
layer => 'syntax',
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
# syntax
if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
$mtt->add(
term => 'cnx_syn:' . $found
);
};
});
Add token information to the parsed token stream.
Expects a C<foundry> name, a C<layer> name and a
callback parameter, that will be called after each parsed
token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
as well as the current L<KorAP::Tokenizer::Span>.
An optional parameter C<encoding> may indicate that the token offsets
are either refering to C<bytes> or C<utf-8> offsets.
An optional parameter C<skip> allows for skipping the process.
=cut