Optimize annotations to not use hash multiterms
Change-Id: I7d1e99e5a5e55447b53eb8ba1eafb20cbbbe829a
diff --git a/lib/KorAP/XML/Annotation/Base.pm b/lib/KorAP/XML/Annotation/Base.pm
index e4fcd99..d38c521 100644
--- a/lib/KorAP/XML/Annotation/Base.pm
+++ b/lib/KorAP/XML/Annotation/Base.pm
@@ -1,5 +1,4 @@
package KorAP::XML::Annotation::Base;
-
use strict;
use warnings;
@@ -21,8 +20,7 @@
# Constructor
sub new {
- my $class = shift;
- my $tokens = shift;
+ my ($class, $tokens) = @_;
bless \$tokens, $class;
};
diff --git a/lib/KorAP/XML/Annotation/Base/Paragraphs.pm b/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
index 694dfc8..40a37d2 100644
--- a/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
+++ b/lib/KorAP/XML/Annotation/Base/Paragraphs.pm
@@ -11,15 +11,12 @@
cb => sub {
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->get_p_start);
-
- $mtt->add(
- term => '<>:base/s:p',
- o_start => $span->get_o_start,
- o_end => $span->get_o_end,
- p_end => $span->get_p_end,
- payload => '<b>1',
- pti => 64
- );
+ my $mt = $mtt->add('<>:base/s:p');
+ $mt->set_o_start($span->get_o_start);
+ $mt->set_o_end($span->get_o_end);
+ $mt->set_p_end($span->get_p_end);
+ $mt->set_payload('<b>1');
+ $mt->set_pti(64);
$i++;
}
) or return;
diff --git a/lib/KorAP/XML/Annotation/Base/Sentences.pm b/lib/KorAP/XML/Annotation/Base/Sentences.pm
index 852146f..3c9b3ce 100644
--- a/lib/KorAP/XML/Annotation/Base/Sentences.pm
+++ b/lib/KorAP/XML/Annotation/Base/Sentences.pm
@@ -15,14 +15,14 @@
my $mtt = $stream->pos($span->get_p_start);
$first = [$span->get_p_start, $span->get_o_start] unless defined $first;
- $mtt->add(
- term => '<>:base/s:s',
- o_start => $span->get_o_start,
- o_end => $span->get_o_end,
- p_end => $span->get_p_end,
- payload => '<b>2',
- pti => 64
- );
+
+ my $mt = $mtt->add('<>:base/s:s');
+ $mt->set_o_start($span->get_o_start);
+ $mt->set_o_end($span->get_o_end);
+ $mt->set_p_end($span->get_p_end);
+ $mt->set_payload('<b>2');
+ $mt->set_pti(64);
+
$last_p = $span->get_p_end;
$last_o = $span->get_o_end;
$i++;
diff --git a/lib/KorAP/XML/Annotation/CoreNLP/Constituency.pm b/lib/KorAP/XML/Annotation/CoreNLP/Constituency.pm
index 849a3c7..0138594 100644
--- a/lib/KorAP/XML/Annotation/CoreNLP/Constituency.pm
+++ b/lib/KorAP/XML/Annotation/CoreNLP/Constituency.pm
@@ -59,17 +59,12 @@
my $type = $f->{'#text'} or return;
# $type is now NPA, NP, NUM ...
- my %term = (
- term => '<>:corenlp/c:' . $type,
- o_start => $span->get_o_start,
- o_end => $span->get_o_end,
- p_end => $span->get_p_end,
- pti => 64
- );
-
- $term{payload} = '<b>' . ($level // 0);
-
- $mtt->add(%term);
+ my $term = $mtt->add('<>:corenlp/c:' . $type);
+ $term->set_o_start($span->get_o_start);
+ $term->set_o_end($span->get_o_end);
+ $term->set_p_end($span->get_p_end);
+ $term->set_pti(64);
+ $term->set_payload('<b>' . ($level // 0));
my $this = $add_const;
diff --git a/lib/KorAP/XML/Annotation/DGD/Morpho.pm b/lib/KorAP/XML/Annotation/DGD/Morpho.pm
index 0aed712..aacf00e 100644
--- a/lib/KorAP/XML/Annotation/DGD/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/DGD/Morpho.pm
@@ -2,6 +2,14 @@
use KorAP::XML::Annotation::Base;
use Data::Dumper;
+our %conv = (
+ pos => 'p',
+ trans => 'trans',
+ phon => 'phon',
+ type => 'type',
+ lemma => 'l'
+);
+
sub parse {
my $self = shift;
@@ -20,102 +28,61 @@
foreach my $feat (@$content) {
- # syntax
- if (($feat->{-name} eq 'pos') && ($feat->{'#text'})) {
- $mtt->add(
- term => 'dgd/p:' . $feat->{'#text'}
- );
- }
+ my $text = $feat->{'#text'} or next;
+ my $name = $feat->{-name};
- # transcription
- elsif (($feat->{-name} eq 'trans') && ($feat->{'#text'})) {
- $mtt->add(
- term => 'dgd/trans:' . $feat->{'#text'}
- );
- }
-
- # phonetics
- elsif (($feat->{-name} eq 'phon') && ($feat->{'#text'})) {
- $mtt->add(
- term => 'dgd/phon:' . $feat->{'#text'}
- );
- }
-
- # type
- elsif (($feat->{-name} eq 'type') && ($feat->{'#text'})) {
- $mtt->add(
- term => 'dgd/type:' . $feat->{'#text'}
- );
- }
-
- elsif (($feat->{-name} eq 'lemma') && ($feat->{'#text'})) {
- $mtt->add(
- term => 'dgd/l:' . $feat->{'#text'}
- );
+ if (my $t = $conv{$name}) {
+ $mtt->add('dgd/' . $t . ':' . $text);
}
# Pause
- elsif ($feat->{-name} eq 'pause') {
- $mtt->add(
- term => 'dgd/para:pause',
- pti => 128,
- payload => '<s>' . $tui
- );
+ elsif ($name eq 'pause') {
+ my $p = $mtt->add('dgd/para:pause');
+ $p->set_pti(128);
+ $p->set_payload('<s>' . $tui);
# Duration
- if ($feat->{'#text'} =~ /dur="PT([^"]+?)"/) {
- $mtt->add(
- term => '@:dgd/para:dur:' . $1,
- pti => 16,
- payload => '<s>' . $tui
- );
+ if ($text =~ /dur="PT([^"]+?)"/) {
+ $p = $mtt->add('@:dgd/para:dur:' . $1);
+ $p->set_pti(16);
+ $p->set_payload('<s>' . $tui);
};
# Rendering
- if ($feat->{'#text'} =~ /rend="([^"]+?)"/) {
- $mtt->add(
- term => '@:dgd/para:rend:' . $1,
- pti => 16,
- payload => '<s>' . $tui
- );
+ if ($text =~ /rend="([^"]+?)"/) {
+ $p = $mtt->add('@:dgd/para:rend:' . $1);
+ $p->set_pti(16);
+ $p->set_payload('<s>' . $tui);
};
# Type
- if ($feat->{'#text'} =~ /type="([^"]+?)"/) {
- $mtt->add(
- term => '@:dgd/para:type:' . $1,
- pti => 16,
- payload => '<s>' . $tui
- );
+ if ($text =~ /type="([^"]+?)"/) {
+ $p = $mtt->add('@:dgd/para:type:' . $1);
+ $p->set_pti(16);
+ $p->set_payload('<s>' . $tui);
};
last;
}
# Incident
- elsif (($feat->{-name} eq 'incident') || ($feat->{-name} eq 'vocal')) {
- $mtt->add(
- term => 'dgd/para:' . $feat->{-name},
- pti => 128,
- payload => '<s>' . $tui
- );
+ elsif (($name eq 'incident') || ($name eq 'vocal')) {
+ my $i = $mtt->add('dgd/para:' . $name);
+ $i->set_pti(128);
+ $i->set_payload('<s>' . $tui);
# Rendering
- if ($feat->{'#text'} =~ /rend="([^"]+?)"/) {
- $mtt->add(
- term => '@:dgd/para:rend:' . $1,
- pti => 16,
- payload => '<s>' . $tui
- );
+ if ($text =~ /rend="([^"]+?)"/) {
+ $i = $mtt->add('@:dgd/para:rend:' . $1);
+ $i->set_pti(16);
+ $i->set_payload('<s>' . $tui);
};
# desc
- if ($feat->{'#text'} =~ m!<desc[^>]*>([^<]+?)<\/desc>!) {
- $mtt->add(
- term => '@:dgd/para:desc:' . $1,
- pti => 16,
- payload => '<s>' . $tui
- );
+ if ($text =~ m!<desc[^>]*>([^<]+?)<\/desc>!) {
+ $i = $mtt->add('@:dgd/para:desc:' . $1);
+ $i->set_pti(16);
+ $i->set_payload('<s>' . $tui);
};
last;
diff --git a/lib/KorAP/XML/Annotation/DGD/Structure.pm b/lib/KorAP/XML/Annotation/DGD/Structure.pm
index 0643caf..444bf7e 100644
--- a/lib/KorAP/XML/Annotation/DGD/Structure.pm
+++ b/lib/KorAP/XML/Annotation/DGD/Structure.pm
@@ -68,14 +68,12 @@
my $mtt = $stream->pos($p_start);
# Add the base sentence
- my $mt = $mtt->add(
- term => '<>:base/s:s',
- o_start => $o_start,
- o_end => $_->[1],
- p_start => $p_start,
- p_end => $_->[0],
- pti => 64
- );
+ my $mt = $mtt->add('<>:base/s:s');
+ $mt->set_o_start($o_start);
+ $mt->set_o_end($_->[1]);
+ $mt->set_p_start($p_start);
+ $mt->set_p_end($_->[0]);
+ $mt->set_pti(64);
$mt->set_payload('<b>1');
$sentences++;
diff --git a/lib/KorAP/XML/Annotation/DRuKoLa/Morpho.pm b/lib/KorAP/XML/Annotation/DRuKoLa/Morpho.pm
index efdf5ed..5fa9204 100644
--- a/lib/KorAP/XML/Annotation/DRuKoLa/Morpho.pm
+++ b/lib/KorAP/XML/Annotation/DRuKoLa/Morpho.pm
@@ -20,7 +20,7 @@
# pos tag
if (($f->{-name} eq 'pos') &&
($found = $f->{'#text'})) {
- $mtt->add(term => 'drukola/p:' . $found);
+ $mtt->add('drukola/p:' . $found);
}
# ana tag
@@ -28,7 +28,7 @@
($found = $f->{'#text'})) {
my ($pos, $msd) = split(/ /, $found);
if ($msd) {
- $mtt->add(term => 'drukola/p:' . $pos);
+ $mtt->add('drukola/p:' . $pos);
}
else {
$msd = $pos;
@@ -38,7 +38,7 @@
foreach (split '\|', $msd) {
my ($x, $y) = split "=", $_;
# case, tense, number, mood, person, degree, gender
- $mtt->add(term => 'drukola/m:' . $x . ($y ? ':' . $y : ''));
+ $mtt->add('drukola/m:' . $x . ($y ? ':' . $y : ''));
};
}
@@ -47,7 +47,7 @@
&& ($found = $f->{'#text'})
&& $found ne '--') {
# b($found)->decode('latin-1')->encode->to_string
- $mtt->add(term => 'drukola/l:' . $found);
+ $mtt->add('drukola/l:' . $found);
};
};
}) or return;
diff --git a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
index c60a3b4..3ac1930 100644
--- a/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
+++ b/lib/KorAP/XML/Annotation/DeReKo/Structure.pm
@@ -49,14 +49,12 @@
my $p_end = $span->get_p_end;
# Add structure
- my $mt = $mtt->add(
- term => '<>:dereko/s:' . $name,
- o_start => $span->get_o_start,
- o_end => $span->get_o_end,
- p_start => $p_start,
- p_end => $p_end,
- pti => $span->get_milestone ? 65 : 64,
- );
+ my $mt = $mtt->add('<>:dereko/s:' . $name);
+ $mt->set_o_start($span->get_o_start);
+ $mt->set_o_end($span->get_o_end);
+ $mt->set_p_start($p_start);
+ $mt->set_p_end($p_end);
+ $mt->set_pti($span->get_milestone ? 65 : 64);
my $level = $span->get_hash->{'-l'};
if ($level || $tui) {
@@ -109,14 +107,10 @@
foreach (@$attrs) {
# Add attributes
- $mtt->add(
- term =>
- '@:dereko/s:' . $_->{'-name'} . ($_->{'#text'} ? ':' . $_->{'#text'} : ''),
- p_start => $p_start,
- pti => 17,
- payload => '<s>' . $tui .
- ($span->get_milestone ? '' : '<i>' . $p_end)
- );
+ my $mt = $mtt->add('@:dereko/s:' . $_->{'-name'} . ($_->{'#text'} ? ':' . $_->{'#text'} : ''));
+ $mt->set_p_start($p_start);
+ $mt->set_pti(17);
+ $mt->set_payload('<s>' . $tui .($span->get_milestone ? '' : '<i>' . $p_end));
};
};
}
diff --git a/lib/KorAP/XML/Index/MultiTerm.pm b/lib/KorAP/XML/Index/MultiTerm.pm
index 90253e5..863295f 100644
--- a/lib/KorAP/XML/Index/MultiTerm.pm
+++ b/lib/KorAP/XML/Index/MultiTerm.pm
@@ -52,6 +52,14 @@
$self;
};
+sub new_from_array {
+ bless [@_], shift;
+};
+
+sub new_blank {
+ bless [], shift;
+}
+
sub set_payload {
return $_[0]->[PAYLOAD] = $_[1];
};
@@ -149,7 +157,7 @@
if (defined $_[0]->[P_END]) {
$pre .= '<i>' . $_[0]->[P_END];
};
- if ($_[0]->[0]) {
+ if ($_[0]->[PAYLOAD]) {
if (index($_[0]->[PAYLOAD], '<') == 0) {
$pre .= $_[0]->[PAYLOAD];
}
diff --git a/lib/KorAP/XML/Index/MultiTermToken.pm b/lib/KorAP/XML/Index/MultiTermToken.pm
index cae6cba..ef9d568 100644
--- a/lib/KorAP/XML/Index/MultiTermToken.pm
+++ b/lib/KorAP/XML/Index/MultiTermToken.pm
@@ -14,11 +14,11 @@
MT => 0,
O_START => 1,
O_END => 2,
- ID_COUNTER => 3
+ ID_COUNTER => 3,
};
sub new {
- bless [], shift;
+ bless [[]], shift;
};
@@ -28,7 +28,8 @@
my $mt;
unless (blessed $_[0]) {
if (@_ == 1) {
- $mt = KorAP::XML::Index::MultiTerm->new(term => $_[0]);
+ $mt = KorAP::XML::Index::MultiTerm->new_blank;
+ $mt->set_term($_[0]);
}
else {
$mt = KorAP::XML::Index::MultiTerm->new(@_);
@@ -37,11 +38,21 @@
else {
$mt = $_[0];
};
- $self->[MT] //= [];
push(@{$self->[MT]}, $mt);
$mt;
};
+sub add_position_term {
+ my $self = shift;
+ my $mt = KorAP::XML::Index::MultiTerm->new_blank;
+ $mt->set_term('_'. $_[0]);
+ $mt->set_o_start($_[1]);
+ $mt->set_o_end($_[2]);
+ push(@{$self->[MT]}, $mt);
+ $mt;
+};
+
+
sub set_o_start {
return $_[0]->[O_START] = $_[1];
};
@@ -130,15 +141,15 @@
sub _sort {
# Both are no spans
- if (index($a->[5], '<>:') != 0 && index($b->[5], '<>:') != 0) {
+ if (index($a->get_term, '<>:') != 0 && index($b->get_term, '<>:') != 0) {
# Both are attributes
# Order attributes by reference id
- if (index($a->[5], '@:') == 0 && index($b->[5], '@:') == 0) {
+ if (index($a->get_term, '@:') == 0 && index($b->get_term, '@:') == 0) {
# Check TUI
- my ($a_id) = ($a->[0] =~ m/^<s>(\d+)/);
- my ($b_id) = ($b->[0] =~ m/^<s>(\d+)/);
+ my ($a_id) = ($a->get_payload =~ m/^<s>(\d+)/);
+ my ($b_id) = ($b->get_payload =~ m/^<s>(\d+)/);
if ($a_id > $b_id) {
return 1;
}
@@ -152,18 +163,18 @@
# Both are relations
elsif (
- (index($a->[5],'<:') == 0 || index($a->[5],'>:') == 0) &&
- (index($b->[5], '<:') == 0 || index($b->[5],'>:') == 0)) {
+ (index($a->get_term,'<:') == 0 || index($a->get_term,'>:') == 0) &&
+ (index($b->get_term, '<:') == 0 || index($b->get_term,'>:') == 0)) {
my $a_end = ($a->get_pti < 34 ? $a->get_p_start : (
- ($a->get_pti == 35 ? ($a->[0] =~ /^(?:<i>\d+){4}<i>(\d+)</ && $1) :
- ($a->[0] =~ /^(?:<i>\d+){2}<i>(\d+)</ && $1)
+ ($a->get_pti == 35 ? ($a->get_payload =~ /^(?:<i>\d+){4}<i>(\d+)</ && $1) :
+ ($a->get_payload =~ /^(?:<i>\d+){2}<i>(\d+)</ && $1)
)
));
my $b_end = ($b->get_pti < 34 ? $b->get_p_start : (
- ($b->get_pti == 35 ? ($b->[0] =~ /^(?:<i>\d+){4}<i>(\d+)</ && $1) :
- ($b->[0] =~ /^(?:<i>\d+){2}<i>(\d+)</ && $1)
+ ($b->get_pti == 35 ? ($b->get_payload =~ /^(?:<i>\d+){4}<i>(\d+)</ && $1) :
+ ($b->get_payload =~ /^(?:<i>\d+){2}<i>(\d+)</ && $1)
)
));
@@ -178,8 +189,8 @@
# Both are either > or <
# Check for right positions
- (my $a_start, $a_end) = _rel_right_pos($a->get_pti, $a->[0]);
- (my $b_start, $b_end) = _rel_right_pos($b->get_pti, $b->[0]);
+ (my $a_start, $a_end) = _rel_right_pos($a->get_pti, $a->get_payload);
+ (my $b_start, $b_end) = _rel_right_pos($b->get_pti, $b->get_payload);
if ($a_start < $b_start) {
return -1;
}
@@ -199,31 +210,31 @@
};
# This has to be sorted alphabetically!
- return $a->[5] cmp $b->[5];
+ return $a->get_term cmp $b->get_term;
}
# Not identical
- elsif (index($a->[5], '<>:') != 0) {
- return $a->[5] cmp $b->[5];
+ elsif (index($a->get_term, '<>:') != 0) {
+ return $a->get_term cmp $b->get_term;
}
# Not identical
- elsif (index($b->[5], '<>:') != 0) {
- return $a->[5] cmp $b->[5];
+ elsif (index($b->get_term, '<>:') != 0) {
+ return $a->get_term cmp $b->get_term;
}
# Sort both spans
else {
- if ($a->[2] < $b->[2]) {
+ if ($a->get_p_end < $b->get_p_end) {
return -1;
}
- elsif ($a->[2] > $b->[2]) {
+ elsif ($a->get_p_end > $b->get_p_end) {
return 1;
}
# Check depth
else {
- my ($a_depth) = ($a->[0] ? $a->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
- my ($b_depth) = ($b->[0] ? $b->[0] =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
+ my ($a_depth) = ($a->get_payload ? $a->get_payload =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
+ my ($b_depth) = ($b->get_payload ? $b->get_payload =~ m/<b>(\d+)(?:<s>\d+)?$/ : 0);
$a_depth //= 0;
$b_depth //= 0;
@@ -234,7 +245,7 @@
return 1;
}
else {
- return $a->[5] cmp $b->[5];
+ return $a->get_term cmp $b->get_term;
};
};
};
diff --git a/lib/KorAP/XML/Index/MultiTermTokenStream.pm b/lib/KorAP/XML/Index/MultiTermTokenStream.pm
index 1b7bfa2..456f5f6 100644
--- a/lib/KorAP/XML/Index/MultiTermTokenStream.pm
+++ b/lib/KorAP/XML/Index/MultiTermTokenStream.pm
@@ -23,7 +23,7 @@
my ($self, $unit, $term) = @_;
if ($unit->type eq 'token') {
- my $mtt = $self->pos($unit->pos);
+ my $mtt = $self->pos($unit->get_pos);
my $node = $mtt->grep_mt($term);
# TODO: Check if term has PTI 128 - or what is wanted!
@@ -31,31 +31,42 @@
# TODO: if the node has no TUI - add!
return $node if $node;
- my $tui = $self->tui($unit->pos);
- return $mtt->add(
- term => $term,
- pti => 128,
- payload => '<s>' . $tui,
- tui => $tui
- );
+ my $tui = $self->tui($unit->get_pos);
+ # return $mtt->add(
+ # term => $term,
+ # pti => 128,
+ # payload => '<s>' . $tui,
+ # tui => $tui
+ # );
+ return $mtt->add_as_array(
+ '<s>' . $tui, # PAYLOAD=0
+ undef,
+ undef,
+ undef,
+ undef,
+ $term, # TERM=5
+ undef,
+ 128, # PTI=7
+ $tui # TUI=8
+ )
}
# Is span
else {
- my $mtt = $self->pos($unit->p_start);
+ my $mtt = $self->pos($unit->get_p_start);
my $node = $mtt->grep_mt('<>:' . $term);
# TODO: if the node has no TUI - add!
return $node if $node;
- my $tui = $self->tui($unit->p_start);
+ my $tui = $self->tui($unit->get_p_start);
return $mtt->add(
term => '<>:' . $term,
- o_start => $unit->o_start,
- o_end => $unit->o_end,
- p_start => $unit->p_start,
- p_end => $unit->p_end,
+ o_start => $unit->get_o_start,
+ o_end => $unit->get_o_end,
+ p_start => $unit->get_p_start,
+ p_end => $unit->get_p_end,
pti => 64,
payload => '<b>0<s>' . $tui,
tui => $tui
@@ -73,10 +84,8 @@
};
sub pos {
- my $self = shift;
- my $pos = shift;
- return unless defined $pos;
- return $self->[MTT]->[$pos];
+ return unless defined $_[1];
+ return $_[0]->[MTT]->[$_[1]];
};
sub to_string {
@@ -89,10 +98,8 @@
};
sub tui {
- my $self = shift;
- my $pos = shift;
- return unless defined $pos;
- return ++$self->[TUI]->[$pos];
+ return unless defined $_[1];
+ return ++$_[0]->[TUI]->[$_[1]];
};
sub to_array {
diff --git a/lib/KorAP/XML/Krill.pm b/lib/KorAP/XML/Krill.pm
index 7434fa1..ee371aa 100644
--- a/lib/KorAP/XML/Krill.pm
+++ b/lib/KorAP/XML/Krill.pm
@@ -6,14 +6,12 @@
use Scalar::Util qw/weaken/;
use XML::Fast;
use Try::Tiny;
-use Carp qw/croak carp/;
use KorAP::XML::Document::Primary;
use KorAP::XML::Tokenizer;
use Log::Log4perl;
use KorAP::XML::Log;
use Cache::FastMmap;
use Mojo::DOM;
-use Data::Dumper;
use File::Spec::Functions qw/catdir catfile catpath splitdir splitpath rel2abs/;
our $VERSION = '0.41';
@@ -204,9 +202,9 @@
unless ($tokens->parse) {
$self->log->warn(
'Unable to tokenize ' . $self->path .
- ' with ' . $token_foundry . '#'
- . $token_layer
- );
+ ' with ' . $token_foundry . '#'
+ . $token_layer
+ );
}
else {
weaken $self;
@@ -265,7 +263,7 @@
$hash{_k($_)} = $meta->keywords($_);
}
else {
- $v =~ s/\n/ /g;
+ $v =~ tr/\n/ /;
$v =~ s/\s\s+/ /g;
$hash{_k($_)} = $v;
};
@@ -280,10 +278,7 @@
sub _k {
- my $x = substr($_[0], 2);
- $x =~ s/_(\w)/\U$1\E/g;
- $x =~ s/id$/ID/gi;
- return $x;
+ substr($_[0], 2) =~ s/_(\w)/\U$1\E/gr =~ s/id$/ID/gir;
};
@@ -303,41 +298,6 @@
__END__
-sub to_string {
- my $self = shift;
-
- my $string;
-
- foreach (@ATTR) {
- if (my $att = $self->$_) {
- $att =~ s/\n/ /g;
- $att =~ s/\s\s+/ /g;
- $string .= $_ . ' = ' . $att . "\n";
- };
- };
-
- $string .= 'text_class = ' . $self->text_class_string . "\n";
- $string .= 'keywords = ' . $self->keywords_string . "\n";
-
- return $string;
-};
-
-# Todo: Make this a KoralQuery serializer
-sub to_koral_query {
- my $self = shift;
- my $hash = {};
- $hash->{'@context'} = 'http://korap.ids-mannheim.de/ns/koral/0.4/context.jsonld';
- $hash->{'@type'} = 'koral:corpus';
-# $hash->{'text'} = $self->primary->data;
-# my $hash = $self->to_hash;
-};
-
-
-1;
-
-
-__END__
-
=pod
=encoding utf8
diff --git a/lib/KorAP/XML/Tokenizer.pm b/lib/KorAP/XML/Tokenizer.pm
index c2d0c55..51978cd 100644
--- a/lib/KorAP/XML/Tokenizer.pm
+++ b/lib/KorAP/XML/Tokenizer.pm
@@ -119,12 +119,14 @@
my $mtt;
my $distance = 0;
# my (@non_word_tokens);
+
+ my $p = $doc->primary;
foreach my $span (@$tokens) {
my $from = $span->{'-from'};
my $to = $span->{'-to'};
# Get the subring from primary data
- my $token = $doc->primary->data($from, $to);
+ my $token = $p->data($from, $to);
# Token is undefined
unless (defined $token) {
@@ -188,10 +190,10 @@
$old = $to + 1;
# Add position term
- $mtt->add(
- term => '_' . $have,
- o_start => $mtt->get_o_start,
- o_end => $mtt->get_o_end
+ $mtt->add_position_term(
+ $have,
+ $mtt->get_o_start,
+ $mtt->get_o_end
);
$have++;
@@ -206,14 +208,12 @@
$mtts->add_meta('tokens', '<i>' . $have);
# Add text boundary
- $mtts->pos(0)->add(
- term => '<>:base/s:t',
- o_start => 0,
- p_end => $have,
- o_end => $doc->primary->data_length,
- payload => '<b>0',
- pti => 64
- );
+ my $tb = $mtts->pos(0)->add('<>:base/s:t');
+ $tb->set_o_start(0);
+ $tb->set_p_end($have);
+ $tb->set_o_end($doc->primary->data_length);
+ $tb->set_payload('<b>0');
+ $tb->set_pti(64);
# Create a gap for the end
if ($doc->primary->data_length >= ($old - 1)) {
@@ -257,27 +257,27 @@
my $from = $-[1];
my $to = $+[1];
$mtt->add(
- term => 'i^1:' . substr($os, $from, $from + $to),
- o_start => $from + $o_start,
- o_end => $to + $o_start
+ term => 'i^1:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
) unless $to - $from == $l;
};
while ($s =~ /(0+)[^0]/g) {
my $from = $-[1];
my $to = $+[1];
$mtt->add(
- term => 'i^2:' . substr($os, $from, $from + $to),
- o_start => $from + $o_start,
- o_end => $to + $o_start
+ term => 'i^2:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
) unless $to - $from == $l;
};
while ($s =~ /(#)/g) {
my $from = $-[1];
my $to = $+[1];
$mtt->add(
- term => 'i^3:' . substr($os, $from, $from + $to),
- o_start => $from + $o_start,
- o_end => $to + $o_start
+ term => 'i^3:' . substr($os, $from, $from + $to),
+ o_start => $from + $o_start,
+ o_end => $to + $o_start
) unless $to - $from == $l;
};
};
@@ -421,9 +421,9 @@
my $mod = 'KorAP::XML::Annotation::' . $foundry . '::' . $layer;
if ($mod->can('new') || eval("require $mod; 1;")) {
- my $obj = $mod->new($self);
+ my $obj = $mod->new($self);
- if (my $retval = $obj->parse(@_)) {
+ if (my $retval = $obj->parse(@_)) {
# This layer is supported
$self->support($foundry => $layer, @_);
@@ -458,7 +458,7 @@
my $b_quota = ($b_have * 100) / $b_should;
return sprintf("%.2f", $a_quota) . '%' .
((($a_quota + $b_quota) <= 100) ?
- ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
+ ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
};