Fixed primary data problems, speedup using moar C and now provide layer info
diff --git a/Makefile.PL b/Makefile.PL
index 804bcaf..7ee0a7a 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -21,7 +21,8 @@
'strict' => 0,
'warnings' => 0,
'utf8' => 0,
- 'bytes' => 0
+ 'bytes' => 0,
+ 'List::MoreUtils' => 0.33
},
# LICENSE => 'perl',
MIN_PERL_VERSION => '5.016',
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 5b4ef73..2cb023a 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -1,9 +1,9 @@
package KorAP::Document;
use Mojo::Base -base;
use v5.16;
-
use Mojo::ByteStream 'b';
-use Mojo::DOM;
+use XML::Fast;
+use Try::Tiny;
use Carp qw/croak/;
use KorAP::Document::Primary;
@@ -19,17 +19,26 @@
my $self = shift;
my $file = b($self->path . 'data.xml')->slurp;
+ my ($rt, $error);
state $unable = 'Unable to parse document ' . $self->path;
+ try {
+ local $SIG{__WARN__} = sub {
+ $error = 1;
+ };
+ $rt = xml2hash($file, text => '#text', attr => '-')->{raw_text};
+ }
+ catch {
+ $self->log->warn($unable);
+ $error = 1;
+ };
+
+ return if $error;
$self->log->debug('Parse document ' . $self->path);
- my $dom = Mojo::DOM->new($file);
-
- my $rt = $dom->at('raw_text');
-
# Get document id and corpus id
- if ($rt && $rt->attr('docid')) {
- $self->id($rt->attr('docid'));
+ if ($rt && $rt->{'-docid'}) {
+ $self->id($rt->{'-docid'});
if ($self->id =~ /^([^_]+)_/) {
$self->corpus_id($1);
}
@@ -42,11 +51,9 @@
};
# Get primary data
- my $pd = $rt->at('text');
+ my $pd = $rt->{text};
if ($pd) {
-
- $pd = b($pd->text)->decode;
- $self->{pd} = KorAP::Document::Primary->new($pd->to_string);
+ $self->{pd} = KorAP::Document::Primary->new($pd);
}
else {
croak $unable;
diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 7675737..2064e6e 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm
@@ -1,6 +1,7 @@
package KorAP::Field::MultiTermToken;
use KorAP::Field::MultiTerm;
use Mojo::Base -base;
+use List::MoreUtils 'uniq';
has [qw/o_start o_end/];
@@ -33,7 +34,7 @@
sub to_array {
my $self = shift;
- [map($_->to_string, @{$self->{mt}})];
+ [uniq(map($_->to_string, @{$self->{mt}}))];
};
1;
diff --git a/lib/KorAP/Index/Base.pm b/lib/KorAP/Index/Base.pm
index cc53420..7e447d6 100644
--- a/lib/KorAP/Index/Base.pm
+++ b/lib/KorAP/Index/Base.pm
@@ -24,4 +24,8 @@
bless \$tokens, $class;
};
+sub layer_info {
+ return []
+};
+
1;
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index 6b45f63..595c527 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -1,8 +1,6 @@
package KorAP::Index::Base::Paragraphs;
use KorAP::Index::Base;
-
-
sub parse {
my $self = shift;
my $i = 0;
@@ -27,4 +25,6 @@
return 1;
};
+
+
1;
diff --git a/lib/KorAP/Index/Connexor/Morpho.pm b/lib/KorAP/Index/Connexor/Morpho.pm
index be20d33..1aaa991 100644
--- a/lib/KorAP/Index/Connexor/Morpho.pm
+++ b/lib/KorAP/Index/Connexor/Morpho.pm
@@ -58,5 +58,10 @@
return 1;
};
+sub layer_info {
+ ['cnx/l=lemma', 'cnx/p=pos', 'cnx/m=msd'];
+};
+
+
1;
diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
index e540169..f6dbe99 100644
--- a/lib/KorAP/Index/Connexor/Phrase.pm
+++ b/lib/KorAP/Index/Connexor/Phrase.pm
@@ -19,7 +19,7 @@
if ($type) {
my $mtt = $stream->pos($span->p_start);
$mtt->add(
- term => '<>:cnx/const:' . $type,
+ term => '<>:cnx/c:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
@@ -31,5 +31,9 @@
return 1;
};
+sub layer_info {
+ ['cnx/c=const'];
+};
+
1;
diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
index 430dff5..f840d1e 100644
--- a/lib/KorAP/Index/Mate/Dependency.pm
+++ b/lib/KorAP/Index/Mate/Dependency.pm
@@ -52,5 +52,9 @@
return 1;
};
+sub layer_info {
+ ['mate/d=dep']
+};
+
1;
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index 55ecbbc..663224a 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -49,5 +49,8 @@
return 1;
};
+sub layer_info {
+ ['mate/l=lemma', 'mate/p=pos', 'mate/m=msd']
+}
1;
diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
index f03b17f..2de5042 100644
--- a/lib/KorAP/Index/OpenNLP/Morpho.pm
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm
@@ -27,4 +27,8 @@
return 1;
};
+sub layer_info {
+ ['opennlp/p=pos'];
+};
+
1;
diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
index e4de00e..d83267a 100644
--- a/lib/KorAP/Index/TreeTagger/Morpho.pm
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm
@@ -44,5 +44,8 @@
return 1;
};
+sub layer_info {
+ ['tt/p=pos', 'tt/l=lemma']
+};
1;
diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index 7b70424..341a9da 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm
@@ -48,7 +48,7 @@
# $type is now NPA, NP, NUM ...
my %term = (
- term => '<>:xip/const:' . $type,
+ term => '<>:xip/c:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
@@ -79,5 +79,8 @@
return 1;
};
+sub layer_info {
+ ['xip/c=const']
+}
1;
diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
index 98e121e..20ab151 100644
--- a/lib/KorAP/Index/XIP/Dependency.pm
+++ b/lib/KorAP/Index/XIP/Dependency.pm
@@ -54,5 +54,9 @@
return 1;
};
+sub layer_info {
+ ['xip/d=dep']
+}
+
1;
diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
index 83b484e..45e7c9a 100644
--- a/lib/KorAP/Index/XIP/Morpho.pm
+++ b/lib/KorAP/Index/XIP/Morpho.pm
@@ -57,5 +57,9 @@
return 1;
};
+sub layer_info {
+ ['xip/l=lemma', 'xip/p=pos']
+};
+
1;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 3f2c7f2..e5c7b51 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -2,12 +2,15 @@
use Mojo::Base -base;
use Mojo::ByteStream 'b';
use Mojo::Loader;
+use XML::Fast;
+use Try::Tiny;
use Carp qw/croak/;
use KorAP::Tokenizer::Range;
use KorAP::Tokenizer::Match;
use KorAP::Tokenizer::Spans;
use KorAP::Tokenizer::Tokens;
use KorAP::Field::MultiTermTokenStream;
+use List::MoreUtils 'uniq';
use JSON::XS;
use Log::Log4perl;
@@ -26,8 +29,8 @@
my $mtts = KorAP::Field::MultiTermTokenStream->new;
my $path = $self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml';
my $file = b($path)->slurp;
- my $tokens = Mojo::DOM->new($file);
- $tokens->xml(1);
+# my $tokens = Mojo::DOM->new($file);
+# $tokens->xml(1);
my $doc = $self->doc;
@@ -41,25 +44,58 @@
$self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);
+ # TODO: Reuse the following code from Spans.pm and tokens.pm
+ my ($tokens, $error);
+ try {
+ local $SIG{__WARN__} = sub {
+ $error = 1;
+ };
+ $tokens = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList};
+ }
+ catch {
+ $self->log->warn('Token error in ' . $path . ($_ ? ': ' . $_ : ''));
+ $error = 1;
+ };
+
+ return if $error;
+
+ if (ref $tokens && $tokens->{span}) {
+ $tokens = $tokens->{span};
+ }
+ else {
+ return [];
+ };
+
+ $tokens = [$tokens] if ref $tokens ne 'ARRAY';
+
# Iterate over all tokens
- $tokens->find('span')->each(
- sub {
- my $span = $_;
- my $from = $span->attr('from');
- my $to = $span->attr('to');
+ # $tokens->find('span')->each(
+ # sub {
+ # my $span = $_;
+ foreach my $span (@$tokens) {
+ my $from = $span->{'-from'};
+ my $to = $span->{'-to'};
my $token = $doc->primary->data($from, $to);
# warn 'Has ' . $from . '->' . $to . "($old)";
unless (defined $token) {
$self->log->error("Unable to find substring [$from-$to] in $path");
- return;
+ next;
};
$should++;
# Ignore non-word tokens
- return if $token !~ /[\w\d]/;
+ next if $token !~ /[\w\d]/;
+
+# my $limit = 40;
+# if ($should > $limit) {
+# warn $token;
+# };
+# if ($should > $limit+20) {
+# die;
+# };
my $mtt = $mtts->add;
@@ -86,7 +122,7 @@
$mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
$have++;
- });
+ };
# Add token count
$mtts->add_meta('tokens', '<i>' . $have);
@@ -213,7 +249,12 @@
if ($mod->can('new') || eval("require $mod; 1;")) {
if (my $retval = $mod->new($self)->parse(@_)) {
+
+ # This layer is supported
$self->support($foundry => $layer, @_);
+
+ # Get layerinfo
+ $self->layer_info($mod->layer_info);
return $retval;
};
}
@@ -270,6 +311,17 @@
push(@{$self->{support}->{$f}}, [$l, @info]);
};
+sub layer_info {
+ my $self = shift;
+ $self->{layer_info} //= [];
+ if ($_[0]) {
+ push(@{$self->{layer_info}}, @{$_[0]});
+ }
+ else {
+ return join ' ', uniq @{$self->{layer_info}};
+ };
+};
+
sub to_string {
my $self = shift;
@@ -290,6 +342,10 @@
$string .= 'support = ' . $foundry . '#' . join(',', @{$_}) . "\n";
};
};
+ foreach my $layer_info (keys %{$self->layer_info}) {
+ $string .= 'layer_info = ' . $_ . "\n";
+ };
+
$string .= "</info>\n";
$string .= $self->stream->to_string;
$string .= "</field>";
@@ -308,7 +364,8 @@
name => $self->name,
data => $self->stream->to_array,
tokenization => lc($self->foundry) . '#' . lc($self->layer),
- foundries => $self->support
+ foundries => $self->support,
+ layerInfo => $self->layer_info
});
$data{fields} = \@fields;
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 045cf76..d5c6dfa 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -1,7 +1,7 @@
package KorAP::Tokenizer::Spans;
use Mojo::Base 'KorAP::Tokenizer::Units';
use KorAP::Tokenizer::Span;
-use Mojo::DOM;
+# use Mojo::DOM;
use Mojo::ByteStream 'b';
use XML::Fast;
use Try::Tiny;
@@ -20,11 +20,6 @@
my $file = b($path)->slurp;
- # my $spans = Mojo::DOM->new($file);
- # $spans->xml(1);
-
- # my $spans = XML::LibXML->load_xml(string => $file);
-
my ($spans, $error);
try {
local $SIG{__WARN__} = sub {