Indexation script finished
diff --git a/Makefile.PL b/Makefile.PL
index b076284..db22f55 100644
--- a/Makefile.PL
+++ b/Makefile.PL
@@ -13,11 +13,14 @@
'Mojolicious' => 4.51,
'Packed::Array' => 0.01,
'Log::Log4perl' => 1.42,
- 'Carp' => 0,
- 'strict' => 0,
- 'warnings' => 0,
- 'utf8' => 0,
- 'bytes' => 0
+ 'JSON::XS' => 3.01,
+ 'Set::Scalar' => 1.26,
+ 'Benchmark' => 0,
+ 'Carp' => 0,
+ 'strict' => 0,
+ 'warnings' => 0,
+ 'utf8' => 0,
+ 'bytes' => 0
},
# LICENSE => 'perl',
MIN_PERL_VERSION => '5.016',
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 8f80e6c..4a07f8e 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -4,11 +4,15 @@
use Mojo::ByteStream 'b';
use Mojo::DOM;
-use Carp qw/croak carp/;
+use Carp qw/croak/;
use KorAP::Document::Primary;
-has [qw/id corpus_id path/];
-has [qw/pub_date title sub_title pub_place/];
+our @ATTR = qw/id corpus_id pub_date
+ title sub_title pub_place/;
+has 'path';
+has [@ATTR];
+
+has log => sub { Log::Log4perl->get_logger(__PACKAGE__) };
# parse document
sub parse {
@@ -17,7 +21,7 @@
state $unable = 'Unable to parse document';
- carp 'Parse document ' . $self->path;
+ $self->log->trace('Parse document ' . $self->path);
my $dom = Mojo::DOM->new($file);
@@ -84,7 +88,7 @@
sub _parse_meta {
my $self = shift;
- my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1')->encode;
+ my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1');
my $dom = Mojo::DOM->new($file);
my $monogr = $dom->at('monogr');
@@ -126,6 +130,55 @@
$self->text_class(@topic);
};
+sub to_string {
+ my $self = shift;
+
+ my $string;
+
+ foreach (@ATTR) {
+ if (my $att = $self->$_) {
+ $att =~ s/\n/ /g;
+ $att =~ s/\s\s+/ /g;
+ $string .= $_ . ' = ' . $att . "\n";
+ };
+ };
+
+ if ($self->author) {
+ foreach (@{$self->author}) {
+ $_ =~ s/\n/ /g;
+ $_ =~ s/\s\s+/ /g;
+ $string .= 'author = ' . $_ . "\n";
+ };
+ };
+
+ if ($self->text_class) {
+ foreach (@{$self->text_class}) {
+ $string .= 'text_class = ' . $_ . "\n";
+ };
+ };
+
+ return $string;
+};
+
+
+sub to_hash {
+ my $self = shift;
+
+ my %hash;
+
+ foreach (@ATTR, 'author', 'text_class') {
+ if (my $att = $self->$_) {
+ $att =~ s/\n/ /g;
+ $att =~ s/\s\s+/ /g;
+ $hash{$_} = $att;
+ };
+ };
+
+ return \%hash;
+};
+
+
+
1;
diff --git a/lib/KorAP/Document/Primary.pm b/lib/KorAP/Document/Primary.pm
index 52ca844..be0a234 100644
--- a/lib/KorAP/Document/Primary.pm
+++ b/lib/KorAP/Document/Primary.pm
@@ -5,7 +5,10 @@
use Mojo::ByteStream 'b';
use feature 'state';
use Packed::Array;
+use utf8;
+# our $QUOT = b("„“”")->decode;
+our $QUOT_RE = qr/[„“”]/;
# Constructor
sub new {
@@ -19,13 +22,37 @@
my $self = shift;
my ($from, $to) = @_;
- return b(substr($self->[0], $from))->encode if $from && !$to;
+# return b(substr($self->[0], $from))->encode if $from && !$to;
+ return substr($self->[0], $from) if $from && !$to;
- return b($self->[0])->encode unless $to;
+# return b($self->[0])->encode unless $to;
+ return $self->[0] unless $to;
my $substr = substr($self->[0], $from, $to - $from);
if ($substr) {
- return b($substr)->encode;
+# return b($substr)->encode;
+ return $substr;
+ };
+ # encode 'UTF-8',
+ croak 'Unable to find substring';
+};
+
+
+sub data_bytes {
+ my $self = shift;
+ my ($from, $to) = @_;
+
+ use bytes;
+
+ return b(substr($self->[0], $from))->decode if $from && !$to;
+
+# return b($self->[0])->encode unless $to;
+ return b($self->[0])->decode unless $to;
+
+ my $substr = substr($self->[0], $from, $to - $from);
+ if ($substr) {
+# return b($substr)->encode;
+ return b($substr)->decode;
};
# encode 'UTF-8',
croak 'Unable to find substring';
@@ -45,17 +72,31 @@
sub bytes2chars {
my $self = shift;
unless ($self->[2]) {
- $self->_calc_chars;
+ $self->[2] = $self->_calc_chars($self->[0]);
};
return $self->[2]->[shift];
};
+# Get correct offset
+sub xip2chars {
+ my $self = shift;
+ unless ($self->[3]) {
+ my $buffer = $self->[0];
+
+ # Hacky work around: replace fancy quotation marks for XIP
+ $buffer =~ s{$QUOT_RE}{"}g;
+
+ $self->[3] = $self->_calc_chars($buffer);
+ };
+ return $self->[3]->[shift];
+};
# Calculate character offsets
sub _calc_chars {
use bytes;
-
my $self = shift;
+ my $text = shift;
+
tie my @array, 'Packed::Array';
state $leading = pack( 'B8', '10000000' );
@@ -65,14 +106,14 @@
my $c;
# Init array
- my $l = length($self->[0]);
+ my $l = length($text);
$array[$l-1] = 0;
# Iterate over every character
- while ($i < $l) {
+ while ($i <= $l) {
# Get actual character
- $c = substr($self->[0], $i, 1);
+ $c = substr($text, $i, 1);
# store character position
$array[$i++] = $j;
@@ -81,7 +122,7 @@
if (ord($c & $leading) && ord($c & $start)) {
# Get the next byte - expecting a following character
- $c = substr($self->[0], $i, 1);
+ $c = substr($text, $i, 1);
# Character is part of a multibyte
while (ord($c & $leading)) {
@@ -90,14 +131,13 @@
$array[$i] = (ord($c & $start)) ? ++$j : $j;
# Get next character
- $c = substr($self->[0], ++$i, 1);
+ $c = substr($text, ++$i, 1);
};
};
$j++;
};
-
- $self->[2] = \@array;
+ return \@array;
};
diff --git a/lib/KorAP/MultiTerm.pm b/lib/KorAP/Field/MultiTerm.pm
similarity index 95%
rename from lib/KorAP/MultiTerm.pm
rename to lib/KorAP/Field/MultiTerm.pm
index 9dd12a9..7381a0d 100644
--- a/lib/KorAP/MultiTerm.pm
+++ b/lib/KorAP/Field/MultiTerm.pm
@@ -1,4 +1,4 @@
-package KorAP::MultiTerm;
+package KorAP::Field::MultiTerm;
use Mojo::Base -base;
has [qw/p_start p_end o_start o_end term payload/];
diff --git a/lib/KorAP/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
similarity index 65%
rename from lib/KorAP/MultiTermToken.pm
rename to lib/KorAP/Field/MultiTermToken.pm
index 9df9811..7675737 100644
--- a/lib/KorAP/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm
@@ -1,5 +1,5 @@
-package KorAP::MultiTermToken;
-use KorAP::MultiTerm;
+package KorAP::Field::MultiTermToken;
+use KorAP::Field::MultiTerm;
use Mojo::Base -base;
has [qw/o_start o_end/];
@@ -9,10 +9,10 @@
my $mt;
unless (ref $_[0] eq 'MultiTerm') {
if (@_ == 1) {
- $mt = KorAP::MultiTerm->new(term => shift());
+ $mt = KorAP::Field::MultiTerm->new(term => shift());
}
else {
- $mt = KorAP::MultiTerm->new(@_);
+ $mt = KorAP::Field::MultiTerm->new(@_);
};
}
else {
@@ -31,4 +31,9 @@
return $string;
};
+sub to_array {
+ my $self = shift;
+ [map($_->to_string, @{$self->{mt}})];
+};
+
1;
diff --git a/lib/KorAP/MultiTermTokenStream.pm b/lib/KorAP/Field/MultiTermTokenStream.pm
similarity index 68%
rename from lib/KorAP/MultiTermTokenStream.pm
rename to lib/KorAP/Field/MultiTermTokenStream.pm
index 6d7dc29..d2e66a8 100644
--- a/lib/KorAP/MultiTermTokenStream.pm
+++ b/lib/KorAP/Field/MultiTermTokenStream.pm
@@ -1,12 +1,12 @@
-package KorAP::MultiTermTokenStream;
+package KorAP::Field::MultiTermTokenStream;
use Mojo::Base -base;
-use KorAP::MultiTermToken;
+use KorAP::Field::MultiTermToken;
has [qw/oStart oEnd/];
sub add {
my $self = shift;
- my $mtt = shift // KorAP::MultiTermToken->new;
+ my $mtt = shift // KorAP::Field::MultiTermToken->new;
$self->{mtt} //= [];
push(@{$self->{mtt}}, $mtt);
return $mtt;
@@ -30,4 +30,9 @@
return join("\n" , map { $_->to_string } @{$self->{mtt}}) . "\n";
};
+sub to_array {
+ my $self = shift;
+ [ map { $_->to_array } @{$self->{mtt}} ];
+};
+
1;
diff --git a/lib/KorAP/Index/Base.pm b/lib/KorAP/Index/Base.pm
new file mode 100644
index 0000000..cc53420
--- /dev/null
+++ b/lib/KorAP/Index/Base.pm
@@ -0,0 +1,27 @@
+package KorAP::Index::Base;
+
+use strict;
+use warnings;
+
+sub import {
+ my $class = shift;
+ my $caller = caller;
+
+ no strict 'refs';
+
+ push @{"${caller}::ISA"}, $class;
+
+ strict->import;
+ warnings->import;
+ utf8->import;
+ feature->import(':5.10');
+};
+
+
+sub new {
+ my $class = shift;
+ my $tokens = shift;
+ bless \$tokens, $class;
+};
+
+1;
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
new file mode 100644
index 0000000..35199d4
--- /dev/null
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -0,0 +1,30 @@
+package KorAP::Index::Base::Paragraphs;
+use KorAP::Index::Base;
+
+
+
+sub parse {
+ my $self = shift;
+ my $i = 0;
+ $$self->add_spandata(
+ foundry => 'base',
+ layer => 'paragraph',
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:p',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ ) or return;
+
+ $$self->stream->add_meta('p', '<i>' . $i);
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Index/Connexor/Morpho.pm b/lib/KorAP/Index/Connexor/Morpho.pm
new file mode 100644
index 0000000..74533bf
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Morpho.pm
@@ -0,0 +1,62 @@
+package KorAP::Index::Connexor::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'connexor',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f};
+
+ my $found;
+
+ my $features = $content->{fs}->{f};
+
+ for my $f (@$features) {
+
+ # Lemma
+ if (($f->{-name} eq 'lemma') && ($found = $f->{'#text'})) {
+ if (index($found, "\N{U+00a0}") >= 0) {
+ foreach (split(/\x{00A0}/, $found)) {
+ $mtt->add(
+ term => 'cnx_l:' . $_
+ );
+ }
+ }
+ else {
+ $mtt->add(
+ term => 'cnx_l:' . $found
+ );
+ };
+ }
+
+ # POS
+ elsif (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
+ $mtt->add(
+ term => 'cnx_p:' . $found
+ );
+
+ }
+ # MSD
+ # Todo: Look in the description!
+ elsif (($f->{-name} eq 'msd') && ($found = $f->{'#text'})) {
+ foreach (split(':', $found)) {
+ $mtt->add(
+ term => 'cnx_m:' . $_
+ );
+ };
+ };
+ };
+ }
+ ) or return;
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
new file mode 100644
index 0000000..60fcc8e
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Phrase.pm
@@ -0,0 +1,35 @@
+package KorAP::Index::Connexor::Phrase;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_spandata(
+ foundry => 'connexor',
+ layer => 'phrase',
+ cb => sub {
+ my ($stream, $span) = @_;
+
+ my $content = $span->hash->{fs}->{f};
+
+ return if $content->{-name} ne 'pos';
+
+ my $type = $content->{'#text'};
+
+ if ($type) {
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:cnx_const:' . $type,
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ };
+ }
+ ) or return;
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Index/Connexor/Syntax.pm b/lib/KorAP/Index/Connexor/Syntax.pm
new file mode 100644
index 0000000..35f5079
--- /dev/null
+++ b/lib/KorAP/Index/Connexor/Syntax.pm
@@ -0,0 +1,31 @@
+package KorAP::Index::Connexor::Syntax;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'connexor',
+ layer => 'syntax',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $found;
+ my $spans = $token->hash->{fs}->{f}->{fs}->{f};
+
+
+ # syntax
+ foreach (@$spans) {
+ if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
+ $mtt->add(
+ term => 'cnx_syn:' . $found
+ );
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Index/CoreNLP/NamedEntities.pm b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
new file mode 100644
index 0000000..8596783
--- /dev/null
+++ b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
@@ -0,0 +1,32 @@
+package KorAP::Index::CoreNLP::NamedEntities;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+ my $model = shift;
+
+ $$self->add_tokendata(
+ foundry => 'corenlp',
+ layer => $model,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f} or return;
+ my $found;
+
+ if (($content->{-name} eq 'ne') &&
+ ($found = $content->{fs}) &&
+ ($found = $found->{f}) &&
+ ($found->{-name} eq 'ent') &&
+ ($found = $found->{'#text'})) {
+ $mtt->add(
+ term => 'corenlp_' . $model . ':' . $found
+ );
+ };
+ }) or return;
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
new file mode 100644
index 0000000..d300363
--- /dev/null
+++ b/lib/KorAP/Index/Mate/Dependency.pm
@@ -0,0 +1,56 @@
+package KorAP::Index::Mate::Dependency;
+use KorAP::Index::Base;
+use Data::Dumper;
+
+sub parse {
+ my $self = shift;
+
+ # TODO: Create XIP tree here - for indirect dependency
+ # >>:xip_d:SUBJ<i>566<i>789
+
+ $$self->add_tokendata(
+ foundry => 'mate',
+ layer => 'dependency',
+ cb => sub {
+ my ($stream, $token, $tokens) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash;
+
+ my $rel = $content->{rel};
+ $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+ foreach (@$rel) {
+ my $label = $_->{-label};
+
+ if ($_->{-type} && $_->{-type} eq 'unary') {
+ next if $_->{-label} eq '--';
+ $mtt->add(
+ term => 'mate_d:' . $label
+ );
+ }
+ else {
+
+ my $from = $_->{span}->{-from};
+ my $to = $_->{span}->{-to};
+
+ my $rel_token = $tokens->token($from, $to) or next;
+
+ $mtt->add(
+ term => '>:mate_d:' . $label,
+ payload => '<i>' . $rel_token->pos
+ );
+
+ $stream->pos($rel_token->pos)->add(
+ term => '<:mate_d:' . $label,
+ payload => '<i>' . $token->pos
+ );
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
new file mode 100644
index 0000000..b1a6803
--- /dev/null
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -0,0 +1,54 @@
+package KorAP::Index::Mate::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'mate',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f};
+
+ my $found;
+
+ my $capital = 0;
+
+ foreach my $f (@{$content->{fs}->{f}}) {
+
+ # pos
+ if (($f->{-name} eq 'pos') &&
+ ($found = $f->{'#text'})) {
+ $mtt->add(term => 'mate_p:' . $found
+ );
+ }
+
+ # lemma
+ elsif (($f->{-name} eq 'lemma')
+ && ($found = $f->{'#text'})
+ && $found ne '--') {
+ # b($found)->decode('latin-1')->encode->to_string
+ $mtt->add(term => 'mate_l:' . $found);
+ }
+
+ # MSD
+ elsif (($f->{-name} eq 'msd') &&
+ ($found = $f->{'#text'}) &&
+ ($found ne '_')) {
+ foreach (split '\|', $found) {
+ my ($x, $y) = split "=", $_;
+ # case, tense, number, mood, person, degree, gender
+ $mtt->add(term => 'mate_m:' . $x . ':' . $y);
+ };
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
new file mode 100644
index 0000000..20716b0
--- /dev/null
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm
@@ -0,0 +1,30 @@
+package KorAP::Index::OpenNLP::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'opennlp',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f} or return;
+
+ $content = $content->{fs}->{f};
+ my $found;
+
+ # syntax
+ if (($content->{-name} eq 'pos') && ($content->{'#text'})) {
+ $mtt->add(
+ term => 'opennlp_p:' . $content->{'#text'}
+ );
+ };
+ }) or return;
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
new file mode 100644
index 0000000..7868fb2
--- /dev/null
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm
@@ -0,0 +1,29 @@
+package KorAP::Index::OpenNLP::Sentences;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+ my $i = 0;
+
+ $$self->add_spandata(
+ foundry => 'opennlp',
+ layer => 'sentences',
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ ) or return;
+
+ $$self->stream->add_meta('s', '<i>' . $i);
+
+ return 1;
+};
+
+1;
diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
new file mode 100644
index 0000000..67994ac
--- /dev/null
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm
@@ -0,0 +1,48 @@
+package KorAP::Index::TreeTagger::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'tree_tagger',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f};
+
+ my $found;
+
+ $content = ref $content ne 'ARRAY' ? [$content] : $content;
+
+ foreach my $fs (@$content) {
+ $content = $fs->{fs}->{f};
+ foreach (@$content) {
+
+ # lemma
+ if (($_->{-name} eq 'lemma') &&
+ ($found = $_->{'#text'}) &&
+ ($found ne 'UNKNOWN') &&
+ ($found ne '?')) {
+ $mtt->add(
+ term => 'tt_l:' . $found
+ );
+ };
+
+ # pos
+ if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
+ $mtt->add(
+ term => 'tt_p:' . $found
+ );
+ };
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
new file mode 100644
index 0000000..61c2a5e
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Constituency.pm
@@ -0,0 +1,83 @@
+package KorAP::Index::XIP::Constituency;
+use KorAP::Index::Base;
+use Set::Scalar;
+use v5.16;
+
+sub parse {
+ my $self = shift;
+
+ # Collect all spans and check for roots
+ my %xip_const;
+ my $xip_const_root = Set::Scalar->new;
+ my $xip_const_noroot = Set::Scalar->new;
+
+ # First run:
+ $$self->add_spandata(
+ foundry => 'xip',
+ layer => 'constituency',
+ encoding => 'xip',
+ cb => sub {
+ my ($stream, $span) = @_;
+
+ $xip_const{$span->id} = $span;
+ $xip_const_root->insert($span->id);
+
+ my $rel = $span->hash->{rel} or return;
+ $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+ foreach (@$rel) {
+ if ($_->{-label} eq 'dominates' && $_->{-target}) {
+ $xip_const_noroot->insert($_->{-target});
+ };
+ };
+ }
+ ) or return;
+
+ my $stream = $$self->stream;
+
+ my $add_const = sub {
+ my $span = shift;
+ my $level = shift;
+ my $mtt = $stream->pos($span->p_start);
+
+ my $content = $span->hash;
+ my $f = $content->{fs}->{f};
+ return unless $f->{-name} eq 'const';
+
+ my $type = $f->{'#text'} or return;
+
+ # $type is now NPA, NP, NUM ...
+ my %term = (
+ term => '<>:xip_const:' . $type,
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+
+ $term{payload} = '<s>' . $level if $level;
+
+ $mtt->add(%term);
+
+ my $this = __SUB__;
+
+ my $rel = $content->{rel} or return;
+ $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+ foreach (@$rel) {
+ next if $_->{-label} ne 'dominates' || !$_->{-target};
+ my $subspan = delete $xip_const{$_->{-target}} or return;
+ $this->($subspan, $level + 1);
+ };
+ };
+
+ my $diff = $xip_const_root->difference($xip_const_noroot);
+ foreach ($diff->members) {
+ my $obj = delete $xip_const{$_} or next;
+ $add_const->($obj, 0);
+ };
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
new file mode 100644
index 0000000..ca889d8
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Dependency.pm
@@ -0,0 +1,58 @@
+package KorAP::Index::XIP::Dependency;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ # TODO: Create XIP tree here - for indirect dependency
+ # >>:xip_d:SUBJ<i>566<i>789
+
+ $$self->add_tokendata(
+ foundry => 'xip',
+ layer => 'dependency',
+ encoding => 'xip',
+ cb => sub {
+ my ($stream, $token, $tokens) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash;
+
+ my $rel = $content->{rel};
+ $rel = [$rel] unless ref $rel eq 'ARRAY';
+
+ foreach (@$rel) {
+ my $label = $_->{-label};
+
+ if ($_->{-type} && $_->{-type} eq 'unary') {
+ $mtt->add(
+ term => 'xip_d:' . $label
+ );
+ }
+ else {
+
+ my $from = $_->{span}->{-from};
+ my $to = $_->{span}->{-to};
+
+ my $rel_token = $tokens->token($from, $to) or next;
+
+ # die $token->pos . ' -' . $label . '-> ' . $rel_token->pos;
+ $mtt->add(
+ term => '>:xip_d:' . $label,
+ payload => '<i>' . $rel_token->pos
+ );
+
+ $stream->pos($rel_token->pos)->add(
+ term => '<:xip_d:' . $label,
+ payload => '<i>' . $token->pos
+ );
+ };
+
+# print $label,"\n";
+ };
+ }) or return;
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
new file mode 100644
index 0000000..d568afe
--- /dev/null
+++ b/lib/KorAP/Index/XIP/Morpho.pm
@@ -0,0 +1,61 @@
+package KorAP::Index::XIP::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'xip',
+ layer => 'morpho',
+ encoding => 'xip',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f}->{fs}->{f};
+
+ my $found;
+
+ my $capital = 0;
+ foreach (@$content) {
+ # pos
+ if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
+ $mtt->add(
+ term => 'xip_p:' . $found
+ );
+
+ $capital = 1 if $found eq 'NOUN';
+ }
+ };
+
+ foreach (@$content) {
+ # lemma
+ if (($_->{-name} eq 'lemma') && ($found = $_->{'#text'})) {
+
+ # Verb delimiter (aus=druecken)
+ $found =~ tr/=//d;
+
+ # Composites
+ my (@token) = split('#', $found);
+
+ my $full = '';
+ foreach (@token) {
+ $full .= $_;
+ $_ =~ s{/\w+$}{};
+ $mtt->add(term => 'xip_l:' . $_);
+ };
+ if (@token > 1) {
+ $full =~ s{/}{}g;
+ $full = lc $full;
+ $full = $capital ? ucfirst($full) : $full;
+ $mtt->add(term => 'xip_l:' . $full);
+ };
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+
+1;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 7038cc4..4920215 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -1,16 +1,18 @@
package KorAP::Tokenizer;
-
use Mojo::Base -base;
use Mojo::ByteStream 'b';
-use Carp qw/carp croak/;
+use Mojo::Loader;
+use Carp qw/croak/;
use KorAP::Tokenizer::Range;
use KorAP::Tokenizer::Match;
use KorAP::Tokenizer::Spans;
use KorAP::Tokenizer::Tokens;
-use KorAP::MultiTermTokenStream;
+use KorAP::Field::MultiTermTokenStream;
+use JSON::XS;
use Log::Log4perl;
-has [qw/path foundry layer doc stream should have/];
+has [qw/path foundry doc stream should have name/];
+has layer => 'Tokens';
has 'log' => sub {
Log::Log4perl->get_logger(__PACKAGE__)
@@ -21,8 +23,8 @@
my $self = shift;
# Create new token stream
- my $mtts = KorAP::MultiTermTokenStream->new;
- my $file = b($self->path . $self->foundry . '/' . ($self->layer // 'tokens') . '.xml')->slurp;
+ my $mtts = KorAP::Field::MultiTermTokenStream->new;
+ my $file = b($self->path . lc($self->foundry) . '/' . lc($self->layer) . '.xml')->slurp;
my $tokens = Mojo::DOM->new($file);
$tokens->xml(1);
@@ -121,13 +123,12 @@
my $cb = delete $param{cb};
- if ($param{encoding} && $param{encoding} eq 'bytes') {
- $param{primary} = $self->doc->primary;
- };
+ $param{primary} = $self->doc->primary;
my $spans = KorAP::Tokenizer::Spans->new(
path => $self->path,
range => $self->range,
+ match => $self->match,
%param
);
@@ -140,12 +141,11 @@
$self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
};
-
if ($cb) {
foreach (@$spanarray) {
- $cb->($self->stream, $_);
+ $cb->($self->stream, $_, $spans);
};
- return;
+ return 1;
};
return $spans;
};
@@ -165,12 +165,11 @@
my $cb = delete $param{cb};
- if ($param{encoding} && $param{encoding} eq 'bytes') {
- $param{primary} = $self->doc->primary;
- };
+ $param{primary} = $self->doc->primary;
my $tokens = KorAP::Tokenizer::Tokens->new(
path => $self->path,
+ range => $self->range,
match => $self->match,
%param
);
@@ -189,14 +188,35 @@
if ($cb) {
foreach (@$tokenarray) {
- $cb->($self->stream, $_);
+ $cb->($self->stream, $_, $tokens);
};
- return;
+ return 1;
};
return $tokens;
};
+sub add {
+ my $self = shift;
+ my $loader = Mojo::Loader->new;
+ my $foundry = shift;
+ my $layer = shift;
+ my $mod = 'KorAP::Index::' . $foundry . '::' . $layer;
+
+ if ($mod->can('new') || eval("require $mod; 1;")) {
+ if (my $retval = $mod->new($self)->parse(@_)) {
+ $self->support($foundry => $layer, @_);
+ return $retval;
+ };
+ }
+ else {
+ $self->log->error('Unable to load '.$mod . '(' . $@ . ')');
+ };
+
+ return;
+};
+
+
sub _perc {
if (@_ == 2) {
# '[' . $_[0] . '/' . $_[1] . ']' .
@@ -215,6 +235,73 @@
};
+sub support {
+ my $self = shift;
+ unless ($_[0]) {
+ return $self->{support} // {};
+ }
+ elsif (!$_[1]) {
+ return $self->{support}->{$_[0]} // []
+ };
+ my $f = lc shift;
+ my $l = lc shift;
+ my @info = @_;
+ $self->{support} //= {};
+ $self->{support}->{$f} //= [];
+ push(@{$self->{support}->{$f}}, [$l, @info]);
+};
+
+
+sub to_string {
+ my $self = shift;
+ my $primary = defined $_[0] ? $_[0] : 1;
+ my $string = "<meta>\n";
+ $string .= $self->doc->to_string;
+ $string .= "</meta>\n";
+ if ($primary) {
+ $string .= "<text>\n";
+ $string .= $self->doc->primary->data . "\n";
+ $string .= "</text>\n";
+ };
+ $string .= '<field name="' . $self->name . "\">\n";
+ $string .= "<info>\n";
+ $string .= 'tokenization = ' . $self->foundry . '#' . $self->layer . "\n";
+ foreach my $foundry (keys %{$self->support}) {
+ foreach (@{$self->support($foundry)}) {
+ $string .= 'support = ' . $foundry . '#' . join(',', @{$_}) . "\n";
+ };
+ };
+ $string .= "</info>\n";
+ $string .= $self->stream->to_string;
+ $string .= "</field>";
+ return $string;
+};
+
+sub to_data {
+ my $self = shift;
+ my $primary = defined $_[0] ? $_[0] : 1;
+ my %data;
+ $data{meta} = $self->doc->to_hash;
+ $data{primary} = $self->doc->primary->data if $primary;
+ $data{fields} = [ {
+ name => $self->name,
+ data => $self->stream->to_array,
+ tokenization => [lc($self->foundry), lc($self->layer)],
+ support => $self->support
+ }];
+ \%data;
+};
+
+sub to_json {
+ encode_json($_[0]->to_data($_[1]));
+};
+
+
+sub to_pretty_json {
+ JSON::XS->new->pretty->encode($_[0]->to_data($_[1]));
+};
+
+
1;
@@ -276,7 +363,7 @@
$tokens->stream->add_meta('adjCount', '<i>45');
-The L<KorAP::MultiTermTokenStream> object
+The L<KorAP::Field::MultiTermTokenStream> object
=head2 range
@@ -321,7 +408,7 @@
Add span information to the parsed token stream.
Expects a C<foundry> name, a C<layer> name and a
callback parameter, that will be called after each parsed
-span. The L<KorAP::MultiTermTokenStream> object will be passed,
+span. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
as well as the current L<KorAP::Tokenizer::Span>.
An optional parameter C<encoding> may indicate that the span offsets
@@ -351,7 +438,7 @@
Add token information to the parsed token stream.
Expects a C<foundry> name, a C<layer> name and a
callback parameter, that will be called after each parsed
-token. The L<KorAP::MultiTermTokenStream> object will be passed,
+token. The L<KorAP::Field::MultiTermTokenStream> object will be passed,
as well as the current L<KorAP::Tokenizer::Span>.
An optional parameter C<encoding> may indicate that the token offsets
diff --git a/lib/KorAP/Tokenizer/Match.pm b/lib/KorAP/Tokenizer/Match.pm
index e5a96f8..2a06aea 100644
--- a/lib/KorAP/Tokenizer/Match.pm
+++ b/lib/KorAP/Tokenizer/Match.pm
@@ -8,10 +8,21 @@
sub set {
$_[0]->{$_[1] . ':' . $_[2]} = $_[3];
+ $_[0]->{'[' . $_[1]} = $_[3];
+ $_[0]->{$_[2] . ']'} = $_[3];
};
sub lookup {
$_[0]->{$_[1] . ':' . $_[2]} // undef;
};
+sub startswith {
+ $_[0]->{'[' . $_[1]} // undef;
+};
+
+sub endswith {
+ $_[0]->{$_[1] . ']'} // undef;
+};
+
+
1;
diff --git a/lib/KorAP/Tokenizer/Range.pm b/lib/KorAP/Tokenizer/Range.pm
index c18136b..110fbc6 100644
--- a/lib/KorAP/Tokenizer/Range.pm
+++ b/lib/KorAP/Tokenizer/Range.pm
@@ -35,9 +35,9 @@
if ($found =~ /!(\d+):(\d+)$/) {
return $1 >= 0 ? $1 : 0;
- }
- else {
- return $found;
+# }
+# else {
+# return $found;
};
};
@@ -46,9 +46,9 @@
my $found = $$self->lookup( shift() );
if ($found =~ /!(\d+):(\d+)$/) {
return $2;
- }
- else {
- return $found;
+# }
+# else {
+# return $found;
};
};
diff --git a/lib/KorAP/Tokenizer/Span.pm b/lib/KorAP/Tokenizer/Span.pm
index 0010d7c..485e700 100644
--- a/lib/KorAP/Tokenizer/Span.pm
+++ b/lib/KorAP/Tokenizer/Span.pm
@@ -42,29 +42,33 @@
$_[0]->[4];
};
-
sub content {
if (defined $_[1]) {
$_[0]->[5] = $_[1];
}
else {
- if ($_[0]->processed) {
- return $_[0]->[5];
- }
- else {
- my $c = Mojo::DOM->new($_[0]->[5]);
- $c->xml(1);
- $_[0]->processed(1);
- return $_[0]->[5] = $c;
- };
+ return $_[0]->[5];
};
};
-sub processed {
- if (defined $_[1]) {
- $_[0]->[6] = $_[1] ? 1 : 0;
+sub dom {
+ if ($_[0]->[6]) {
+ return $_[0]->[6];
+ }
+ else {
+ my $c = Mojo::DOM->new($_[0]->[5]);
+ $c->xml(1);
+ return $_[0]->[6] = $c;
};
- $_[0]->[6];
+};
+
+sub hash {
+ if (defined $_[1]) {
+ $_[0]->[7] = $_[1];
+ }
+ else {
+ return $_[0]->[7];
+ };
};
1;
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
index 7e1a382..7eb31c0 100644
--- a/lib/KorAP/Tokenizer/Spans.pm
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -1,58 +1,43 @@
package KorAP::Tokenizer::Spans;
-use Mojo::Base -base;
+use Mojo::Base 'KorAP::Tokenizer::Units';
use KorAP::Tokenizer::Span;
use Mojo::DOM;
use Mojo::ByteStream 'b';
+use XML::Fast;
-has [qw/path foundry layer range primary should have/];
-has 'encoding' => 'utf-8';
+has 'range';
sub parse {
my $self = shift;
my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
- my $spans = Mojo::DOM->new($file);
- $spans->xml(1);
+ # my $spans = Mojo::DOM->new($file);
+ # $spans->xml(1);
+
+ # my $spans = XML::LibXML->load_xml(string => $file);
+
+ my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
my ($should, $have) = (0,0);
- my ($from, $to);
+ my ($from, $to, $h);
my @spans;
- $spans->find('span')->each(
- sub {
- my $s = shift;
+ my $p = $self->primary;
- $should++;
+ foreach my $s (@$spans) {
- if ($self->encoding eq 'bytes') {
- $from = $self->primary->bytes2chars($s->attr('from'));
- $to = $self->primary->bytes2chars($s->attr('to'));
- }
- else {
- $from = $s->attr('from');
- $to = $s->attr('to');
- };
+ $should++;
- return unless $to > $from;
+ my $span = $self->span(
+ $s->{-from},
+ $s->{-to},
+ $s
+ ) or next;
- my $span = KorAP::Tokenizer::Span->new;
+ $have++;
- $span->id($s->attr('id'));
- $span->o_start($from);
- $span->o_end($to);
- $span->p_start($self->range->after($span->o_start));
- $span->p_end($self->range->before($span->o_end));
-
- return unless $span->p_end >= $span->p_start;
-
- if (@{$s->children}) {
- $span->content($s->content_xml);
- };
-
- $have++;
-
- push(@spans, $span);
- });
+ push(@spans, $span);
+ };
$self->should($should);
$self->have($have);
diff --git a/lib/KorAP/Tokenizer/Token.pm b/lib/KorAP/Tokenizer/Token.pm
index f6c1971..fe12a25 100644
--- a/lib/KorAP/Tokenizer/Token.pm
+++ b/lib/KorAP/Tokenizer/Token.pm
@@ -14,14 +14,13 @@
$_[0]->[0];
};
+
sub content {
- if ($_[1]) {
+ if (defined $_[1]) {
$_[0]->[1] = $_[1];
}
else {
- my $c = Mojo::DOM->new($_[0]->[1]);
- $c->xml(1);
- return $c;
+ return $_[0]->[1];
};
};
@@ -34,4 +33,25 @@
};
};
+sub dom {
+ if ($_[0]->[3]) {
+ return $_[0]->[3];
+ }
+ else {
+ my $c = Mojo::DOM->new($_[0]->[1]);
+ $c->xml(1);
+ return $_[0]->[3] = $c;
+ };
+};
+
+sub hash {
+ if (defined $_[1]) {
+ $_[0]->[4] = $_[1];
+ }
+ else {
+ return $_[0]->[4];
+ };
+};
+
+
1;
diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
index 7657460..58ff6fa 100644
--- a/lib/KorAP/Tokenizer/Tokens.pm
+++ b/lib/KorAP/Tokenizer/Tokens.pm
@@ -1,56 +1,38 @@
package KorAP::Tokenizer::Tokens;
-use Mojo::Base -base;
+use Mojo::Base 'KorAP::Tokenizer::Units';
use Mojo::DOM;
use Mojo::ByteStream 'b';
use KorAP::Tokenizer::Token;
+use Carp qw/croak carp/;
+use XML::Fast;
-has [qw/path foundry layer match primary should have/];
-has 'encoding' => 'utf-8';
sub parse {
my $self = shift;
my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
- my $spans = Mojo::DOM->new($file);
- $spans->xml(1);
+# my $spans = Mojo::DOM->new($file);
+# $spans->xml(1);
+ my $spans = xml2hash($file, text => '#text', attr => '-')->{layer}->{spanList}->{span};
my ($should, $have) = (0,0);
- my ($from, $to);
-
- my $match = $self->match;
my @tokens;
- $spans->find('span')->each(
- sub {
- my $s = shift;
- $should++;
+ foreach my $s (@$spans) {
- if ($self->encoding eq 'bytes') {
- $from = $self->primary->bytes2chars($s->attr('from'));
- $to = $self->primary->bytes2chars($s->attr('to'));
- }
- else {
- $from = $s->attr('from');
- $to = $s->attr('to');
- };
+ $should++;
- my $pos = $match->lookup($from, $to);
+ my $token = $self->token(
+ $s->{-from},
+ $s->{-to},
+ $s
+ ) or next;
- return unless defined $pos;
+ $have++;
- my $token = KorAP::Tokenizer::Token->new;
- $token->id($s->attr('id'));
- $token->pos($pos);
-
- if (@{$s->children}) {
- $token->content($s->content_xml);
- };
-
- $have++;
-
- push(@tokens, $token);
- });
+ push(@tokens, $token);
+ };
$self->should($should);
$self->have($have);
diff --git a/lib/KorAP/Tokenizer/Units.pm b/lib/KorAP/Tokenizer/Units.pm
new file mode 100644
index 0000000..c04ebaf
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Units.pm
@@ -0,0 +1,89 @@
+package KorAP::Tokenizer::Units;
+use KorAP::Tokenizer::Span;
+use KorAP::Tokenizer::Token;
+use Mojo::Base -base;
+
+has [qw/path foundry layer match range primary should have/];
+has 'encoding' => 'utf-8';
+
+sub span {
+ my $self = shift;
+ my ($from, $to, $s) = @_;
+
+ ($from, $to) = $self->_offset($from, $to);
+
+ return unless $to > $from;
+
+ my $span = KorAP::Tokenizer::Span->new;
+
+ $span->id($s->{-id}) if $s && $s->{-id};
+
+ $span->o_start($from);
+ $span->o_end($to);
+
+ my $start = $self->match->startswith($span->o_start);
+
+ unless (defined $start) {
+ $start = $self->range->after($span->o_start) or return;
+ };
+
+ $span->p_start($start);
+
+ my $end = $self->match->endswith($span->o_end);
+ unless ($end) {
+ $end = $self->range->before($span->o_end) or return;
+ };
+ $span->p_end($end);
+
+ return unless $span->p_end >= $span->p_start;
+
+ $span->hash($s) if $s;
+
+ return $span;
+};
+
+sub token {
+ my $self = shift;
+ my ($from, $to, $s) = @_;
+
+ ($from, $to) = $self->_offset($from, $to);
+
+ my $pos = $self->match->lookup($from, $to);
+
+ return unless defined $pos;
+
+# if ($from == $to) {
+# print "Unable to find match for $from - $to (resp ".$s->{-from} . '-' . $s->{-to}.") " . $s->{-id};
+# print "\n";
+# };
+
+ my $token = KorAP::Tokenizer::Token->new;
+ $token->id($s->{-id}) if $s && $s->{-id};
+ $token->pos($pos);
+
+ $token->hash($s) if $s;
+
+ return $token;
+};
+
+
+sub _offset {
+ my $self = shift;
+ my ($from, $to) = @_;
+
+ if ($self->encoding) {
+ my $p = $self->primary;
+ if ($self->encoding eq 'bytes') {
+ $from = $p->bytes2chars($from);
+ $to = $p->bytes2chars($to);
+ }
+ elsif ($self->encoding eq 'xip') {
+ $from = $p->xip2chars($from);
+ $to = $p->xip2chars($to);
+ };
+ };
+
+ return ($from, $to);
+};
+
+1;
diff --git a/script/log4perl.conf b/script/log4perl.conf
deleted file mode 100644
index 90459dd..0000000
--- a/script/log4perl.conf
+++ /dev/null
@@ -1,5 +0,0 @@
-log4perl.rootLogger=TRACE, STDERR
-
-log4perl.appender.STDERR. = Log::Log4perl::Appender::ScreenColoredLevels
-log4perl.appender.STDERR.layout=PatternLayout
-log4perl.appender.STDERR.layout.ConversionPattern=[%r] %F %L %c - %m%n
\ No newline at end of file
diff --git a/script/prepare_index.pl b/script/prepare_index.pl
index f0dc398..2b98cfa 100644
--- a/script/prepare_index.pl
+++ b/script/prepare_index.pl
@@ -1,405 +1,195 @@
#!/usr/bin/env perl
use strict;
use warnings;
-use v5.16;
use lib 'lib', '../lib';
-use Set::Scalar;
-use Mojo::DOM;
-use Mojo::Util qw/encode decode/;
-use Mojo::ByteStream 'b';
-
+use Getopt::Long;
+use Benchmark qw/:hireswallclock/;
+use IO::Compress::Gzip qw/$GzipError/;
use Log::Log4perl;
-Log::Log4perl->init("script/log4perl.conf");
-
use KorAP::Document;
use KorAP::Tokenizer;
+our $VERSION = 0.01;
+
+# Merges foundry data to create indexer friendly documents
+# ndiewald, 2013/11/05
+
+sub printhelp {
+ print <<'EOHELP';
+
+Merge foundry data based on a tokenization and create indexer friendly documents.
+
+Call:
+prepare_index.pl -z --input <directory> --outputfile <filename>
+
+--input|-i <directory> Directory of the document to index
+--output|-o <filename> Document name for output (optional),
+ Writes to <STDOUT> by default
+--token|-t <foundry>[#<layer>] Define the default tokenization by specifying
+ the name of the foundry and optionally the name
+ of the layer. Defaults to OpenNLP#tokens.
+--skip|-s <foundry>[#<layer>] Skip specific foundries by specifying the name
+ or specific layers by defining the name
+ with a # in front of the foundry,
+ e.g. Mate#Morpho. Alternatively you can skip #ALL.
+ Can be set multiple times.
+--allow|-a <foundry>#<layer> Allow specific foundries and layers by defining them
+ combining the foundry name with a # and the layer name.
+--primary|-p Output primary data or not. Defaults to true.
+ Can be flagged using --no-primary as well.
+--human|-m Represent the data human friendly,
+ while the output defaults to JSON
+--pretty|-y Pretty print json output
+--gzip|-z Compress the output
+ (expects a defined output file)
+--log|-l The Log4perl log level, defaults to ERROR.
+--help|-h Print this document (optional)
+
+diewald@ids-mannheim.de, 2013/11/04
+
+EOHELP
+ exit(defined $_[0] ? $_[0] : 0);
+};
+
+# Options from the command line
+my ($input, $output, $text, $gzip, $log_level, @skip, $token_base, $primary, @allow, $pretty);
+GetOptions(
+ 'input|x=s' => \$input,
+ 'output|o=s' => \$output,
+ 'human|m' => \$text,
+ 'token|t=s' => \$token_base,
+ 'gzip|z' => \$gzip,
+ 'skip|s=s' => \@skip,
+ 'log|l=s' => \$log_level,
+ 'allow|a=s' => \@allow,
+ 'primary|p!' => \$primary,
+ 'pretty|y' => \$pretty,
+ 'help|h' => sub { printhelp }
+);
+
+printhelp(1) if !$input || ($gzip && !$output);
+
+$log_level //= 'ERROR';
+
+my %skip;
+$skip{lc($_)} = 1 foreach @skip;
+
+Log::Log4perl->init({
+ 'log4perl.rootLogger' => uc($log_level) . ', STDERR',
+ 'log4perl.appender.STDERR' => 'Log::Log4perl::Appender::ScreenColoredLevels',
+ 'log4perl.appender.STDERR.layout' => 'PatternLayout',
+ 'log4perl.appender.STDERR.layout.ConversionPattern' => '[%r] %F %L %c - %m%n'
+});
+
+my $log = Log::Log4perl->get_logger('main');
+
+BEGIN {
+ $main::TIME = Benchmark->new;
+ $main::LAST_STOP = Benchmark->new;
+};
+
+sub stop_time {
+ my $new = Benchmark->new;
+ $log->trace(
+ 'The code took: '.
+ timestr(timediff($new, $main::LAST_STOP)) .
+ ' (overall: ' . timestr(timediff($new, $main::TIME)) . ')'
+ );
+ $main::LAST_STOP = $new;
+};
# Call perl script/prepare_index.pl WPD/AAA/00001
-sub parse_doc {
- my $doc = KorAP::Document->new(
- path => shift . '/'
- );
-
- $doc->parse;
-
- my $tokens = KorAP::Tokenizer->new(
- path => $doc->path,
- doc => $doc,
- foundry => 'connexor',
- layer => 'tokens'
- );
-
- $tokens->parse;
-
- my $i = 0;
- $tokens->add_spandata(
- foundry => 'connexor',
- layer => 'sentences',
- #skip => 1,
- cb => sub {
- my ($stream, $span) = @_;
- my $mtt = $stream->pos($span->p_start);
- $mtt->add(
- term => '<>:s',
- o_start => $span->o_start,
- o_end => $span->o_end,
- p_end => $span->p_end
- );
- $i++;
- }
- );
-
- $tokens->stream->add_meta('s', '<i>' . $i);
-
- $i = 0;
- $tokens->add_spandata(
- foundry => 'base',
- layer => 'paragraph',
- #skip => 1,
- cb => sub {
- my ($stream, $span) = @_;
- my $mtt = $stream->pos($span->p_start);
- $mtt->add(
- term => '<>:p',
- o_start => $span->o_start,
- o_end => $span->o_end,
- p_end => $span->p_end
- );
- $i++;
- }
- );
- $tokens->stream->add_meta('p', '<i>' . $i);
-
- $tokens->add_tokendata(
- foundry => 'opennlp',
- layer => 'morpho',
- #skip => 1,
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
- my $content = $token->content;
-
- my $found;
-
- # syntax
- if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
- $mtt->add(
- term => 'opennlp_p:' . $found
- );
- };
- });
+# Create and parse new document
+$input =~ s{([^/])$}{$1/};
+my $doc = KorAP::Document->new( path => $input );
+$doc->parse;
- my $model = 'ne_dewac_175m_600';
- $tokens->add_tokendata(
- foundry => 'corenlp',
- #skip => 1,
- layer => $model,
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
- my $content = $token->content;
+my ($token_base_foundry, $token_base_layer) = (qw/OpenNLP Tokens/);
+if ($token_base) {
+ ($token_base_foundry, $token_base_layer) = split /#/, $token_base;
+};
- my $found;
+# Get tokenization
+my $tokens = KorAP::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => $token_base_foundry,
+ layer => $token_base_layer,
+ name => 'tokens'
+);
+$tokens->parse;
- if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
- $mtt->add(
- term => 'corenlp_' . $model . ':' . $found
- );
- };
- });
+my @layers;
- $model = 'ne_hgc_175m_600';
- $tokens->add_tokendata(
- foundry => 'corenlp',
- #skip => 1,
- layer => $model,
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
- my $content = $token->content;
+# Base information
+push(@layers, ['OpenNLP', 'Sentences']);
+push(@layers, ['Base', 'Paragraphs']);
- my $found;
+# OpenNLP
+push(@layers, ['OpenNLP', 'Morpho']);
- if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
- $mtt->add(
- term => 'corenlp_' . $model . ':' . $found
- );
- };
- });
+# CoreNLP
+push(@layers, ['CoreNLP', 'NamedEntities', 'ne_dewac_175m_600']);
+push(@layers, ['CoreNLP', 'NamedEntities', 'ne_hgc_175m_600']);
- $tokens->add_tokendata(
- foundry => 'connexor',
- layer => 'morpho',
- #skip => 1,
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
- my $content = $token->content;
+# Connexor
+push(@layers, ['Connexor', 'Morpho']);
+push(@layers, ['Connexor', 'Syntax']);
+push(@layers, ['Connexor', 'Phrase']);
- my $found;
+# TreeTagger
+push(@layers, ['TreeTagger', 'Morpho']);
- # Lemma
- if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
- if (index($found, "\N{U+00a0}") >= 0) {
- $found = b($found)->decode;
- foreach (split(/\x{00A0}/, $found)) {
- $mtt->add(
- term => 'cnx_l:' . b($_)->encode
- );
- }
- }
- else {
- $mtt->add(
- term => 'cnx_l:' . $found # b($found)->encode
- );
- };
- };
+# Mate
+push(@layers, ['Mate', 'Morpho']);
+push(@layers, ['Mate', 'Dependency']);
- # POS
- if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
- $mtt->add(
- term => 'cnx_p:' . $found
- );
- };
-
- # MSD
- # Todo: Look in the description!
- if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
- foreach (split(':', $found)) {
- $mtt->add(
- term => 'cnx_m:' . $_
- );
- };
- };
- }
- );
-
- $tokens->add_tokendata(
- foundry => 'connexor',
- layer => 'syntax',
- #skip => 1,
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
- my $content = $token->content;
-
- my $found;
-
- # syntax
- if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
- $mtt->add(
- term => 'cnx_syn:' . $found
- );
- };
- });
-
- $tokens->add_spandata(
- foundry => 'connexor',
- layer => 'phrase',
- #skip => 1,
- cb => sub {
- my ($stream, $span) = @_;
-
- my $type = $span->content->at('f[name=pos]');
- if ($type && ($type = $type->text)) {
- my $mtt = $stream->pos($span->p_start);
- $mtt->add(
- term => '<>:cnx_const:' . $type,
- o_start => $span->o_start,
- o_end => $span->o_end,
- p_end => $span->p_end
- );
- };
- }
- );
-
- $tokens->add_tokendata(
- foundry => 'tree_tagger',
- #skip => 1,
- layer => 'morpho',
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
- my $content = $token->content;
-
- my $found;
-
- # lemma
- if (($found = $content->at('f[name="lemma"]')) &&
- ($found = $found->text) && $found ne 'UNKNOWN') {
- $mtt->add(
- term => 'tt_l:' . $found
- );
- };
-
- # pos
- if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
- $mtt->add(
- term => 'tt_p:' . $found
- );
- };
- });
-
- $tokens->add_tokendata(
- foundry => 'mate',
- layer => 'morpho',
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
- my $content = $token->content;
-
- my $found;
-
- my $capital = 0;
-
- # pos
- if (($found = $content->at('f[name="pos"]')) &&
- ($found = $found->text)) {
- $mtt->add(term => 'mate_p:' . $found
- );
- };
-
- # lemma
- if (($found = $content->at('f[name="lemma"]'))
- && ($found = $found->text)
- && $found ne '--') {
- $mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
- };
-
- # MSD
- if (($found = $content->at('f[name="msd"]')) &&
- ($found = $found->text) &&
- ($found ne '_')) {
- foreach (split '\|', $found) {
- my ($x, $y) = split "=", $_;
- # case, tense, number, mood, person, degree, gender
- $mtt->add(term => 'mate_m:' . $x . ':' . $y);
- };
- };
- });
+# XIP
+push(@layers, ['XIP', 'Morpho']);
+push(@layers, ['XIP', 'Constituency']);
+push(@layers, ['XIP', 'Dependency']);
- $tokens->add_tokendata(
- foundry => 'xip',
- #skip => 1,
- layer => 'morpho',
- encoding => 'bytes',
- cb => sub {
- my ($stream, $token) = @_;
- my $mtt = $stream->pos($token->pos);
- my $content = $token->content;
-
- my $found;
-
- my $capital = 0;
- # pos
- if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
- $mtt->add(
- term => 'xip_p:' . $found
- );
-
- $capital = 1 if $found eq 'NOUN';
- };
-
- # lemma
- if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
- my (@token) = split('#', $found);
-
- my $full = '';
- foreach (@token) {
- $full .= $_;
- $_ =~ s{/\w+$}{};
- $mtt->add(term => 'xip_l:' . $_);
- };
- if (@token > 1) {
- $full =~ s{/}{}g;
- $full = lc $full;
- $full = $capital ? ucfirst($full) : $full;
- $mtt->add(term => 'xip_l:' . $full);
- };
- };
- });
-
-
- # Collect all spans and check for roots
- my %xip_const;
- my $xip_const_root = Set::Scalar->new;
- my $xip_const_noroot = Set::Scalar->new;
-
- # First run:
- $tokens->add_spandata(
- foundry => 'xip',
- layer => 'constituency',
- encoding => 'bytes',
- #skip => 1,
- cb => sub {
- my ($stream, $span) = @_;
-
- $xip_const{$span->id} = $span;
- $xip_const_root->insert($span->id);
-
- $span->content->find('rel[label=dominates][target]')->each(
- sub {
- my $rel = shift;
- $xip_const_noroot->insert($rel->attr('target'));
- }
- );
- }
- );
-
- my $stream = $tokens->stream;
-
- my $add_const = sub {
- my $span = shift;
- my $level = shift;
- my $mtt = $stream->pos($span->p_start);
-
- my $content = $span->content;
- my $type = $content->at('f[name=const]');
- if ($type && ($type = $type->text)) {
- # $type is now NPA, NP, NUM
- my %term = (
- term => '<>:xip_const:' . $type,
- o_start => $span->o_start,
- o_end => $span->o_end,
- p_end => $span->p_end
- );
-
- $term{payload} = '<s>' . $level if $level;
-
- $mtt->add(%term);
-
- my $this = __SUB__;
-
- $content->find('rel[label=dominates][target]')->each(
- sub {
- my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
- $this->($subspan, $level + 1);
- }
- );
+if ($skip{'#all'}) {
+ foreach (@allow) {
+ $tokens->add(split('#', $_));
+ stop_time;
+ };
+}
+else {
+ # Add to index file - respect skipping
+ foreach my $info (@layers) {
+ unless ($skip{lc($info->[0]) . '#' . lc($info->[1])}) {
+ $tokens->add(@$info);
+ stop_time;
};
};
+};
- my $diff = $xip_const_root->difference($xip_const_noroot);
- foreach ($diff->members) {
- my $obj = delete $xip_const{$_} or next;
- $add_const->($obj, 0);
+my $file;
+
+my $print_text = $text ? $tokens->to_string($primary) : ($pretty ? $tokens->to_pretty_json($primary) : $tokens->to_json($primary));
+
+if ($output) {
+ if ($gzip) {
+ $file = IO::Compress::Gzip->new($output, Minimal => 1);
+ }
+ else {
+ $file = IO::File->new($output, "w");
};
- # Todo: Add mate-morpho
- # Todo: Add mate-dependency
- # Todo: Add xip-dependency
+# binmode $file, ':utf8';
- print $tokens->stream->to_string;
+ $file->print($print_text);
+ $file->close;
+}
+else {
+# binmode STDOUT, ':utf8';
+ print $print_text . "\n";
};
-if ($ARGV[0]) {
- parse_doc($ARGV[0]);
-};
-
-
+stop_time;
__END__
diff --git a/t/primary.t b/t/primary.t
new file mode 100644
index 0000000..831124e
--- /dev/null
+++ b/t/primary.t
@@ -0,0 +1,88 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use Test::More;
+use Mojo::ByteStream 'b';
+use utf8;
+use lib 'lib', '../lib';
+
+use_ok('KorAP::Document::Primary');
+
+my $t = "Der März ging vorüber und demnächst würde es Herbstblätter regnen.";
+
+ok(my $p = KorAP::Document::Primary->new($t), 'Constructor');
+
+is($p->data_length, 66, 'Text has correct length');
+
+is($p->data, $t, 'Text is identical');
+is($p->data(0,3), 'Der', 'Text is identical');
+is($p->data(4,8), 'März', 'Text is identical');
+is($p->data(26,35), 'demnächst', 'Text is identical');
+
+is($p->data_bytes(0,3), 'Der', 'Text is identical');
+is($p->data_bytes(4,9), 'März', 'Text is identical');
+is($p->data_bytes(28,38), 'demnächst', 'Text is identical');
+
+is($p->bytes2chars(4), 4, 'Byte offset matches');
+is($p->bytes2chars(9), 8, 'Byte offset matches');
+is($p->bytes2chars(28), 26, 'Byte offset matches');
+is($p->bytes2chars(38), 35, 'Byte offset matches');
+
+is(
+ $p->data(
+ $p->bytes2chars(17),
+ $p->bytes2chars(45)
+ ),
+ $p->data_bytes(17,45),
+ 'Text is identical'
+);
+
+$t = 'Er dächte, daß dies „für alle Elemente gilt“.';
+
+ok($p = KorAP::Document::Primary->new($t), 'Constructor');
+
+is($p->data_length, 45, 'Text has correct length');
+
+is($p->data, $t, 'Text is identical');
+is($p->data(0,2), 'Er', 'Text is identical');
+is($p->data(3,9), 'dächte', 'Text is identical');
+is($p->data(21,24), 'für', 'Text is identical');
+is($p->data(20,21), '„', 'Text is identical');
+is($p->data(43,44), '“', 'Text is identical');
+is($p->data(44,45), '.', 'Text is identical');
+
+is($p->data_bytes(0,2), 'Er', 'Text is identical');
+is($p->bytes2chars(0),0, 'b2c correct');
+is($p->bytes2chars(2),2, 'b2c correct');
+is($p->data_bytes(3,10), 'dächte', 'Text is identical');
+is($p->bytes2chars(3),3, 'b2c correct');
+is($p->bytes2chars(10),9, 'b2c correct');
+is($p->data_bytes(25,29), 'für', 'Text is identical');
+is($p->bytes2chars(25),21, 'b2c correct');
+is($p->bytes2chars(29),24, 'b2c correct');
+is($p->data_bytes(22,25), '„', 'Text is identical');
+is($p->bytes2chars(22),20, 'b2c correct');
+is($p->bytes2chars(25),21, 'b2c correct');
+is($p->data_bytes(48,51), '“', 'Text is identical');
+is($p->bytes2chars(48),43, 'b2c correct');
+is($p->bytes2chars(51),44, 'b2c correct');
+is($p->data_bytes(51,52), '.', 'Text is identical');
+is($p->bytes2chars(52),45, 'b2c correct');
+
+is(
+ $p->data(
+ $p->bytes2chars(17),
+ $p->bytes2chars(45)
+ ),
+ $p->data_bytes(17,45),
+ 'Text is identical'
+);
+
+
+#ok($p = KorAP::Document::Primary->new($t), 'Constructor');
+is($p->xip2chars(0), 0, 'Fine');
+is($p->xip2chars(7), 6, 'Fine');
+#diag $p->data($p->latinbytes2chars(3),$p->latinbytes2chars(9));
+
+
+done_testing;