Lucene field indexer written in perl
diff --git a/Makefile.PL b/Makefile.PL
new file mode 100644
index 0000000..b076284
--- /dev/null
+++ b/Makefile.PL
@@ -0,0 +1,27 @@
+#!/usr/bin/env perl
+use v5.16;
+use strict;
+use warnings;
+use ExtUtils::MakeMaker;
+
+WriteMakefile(
+ NAME => 'KorAP::Bundle',
+ AUTHOR => 'Nils Diewald',
+ ABSTRACT => 'Perl Implementation for Generating Multifoundry Lucene Indices',
+ VERSION_FROM => 'lib/KorAP/Bundle.pm',
+ PREREQ_PM => {
+ 'Mojolicious' => 4.51,
+ 'Packed::Array' => 0.01,
+ 'Log::Log4perl' => 1.42,
+ 'Carp' => 0,
+ 'strict' => 0,
+ 'warnings' => 0,
+ 'utf8' => 0,
+ 'bytes' => 0
+ },
+# LICENSE => 'perl',
+ MIN_PERL_VERSION => '5.016',
+ test => {
+ TESTS => 't/*.t'
+ }
+);
diff --git a/lib/KorAP/Bundle.pm b/lib/KorAP/Bundle.pm
new file mode 100644
index 0000000..43997e3
--- /dev/null
+++ b/lib/KorAP/Bundle.pm
@@ -0,0 +1,5 @@
+package KorAP::Bundle;
+
+our $VERSION = 0.01;
+
+1;
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
new file mode 100644
index 0000000..8f80e6c
--- /dev/null
+++ b/lib/KorAP/Document.pm
@@ -0,0 +1,250 @@
+package KorAP::Document;
+use Mojo::Base -base;
+use v5.16;
+
+use Mojo::ByteStream 'b';
+use Mojo::DOM;
+use Carp qw/croak carp/;
+use KorAP::Document::Primary;
+
+has [qw/id corpus_id path/];
+has [qw/pub_date title sub_title pub_place/];
+
+# parse document
+sub parse {
+ my $self = shift;
+ my $file = b($self->path . 'data.xml')->slurp;
+
+ state $unable = 'Unable to parse document';
+
+ carp 'Parse document ' . $self->path;
+
+ my $dom = Mojo::DOM->new($file);
+
+ my $rt = $dom->at('raw_text');
+
+ # Get document id and corpus id
+ if ($rt && $rt->attr('docid')) {
+ $self->id($rt->attr('docid'));
+ if ($self->id =~ /^([^_]+)_/) {
+ $self->corpus_id($1);
+ }
+ else {
+ croak $unable;
+ };
+ }
+ else {
+ croak $unable;
+ };
+
+ # Get primary data
+ my $pd = $rt->at('text');
+ if ($pd) {
+
+ $pd = b($pd->text)->decode;
+ $self->{pd} = KorAP::Document::Primary->new($pd->to_string);
+ }
+ else {
+ croak $unable;
+ };
+
+ # Get meta data
+ $self->_parse_meta;
+ return 1;
+};
+
+
+# Primary data
+sub primary {
+ $_[0]->{pd};
+};
+
+sub author {
+ my $self = shift;
+
+ # Set authors
+ if ($_[0]) {
+ return $self->{authors} = [
+ grep { $_ !~ m{^\s*u\.a\.\s*$} } split(/;\s+/, shift())
+ ];
+ }
+ return ($self->{authors} // []);
+};
+
+sub text_class {
+ my $self = shift;
+ if ($_[0]) {
+ return $self->{topics} = [ @_ ];
+ };
+ return ($self->{topics} // []);
+};
+
+
+
+sub _parse_meta {
+ my $self = shift;
+
+ my $file = b($self->path . 'header.xml')->slurp->decode('iso-8859-1')->encode;
+
+ my $dom = Mojo::DOM->new($file);
+ my $monogr = $dom->at('monogr');
+
+ # Get title
+ my $title = $monogr->at('h\.title[type=main]');
+ $self->title($title->text) if $title;
+
+ # Get Subtitle
+ my $sub_title = $monogr->at('h\.title[type=sub]');
+ $self->sub_title($sub_title->text) if $sub_title;
+
+ # Get Author
+ my $author = $monogr->at('h\.author');
+ $self->author($author->all_text) if $author;
+
+ # Get pubDate
+ my $year = $dom->at("pubDate[type=year]");
+ $year = $year ? $year->text : 0;
+ my $month = $dom->at("pubDate[type=month]");
+ $month = $month ? $month->text : 0;
+ my $day = $dom->at("pubDate[type=day]");
+ $day = $day ? $day->text : 0;
+
+ my $date = $year ? ($year < 100 ? '20' . $year : $year) : '0000';
+ $date .= length($month) == 1 ? '0' . $month : $month;
+ $date .= length($day) == 1 ? '0' . $day : $day;
+
+ $self->pub_date($date);
+
+ # Get textClasses
+ my @topic;
+ $dom->find("textClass catRef")->each(
+ sub {
+ my ($ign, @ttopic) = split('\.', $_->attr('target'));
+ push(@topic, @ttopic);
+ }
+ );
+ $self->text_class(@topic);
+};
+
+
+1;
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+KorAP::Document
+
+
+=head1 SYNOPSIS
+
+ my $doc = KorAP::Document->new(
+ path => 'mydoc-1/'
+ );
+
+ $doc->parse;
+
+ print $doc->title;
+
+
+=head1 DESCRIPTION
+
+Parse the primary and meta data of a document.
+
+
+=head2 ATTRIBUTES
+
+=head2 id
+
+ $doc->id(75476);
+ print $doc->id;
+
+The unique identifier of the document.
+
+
+=head2 corpus_id
+
+ $doc->corpus_id(4);
+ print $doc->corpus_id;
+
+The unique identifier of the corpus.
+
+
+=head2 path
+
+ $doc->path("example-004/");
+ print $doc->path;
+
+The path of the document.
+
+
+=head2 title
+
+ $doc->title("Der Name der Rose");
+ print $doc->title;
+
+The title of the document.
+
+
+=head2 sub_title
+
+ $doc->sub_title("Natürlich eine Handschrift");
+ print $doc->sub_title;
+
+The title of the document.
+
+
+=head2 pub_place
+
+ $doc->pub_place("Rom");
+ print $doc->pub_place;
+
+The publication place of the document.
+
+
+=head2 pub_date
+
+ $doc->pub_place("19800404");
+ print $doc->pub_place;
+
+The publication date of the document,
+in the format "YYYYMMDD".
+
+
+=head2 primary
+
+ print $doc->primary->data(0,20);
+
+The L<KorAP::Document::Primary> object containing the primary data.
+
+
+=head2 author
+
+ $doc->author('Binks, Jar Jar; Luke Skywalker');
+ print $doc->author->[0];
+
+Set the author value as semikolon separated list of names or
+get an array reference of author names.
+
+=head2 text_class
+
+ $doc->text_class(qw/news sports/);
+ print $doc->text_class->[0];
+
+Set the text class as an array or get an array
+reference of text classes.
+
+
+=head1 METHODS
+
+=head2 parse
+
+ $doc->parse;
+
+Run the parsing process of the document
+
+
+=cut
diff --git a/lib/KorAP/Document/Primary.pm b/lib/KorAP/Document/Primary.pm
new file mode 100644
index 0000000..52ca844
--- /dev/null
+++ b/lib/KorAP/Document/Primary.pm
@@ -0,0 +1,153 @@
+package KorAP::Document::Primary;
+use strict;
+use warnings;
+use Carp qw/croak/;
+use Mojo::ByteStream 'b';
+use feature 'state';
+use Packed::Array;
+
+
+# Constructor
+sub new {
+ my $class = shift;
+ bless [shift()], $class;
+};
+
+
+# Get the data as a substring
+sub data {
+ my $self = shift;
+ my ($from, $to) = @_;
+
+ return b(substr($self->[0], $from))->encode if $from && !$to;
+
+ return b($self->[0])->encode unless $to;
+
+ my $substr = substr($self->[0], $from, $to - $from);
+ if ($substr) {
+ return b($substr)->encode;
+ };
+ # encode 'UTF-8',
+ croak 'Unable to find substring';
+};
+
+
+# The length of the primary text in characters
+sub data_length {
+ my $self = shift;
+ return $self->[1] if $self->[1];
+ $self->[1] = length($self->[0]);
+ return $self->[1];
+};
+
+
+# Get correct offset
+sub bytes2chars {
+ my $self = shift;
+ unless ($self->[2]) {
+ $self->_calc_chars;
+ };
+ return $self->[2]->[shift];
+};
+
+
+# Calculate character offsets
+sub _calc_chars {
+ use bytes;
+
+ my $self = shift;
+ tie my @array, 'Packed::Array';
+
+ state $leading = pack( 'B8', '10000000' );
+ state $start = pack( 'B8', '01000000' );
+
+ my ($i, $j) = (0,0);
+ my $c;
+
+ # Init array
+ my $l = length($self->[0]);
+ $array[$l-1] = 0;
+
+ # Iterate over every character
+ while ($i < $l) {
+
+ # Get actual character
+ $c = substr($self->[0], $i, 1);
+
+ # store character position
+ $array[$i++] = $j;
+
+ # This is the start of a multibyte sequence
+ if (ord($c & $leading) && ord($c & $start)) {
+
+ # Get the next byte - expecting a following character
+ $c = substr($self->[0], $i, 1);
+
+ # Character is part of a multibyte
+ while (ord($c & $leading)) {
+
+ # Set count
+ $array[$i] = (ord($c & $start)) ? ++$j : $j;
+
+ # Get next character
+ $c = substr($self->[0], ++$i, 1);
+ };
+ };
+
+ $j++;
+ };
+
+ $self->[2] = \@array;
+};
+
+
+1;
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+KorAP::Document::Primary
+
+=head1 SYNOPSIS
+
+ my $text = KorAP::Document::Primary('Das ist mein Text');
+ print $text->data(2,5);
+ print $text->data_length;
+
+
+=head1 DESCRIPTION
+
+Represent textual data with annotated character and byte offsets.
+
+
+=head1 ATTRIBUTES
+
+=head2 data_length
+
+ print $text->data_length;
+
+The textual length in number of characters.
+
+
+=head1 METHODS
+
+=head2 data
+
+ print $text->data;
+ print $text->data(4);
+ print $text->data(5,17);
+
+Return the textual data as a substring. Accepts a starting offset and the length of
+the requested data. The data will be wrapped in an utf-8 encoded L<Mojo::ByteStream>.
+
+=head2 bytes2chars
+
+ print $text->bytes2chars(40);
+
+Calculates the character offset based on a given byte offset.
+
+=cut
diff --git a/lib/KorAP/MultiTerm.pm b/lib/KorAP/MultiTerm.pm
new file mode 100644
index 0000000..9dd12a9
--- /dev/null
+++ b/lib/KorAP/MultiTerm.pm
@@ -0,0 +1,37 @@
+package KorAP::MultiTerm;
+use Mojo::Base -base;
+
+has [qw/p_start p_end o_start o_end term payload/];
+has store_offsets => 1;
+
+sub to_string {
+ my $self = shift;
+ my $string = $self->term;
+ if (defined $self->o_start) {
+ $string .= '#' .$self->o_start .'-' . $self->o_end;
+# }
+# elsif (!$self->storeOffsets) {
+# $string .= '#-';
+ };
+
+
+ my $pl = $self->p_end ? $self->p_end - 1 : $self->payload;
+ if ($self->p_end || $self->payload) {
+ $string .= '$';
+ if ($self->p_end) {
+ $string .= '<i>' . $self->p_end;
+ };
+ if ($self->payload) {
+ if (index($self->payload, '<') == 0) {
+ $string .= $self->payload;
+ }
+ else {
+ $string .= '<?>' . $self->payload;
+ };
+ };
+ };
+
+ return $string;
+};
+
+1;
diff --git a/lib/KorAP/MultiTermToken.pm b/lib/KorAP/MultiTermToken.pm
new file mode 100644
index 0000000..9df9811
--- /dev/null
+++ b/lib/KorAP/MultiTermToken.pm
@@ -0,0 +1,34 @@
+package KorAP::MultiTermToken;
+use KorAP::MultiTerm;
+use Mojo::Base -base;
+
+has [qw/o_start o_end/];
+
+sub add {
+ my $self = shift;
+ my $mt;
+ unless (ref $_[0] eq 'MultiTerm') {
+ if (@_ == 1) {
+ $mt = KorAP::MultiTerm->new(term => shift());
+ }
+ else {
+ $mt = KorAP::MultiTerm->new(@_);
+ };
+ }
+ else {
+ $mt = shift;
+ };
+ $self->{mt} //= [];
+ push(@{$self->{mt}}, $mt);
+ return $mt;
+};
+
+sub to_string {
+ my $self = shift;
+ my $string = '[(' . $self->o_start . '-'. $self->o_end . ')';
+ $string .= join ('|', map($_->to_string, @{$self->{mt}}));
+ $string .= ']';
+ return $string;
+};
+
+1;
diff --git a/lib/KorAP/MultiTermTokenStream.pm b/lib/KorAP/MultiTermTokenStream.pm
new file mode 100644
index 0000000..6d7dc29
--- /dev/null
+++ b/lib/KorAP/MultiTermTokenStream.pm
@@ -0,0 +1,33 @@
+package KorAP::MultiTermTokenStream;
+use Mojo::Base -base;
+use KorAP::MultiTermToken;
+
+has [qw/oStart oEnd/];
+
+sub add {
+ my $self = shift;
+ my $mtt = shift // KorAP::MultiTermToken->new;
+ $self->{mtt} //= [];
+ push(@{$self->{mtt}}, $mtt);
+ return $mtt;
+};
+
+sub add_meta {
+ my $self = shift;
+ my $mt = $self->pos(0)->add('-:' . shift);
+ $mt->payload(shift);
+ $mt->store_offsets(0);
+};
+
+sub pos {
+ my $self = shift;
+ my $pos = shift;
+ return $self->{mtt}->[$pos];
+};
+
+sub to_string {
+ my $self = shift;
+ return join("\n" , map { $_->to_string } @{$self->{mtt}}) . "\n";
+};
+
+1;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
new file mode 100644
index 0000000..7038cc4
--- /dev/null
+++ b/lib/KorAP/Tokenizer.pm
@@ -0,0 +1,362 @@
+package KorAP::Tokenizer;
+
+use Mojo::Base -base;
+use Mojo::ByteStream 'b';
+use Carp qw/carp croak/;
+use KorAP::Tokenizer::Range;
+use KorAP::Tokenizer::Match;
+use KorAP::Tokenizer::Spans;
+use KorAP::Tokenizer::Tokens;
+use KorAP::MultiTermTokenStream;
+use Log::Log4perl;
+
+has [qw/path foundry layer doc stream should have/];
+
+has 'log' => sub {
+ Log::Log4perl->get_logger(__PACKAGE__)
+};
+
+# Parse tokens of the document
+sub parse {
+ my $self = shift;
+
+ # Create new token stream
+ my $mtts = KorAP::MultiTermTokenStream->new;
+ my $file = b($self->path . $self->foundry . '/' . ($self->layer // 'tokens') . '.xml')->slurp;
+ my $tokens = Mojo::DOM->new($file);
+ $tokens->xml(1);
+
+ my $doc = $self->doc;
+
+ my ($should, $have) = (0, 0);
+
+ # Create range and match objects
+ my $range = KorAP::Tokenizer::Range->new;
+ my $match = KorAP::Tokenizer::Match->new;
+
+ my $old = 0;
+
+ $self->log->trace('Tokenize data ' . $self->foundry . ':' . $self->layer);
+
+ # Iterate over all tokens
+ $tokens->find('span')->each(
+ sub {
+ my $span = $_;
+ my $from = $span->attr('from');
+ my $to = $span->attr('to');
+ my $token = $doc->primary->data($from, $to);
+
+ $should++;
+
+ # Ignore non-word tokens
+ return if $token !~ /[\w\d]/;
+
+ my $mtt = $mtts->add;
+
+ # Add gap for later finding matching positions before or after
+ $range->gap($old, $from, $have) unless $old >= $from;
+
+ # Add surface term
+ $mtt->add('s:' . $token);
+
+ # Add case insensitive term
+ $mtt->add('i:' . lc $token);
+
+ # Add offset information
+ $mtt->o_start($from);
+ $mtt->o_end($to);
+
+ # Store offset information for position matching
+ $range->set($from, $to, $have);
+ $match->set($from, $to, $have);
+
+ $old = $to + 1;
+
+ # Add position term
+ $mtt->add('_' . $have . '#' . $mtt->o_start . '-' . $mtt->o_end);
+
+ $have++;
+ });
+
+ # Add token count
+ $mtts->add_meta('t', '<i>' . $have);
+
+ $range->gap($old, $doc->primary->data_length, $have-1) if $doc->primary->data_length >= $old;
+
+ # Add info
+ $self->stream($mtts);
+ $self->{range} = $range;
+ $self->{match} = $match;
+ $self->should($should);
+ $self->have($have);
+
+ $self->log->debug('With a non-word quota of ' . _perc($self->should, $self->should - $self->have) . ' %');
+};
+
+
+# Get span positions through character offsets
+sub range {
+ return shift->{range} // KorAP::Tokenizer::Range->new;
+};
+
+
+# Get token positions through character offsets
+sub match {
+ return shift->{match} // KorAP::Tokenizer::Match->new;
+};
+
+
+# Add information of spans to the tokens
+sub add_spandata {
+ my $self = shift;
+ my %param = @_;
+
+ croak 'No token data available' unless $self->stream;
+
+ $self->log->trace(
+ ($param{skip} ? 'Skip' : 'Add').' span data '.$param{foundry}.':'.$param{layer}
+ );
+
+ return if $param{skip};
+
+ my $cb = delete $param{cb};
+
+ if ($param{encoding} && $param{encoding} eq 'bytes') {
+ $param{primary} = $self->doc->primary;
+ };
+
+ my $spans = KorAP::Tokenizer::Spans->new(
+ path => $self->path,
+ range => $self->range,
+ %param
+ );
+
+ my $spanarray = $spans->parse;
+
+ if ($spans->should == $spans->have) {
+ $self->log->trace('With perfect alignment!');
+ }
+ else {
+ $self->log->debug('With an alignment quota of ' . _perc($spans->should, $spans->have) . ' %');
+ };
+
+
+ if ($cb) {
+ foreach (@$spanarray) {
+ $cb->($self->stream, $_);
+ };
+ return;
+ };
+ return $spans;
+};
+
+
+# Add information to the tokens
+sub add_tokendata {
+ my $self = shift;
+ my %param = @_;
+
+ croak 'No token data available' unless $self->stream;
+
+ $self->log->trace(
+ ($param{skip} ? 'Skip' : 'Add').' token data '.$param{foundry}.':'.$param{layer}
+ );
+ return if $param{skip};
+
+ my $cb = delete $param{cb};
+
+ if ($param{encoding} && $param{encoding} eq 'bytes') {
+ $param{primary} = $self->doc->primary;
+ };
+
+ my $tokens = KorAP::Tokenizer::Tokens->new(
+ path => $self->path,
+ match => $self->match,
+ %param
+ );
+
+ my $tokenarray = $tokens->parse;
+
+ if ($tokens->should == $tokens->have) {
+ $self->log->trace('With perfect alignment!');
+ }
+ else {
+ my $perc = _perc(
+ $tokens->should, $tokens->have, $self->should, $self->should - $self->have
+ );
+ $self->log->debug('With an alignment quota of ' . $perc);
+ };
+
+ if ($cb) {
+ foreach (@$tokenarray) {
+ $cb->($self->stream, $_);
+ };
+ return;
+ };
+ return $tokens;
+};
+
+
+sub _perc {
+ if (@_ == 2) {
+ # '[' . $_[0] . '/' . $_[1] . ']' .
+ return sprintf("%.2f", ($_[1] * 100) / $_[0]);
+ }
+
+ my $a_should = shift;
+ my $a_have = shift;
+ my $b_should = shift;
+ my $b_have = shift;
+ my $a_quota = ($a_have * 100) / $a_should;
+ my $b_quota = ($b_have * 100) / $b_should;
+ return sprintf("%.2f", $a_quota) . '%' .
+ ((($a_quota + $b_quota) <= 100) ?
+ ' [' . sprintf("%.2f", $a_quota + $b_quota) . '%]' : '');
+};
+
+
+1;
+
+
+__END__
+
+=pod
+
+=head1 NAME
+
+KorAP::Tokenizer
+
+=head1 SYNOPSIS
+
+ my $tokens = KorAP::Tokenizer->new(
+ path => '../examples/00003',
+ doc => KorAP::Document->new( ... ),
+ foundry => 'opennlp',
+ layer => 'tokens'
+ );
+
+ $tokens->parse;
+
+=head1 DESCRIPTION
+
+Convert token information from the KorAP XML
+format into Lucene Index compatible token streams.
+
+=head1 ATTRIBUTES
+
+=head2 path
+
+ print $tokens->path;
+
+The path of the document.
+
+
+=head2 foundry
+
+ print $tokens->foundry;
+
+The name of the foundry.
+
+
+=head2 layer
+
+ print $tokens->layer;
+
+The name of the tokens layer.
+
+
+=head2 doc
+
+ print $tokens->doc->corpus_id;
+
+The L<KorAP::Document> object.
+
+
+=head2 stream
+
+ $tokens->stream->add_meta('adjCount', '<i>45');
+
+The L<KorAP::MultiTermTokenStream> object
+
+
+=head2 range
+
+ $tokens->range->lookup(45);
+
+The L<KorAP::Tokenizer::Range> object for converting span offsets to positions.
+
+=head2 match
+
+ $tokens->match->lookup(45);
+
+The L<KorAP::Tokenizer::Match> object for converting token offsets to positions.
+
+
+=head1 METHODS
+
+=head2 parse
+
+ $tokens->parse;
+
+Start the tokenization process.
+
+
+=head2 add_spandata
+
+ $tokens->add_spandata(
+ foundry => 'base',
+ layer => 'sentences',
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ }
+ );
+
+Add span information to the parsed token stream.
+Expects a C<foundry> name, a C<layer> name and a
+callback parameter, that will be called after each parsed
+span. The L<KorAP::MultiTermTokenStream> object will be passed,
+as well as the current L<KorAP::Tokenizer::Span>.
+
+An optional parameter C<encoding> may indicate that the span offsets
+are either refering to C<bytes> or C<utf-8> offsets.
+
+An optional parameter C<skip> allows for skipping the process.
+
+
+=head2 add_tokendata
+
+ $tokens->add_tokendata(
+ foundry => 'connexor',
+ layer => 'syntax',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ # syntax
+ if ((my $found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'cnx_syn:' . $found
+ );
+ };
+ });
+
+Add token information to the parsed token stream.
+Expects a C<foundry> name, a C<layer> name and a
+callback parameter, that will be called after each parsed
+token. The L<KorAP::MultiTermTokenStream> object will be passed,
+as well as the current L<KorAP::Tokenizer::Span>.
+
+An optional parameter C<encoding> may indicate that the token offsets
+are either refering to C<bytes> or C<utf-8> offsets.
+
+An optional parameter C<skip> allows for skipping the process.
+
+=cut
diff --git a/lib/KorAP/Tokenizer/Match.pm b/lib/KorAP/Tokenizer/Match.pm
new file mode 100644
index 0000000..e5a96f8
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Match.pm
@@ -0,0 +1,17 @@
+package KorAP::Tokenizer::Match;
+use strict;
+use warnings;
+
+sub new {
+ bless {}, shift;
+};
+
+sub set {
+ $_[0]->{$_[1] . ':' . $_[2]} = $_[3];
+};
+
+sub lookup {
+ $_[0]->{$_[1] . ':' . $_[2]} // undef;
+};
+
+1;
diff --git a/lib/KorAP/Tokenizer/Range.pm b/lib/KorAP/Tokenizer/Range.pm
new file mode 100644
index 0000000..c18136b
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Range.pm
@@ -0,0 +1,55 @@
+package KorAP::Tokenizer::Range;
+use strict;
+use warnings;
+use Array::IntSpan;
+
+sub new {
+ my $class = shift;
+ my $range = Array::IntSpan->new;
+ bless \$range, $class;
+};
+
+sub set {
+ my $self = shift;
+ $$self->set_range(@_);
+};
+
+sub gap {
+ my $self = shift;
+ $$self->set_range($_[0], $_[1], '!' . ($_[2] - 1) . ':' . $_[2]);
+};
+
+sub lookup {
+ my $x = ${$_[0]}->lookup( $_[1] ) or return;
+ return if index($x, '!') == 0;
+ return $x;
+};
+
+sub before {
+ my $self = shift;
+ my $offset = shift;
+ my $found = $$self->lookup( $offset );
+ unless (defined $found) {
+ warn 'There is no value for ', $offset;
+ };
+
+ if ($found =~ /!(\d+):(\d+)$/) {
+ return $1 >= 0 ? $1 : 0;
+ }
+ else {
+ return $found;
+ };
+};
+
+sub after {
+ my $self = shift;
+ my $found = $$self->lookup( shift() );
+ if ($found =~ /!(\d+):(\d+)$/) {
+ return $2;
+ }
+ else {
+ return $found;
+ };
+};
+
+1;
diff --git a/lib/KorAP/Tokenizer/Span.pm b/lib/KorAP/Tokenizer/Span.pm
new file mode 100644
index 0000000..0010d7c
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Span.pm
@@ -0,0 +1,70 @@
+package KorAP::Tokenizer::Span;
+use strict;
+use warnings;
+use Mojo::DOM;
+
+sub new {
+ bless [], shift;
+};
+
+sub o_start {
+ if (defined $_[1]) {
+ $_[0]->[0] = $_[1];
+ };
+ $_[0]->[0];
+};
+
+sub o_end {
+ if (defined $_[1]) {
+ $_[0]->[1] = $_[1];
+ };
+ $_[0]->[1];
+};
+
+sub p_start {
+ if (defined $_[1]) {
+ $_[0]->[2] = $_[1];
+ };
+ $_[0]->[2];
+};
+
+sub p_end {
+ if (defined $_[1]) {
+ $_[0]->[3] = $_[1];
+ };
+ $_[0]->[3];
+};
+
+sub id {
+ if (defined $_[1]) {
+ $_[0]->[4] = $_[1];
+ };
+ $_[0]->[4];
+};
+
+
+sub content {
+ if (defined $_[1]) {
+ $_[0]->[5] = $_[1];
+ }
+ else {
+ if ($_[0]->processed) {
+ return $_[0]->[5];
+ }
+ else {
+ my $c = Mojo::DOM->new($_[0]->[5]);
+ $c->xml(1);
+ $_[0]->processed(1);
+ return $_[0]->[5] = $c;
+ };
+ };
+};
+
+sub processed {
+ if (defined $_[1]) {
+ $_[0]->[6] = $_[1] ? 1 : 0;
+ };
+ $_[0]->[6];
+};
+
+1;
diff --git a/lib/KorAP/Tokenizer/Spans.pm b/lib/KorAP/Tokenizer/Spans.pm
new file mode 100644
index 0000000..7e1a382
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Spans.pm
@@ -0,0 +1,63 @@
+package KorAP::Tokenizer::Spans;
+use Mojo::Base -base;
+use KorAP::Tokenizer::Span;
+use Mojo::DOM;
+use Mojo::ByteStream 'b';
+
+has [qw/path foundry layer range primary should have/];
+has 'encoding' => 'utf-8';
+
+sub parse {
+ my $self = shift;
+ my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
+
+ my $spans = Mojo::DOM->new($file);
+ $spans->xml(1);
+
+ my ($should, $have) = (0,0);
+ my ($from, $to);
+
+ my @spans;
+ $spans->find('span')->each(
+ sub {
+ my $s = shift;
+
+ $should++;
+
+ if ($self->encoding eq 'bytes') {
+ $from = $self->primary->bytes2chars($s->attr('from'));
+ $to = $self->primary->bytes2chars($s->attr('to'));
+ }
+ else {
+ $from = $s->attr('from');
+ $to = $s->attr('to');
+ };
+
+ return unless $to > $from;
+
+ my $span = KorAP::Tokenizer::Span->new;
+
+ $span->id($s->attr('id'));
+ $span->o_start($from);
+ $span->o_end($to);
+ $span->p_start($self->range->after($span->o_start));
+ $span->p_end($self->range->before($span->o_end));
+
+ return unless $span->p_end >= $span->p_start;
+
+ if (@{$s->children}) {
+ $span->content($s->content_xml);
+ };
+
+ $have++;
+
+ push(@spans, $span);
+ });
+
+ $self->should($should);
+ $self->have($have);
+
+ return \@spans;
+};
+
+1;
diff --git a/lib/KorAP/Tokenizer/Token.pm b/lib/KorAP/Tokenizer/Token.pm
new file mode 100644
index 0000000..f6c1971
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Token.pm
@@ -0,0 +1,37 @@
+package KorAP::Tokenizer::Token;
+use strict;
+use warnings;
+use Mojo::DOM;
+
+sub new {
+ bless [], shift;
+};
+
+sub pos {
+ if (defined $_[1]) {
+ $_[0]->[0] = $_[1];
+ };
+ $_[0]->[0];
+};
+
+sub content {
+ if ($_[1]) {
+ $_[0]->[1] = $_[1];
+ }
+ else {
+ my $c = Mojo::DOM->new($_[0]->[1]);
+ $c->xml(1);
+ return $c;
+ };
+};
+
+sub id {
+ if ($_[1]) {
+ $_[0]->[2] = $_[1];
+ }
+ else {
+ $_[0]->[2];
+ };
+};
+
+1;
diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
new file mode 100644
index 0000000..7657460
--- /dev/null
+++ b/lib/KorAP/Tokenizer/Tokens.pm
@@ -0,0 +1,62 @@
+package KorAP::Tokenizer::Tokens;
+use Mojo::Base -base;
+use Mojo::DOM;
+use Mojo::ByteStream 'b';
+use KorAP::Tokenizer::Token;
+
+has [qw/path foundry layer match primary should have/];
+has 'encoding' => 'utf-8';
+
+sub parse {
+ my $self = shift;
+ my $file = b($self->path . $self->foundry . '/' . $self->layer . '.xml')->slurp;
+
+ my $spans = Mojo::DOM->new($file);
+ $spans->xml(1);
+
+ my ($should, $have) = (0,0);
+ my ($from, $to);
+
+ my $match = $self->match;
+
+ my @tokens;
+ $spans->find('span')->each(
+ sub {
+ my $s = shift;
+
+ $should++;
+
+ if ($self->encoding eq 'bytes') {
+ $from = $self->primary->bytes2chars($s->attr('from'));
+ $to = $self->primary->bytes2chars($s->attr('to'));
+ }
+ else {
+ $from = $s->attr('from');
+ $to = $s->attr('to');
+ };
+
+ my $pos = $match->lookup($from, $to);
+
+ return unless defined $pos;
+
+ my $token = KorAP::Tokenizer::Token->new;
+ $token->id($s->attr('id'));
+ $token->pos($pos);
+
+ if (@{$s->children}) {
+ $token->content($s->content_xml);
+ };
+
+ $have++;
+
+ push(@tokens, $token);
+ });
+
+ $self->should($should);
+ $self->have($have);
+
+ return \@tokens;
+};
+
+
+1;
diff --git a/script/log4perl.conf b/script/log4perl.conf
new file mode 100644
index 0000000..90459dd
--- /dev/null
+++ b/script/log4perl.conf
@@ -0,0 +1,5 @@
+log4perl.rootLogger=TRACE, STDERR
+
+log4perl.appender.STDERR. = Log::Log4perl::Appender::ScreenColoredLevels
+log4perl.appender.STDERR.layout=PatternLayout
+log4perl.appender.STDERR.layout.ConversionPattern=[%r] %F %L %c - %m%n
\ No newline at end of file
diff --git a/script/prepare_index.pl b/script/prepare_index.pl
new file mode 100644
index 0000000..f0dc398
--- /dev/null
+++ b/script/prepare_index.pl
@@ -0,0 +1,405 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use v5.16;
+use lib 'lib', '../lib';
+use Set::Scalar;
+use Mojo::DOM;
+use Mojo::Util qw/encode decode/;
+use Mojo::ByteStream 'b';
+
+use Log::Log4perl;
+Log::Log4perl->init("script/log4perl.conf");
+
+use KorAP::Document;
+use KorAP::Tokenizer;
+
+
+# Call perl script/prepare_index.pl WPD/AAA/00001
+
+sub parse_doc {
+ my $doc = KorAP::Document->new(
+ path => shift . '/'
+ );
+
+ $doc->parse;
+
+ my $tokens = KorAP::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => 'connexor',
+ layer => 'tokens'
+ );
+
+ $tokens->parse;
+
+ my $i = 0;
+ $tokens->add_spandata(
+ foundry => 'connexor',
+ layer => 'sentences',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ );
+
+ $tokens->stream->add_meta('s', '<i>' . $i);
+
+ $i = 0;
+ $tokens->add_spandata(
+ foundry => 'base',
+ layer => 'paragraph',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:p',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ );
+ $tokens->stream->add_meta('p', '<i>' . $i);
+
+ $tokens->add_tokendata(
+ foundry => 'opennlp',
+ layer => 'morpho',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ # syntax
+ if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'opennlp_p:' . $found
+ );
+ };
+ });
+
+
+ my $model = 'ne_dewac_175m_600';
+ $tokens->add_tokendata(
+ foundry => 'corenlp',
+ #skip => 1,
+ layer => $model,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'corenlp_' . $model . ':' . $found
+ );
+ };
+ });
+
+ $model = 'ne_hgc_175m_600';
+ $tokens->add_tokendata(
+ foundry => 'corenlp',
+ #skip => 1,
+ layer => $model,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'corenlp_' . $model . ':' . $found
+ );
+ };
+ });
+
+ $tokens->add_tokendata(
+ foundry => 'connexor',
+ layer => 'morpho',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ # Lemma
+ if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
+ if (index($found, "\N{U+00a0}") >= 0) {
+ $found = b($found)->decode;
+ foreach (split(/\x{00A0}/, $found)) {
+ $mtt->add(
+ term => 'cnx_l:' . b($_)->encode
+ );
+ }
+ }
+ else {
+ $mtt->add(
+ term => 'cnx_l:' . $found # b($found)->encode
+ );
+ };
+ };
+
+ # POS
+ if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'cnx_p:' . $found
+ );
+ };
+
+ # MSD
+ # Todo: Look in the description!
+ if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
+ foreach (split(':', $found)) {
+ $mtt->add(
+ term => 'cnx_m:' . $_
+ );
+ };
+ };
+ }
+ );
+
+ $tokens->add_tokendata(
+ foundry => 'connexor',
+ layer => 'syntax',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ # syntax
+ if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'cnx_syn:' . $found
+ );
+ };
+ });
+
+ $tokens->add_spandata(
+ foundry => 'connexor',
+ layer => 'phrase',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $span) = @_;
+
+ my $type = $span->content->at('f[name=pos]');
+ if ($type && ($type = $type->text)) {
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:cnx_const:' . $type,
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ };
+ }
+ );
+
+ $tokens->add_tokendata(
+ foundry => 'tree_tagger',
+ #skip => 1,
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ # lemma
+ if (($found = $content->at('f[name="lemma"]')) &&
+ ($found = $found->text) && $found ne 'UNKNOWN') {
+ $mtt->add(
+ term => 'tt_l:' . $found
+ );
+ };
+
+ # pos
+ if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'tt_p:' . $found
+ );
+ };
+ });
+
+ $tokens->add_tokendata(
+ foundry => 'mate',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ my $capital = 0;
+
+ # pos
+ if (($found = $content->at('f[name="pos"]')) &&
+ ($found = $found->text)) {
+ $mtt->add(term => 'mate_p:' . $found
+ );
+ };
+
+ # lemma
+ if (($found = $content->at('f[name="lemma"]'))
+ && ($found = $found->text)
+ && $found ne '--') {
+ $mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
+ };
+
+ # MSD
+ if (($found = $content->at('f[name="msd"]')) &&
+ ($found = $found->text) &&
+ ($found ne '_')) {
+ foreach (split '\|', $found) {
+ my ($x, $y) = split "=", $_;
+ # case, tense, number, mood, person, degree, gender
+ $mtt->add(term => 'mate_m:' . $x . ':' . $y);
+ };
+ };
+ });
+
+
+ $tokens->add_tokendata(
+ foundry => 'xip',
+ #skip => 1,
+ layer => 'morpho',
+ encoding => 'bytes',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ my $capital = 0;
+ # pos
+ if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'xip_p:' . $found
+ );
+
+ $capital = 1 if $found eq 'NOUN';
+ };
+
+ # lemma
+ if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
+ my (@token) = split('#', $found);
+
+ my $full = '';
+ foreach (@token) {
+ $full .= $_;
+ $_ =~ s{/\w+$}{};
+ $mtt->add(term => 'xip_l:' . $_);
+ };
+ if (@token > 1) {
+ $full =~ s{/}{}g;
+ $full = lc $full;
+ $full = $capital ? ucfirst($full) : $full;
+ $mtt->add(term => 'xip_l:' . $full);
+ };
+ };
+ });
+
+
+ # Collect all spans and check for roots
+ my %xip_const;
+ my $xip_const_root = Set::Scalar->new;
+ my $xip_const_noroot = Set::Scalar->new;
+
+ # First run:
+ $tokens->add_spandata(
+ foundry => 'xip',
+ layer => 'constituency',
+ encoding => 'bytes',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $span) = @_;
+
+ $xip_const{$span->id} = $span;
+ $xip_const_root->insert($span->id);
+
+ $span->content->find('rel[label=dominates][target]')->each(
+ sub {
+ my $rel = shift;
+ $xip_const_noroot->insert($rel->attr('target'));
+ }
+ );
+ }
+ );
+
+ my $stream = $tokens->stream;
+
+ my $add_const = sub {
+ my $span = shift;
+ my $level = shift;
+ my $mtt = $stream->pos($span->p_start);
+
+ my $content = $span->content;
+ my $type = $content->at('f[name=const]');
+ if ($type && ($type = $type->text)) {
+ # $type is now NPA, NP, NUM
+ my %term = (
+ term => '<>:xip_const:' . $type,
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+
+ $term{payload} = '<s>' . $level if $level;
+
+ $mtt->add(%term);
+
+ my $this = __SUB__;
+
+ $content->find('rel[label=dominates][target]')->each(
+ sub {
+ my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
+ $this->($subspan, $level + 1);
+ }
+ );
+ };
+ };
+
+ my $diff = $xip_const_root->difference($xip_const_noroot);
+ foreach ($diff->members) {
+ my $obj = delete $xip_const{$_} or next;
+ $add_const->($obj, 0);
+ };
+
+ # Todo: Add mate-morpho
+ # Todo: Add mate-dependency
+ # Todo: Add xip-dependency
+
+ print $tokens->stream->to_string;
+};
+
+if ($ARGV[0]) {
+ parse_doc($ARGV[0]);
+};
+
+
+
+__END__