Lucene field indexer written in perl
diff --git a/script/prepare_index.pl b/script/prepare_index.pl
new file mode 100644
index 0000000..f0dc398
--- /dev/null
+++ b/script/prepare_index.pl
@@ -0,0 +1,405 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use v5.16;
+use lib 'lib', '../lib';
+use Set::Scalar;
+use Mojo::DOM;
+use Mojo::Util qw/encode decode/;
+use Mojo::ByteStream 'b';
+
+use Log::Log4perl;
+Log::Log4perl->init("script/log4perl.conf");
+
+use KorAP::Document;
+use KorAP::Tokenizer;
+
+
+# Call perl script/prepare_index.pl WPD/AAA/00001
+
+sub parse_doc {
+ my $doc = KorAP::Document->new(
+ path => shift . '/'
+ );
+
+ $doc->parse;
+
+ my $tokens = KorAP::Tokenizer->new(
+ path => $doc->path,
+ doc => $doc,
+ foundry => 'connexor',
+ layer => 'tokens'
+ );
+
+ $tokens->parse;
+
+ my $i = 0;
+ $tokens->add_spandata(
+ foundry => 'connexor',
+ layer => 'sentences',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:s',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ );
+
+ $tokens->stream->add_meta('s', '<i>' . $i);
+
+ $i = 0;
+ $tokens->add_spandata(
+ foundry => 'base',
+ layer => 'paragraph',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $span) = @_;
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:p',
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ $i++;
+ }
+ );
+ $tokens->stream->add_meta('p', '<i>' . $i);
+
+ $tokens->add_tokendata(
+ foundry => 'opennlp',
+ layer => 'morpho',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ # syntax
+ if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'opennlp_p:' . $found
+ );
+ };
+ });
+
+
+ my $model = 'ne_dewac_175m_600';
+ $tokens->add_tokendata(
+ foundry => 'corenlp',
+ #skip => 1,
+ layer => $model,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'corenlp_' . $model . ':' . $found
+ );
+ };
+ });
+
+ $model = 'ne_hgc_175m_600';
+ $tokens->add_tokendata(
+ foundry => 'corenlp',
+ #skip => 1,
+ layer => $model,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'corenlp_' . $model . ':' . $found
+ );
+ };
+ });
+
+ $tokens->add_tokendata(
+ foundry => 'connexor',
+ layer => 'morpho',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ # Lemma
+ if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
+ if (index($found, "\N{U+00a0}") >= 0) {
+ $found = b($found)->decode;
+ foreach (split(/\x{00A0}/, $found)) {
+ $mtt->add(
+ term => 'cnx_l:' . b($_)->encode
+ );
+ }
+ }
+ else {
+ $mtt->add(
+ term => 'cnx_l:' . $found # b($found)->encode
+ );
+ };
+ };
+
+ # POS
+ if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'cnx_p:' . $found
+ );
+ };
+
+ # MSD
+ # Todo: Look in the description!
+ if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
+ foreach (split(':', $found)) {
+ $mtt->add(
+ term => 'cnx_m:' . $_
+ );
+ };
+ };
+ }
+ );
+
+ $tokens->add_tokendata(
+ foundry => 'connexor',
+ layer => 'syntax',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ # syntax
+ if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'cnx_syn:' . $found
+ );
+ };
+ });
+
+ $tokens->add_spandata(
+ foundry => 'connexor',
+ layer => 'phrase',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $span) = @_;
+
+ my $type = $span->content->at('f[name=pos]');
+ if ($type && ($type = $type->text)) {
+ my $mtt = $stream->pos($span->p_start);
+ $mtt->add(
+ term => '<>:cnx_const:' . $type,
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+ };
+ }
+ );
+
+ $tokens->add_tokendata(
+ foundry => 'tree_tagger',
+ #skip => 1,
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ # lemma
+ if (($found = $content->at('f[name="lemma"]')) &&
+ ($found = $found->text) && $found ne 'UNKNOWN') {
+ $mtt->add(
+ term => 'tt_l:' . $found
+ );
+ };
+
+ # pos
+ if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'tt_p:' . $found
+ );
+ };
+ });
+
+ $tokens->add_tokendata(
+ foundry => 'mate',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ my $capital = 0;
+
+ # pos
+ if (($found = $content->at('f[name="pos"]')) &&
+ ($found = $found->text)) {
+ $mtt->add(term => 'mate_p:' . $found
+ );
+ };
+
+ # lemma
+ if (($found = $content->at('f[name="lemma"]'))
+ && ($found = $found->text)
+ && $found ne '--') {
+ $mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
+ };
+
+ # MSD
+ if (($found = $content->at('f[name="msd"]')) &&
+ ($found = $found->text) &&
+ ($found ne '_')) {
+ foreach (split '\|', $found) {
+ my ($x, $y) = split "=", $_;
+ # case, tense, number, mood, person, degree, gender
+ $mtt->add(term => 'mate_m:' . $x . ':' . $y);
+ };
+ };
+ });
+
+
+ $tokens->add_tokendata(
+ foundry => 'xip',
+ #skip => 1,
+ layer => 'morpho',
+ encoding => 'bytes',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+ my $content = $token->content;
+
+ my $found;
+
+ my $capital = 0;
+ # pos
+ if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+ $mtt->add(
+ term => 'xip_p:' . $found
+ );
+
+ $capital = 1 if $found eq 'NOUN';
+ };
+
+ # lemma
+ if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
+ my (@token) = split('#', $found);
+
+ my $full = '';
+ foreach (@token) {
+ $full .= $_;
+ $_ =~ s{/\w+$}{};
+ $mtt->add(term => 'xip_l:' . $_);
+ };
+ if (@token > 1) {
+ $full =~ s{/}{}g;
+ $full = lc $full;
+ $full = $capital ? ucfirst($full) : $full;
+ $mtt->add(term => 'xip_l:' . $full);
+ };
+ };
+ });
+
+
+ # Collect all spans and check for roots
+ my %xip_const;
+ my $xip_const_root = Set::Scalar->new;
+ my $xip_const_noroot = Set::Scalar->new;
+
+ # First run:
+ $tokens->add_spandata(
+ foundry => 'xip',
+ layer => 'constituency',
+ encoding => 'bytes',
+ #skip => 1,
+ cb => sub {
+ my ($stream, $span) = @_;
+
+ $xip_const{$span->id} = $span;
+ $xip_const_root->insert($span->id);
+
+ $span->content->find('rel[label=dominates][target]')->each(
+ sub {
+ my $rel = shift;
+ $xip_const_noroot->insert($rel->attr('target'));
+ }
+ );
+ }
+ );
+
+ my $stream = $tokens->stream;
+
+ my $add_const = sub {
+ my $span = shift;
+ my $level = shift;
+ my $mtt = $stream->pos($span->p_start);
+
+ my $content = $span->content;
+ my $type = $content->at('f[name=const]');
+ if ($type && ($type = $type->text)) {
+ # $type is now NPA, NP, NUM
+ my %term = (
+ term => '<>:xip_const:' . $type,
+ o_start => $span->o_start,
+ o_end => $span->o_end,
+ p_end => $span->p_end
+ );
+
+ $term{payload} = '<s>' . $level if $level;
+
+ $mtt->add(%term);
+
+ my $this = __SUB__;
+
+ $content->find('rel[label=dominates][target]')->each(
+ sub {
+ my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
+ $this->($subspan, $level + 1);
+ }
+ );
+ };
+ };
+
+ my $diff = $xip_const_root->difference($xip_const_noroot);
+ foreach ($diff->members) {
+ my $obj = delete $xip_const{$_} or next;
+ $add_const->($obj, 0);
+ };
+
+ # Todo: Add mate-morpho
+ # Todo: Add mate-dependency
+ # Todo: Add xip-dependency
+
+ print $tokens->stream->to_string;
+};
+
+if ($ARGV[0]) {
+ parse_doc($ARGV[0]);
+};
+
+
+
+__END__