Lucene field indexer written in perl

commit: 2db9ad0d484214d641313937810225a635a997a4 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Tue Oct 29 19:26:43 2013 +0000
committer: Nils Diewald <nils@diewald-online.de> Tue Oct 29 19:26:43 2013 +0000
tree: 46f404294a6ec09faf4d1a551a3a03599473dd61
diff --git a/script/prepare_index.pl b/script/prepare_index.pl
new file mode 100644
index 0000000..f0dc398
--- /dev/null
+++ b/script/prepare_index.pl

@@ -0,0 +1,405 @@
+#!/usr/bin/env perl
+use strict;
+use warnings;
+use v5.16;
+use lib 'lib', '../lib';
+use Set::Scalar;
+use Mojo::DOM;
+use Mojo::Util qw/encode decode/;
+use Mojo::ByteStream 'b';
+
+use Log::Log4perl;
+Log::Log4perl->init("script/log4perl.conf");
+
+use KorAP::Document;
+use KorAP::Tokenizer;
+
+
+# Call perl script/prepare_index.pl WPD/AAA/00001
+
+sub parse_doc {
+  my $doc = KorAP::Document->new(
+    path => shift . '/'
+  );
+
+  $doc->parse;
+
+  my $tokens = KorAP::Tokenizer->new(
+    path => $doc->path,
+    doc => $doc,
+    foundry => 'connexor',
+    layer => 'tokens'
+  );
+
+  $tokens->parse;
+
+  my $i = 0;
+  $tokens->add_spandata(
+    foundry => 'connexor',
+    layer => 'sentences',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:s',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  );
+
+  $tokens->stream->add_meta('s', '<i>' . $i);
+
+  $i = 0;
+  $tokens->add_spandata(
+    foundry => 'base',
+    layer => 'paragraph',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $span) = @_;
+      my $mtt = $stream->pos($span->p_start);
+      $mtt->add(
+	term => '<>:p',
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+      $i++;
+    }
+  );
+  $tokens->stream->add_meta('p', '<i>' . $i);
+
+  $tokens->add_tokendata(
+    foundry => 'opennlp',
+    layer => 'morpho',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      # syntax
+      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'opennlp_p:' . $found
+	);
+      };
+    });
+
+
+  my $model = 'ne_dewac_175m_600';
+  $tokens->add_tokendata(
+    foundry => 'corenlp',
+    #skip => 1,
+    layer => $model,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'corenlp_' . $model . ':' . $found
+	);
+      };
+    });
+
+  $model = 'ne_hgc_175m_600';
+  $tokens->add_tokendata(
+    foundry => 'corenlp',
+    #skip => 1,
+    layer => $model,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'corenlp_' . $model . ':' . $found
+	);
+      };
+    });
+
+  $tokens->add_tokendata(
+    foundry => 'connexor',
+    layer => 'morpho',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      # Lemma
+      if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
+	if (index($found, "\N{U+00a0}") >= 0) {
+	  $found = b($found)->decode;
+	  foreach (split(/\x{00A0}/, $found)) {
+	    $mtt->add(
+	      term => 'cnx_l:' . b($_)->encode
+	    );
+	  }
+	}
+	else {
+	  $mtt->add(
+	    term => 'cnx_l:' . $found # b($found)->encode
+	  );
+	};
+      };
+
+      # POS
+      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'cnx_p:' . $found
+	);
+      };
+
+      # MSD
+      # Todo: Look in the description!
+      if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
+	foreach (split(':', $found)) {
+	  $mtt->add(
+	    term => 'cnx_m:' . $_
+	  );
+	};
+      };
+    }
+  );
+
+  $tokens->add_tokendata(
+    foundry => 'connexor',
+    layer => 'syntax',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      # syntax
+      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'cnx_syn:' . $found
+	);
+      };
+    });
+
+  $tokens->add_spandata(
+    foundry => 'connexor',
+    layer => 'phrase',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      my $type = $span->content->at('f[name=pos]');
+      if ($type && ($type = $type->text)) {
+	my $mtt = $stream->pos($span->p_start);
+	$mtt->add(
+	  term => '<>:cnx_const:' . $type,
+	  o_start => $span->o_start,
+	  o_end => $span->o_end,
+	  p_end => $span->p_end
+	);
+      };
+    }
+  );
+
+  $tokens->add_tokendata(
+    foundry => 'tree_tagger',
+    #skip => 1,
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      # lemma
+      if (($found = $content->at('f[name="lemma"]')) &&
+	    ($found = $found->text) && $found ne 'UNKNOWN') {
+	$mtt->add(
+	  term => 'tt_l:' . $found
+	);
+      };
+
+      # pos
+      if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'tt_p:' . $found
+	);
+      };
+    });
+
+  $tokens->add_tokendata(
+    foundry => 'mate',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      my $capital = 0;
+
+      # pos
+      if (($found = $content->at('f[name="pos"]')) &&
+	    ($found = $found->text)) {
+	$mtt->add(term => 'mate_p:' . $found
+	);
+      };
+
+      # lemma
+      if (($found = $content->at('f[name="lemma"]'))
+	    && ($found = $found->text)
+	      && $found ne '--') {
+	$mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
+      };
+
+      # MSD
+      if (($found = $content->at('f[name="msd"]')) &&
+	    ($found = $found->text) &&
+	      ($found ne '_')) {
+	foreach (split '\|', $found) {
+	  my ($x, $y) = split "=", $_;
+	  # case, tense, number, mood, person, degree, gender
+	  $mtt->add(term => 'mate_m:' . $x . ':' . $y);
+	};
+      };
+    });
+
+
+  $tokens->add_tokendata(
+    foundry => 'xip',
+    #skip => 1,
+    layer => 'morpho',
+    encoding => 'bytes',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+      my $content = $token->content;
+
+      my $found;
+
+      my $capital = 0;
+      # pos
+      if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
+	$mtt->add(
+	  term => 'xip_p:' . $found
+	);
+
+	$capital = 1 if $found eq 'NOUN';
+      };
+
+      # lemma
+      if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
+	my (@token) = split('#', $found);
+
+	my $full = '';
+	foreach (@token) {
+	  $full .= $_;
+	  $_ =~ s{/\w+$}{};
+	  $mtt->add(term => 'xip_l:' . $_);
+	};
+	if (@token > 1) {
+	  $full =~ s{/}{}g;
+	  $full = lc $full;
+	  $full = $capital ? ucfirst($full) : $full;
+	  $mtt->add(term => 'xip_l:' . $full);
+	};
+      };
+    });
+
+
+  # Collect all spans and check for roots
+  my %xip_const;
+  my $xip_const_root = Set::Scalar->new;
+  my $xip_const_noroot = Set::Scalar->new;
+
+  # First run:
+  $tokens->add_spandata(
+    foundry => 'xip',
+    layer => 'constituency',
+    encoding => 'bytes',
+    #skip => 1,
+    cb => sub {
+      my ($stream, $span) = @_;
+
+      $xip_const{$span->id} = $span;
+      $xip_const_root->insert($span->id);
+
+      $span->content->find('rel[label=dominates][target]')->each(
+	sub {
+	  my $rel = shift;
+	  $xip_const_noroot->insert($rel->attr('target'));
+	}
+      );
+    }
+  );
+
+  my $stream = $tokens->stream;
+
+  my $add_const = sub {
+    my $span = shift;
+    my $level = shift;
+    my $mtt = $stream->pos($span->p_start);
+
+    my $content = $span->content;
+    my $type = $content->at('f[name=const]');
+    if ($type && ($type = $type->text)) {
+      # $type is now NPA, NP, NUM
+      my %term = (
+	term => '<>:xip_const:' . $type,
+	o_start => $span->o_start,
+	o_end => $span->o_end,
+	p_end => $span->p_end
+      );
+
+      $term{payload} = '<s>' . $level if $level;
+
+      $mtt->add(%term);
+
+      my $this = __SUB__;
+
+      $content->find('rel[label=dominates][target]')->each(
+	sub {
+	  my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
+	  $this->($subspan, $level + 1);
+	}
+      );
+    };
+  };
+
+  my $diff = $xip_const_root->difference($xip_const_noroot);
+  foreach ($diff->members) {
+    my $obj = delete $xip_const{$_} or next;
+    $add_const->($obj, 0);
+  };
+
+  # Todo: Add mate-morpho
+  # Todo: Add mate-dependency
+  # Todo: Add xip-dependency
+
+  print $tokens->stream->to_string;
+};
+
+if ($ARGV[0]) {
+  parse_doc($ARGV[0]);
+};
+
+
+
+__END__
commit	2db9ad0d484214d641313937810225a635a997a4	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Tue Oct 29 19:26:43 2013 +0000
committer	Nils Diewald <nils@diewald-online.de>	Tue Oct 29 19:26:43 2013 +0000
tree	46f404294a6ec09faf4d1a551a3a03599473dd61