blob: f0dc39891db767fad8438fe6b7d711a890697a26 [file] [log] [blame]
#!/usr/bin/env perl
use strict;
use warnings;
use v5.16;
use lib 'lib', '../lib';
use Set::Scalar;
use Mojo::DOM;
use Mojo::Util qw/encode decode/;
use Mojo::ByteStream 'b';
use Log::Log4perl;
Log::Log4perl->init("script/log4perl.conf");
use KorAP::Document;
use KorAP::Tokenizer;
# Call perl script/prepare_index.pl WPD/AAA/00001
sub parse_doc {
my $doc = KorAP::Document->new(
path => shift . '/'
);
$doc->parse;
my $tokens = KorAP::Tokenizer->new(
path => $doc->path,
doc => $doc,
foundry => 'connexor',
layer => 'tokens'
);
$tokens->parse;
my $i = 0;
$tokens->add_spandata(
foundry => 'connexor',
layer => 'sentences',
#skip => 1,
cb => sub {
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
$mtt->add(
term => '<>:s',
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
);
$i++;
}
);
$tokens->stream->add_meta('s', '<i>' . $i);
$i = 0;
$tokens->add_spandata(
foundry => 'base',
layer => 'paragraph',
#skip => 1,
cb => sub {
my ($stream, $span) = @_;
my $mtt = $stream->pos($span->p_start);
$mtt->add(
term => '<>:p',
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
);
$i++;
}
);
$tokens->stream->add_meta('p', '<i>' . $i);
$tokens->add_tokendata(
foundry => 'opennlp',
layer => 'morpho',
#skip => 1,
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
my $found;
# syntax
if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
$mtt->add(
term => 'opennlp_p:' . $found
);
};
});
my $model = 'ne_dewac_175m_600';
$tokens->add_tokendata(
foundry => 'corenlp',
#skip => 1,
layer => $model,
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
my $found;
if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
$mtt->add(
term => 'corenlp_' . $model . ':' . $found
);
};
});
$model = 'ne_hgc_175m_600';
$tokens->add_tokendata(
foundry => 'corenlp',
#skip => 1,
layer => $model,
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
my $found;
if (($found = $content->at('f[name=ne] f[name=ent]')) && ($found = $found->text)) {
$mtt->add(
term => 'corenlp_' . $model . ':' . $found
);
};
});
$tokens->add_tokendata(
foundry => 'connexor',
layer => 'morpho',
#skip => 1,
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
my $found;
# Lemma
if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
if (index($found, "\N{U+00a0}") >= 0) {
$found = b($found)->decode;
foreach (split(/\x{00A0}/, $found)) {
$mtt->add(
term => 'cnx_l:' . b($_)->encode
);
}
}
else {
$mtt->add(
term => 'cnx_l:' . $found # b($found)->encode
);
};
};
# POS
if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
$mtt->add(
term => 'cnx_p:' . $found
);
};
# MSD
# Todo: Look in the description!
if (($found = $content->at('f[name="msd"]')) && ($found = $found->text)) {
foreach (split(':', $found)) {
$mtt->add(
term => 'cnx_m:' . $_
);
};
};
}
);
$tokens->add_tokendata(
foundry => 'connexor',
layer => 'syntax',
#skip => 1,
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
my $found;
# syntax
if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
$mtt->add(
term => 'cnx_syn:' . $found
);
};
});
$tokens->add_spandata(
foundry => 'connexor',
layer => 'phrase',
#skip => 1,
cb => sub {
my ($stream, $span) = @_;
my $type = $span->content->at('f[name=pos]');
if ($type && ($type = $type->text)) {
my $mtt = $stream->pos($span->p_start);
$mtt->add(
term => '<>:cnx_const:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
);
};
}
);
$tokens->add_tokendata(
foundry => 'tree_tagger',
#skip => 1,
layer => 'morpho',
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
my $found;
# lemma
if (($found = $content->at('f[name="lemma"]')) &&
($found = $found->text) && $found ne 'UNKNOWN') {
$mtt->add(
term => 'tt_l:' . $found
);
};
# pos
if (($found = $content->at('f[name="ctag"]')) && ($found = $found->text)) {
$mtt->add(
term => 'tt_p:' . $found
);
};
});
$tokens->add_tokendata(
foundry => 'mate',
layer => 'morpho',
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
my $found;
my $capital = 0;
# pos
if (($found = $content->at('f[name="pos"]')) &&
($found = $found->text)) {
$mtt->add(term => 'mate_p:' . $found
);
};
# lemma
if (($found = $content->at('f[name="lemma"]'))
&& ($found = $found->text)
&& $found ne '--') {
$mtt->add(term => 'mate_l:' . b($found)->decode('latin-1')->encode->to_string);
};
# MSD
if (($found = $content->at('f[name="msd"]')) &&
($found = $found->text) &&
($found ne '_')) {
foreach (split '\|', $found) {
my ($x, $y) = split "=", $_;
# case, tense, number, mood, person, degree, gender
$mtt->add(term => 'mate_m:' . $x . ':' . $y);
};
};
});
$tokens->add_tokendata(
foundry => 'xip',
#skip => 1,
layer => 'morpho',
encoding => 'bytes',
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
my $content = $token->content;
my $found;
my $capital = 0;
# pos
if (($found = $content->at('f[name="pos"]')) && ($found = $found->text)) {
$mtt->add(
term => 'xip_p:' . $found
);
$capital = 1 if $found eq 'NOUN';
};
# lemma
if (($found = $content->at('f[name="lemma"]')) && ($found = $found->text)) {
my (@token) = split('#', $found);
my $full = '';
foreach (@token) {
$full .= $_;
$_ =~ s{/\w+$}{};
$mtt->add(term => 'xip_l:' . $_);
};
if (@token > 1) {
$full =~ s{/}{}g;
$full = lc $full;
$full = $capital ? ucfirst($full) : $full;
$mtt->add(term => 'xip_l:' . $full);
};
};
});
# Collect all spans and check for roots
my %xip_const;
my $xip_const_root = Set::Scalar->new;
my $xip_const_noroot = Set::Scalar->new;
# First run:
$tokens->add_spandata(
foundry => 'xip',
layer => 'constituency',
encoding => 'bytes',
#skip => 1,
cb => sub {
my ($stream, $span) = @_;
$xip_const{$span->id} = $span;
$xip_const_root->insert($span->id);
$span->content->find('rel[label=dominates][target]')->each(
sub {
my $rel = shift;
$xip_const_noroot->insert($rel->attr('target'));
}
);
}
);
my $stream = $tokens->stream;
my $add_const = sub {
my $span = shift;
my $level = shift;
my $mtt = $stream->pos($span->p_start);
my $content = $span->content;
my $type = $content->at('f[name=const]');
if ($type && ($type = $type->text)) {
# $type is now NPA, NP, NUM
my %term = (
term => '<>:xip_const:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
);
$term{payload} = '<s>' . $level if $level;
$mtt->add(%term);
my $this = __SUB__;
$content->find('rel[label=dominates][target]')->each(
sub {
my $subspan = delete $xip_const{$_[0]->attr('target')} or return;
$this->($subspan, $level + 1);
}
);
};
};
my $diff = $xip_const_root->difference($xip_const_noroot);
foreach ($diff->members) {
my $obj = delete $xip_const{$_} or next;
$add_const->($obj, 0);
};
# Todo: Add mate-morpho
# Todo: Add mate-dependency
# Todo: Add xip-dependency
print $tokens->stream->to_string;
};
if ($ARGV[0]) {
parse_doc($ARGV[0]);
};
__END__