Fix script for new index (including new foundries)
diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 5db3851..0d8742b 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm
@@ -24,6 +24,12 @@
return $mt;
};
+# Return a new term id
+sub id_counter {
+ $_[0]->{id_counter} //= 1;
+ return $_[0]->{id_counter}++;
+};
+
sub surface {
substr($_[0]->{mt}->[0]->term,2);
@@ -47,6 +53,7 @@
[uniq(map($_->to_string, @{$self->{mt}}))];
};
+
sub to_solr {
my $self = shift;
my @array = map { $_->to_solr(0) } @{$self->{mt}};
@@ -54,6 +61,7 @@
return \@array;
};
+
1;
diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index f0b6e6d..105bd59 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm
@@ -25,6 +25,10 @@
return 1;
};
+sub layer_info {
+ ['base/s=spans'];
+};
+
1;
diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
index a9fcd25..f1ca6f9 100644
--- a/lib/KorAP/Index/Base/Sentences.pm
+++ b/lib/KorAP/Index/Base/Sentences.pm
@@ -39,4 +39,8 @@
return 1;
};
+sub layer_info {
+ ['base/s=spans'];
+};
+
1;
diff --git a/lib/KorAP/Index/Connexor/Morpho.pm b/lib/KorAP/Index/Connexor/Morpho.pm
index 27fd7ee..a6970f5 100644
--- a/lib/KorAP/Index/Connexor/Morpho.pm
+++ b/lib/KorAP/Index/Connexor/Morpho.pm
@@ -1,6 +1,24 @@
package KorAP::Index::Connexor::Morpho;
use KorAP::Index::Base;
+our %MAP = (
+ 'v_ind' => 'mood',
+ 'v_imp' => 'mood',
+ 'v_sub' => 'mood',
+ 'v_fin' => 'inf',
+ 'v_pcp' => 'inf',
+ 'v_pres' => 'tense',
+ 'v_past' => 'tense',
+ 'v_prog' => 'tense',
+ 'v_perf' => 'tense',
+ 'n_abbr' => 'type',
+ 'n_prop' => 'type',
+ 'n_pl' => 'type',
+ 'a_cmp' => 'degree',
+ 'a_sub' => 'degree',
+ 'num_ord' => 'type'
+);
+
sub parse {
my $self = shift;
@@ -43,7 +61,8 @@
}
# MSD
- # This could follow http://www.ids-mannheim.de/cosmas2/projekt/referenz/connexor/morph.html
+ # This could follow
+ # http://www.ids-mannheim.de/cosmas2/projekt/referenz/connexor/morph.html
elsif (($f->{-name} eq 'msd') && ($found = $f->{'#text'})) {
foreach (split(':', $found)) {
$mtt->add(
@@ -59,7 +78,7 @@
};
sub layer_info {
- ['cnx/l=lemma', 'cnx/p=pos', 'cnx/m=msd'];
+ ['cnx/l=tokens', 'cnx/p=tokens', 'cnx/m=tokens'];
};
diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
index f6dbe99..ed36de3 100644
--- a/lib/KorAP/Index/Connexor/Phrase.pm
+++ b/lib/KorAP/Index/Connexor/Phrase.pm
@@ -32,7 +32,7 @@
};
sub layer_info {
- ['cnx/c=const'];
+ ['cnx/c=spans'];
};
diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
index 04cee09..ac6f89f 100644
--- a/lib/KorAP/Index/Connexor/Sentences.pm
+++ b/lib/KorAP/Index/Connexor/Sentences.pm
@@ -26,4 +26,9 @@
return 1;
};
+
+sub layer_info {
+ ['cnx/s=spans'];
+};
+
1;
diff --git a/lib/KorAP/Index/Connexor/Syntax.pm b/lib/KorAP/Index/Connexor/Syntax.pm
index 2e58316..c65c4f5 100644
--- a/lib/KorAP/Index/Connexor/Syntax.pm
+++ b/lib/KorAP/Index/Connexor/Syntax.pm
@@ -27,5 +27,8 @@
return 1;
};
+sub layer_info {
+ ['cnx/syn=tokens'];
+};
1;
diff --git a/lib/KorAP/Index/CoreNLP/Morpho.pm b/lib/KorAP/Index/CoreNLP/Morpho.pm
index 5704cdc..2d3491d 100644
--- a/lib/KorAP/Index/CoreNLP/Morpho.pm
+++ b/lib/KorAP/Index/CoreNLP/Morpho.pm
@@ -28,7 +28,7 @@
};
sub layer_info {
- ['corenlp/p=pos'];
+ ['corenlp/p=tokens'];
};
1;
diff --git a/lib/KorAP/Index/CoreNLP/NamedEntities.pm b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
index 9f0b492..942b5b6 100644
--- a/lib/KorAP/Index/CoreNLP/NamedEntities.pm
+++ b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
@@ -2,12 +2,12 @@
use KorAP::Index::Base;
sub parse {
- my $self = shift;
- my $model = shift;
+ my $self = shift;
+ my $model = shift;
$$self->add_tokendata(
foundry => 'corenlp',
- layer => $model,
+ layer => $model // lc('NamedEntities'),
cb => sub {
my ($stream, $token) = @_;
my $mtt = $stream->pos($token->pos);
@@ -21,7 +21,7 @@
($found->{-name} eq 'ent') &&
($found = $found->{'#text'})) {
$mtt->add(
- term => 'corenlp/' . $model . ':' . $found
+ term => 'corenlp/ne:' . $found
);
};
}) or return;
@@ -29,4 +29,8 @@
return 1;
};
+sub layer_info {
+ ['corenlp/ne=tokens'];
+};
+
1;
diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
index 1bd84e0..0f40213 100644
--- a/lib/KorAP/Index/CoreNLP/Sentences.pm
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm
@@ -26,4 +26,9 @@
return 1;
};
+
+sub layer_info {
+ ['corenlp/s=spans'];
+};
+
1;
diff --git a/lib/KorAP/Index/Glemm/Morpho.pm b/lib/KorAP/Index/Glemm/Morpho.pm
new file mode 100644
index 0000000..b0bc589
--- /dev/null
+++ b/lib/KorAP/Index/Glemm/Morpho.pm
@@ -0,0 +1,56 @@
+package KorAP::Index::Glemm::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'glemm',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f} or return;
+
+ # All interpretations
+ foreach (ref $content eq 'ARRAY' ? @$content : $content) {
+
+ # All features
+ $content = $_->{fs}->{f};
+
+ my $lemma;
+ my ($composition, $derivation) = (0,0);
+
+ # Iterate over
+ foreach (ref $content eq 'ARRAY' ? @$content : $content) {
+
+ # syntax
+ if (($_->{-name} eq 'lemma') && $_->{'#text'}) {
+ $lemma = $_->{'#text'};
+ }
+ elsif ($_->{-name} eq 'composition' && $_->{'#text'} eq 'true') {
+ $composition = 1;
+ }
+ elsif ($_->{-name} eq 'derivation' && $_->{'#text'} eq 'true') {
+ $derivation = 1;
+ };
+ };
+
+ $mtt->add(
+ term => 'glemm/l:' .
+ ($composition ? '+' : '_') .
+ ($derivation ? '+' : '_') .
+ $lemma
+ ) if $lemma;
+ };
+ }) or return;
+
+ return 1;
+};
+
+sub layer_info {
+ ['glemm/l=tokens'];
+};
+
+1;
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index df8c95a..035bc37 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -13,6 +13,69 @@
my $content = $token->hash->{fs}->{f};
+ my ($found, $pos, $msd, $id);
+
+ my $capital = 0;
+
+ foreach my $f (@{$content->{fs}->{f}}) {
+ #pos
+ if (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
+ $pos = $found;
+ }
+
+ # lemma
+ elsif (($f->{-name} eq 'lemma')
+ && ($found = $f->{'#text'})
+ && $found ne '--') {
+ $mtt->add(term => 'mate/l:' . $found);
+ }
+
+ # MSD
+ elsif (($f->{-name} eq 'msd') &&
+ ($found = $f->{'#text'}) &&
+ ($found ne '_')) {
+ $msd = $found;
+ $id = $mtt->id_counter;
+ };
+ };
+
+ $mtt->add(term => 'mate/m:' . $pos . ($id ? ('$<s>' . $id) : ''));
+
+ # MSD
+ if ($msd) {
+ foreach (split '\|', $msd) {
+ my ($x, $y) = split "=", $_;
+ # case, tense, number, mood, person, degree, gender
+ $mtt->add(term => '@:' . $x . ($y ? '=' . $y : '') . '$<s>' . $id);
+ };
+ };
+ }) or return;
+
+ return 1;
+};
+
+sub layer_info {
+ ['mate/l=tokens', 'mate/m=tokens']
+};
+
+1;
+
+
+__END__
+
+
+sub parse {
+ my $self = shift;
+
+ $$self->add_tokendata(
+ foundry => 'mate',
+ layer => 'morpho',
+ cb => sub {
+ my ($stream, $token) = @_;
+ my $mtt = $stream->pos($token->pos);
+
+ my $content = $token->hash->{fs}->{f};
+
my $found;
my $capital = 0;
@@ -50,7 +113,7 @@
};
sub layer_info {
- ['mate/l=lemma', 'mate/p=pos', 'mate/m=msd']
+ ['mate/l=tokens', 'mate/p=tokens', 'mate/m=tokens']
}
1;
diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
index 7ebdd96..c72b4c2 100644
--- a/lib/KorAP/Index/OpenNLP/Morpho.pm
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm
@@ -28,7 +28,7 @@
};
sub layer_info {
- ['opennlp/p=pos'];
+ ['opennlp/p=tokens'];
};
1;
diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index fd0c9d3..8710763 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm
@@ -26,4 +26,8 @@
return 1;
};
+sub layer_info {
+ ['opennlp/s=spans'];
+};
+
1;
diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
index d83267a..81766c7 100644
--- a/lib/KorAP/Index/TreeTagger/Morpho.pm
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm
@@ -1,5 +1,6 @@
package KorAP::Index::TreeTagger::Morpho;
use KorAP::Index::Base;
+use POSIX 'floor';
sub parse {
my $self = shift;
@@ -19,22 +20,34 @@
foreach my $fs (@$content) {
$content = $fs->{fs}->{f};
- foreach (@$content) {
+ my @val;
+ my $certainty = '';
+ foreach (@$content) {
+ if ($_->{-name} eq 'certainty') {
+ $certainty = floor(($_->{'#text'} * 255));
+ $certainty = '$<b>' . $certainty if $certainty;
+ }
+ else {
+ push @val, $_
+ };
+ };
+
+ foreach (@val) {
# lemma
if (($_->{-name} eq 'lemma') &&
($found = $_->{'#text'}) &&
($found ne 'UNKNOWN') &&
($found ne '?')) {
$mtt->add(
- term => 'tt/l:' . $found
+ term => 'tt/l:' . $found . $certainty
);
};
# pos
if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
$mtt->add(
- term => 'tt/p:' . $found
+ term => 'tt/p:' . $found . $certainty
);
};
};
@@ -45,7 +58,7 @@
};
sub layer_info {
- ['tt/p=pos', 'tt/l=lemma']
+ ['tt/p=tokens', 'tt/l=tokens']
};
1;
diff --git a/lib/KorAP/Index/TreeTagger/Sentences.pm b/lib/KorAP/Index/TreeTagger/Sentences.pm
index d96d96e..06669ea 100644
--- a/lib/KorAP/Index/TreeTagger/Sentences.pm
+++ b/lib/KorAP/Index/TreeTagger/Sentences.pm
@@ -26,4 +26,9 @@
return 1;
};
+sub layer_info {
+ ['tt/s=spans'];
+};
+
+
1;
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 5acae26..c242b1e 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -445,26 +445,45 @@
sub to_data {
my $self = shift;
my $primary = defined $_[0] ? $_[0] : 1;
+ my $legacy = defined $_[1] ? $_[1] : 0;
+
my %data = %{$self->doc->to_hash};
-
my @fields;
- push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
- push(@fields, {
- name => $self->name,
- data => $self->stream->to_array,
- tokenization => lc($self->foundry) . '#' . lc($self->layer),
- foundries => $self->support,
- layerInfo => $self->layer_info
- });
+ if ($legacy) {
+ push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
- $data{fields} = \@fields;
+ push(@fields, {
+ name => $self->name,
+ data => $self->stream->to_array,
+ tokenization => lc($self->foundry) . '#' . lc($self->layer),
+ foundries => $self->support,
+ layerInfo => $self->layer_info
+ });
+
+ $data{fields} = \@fields;
+ }
+
+ else {
+ $data{primaryData} = $self->doc->primary->data if $primary;
+ $data{tokenName} = $self->name;
+ $data{data} = $self->stream->to_array;
+ $data{tokenSource} = lc($self->foundry) . '#' . lc($self->layer);
+ $data{foundries} = $self->support;
+ $data{layerInfos} = $self->layer_info;
+ $data{version} = '0.02';
+ };
+
\%data;
};
+sub to_json_legacy {
+ encode_json($_[0]->to_data($_[1], 1));
+};
+
sub to_json {
- encode_json($_[0]->to_data($_[1]));
+ encode_json($_[0]->to_data($_[1], 0));
};
@@ -566,6 +585,25 @@
Start the tokenization process.
+=head2 to_json_legacy
+
+ print $tokens->to_json_legacy;
+ print $tokens->to_json_legacy(1);
+
+Return the token data in legacy JSON format.
+An optional parsed boolean parameter indicates,
+if primary data should be included.
+
+=head2 to_json
+
+ print $tokens->to_json;
+ print $tokens->to_json(1);
+
+Return the token data in JSON format
+An optional parsed boolean parameter indicates,
+if primary data should be included.
+
+
=head2 add_subtokens
$tokens->split_tokens;
diff --git a/lib/KorAP/Tokenizer/Token.pm b/lib/KorAP/Tokenizer/Token.pm
index 5dcdb1d..a2ac486 100644
--- a/lib/KorAP/Tokenizer/Token.pm
+++ b/lib/KorAP/Tokenizer/Token.pm
@@ -7,6 +7,7 @@
bless [], shift;
};
+# get or set token position
sub pos {
if (defined $_[1]) {
$_[0]->[0] = $_[1];
@@ -15,6 +16,7 @@
};
+# Get or set token content
sub content {
if (defined $_[1]) {
$_[0]->[1] = $_[1];
@@ -24,6 +26,8 @@
};
};
+
+# Get or set token id
sub id {
if ($_[1]) {
$_[0]->[2] = $_[1];
@@ -33,6 +37,7 @@
};
};
+
sub dom {
if ($_[0]->[3]) {
return $_[0]->[3];
diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
index e3336a3..3b5355b 100644
--- a/lib/KorAP/Tokenizer/Tokens.pm
+++ b/lib/KorAP/Tokenizer/Tokens.pm
@@ -13,9 +13,23 @@
sub parse {
my $self = shift;
+
my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
- return unless -e $path;
+ # Legacy data support
+ unless (-e $path) {
+ if ($self->layer eq 'namedentities') {
+ $path = $self->path . $self->foundry . '/ne_combined.xml';
+ return unless -e $path;
+ }
+ elsif ($self->layer eq 'morpho' && $self->foundry eq 'glemm') {
+ $path = $self->path . $self->foundry . '/glemm.xml';
+ return unless -e $path;
+ }
+ else {
+ return;
+ };
+ };
my $file = b($path)->slurp;