Moderately changed JSON output format for easier parsing
diff --git a/lib/KorAP/Document.pm b/lib/KorAP/Document.pm
index 4a07f8e..200258a 100644
--- a/lib/KorAP/Document.pm
+++ b/lib/KorAP/Document.pm
@@ -160,26 +160,35 @@
return $string;
};
+sub _k {
+ my $x = $_[0];
+ $x =~ s/_(\w)/\U$1\E/g;
+ $x =~ s/id$/ID/gi;
+ return $x;
+};
+
sub to_hash {
my $self = shift;
my %hash;
- foreach (@ATTR, 'author', 'text_class') {
+ foreach (@ATTR) {
if (my $att = $self->$_) {
$att =~ s/\n/ /g;
$att =~ s/\s\s+/ /g;
- $hash{$_} = $att;
+ $hash{_k($_)} = $att;
};
};
+ foreach ('author', 'text_class') {
+ $hash{_k($_)} = join(',', @{ $self->$_ });
+ };
+
return \%hash;
};
-
-
1;
diff --git a/lib/KorAP/Index/Connexor/Morpho.pm b/lib/KorAP/Index/Connexor/Morpho.pm
index 74533bf..be20d33 100644
--- a/lib/KorAP/Index/Connexor/Morpho.pm
+++ b/lib/KorAP/Index/Connexor/Morpho.pm
@@ -24,13 +24,13 @@
if (index($found, "\N{U+00a0}") >= 0) {
foreach (split(/\x{00A0}/, $found)) {
$mtt->add(
- term => 'cnx_l:' . $_
+ term => 'cnx/l:' . $_
);
}
}
else {
$mtt->add(
- term => 'cnx_l:' . $found
+ term => 'cnx/l:' . $found
);
};
}
@@ -38,7 +38,7 @@
# POS
elsif (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
$mtt->add(
- term => 'cnx_p:' . $found
+ term => 'cnx/p:' . $found
);
}
@@ -47,7 +47,7 @@
elsif (($f->{-name} eq 'msd') && ($found = $f->{'#text'})) {
foreach (split(':', $found)) {
$mtt->add(
- term => 'cnx_m:' . $_
+ term => 'cnx/m:' . $_
);
};
};
diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
index 60fcc8e..e540169 100644
--- a/lib/KorAP/Index/Connexor/Phrase.pm
+++ b/lib/KorAP/Index/Connexor/Phrase.pm
@@ -19,7 +19,7 @@
if ($type) {
my $mtt = $stream->pos($span->p_start);
$mtt->add(
- term => '<>:cnx_const:' . $type,
+ term => '<>:cnx/const:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
diff --git a/lib/KorAP/Index/Connexor/Syntax.pm b/lib/KorAP/Index/Connexor/Syntax.pm
index 35f5079..2e58316 100644
--- a/lib/KorAP/Index/Connexor/Syntax.pm
+++ b/lib/KorAP/Index/Connexor/Syntax.pm
@@ -18,7 +18,7 @@
foreach (@$spans) {
if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
$mtt->add(
- term => 'cnx_syn:' . $found
+ term => 'cnx/syn:' . $found
);
};
};
diff --git a/lib/KorAP/Index/CoreNLP/NamedEntities.pm b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
index 8596783..9f0b492 100644
--- a/lib/KorAP/Index/CoreNLP/NamedEntities.pm
+++ b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
@@ -21,7 +21,7 @@
($found->{-name} eq 'ent') &&
($found = $found->{'#text'})) {
$mtt->add(
- term => 'corenlp_' . $model . ':' . $found
+ term => 'corenlp/' . $model . ':' . $found
);
};
}) or return;
diff --git a/lib/KorAP/Index/Mate/Dependency.pm b/lib/KorAP/Index/Mate/Dependency.pm
index d300363..430dff5 100644
--- a/lib/KorAP/Index/Mate/Dependency.pm
+++ b/lib/KorAP/Index/Mate/Dependency.pm
@@ -26,7 +26,7 @@
if ($_->{-type} && $_->{-type} eq 'unary') {
next if $_->{-label} eq '--';
$mtt->add(
- term => 'mate_d:' . $label
+ term => 'mate/d:' . $label
);
}
else {
@@ -37,12 +37,12 @@
my $rel_token = $tokens->token($from, $to) or next;
$mtt->add(
- term => '>:mate_d:' . $label,
+ term => '>:mate/d:' . $label,
payload => '<i>' . $rel_token->pos
);
$stream->pos($rel_token->pos)->add(
- term => '<:mate_d:' . $label,
+ term => '<:mate/d:' . $label,
payload => '<i>' . $token->pos
);
};
diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index b1a6803..d6d673e 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm
@@ -22,8 +22,7 @@
# pos
if (($f->{-name} eq 'pos') &&
($found = $f->{'#text'})) {
- $mtt->add(term => 'mate_p:' . $found
- );
+ $mtt->add(term => 'mate/p:' . $found);
}
# lemma
@@ -31,7 +30,7 @@
&& ($found = $f->{'#text'})
&& $found ne '--') {
# b($found)->decode('latin-1')->encode->to_string
- $mtt->add(term => 'mate_l:' . $found);
+ $mtt->add(term => 'mate/l:' . $found);
}
# MSD
@@ -41,7 +40,7 @@
foreach (split '\|', $found) {
my ($x, $y) = split "=", $_;
# case, tense, number, mood, person, degree, gender
- $mtt->add(term => 'mate_m:' . $x . ':' . $y);
+ $mtt->add(term => 'mate/m:' . $x . ':' . $y);
};
};
};
diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
index 20716b0..f03b17f 100644
--- a/lib/KorAP/Index/OpenNLP/Morpho.pm
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm
@@ -19,7 +19,7 @@
# syntax
if (($content->{-name} eq 'pos') && ($content->{'#text'})) {
$mtt->add(
- term => 'opennlp_p:' . $content->{'#text'}
+ term => 'opennlp/p:' . $content->{'#text'}
);
};
}) or return;
diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
index 67994ac..e4de00e 100644
--- a/lib/KorAP/Index/TreeTagger/Morpho.pm
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm
@@ -27,14 +27,14 @@
($found ne 'UNKNOWN') &&
($found ne '?')) {
$mtt->add(
- term => 'tt_l:' . $found
+ term => 'tt/l:' . $found
);
};
# pos
if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
$mtt->add(
- term => 'tt_p:' . $found
+ term => 'tt/p:' . $found
);
};
};
diff --git a/lib/KorAP/Index/XIP/Constituency.pm b/lib/KorAP/Index/XIP/Constituency.pm
index 61c2a5e..7b70424 100644
--- a/lib/KorAP/Index/XIP/Constituency.pm
+++ b/lib/KorAP/Index/XIP/Constituency.pm
@@ -48,13 +48,13 @@
# $type is now NPA, NP, NUM ...
my %term = (
- term => '<>:xip_const:' . $type,
+ term => '<>:xip/const:' . $type,
o_start => $span->o_start,
o_end => $span->o_end,
p_end => $span->p_end
);
- $term{payload} = '<s>' . $level if $level;
+ $term{payload} = '<b>' . $level if $level;
$mtt->add(%term);
diff --git a/lib/KorAP/Index/XIP/Dependency.pm b/lib/KorAP/Index/XIP/Dependency.pm
index ca889d8..98e121e 100644
--- a/lib/KorAP/Index/XIP/Dependency.pm
+++ b/lib/KorAP/Index/XIP/Dependency.pm
@@ -25,7 +25,7 @@
if ($_->{-type} && $_->{-type} eq 'unary') {
$mtt->add(
- term => 'xip_d:' . $label
+ term => 'xip/d:' . $label
);
}
else {
@@ -37,12 +37,12 @@
# die $token->pos . ' -' . $label . '-> ' . $rel_token->pos;
$mtt->add(
- term => '>:xip_d:' . $label,
+ term => '>:xip/d:' . $label,
payload => '<i>' . $rel_token->pos
);
$stream->pos($rel_token->pos)->add(
- term => '<:xip_d:' . $label,
+ term => '<:xip/d:' . $label,
payload => '<i>' . $token->pos
);
};
diff --git a/lib/KorAP/Index/XIP/Morpho.pm b/lib/KorAP/Index/XIP/Morpho.pm
index d568afe..83b484e 100644
--- a/lib/KorAP/Index/XIP/Morpho.pm
+++ b/lib/KorAP/Index/XIP/Morpho.pm
@@ -21,7 +21,7 @@
# pos
if (($_->{-name} eq 'pos') && ($found = $_->{'#text'})) {
$mtt->add(
- term => 'xip_p:' . $found
+ term => 'xip/p:' . $found
);
$capital = 1 if $found eq 'NOUN';
@@ -42,13 +42,13 @@
foreach (@token) {
$full .= $_;
$_ =~ s{/\w+$}{};
- $mtt->add(term => 'xip_l:' . $_);
+ $mtt->add(term => 'xip/l:' . $_);
};
if (@token > 1) {
$full =~ s{/}{}g;
$full = lc $full;
$full = $capital ? ucfirst($full) : $full;
- $mtt->add(term => 'xip_l:' . $full);
+ $mtt->add(term => 'xip/l:' . $full);
};
};
};
diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 4920215..27705fb 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm
@@ -280,15 +280,19 @@
sub to_data {
my $self = shift;
my $primary = defined $_[0] ? $_[0] : 1;
- my %data;
- $data{meta} = $self->doc->to_hash;
- $data{primary} = $self->doc->primary->data if $primary;
- $data{fields} = [ {
+ my %data = %{$self->doc->to_hash};
+
+ my @fields;
+ push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
+
+ push(@fields, {
name => $self->name,
data => $self->stream->to_array,
tokenization => [lc($self->foundry), lc($self->layer)],
support => $self->support
- }];
+ });
+
+ $data{fields} = \@fields;
\%data;
};