Fix script for new index (including new foundries)

commit: 32e30f07a3ddd06b4af06d74190de0699bfab5b8 [log] [tgz]
author: Nils Diewald <nils@diewald-online.de> Thu Oct 30 00:52:36 2014 +0000
committer: Nils Diewald <nils@diewald-online.de> Thu Oct 30 00:52:36 2014 +0000
tree: 020651e696992a7cb82bdbd4e0c8b83992d629b8
parent: 840c92497503e9f5cf6066d7c3d3bd1b46bd04c1 [diff]
diff --git a/lib/KorAP/Field/MultiTermToken.pm b/lib/KorAP/Field/MultiTermToken.pm
index 5db3851..0d8742b 100644
--- a/lib/KorAP/Field/MultiTermToken.pm
+++ b/lib/KorAP/Field/MultiTermToken.pm

@@ -24,6 +24,12 @@
   return $mt;
 };
 
+# Return a new term id
+sub id_counter {
+  $_[0]->{id_counter} //= 1;
+  return $_[0]->{id_counter}++;
+};
+
 
 sub surface {
   substr($_[0]->{mt}->[0]->term,2);
@@ -47,6 +53,7 @@
   [uniq(map($_->to_string, @{$self->{mt}}))];
 };
 
+
 sub to_solr {
   my $self = shift;
   my @array = map { $_->to_solr(0) } @{$self->{mt}};
@@ -54,6 +61,7 @@
   return \@array;
 };
 
+
 1;
 
 

diff --git a/lib/KorAP/Index/Base/Paragraphs.pm b/lib/KorAP/Index/Base/Paragraphs.pm
index f0b6e6d..105bd59 100644
--- a/lib/KorAP/Index/Base/Paragraphs.pm
+++ b/lib/KorAP/Index/Base/Paragraphs.pm

@@ -25,6 +25,10 @@
   return 1;
 };
 
+sub layer_info {
+    ['base/s=spans'];
+};
+
 
 
 1;

diff --git a/lib/KorAP/Index/Base/Sentences.pm b/lib/KorAP/Index/Base/Sentences.pm
index a9fcd25..f1ca6f9 100644
--- a/lib/KorAP/Index/Base/Sentences.pm
+++ b/lib/KorAP/Index/Base/Sentences.pm

@@ -39,4 +39,8 @@
   return 1;
 };
 
+sub layer_info {
+    ['base/s=spans'];
+};
+
 1;

diff --git a/lib/KorAP/Index/Connexor/Morpho.pm b/lib/KorAP/Index/Connexor/Morpho.pm
index 27fd7ee..a6970f5 100644
--- a/lib/KorAP/Index/Connexor/Morpho.pm
+++ b/lib/KorAP/Index/Connexor/Morpho.pm

@@ -1,6 +1,24 @@
 package KorAP::Index::Connexor::Morpho;
 use KorAP::Index::Base;
 
+our %MAP = (
+  'v_ind'   => 'mood',
+  'v_imp'   => 'mood',
+  'v_sub'   => 'mood',
+  'v_fin'   => 'inf',
+  'v_pcp'   => 'inf',
+  'v_pres'  => 'tense',
+  'v_past'  => 'tense',
+  'v_prog'  => 'tense',
+  'v_perf'  => 'tense',
+  'n_abbr'  => 'type',
+  'n_prop'  => 'type',
+  'n_pl'    => 'type',
+  'a_cmp'   => 'degree',
+  'a_sub'   => 'degree',
+  'num_ord' => 'type'
+);
+
 sub parse {
   my $self = shift;
 
@@ -43,7 +61,8 @@
 
 	}
 	# MSD
-	# This could follow http://www.ids-mannheim.de/cosmas2/projekt/referenz/connexor/morph.html
+	# This could follow
+	# http://www.ids-mannheim.de/cosmas2/projekt/referenz/connexor/morph.html
 	elsif (($f->{-name} eq 'msd') && ($found = $f->{'#text'})) {
 	  foreach (split(':', $found)) {
 	    $mtt->add(
@@ -59,7 +78,7 @@
 };
 
 sub layer_info {
-    ['cnx/l=lemma', 'cnx/p=pos', 'cnx/m=msd'];
+    ['cnx/l=tokens', 'cnx/p=tokens', 'cnx/m=tokens'];
 };
 
 

diff --git a/lib/KorAP/Index/Connexor/Phrase.pm b/lib/KorAP/Index/Connexor/Phrase.pm
index f6dbe99..ed36de3 100644
--- a/lib/KorAP/Index/Connexor/Phrase.pm
+++ b/lib/KorAP/Index/Connexor/Phrase.pm

@@ -32,7 +32,7 @@
 };
 
 sub layer_info {
-    ['cnx/c=const'];
+    ['cnx/c=spans'];
 };
 
 

diff --git a/lib/KorAP/Index/Connexor/Sentences.pm b/lib/KorAP/Index/Connexor/Sentences.pm
index 04cee09..ac6f89f 100644
--- a/lib/KorAP/Index/Connexor/Sentences.pm
+++ b/lib/KorAP/Index/Connexor/Sentences.pm

@@ -26,4 +26,9 @@
   return 1;
 };
 
+
+sub layer_info {
+    ['cnx/s=spans'];
+};
+
 1;

diff --git a/lib/KorAP/Index/Connexor/Syntax.pm b/lib/KorAP/Index/Connexor/Syntax.pm
index 2e58316..c65c4f5 100644
--- a/lib/KorAP/Index/Connexor/Syntax.pm
+++ b/lib/KorAP/Index/Connexor/Syntax.pm

@@ -27,5 +27,8 @@
   return 1;
 };
 
+sub layer_info {
+    ['cnx/syn=tokens'];
+};
 
 1;

diff --git a/lib/KorAP/Index/CoreNLP/Morpho.pm b/lib/KorAP/Index/CoreNLP/Morpho.pm
index 5704cdc..2d3491d 100644
--- a/lib/KorAP/Index/CoreNLP/Morpho.pm
+++ b/lib/KorAP/Index/CoreNLP/Morpho.pm

@@ -28,7 +28,7 @@
 };
 
 sub layer_info {
-    ['corenlp/p=pos'];
+  ['corenlp/p=tokens'];
 };
 
 1;

diff --git a/lib/KorAP/Index/CoreNLP/NamedEntities.pm b/lib/KorAP/Index/CoreNLP/NamedEntities.pm
index 9f0b492..942b5b6 100644
--- a/lib/KorAP/Index/CoreNLP/NamedEntities.pm
+++ b/lib/KorAP/Index/CoreNLP/NamedEntities.pm

@@ -2,12 +2,12 @@
 use KorAP::Index::Base;
 
 sub parse {
-  my $self = shift;
-  my $model = shift;
+  my $self   = shift;
+  my $model  = shift;
 
   $$self->add_tokendata(
     foundry => 'corenlp',
-    layer => $model,
+    layer => $model // lc('NamedEntities'),
     cb => sub {
       my ($stream, $token) = @_;
       my $mtt = $stream->pos($token->pos);
@@ -21,7 +21,7 @@
 		($found->{-name} eq 'ent') &&
 		  ($found = $found->{'#text'})) {
 	$mtt->add(
-	  term => 'corenlp/' . $model . ':' . $found
+	  term => 'corenlp/ne:' . $found
 	);
       };
     }) or return;
@@ -29,4 +29,8 @@
   return 1;
 };
 
+sub layer_info {
+    ['corenlp/ne=tokens'];
+};
+
 1;

diff --git a/lib/KorAP/Index/CoreNLP/Sentences.pm b/lib/KorAP/Index/CoreNLP/Sentences.pm
index 1bd84e0..0f40213 100644
--- a/lib/KorAP/Index/CoreNLP/Sentences.pm
+++ b/lib/KorAP/Index/CoreNLP/Sentences.pm

@@ -26,4 +26,9 @@
   return 1;
 };
 
+
+sub layer_info {
+    ['corenlp/s=spans'];
+};
+
 1;

diff --git a/lib/KorAP/Index/Glemm/Morpho.pm b/lib/KorAP/Index/Glemm/Morpho.pm
new file mode 100644
index 0000000..b0bc589
--- /dev/null
+++ b/lib/KorAP/Index/Glemm/Morpho.pm

@@ -0,0 +1,56 @@
+package KorAP::Index::Glemm::Morpho;
+use KorAP::Index::Base;
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'glemm',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f} or return;
+
+      # All interpretations
+      foreach (ref $content eq 'ARRAY' ? @$content : $content) {
+
+	# All features
+	$content = $_->{fs}->{f};
+
+	my $lemma;
+	my ($composition, $derivation) = (0,0);
+
+	# Iterate over
+	foreach (ref $content eq 'ARRAY' ? @$content : $content) {
+
+	  # syntax
+	  if (($_->{-name} eq 'lemma') && $_->{'#text'}) {
+	    $lemma = $_->{'#text'};
+	  }
+	  elsif ($_->{-name} eq 'composition' && $_->{'#text'} eq 'true') {
+	    $composition = 1;
+	  }
+	  elsif ($_->{-name} eq 'derivation' && $_->{'#text'} eq 'true') {
+	    $derivation = 1;
+	  };
+	};
+
+	$mtt->add(
+	  term => 'glemm/l:' .
+	    ($composition ? '+' : '_') .
+	      ($derivation ? '+' : '_') .
+		$lemma
+	) if $lemma;
+      };
+    }) or return;
+
+  return 1;
+};
+
+sub layer_info {
+    ['glemm/l=tokens'];
+};
+
+1;

diff --git a/lib/KorAP/Index/Mate/Morpho.pm b/lib/KorAP/Index/Mate/Morpho.pm
index df8c95a..035bc37 100644
--- a/lib/KorAP/Index/Mate/Morpho.pm
+++ b/lib/KorAP/Index/Mate/Morpho.pm

@@ -13,6 +13,69 @@
 
       my $content = $token->hash->{fs}->{f};
 
+      my ($found, $pos, $msd, $id);
+
+      my $capital = 0;
+
+      foreach my $f (@{$content->{fs}->{f}}) {
+	#pos
+	if (($f->{-name} eq 'pos') && ($found = $f->{'#text'})) {
+	  $pos = $found;
+	}
+
+	# lemma
+	elsif (($f->{-name} eq 'lemma')
+		 && ($found = $f->{'#text'})
+		   && $found ne '--') {
+	  $mtt->add(term => 'mate/l:' . $found);
+	}
+
+	# MSD
+	elsif (($f->{-name} eq 'msd') &&
+		 ($found = $f->{'#text'}) &&
+		   ($found ne '_')) {
+	  $msd = $found;
+	  $id = $mtt->id_counter;
+	};
+      };
+
+      $mtt->add(term => 'mate/m:' . $pos . ($id ? ('$<s>' . $id) : ''));
+
+      # MSD
+      if ($msd) {
+	foreach (split '\|', $msd) {
+	  my ($x, $y) = split "=", $_;
+	  # case, tense, number, mood, person, degree, gender
+	  $mtt->add(term => '@:' . $x . ($y ? '=' . $y : '') . '$<s>' . $id);
+	};
+      };
+    }) or return;
+
+  return 1;
+};
+
+sub layer_info {
+    ['mate/l=tokens', 'mate/m=tokens']
+};
+
+1;
+
+
+__END__
+
+
+sub parse {
+  my $self = shift;
+
+  $$self->add_tokendata(
+    foundry => 'mate',
+    layer => 'morpho',
+    cb => sub {
+      my ($stream, $token) = @_;
+      my $mtt = $stream->pos($token->pos);
+
+      my $content = $token->hash->{fs}->{f};
+
       my $found;
 
       my $capital = 0;
@@ -50,7 +113,7 @@
 };
 
 sub layer_info {
-    ['mate/l=lemma', 'mate/p=pos', 'mate/m=msd']
+    ['mate/l=tokens', 'mate/p=tokens', 'mate/m=tokens']
 }
 
 1;

diff --git a/lib/KorAP/Index/OpenNLP/Morpho.pm b/lib/KorAP/Index/OpenNLP/Morpho.pm
index 7ebdd96..c72b4c2 100644
--- a/lib/KorAP/Index/OpenNLP/Morpho.pm
+++ b/lib/KorAP/Index/OpenNLP/Morpho.pm

@@ -28,7 +28,7 @@
 };
 
 sub layer_info {
-    ['opennlp/p=pos'];
+    ['opennlp/p=tokens'];
 };
 
 1;

diff --git a/lib/KorAP/Index/OpenNLP/Sentences.pm b/lib/KorAP/Index/OpenNLP/Sentences.pm
index fd0c9d3..8710763 100644
--- a/lib/KorAP/Index/OpenNLP/Sentences.pm
+++ b/lib/KorAP/Index/OpenNLP/Sentences.pm

@@ -26,4 +26,8 @@
   return 1;
 };
 
+sub layer_info {
+    ['opennlp/s=spans'];
+};
+
 1;

diff --git a/lib/KorAP/Index/TreeTagger/Morpho.pm b/lib/KorAP/Index/TreeTagger/Morpho.pm
index d83267a..81766c7 100644
--- a/lib/KorAP/Index/TreeTagger/Morpho.pm
+++ b/lib/KorAP/Index/TreeTagger/Morpho.pm

@@ -1,5 +1,6 @@
 package KorAP::Index::TreeTagger::Morpho;
 use KorAP::Index::Base;
+use POSIX 'floor';
 
 sub parse {
   my $self = shift;
@@ -19,22 +20,34 @@
 
       foreach my $fs (@$content) {
 	$content = $fs->{fs}->{f};
-	foreach (@$content) {
 
+	my @val;
+	my $certainty = '';
+	foreach (@$content) {
+	  if ($_->{-name} eq 'certainty') {
+	    $certainty = floor(($_->{'#text'} * 255));
+	    $certainty = '$<b>' . $certainty if $certainty;
+	  }
+	  else {
+	    push @val, $_
+	  };
+	};
+
+	foreach (@val) {
 	  # lemma
 	  if (($_->{-name} eq 'lemma') &&
 		($found = $_->{'#text'}) &&
 		  ($found ne 'UNKNOWN') &&
 		    ($found ne '?')) {
 	    $mtt->add(
-	      term => 'tt/l:' . $found
+	      term => 'tt/l:' . $found . $certainty
 	    );
 	  };
 
 	  # pos
 	  if (($_->{-name} eq 'ctag') && ($found = $_->{'#text'})) {
 	    $mtt->add(
-	      term => 'tt/p:' . $found
+	      term => 'tt/p:' . $found . $certainty
 	    );
 	  };
 	};
@@ -45,7 +58,7 @@
 };
 
 sub layer_info {
-    ['tt/p=pos', 'tt/l=lemma']
+    ['tt/p=tokens', 'tt/l=tokens']
 };
 
 1;

diff --git a/lib/KorAP/Index/TreeTagger/Sentences.pm b/lib/KorAP/Index/TreeTagger/Sentences.pm
index d96d96e..06669ea 100644
--- a/lib/KorAP/Index/TreeTagger/Sentences.pm
+++ b/lib/KorAP/Index/TreeTagger/Sentences.pm

@@ -26,4 +26,9 @@
   return 1;
 };
 
+sub layer_info {
+    ['tt/s=spans'];
+};
+
+
 1;

diff --git a/lib/KorAP/Tokenizer.pm b/lib/KorAP/Tokenizer.pm
index 5acae26..c242b1e 100644
--- a/lib/KorAP/Tokenizer.pm
+++ b/lib/KorAP/Tokenizer.pm

@@ -445,26 +445,45 @@
 sub to_data {
   my $self = shift;
   my $primary = defined $_[0] ? $_[0] : 1;
+  my $legacy =  defined $_[1] ? $_[1] : 0;
+
   my %data = %{$self->doc->to_hash};
-
   my @fields;
-  push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
 
-  push(@fields, {
-    name => $self->name,
-    data => $self->stream->to_array,
-    tokenization => lc($self->foundry) . '#' . lc($self->layer),
-    foundries => $self->support,
-    layerInfo => $self->layer_info
-  });
+  if ($legacy) {
+    push(@fields, { primaryData => $self->doc->primary->data }) if $primary;
 
-  $data{fields} = \@fields;
+    push(@fields, {
+      name => $self->name,
+      data => $self->stream->to_array,
+      tokenization => lc($self->foundry) . '#' . lc($self->layer),
+      foundries => $self->support,
+      layerInfo => $self->layer_info
+    });
+
+    $data{fields} = \@fields;
+  }
+
+  else {
+    $data{primaryData} = $self->doc->primary->data if $primary;
+    $data{tokenName}   = $self->name;
+    $data{data}        = $self->stream->to_array;
+    $data{tokenSource} = lc($self->foundry) . '#' . lc($self->layer);
+    $data{foundries}   = $self->support;
+    $data{layerInfos}  = $self->layer_info;
+    $data{version}     = '0.02';
+  };
+
   \%data;
 };
 
 
+sub to_json_legacy {
+  encode_json($_[0]->to_data($_[1], 1));
+};
+
 sub to_json {
-  encode_json($_[0]->to_data($_[1]));
+  encode_json($_[0]->to_data($_[1], 0));
 };
 
 
@@ -566,6 +585,25 @@
 Start the tokenization process.
 
 
+=head2 to_json_legacy
+
+  print $tokens->to_json_legacy;
+  print $tokens->to_json_legacy(1);
+
+Return the token data in legacy JSON format.
+An optional parsed boolean parameter indicates,
+if primary data should be included.
+
+=head2 to_json
+
+  print $tokens->to_json;
+  print $tokens->to_json(1);
+
+Return the token data in JSON format
+An optional parsed boolean parameter indicates,
+if primary data should be included.
+
+
 =head2 add_subtokens
 
   $tokens->split_tokens;

diff --git a/lib/KorAP/Tokenizer/Token.pm b/lib/KorAP/Tokenizer/Token.pm
index 5dcdb1d..a2ac486 100644
--- a/lib/KorAP/Tokenizer/Token.pm
+++ b/lib/KorAP/Tokenizer/Token.pm

@@ -7,6 +7,7 @@
   bless [], shift;
 };
 
+# get or set token position
 sub pos {
   if (defined $_[1]) {
     $_[0]->[0] = $_[1];
@@ -15,6 +16,7 @@
 };
 
 
+# Get or set token content
 sub content {
   if (defined $_[1]) {
     $_[0]->[1] = $_[1];
@@ -24,6 +26,8 @@
   };
 };
 
+
+# Get or set token id
 sub id {
   if ($_[1]) {
     $_[0]->[2] = $_[1];
@@ -33,6 +37,7 @@
   };
 };
 
+
 sub dom {
   if ($_[0]->[3]) {
     return $_[0]->[3];

diff --git a/lib/KorAP/Tokenizer/Tokens.pm b/lib/KorAP/Tokenizer/Tokens.pm
index e3336a3..3b5355b 100644
--- a/lib/KorAP/Tokenizer/Tokens.pm
+++ b/lib/KorAP/Tokenizer/Tokens.pm

@@ -13,9 +13,23 @@
 
 sub parse {
   my $self = shift;
+
   my $path = $self->path . $self->foundry . '/' . $self->layer . '.xml';
 
-  return unless -e $path;
+  # Legacy data support
+  unless (-e $path) {
+    if ($self->layer eq 'namedentities') {
+      $path = $self->path . $self->foundry . '/ne_combined.xml';
+      return unless -e $path;
+    }
+    elsif ($self->layer eq 'morpho' && $self->foundry eq 'glemm') {
+      $path = $self->path . $self->foundry . '/glemm.xml';
+      return unless -e $path;
+    }
+    else {
+      return;
+    };
+  };
 
   my $file = b($path)->slurp;
commit	32e30f07a3ddd06b4af06d74190de0699bfab5b8	[log] [tgz]
author	Nils Diewald <nils@diewald-online.de>	Thu Oct 30 00:52:36 2014 +0000
committer	Nils Diewald <nils@diewald-online.de>	Thu Oct 30 00:52:36 2014 +0000
tree	020651e696992a7cb82bdbd4e0c8b83992d629b8
parent	840c92497503e9f5cf6066d7c3d3bd1b46bd04c1 [diff]