w2v-server: new param: dedupe Removes all words that contain the target word or that are contained by the target word from similars and collocators.

commit: d91212fc4200520eae750c37a840448f11780fcc [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Nov 13 10:05:09 2017 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Nov 13 10:05:09 2017 +0100
tree: dff2dd0fe57336a970075b171616bbbef04df043
parent: e28409265b81664231b0d95cf3c4ca4861a5597f [diff]
diff --git a/w2v-server.pl b/w2v-server.pl
index 1173a18..a0e9e49 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl

@@ -93,6 +93,7 @@
   my $sort=$c->param('sort') || 0;
   my $json=$c->param('json') || 0;
   my $cutoff=$c->param('cutoff') || 1000000;
+  my $dedupe=$c->param('dedupe') || 0;
   my $res;
 	my @lists;
 	my @collocations;
@@ -100,15 +101,15 @@
 		$c->inactivity_timeout(300);
 		$word =~ s/\s+/ /g;
     for my $w (split(' *\| *', $word)) {
-      if ($cache{$w.$cutoff.$no_nbs.$sort}) {
+      if ($cache{$w.$cutoff.$no_nbs.$sort.$dedupe}) {
         $c->app->log->info("Getting $w results from cache");
-        $res = $cache{$w.$cutoff.$no_nbs.$sort}
+        $res = $cache{$w.$cutoff.$no_nbs.$sort.$dedupe}
       } else {
         $c->app->log->info('Looking for neighbours of '.$w);
         if($opt_i) {
-          $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff);
+          $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe);
         } else {
-          $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff);
+          $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe);
         }
         $cache{$w} = $res;
       }
@@ -119,7 +120,7 @@
   if($json) {
     return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}});
   } else {
-    $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+    $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
   }
 };
 
@@ -592,7 +593,7 @@
 }
 
 
-SV *get_neighbours(char *st1, int N, int sort_by, int search_backw, long cutoff) {
+SV *get_neighbours(char *st1, int N, int sort_by, int search_backw, long cutoff, int dedupe) {
   HV *result = newHV();
 	float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
 	long long old_words;
@@ -675,11 +676,23 @@
 		}
 	}
 
+  char *chosen[600];
   AV* array = newAV();
-  int i;
+  int i, j;
   int l1_words=0, l2_words=0;
   for (a = 0, i = 0; i < N && a < 600; a++) {
+    int filtered=0;
     long long c = besti[a];
+    if (dedupe && i > 0) {
+      for (j=0; j<i; j++)
+        if (strcasestr(&vocab[c * max_w], chosen[j]) ||
+            strcasestr(chosen[j], &vocab[c * max_w])) {
+              printf("filtering %s %s\n", chosen[j], &vocab[c * max_w]);
+              filtered = 1;
+        }
+      if(filtered)
+        continue;
+    }
     if(merge_words > 0) {
         if(c >= merge_words) {
             if(l1_words > N / 2)
@@ -698,6 +711,7 @@
  
     HV* hash = newHV();
     SV* word = newSVpvf(&vocab[c * max_w], 0);
+		chosen[i] = &vocab[c * max_w];
     if(latin_enc == 0) SvUTF8_on(word);
     fflush(stdout);
     hv_store(hash, "word", strlen("word"), word , 0);
@@ -780,7 +794,20 @@
       }
     }
     array = newAV();
-    for (a = 0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
+    for (a = 0, i=0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
+      long long c = besti[a];
+      if (dedupe) {
+	  		int filtered=0;
+        for (j=0; j<i; j++)
+          if (strcasestr(&vocab[c * max_w], chosen[j]) ||
+              strcasestr(chosen[j], &vocab[c * max_w])) {
+                printf("filtering %s %s\n", chosen[j], &vocab[c * max_w]);
+								filtered = 1;
+							}
+				if(filtered)
+					continue;
+			}
+			chosen[i++]=&vocab[c * max_w];
       HV* hash = newHV();
       SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
       if(latin_enc == 0) SvUTF8_on(word);
commit	d91212fc4200520eae750c37a840448f11780fcc	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Nov 13 10:05:09 2017 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Nov 13 10:05:09 2017 +0100
tree	dff2dd0fe57336a970075b171616bbbef04df043
parent	e28409265b81664231b0d95cf3c4ca4861a5597f [diff]