w2v-servert: add cutoff parame (consider only n most frequent forms) WARNING: this might break mapped multi-lingual vector-spaces

commit: 2c79c5e508711dc62f2e3db95bb7a2a1538957c3 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Nov 09 16:18:40 2017 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Nov 09 16:18:40 2017 +0100
tree: 5c2b3941df3004988317680ed8fdc4723ab196dc
parent: b18978b43c24bac8616b82facb45456becd3d796 [diff]
diff --git a/templates/index.html.ep b/templates/index.html.ep
index 7270997..d42a282 100644
--- a/templates/index.html.ep
+++ b/templates/index.html.ep

@@ -310,11 +310,13 @@
 	  <form method="GET">
 		  word(s): 
       <input id="word" type="text" name="word" size="20"  value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word."> 
+			cut-off:
+      <input id="cutoff" type="text" name="cutoff" size="10"  value="<%= $cutoff %>" title="Only consider the most frequent x word forms.">
 		  % if($mergedEnd > 0) {
         backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
       % }
-		  max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
-		  max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
+		  max. neighbours: <input type="text" size="4" name="n" value="<%= $no_nbs %>">
+		  max. iterations: <input type="text" name="N" size="4" value="<%= $no_iterations %>">
       SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
 		  % if($collocators) {
         <span>  </span>sort collocators by

diff --git a/w2v-server.pl b/w2v-server.pl
index c864774..402a0d5 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl

@@ -92,6 +92,7 @@
 	my $searchBaseVocabFirst=$c->param('sbf') || 0;
   my $sort=$c->param('sort') || 0;
   my $json=$c->param('json') || 0;
+  my $cutoff=$c->param('cutoff') || 1000000;
   my $res;
 	my @lists;
 	my @collocations;
@@ -105,9 +106,9 @@
       } else {
         $c->app->log->info('Looking for neighbours of '.$w);
         if($opt_i) {
-          $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst);
+          $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff);
         } else {
-          $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst);
+          $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff);
         }
         $cache{$w} = $res;
       }
@@ -118,7 +119,7 @@
   if($json) {
     return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}});
   } else {
-    $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+    $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
   }
 };
 
@@ -165,6 +166,7 @@
 } wordlist;
 
 typedef struct {
+  long cutoff;
   wordlist *wl;
 	char *token;
 	int N;
@@ -388,7 +390,7 @@
   if(M2 == NULL || cc == -1)
     return NULL;
 
-	a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
+	a = posix_memalign((void **) &target_sums, 128, pars->cutoff * sizeof(float));
 	besti = malloc(N * sizeof(long long));
 	bestp = malloc(N * sizeof(long long));
 	bestf = malloc(N * sizeof(float));
@@ -396,7 +398,7 @@
 
   worstbest = MIN_RESP;
 
-  for (b = 0; b < words; b++)
+  for (b = 0; b < pars->cutoff; b++)
 			target_sums[b]=0;
   for (b = 0; b < N; b++) {
       besti[b] = -1;
@@ -418,7 +420,8 @@
       window_offset = a * size;
       if (a > window)
         window_offset -= size;
-      for(target = 0; target < words; target ++) {
+      for(target = 0; target < pars->cutoff; target ++) {
+				if(garbage && garbage[target]) continue;
         if(target == d)
           continue;
         f = 0;
@@ -463,7 +466,7 @@
     }
     
   }
-  for (b = 0; b < words; b++)
+  for (b = 0; b < pars->cutoff; b++)
       pars->target_sums[b] += (target_sums[b] / wpos_sum ) / (window * 2);
   free(target_sums);
   for(b=0; b<N && besti[b] >= 0; b++) // THIS LOOP IS NEEDED (b...)
@@ -588,7 +591,7 @@
 }
 
 
-SV *get_neighbours(char *st1, int N, int sort_by, int search_backw) {
+SV *get_neighbours(char *st1, int N, int sort_by, int search_backw, long cutoff) {
   HV *result = newHV();
 	float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
 	long long old_words;
@@ -603,32 +606,36 @@
   
   if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
 	
-  
+  if(cutoff < 1)
+    cutoff=words;
+
   wl = getTargetWords(st1, search_backw);
   if(wl->length < 1)
     goto end;
 
-	old_words = words;
+	old_words = cutoff;
   if(merge_words > 0)
-		words = merge_words * 1.25;  /* HACK */
-	slice = words / para_threads;
+		cutoff = merge_words * 1.25;  /* HACK */
+	slice = cutoff / para_threads;
 
-	a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
-  for(a = 0; a < words; a++)
+	a = posix_memalign((void **) &target_sums, 128, cutoff * sizeof(float));
+  for(a = 0; a < cutoff; a++)
     target_sums[a] = 0;
 
   printf("Starting %d threads\n", para_threads);
   fflush(stdout);
 	for(a=0; a < para_threads; a++) {
+		pars[a].cutoff = cutoff;
 		pars[a].token = st1;
 		pars[a].wl = wl;
 		pars[a].N = N;
 		pars[a].from = a*slice;
-		pars[a].upto = ((a+1)*slice > words? words:(a+1)*slice);
+		pars[a].upto = ((a+1)*slice > cutoff? cutoff:(a+1)*slice);
 		pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
 	}
   if(M2) {
     for(a=0; a < syn_threads; a++) {
+			pars[a + para_threads].cutoff = cutoff;
       pars[a + para_threads].target_sums = target_sums;
       pars[a + para_threads].wl = wl;
       pars[a + para_threads].N = N;
commit	2c79c5e508711dc62f2e3db95bb7a2a1538957c3	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Nov 09 16:18:40 2017 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Nov 09 16:18:40 2017 +0100
tree	5c2b3941df3004988317680ed8fdc4723ab196dc
parent	b18978b43c24bac8616b82facb45456becd3d796 [diff]