w2v-server: bug-fixes and hacks for merged/comparable vecs

commit: a5f60048de0a93a6524b16a1c0196c14fdc72cf9 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu May 04 10:38:12 2017 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu May 04 10:38:12 2017 +0200
tree: b0f40d22d67cbca663904f1e7b3e7d48e422c3ce
parent: b613b05a41994df38585b38cb9abe680f8876181 [diff]
diff --git a/w2v-server.pl b/w2v-server.pl
index df93317..6ed83a8 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl

@@ -1,7 +1,8 @@
 #!/usr/local/bin/perl
 use Inline C;
-#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
 use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -O4";
+#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
+#use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -Ofast -march k8 -mtune k8 ";
 use Mojolicious::Lite;
 use Mojo::JSON qw(decode_json encode_json to_json);
 use Encode qw(decode encode);
@@ -22,7 +23,7 @@
 my $training_args="";
 my $mergedEnd=0;
 
-getopts('d:Gil:p:m:M:');
+getopts('d:Gil:p:m:n:M:');
 
 if($opt_M) {
   open my $handle, '<:encoding(UTF-8)', $opt_M
@@ -68,6 +69,7 @@
 
 get '/' => sub {
   my $c    = shift;
+  $c->app->log->info("get: ".$c->req->url->to_abs);
 	my $word=$c->param('word');
   my $no_nbs=$c->param('n') || 100;
   my $no_iterations=$c->param('N') || 2000;
@@ -84,7 +86,7 @@
 		$c->inactivity_timeout(300);
 		$word =~ s/\s+/ /g;
     for my $w (split(' *\| *', $word)) {
-			$c->app->log->debug('Looking for neighbours of '.$w);
+			$c->app->log->info('Looking for neighbours of '.$w);
       if($opt_i) {
         $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst);
       } else {
@@ -157,6 +159,7 @@
 char *garbage = NULL;
 
 long long words, size, merged_end;
+long long merge_words = 0;
 int num_threads=20;
 int latin_enc=0;
 int window;
@@ -281,7 +284,8 @@
 	float len;
   float *merge_vecs;
   char *merge_vocab;
-  long long merge_words, merge_size;
+ /*  long long merge_words, merge_size; */
+  long long merge_size;
 
 	char binvecs_fname[256], binwords_fname[256];
 	strcpy(binwords_fname, file_name);
@@ -568,6 +572,7 @@
 SV *get_neighbours(char *st1, int N, int sort_by, int search_backw) {
   HV *result = newHV();
 	float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
+	long long old_words;
 	long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
 	knn *para_nbs[MAX_THREADS];
 	knn *syn_nbs[MAX_THREADS];
@@ -579,12 +584,16 @@
   
   if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
 	
-	slice = words / para_threads;
   
   wl = getTargetWords(st1, search_backw);
   if(wl->length < 1)
     goto end;
 
+	old_words = words;
+  if(merge_words > 0)
+		words = merge_words * 1.25;  /* HACK */
+	slice = words / para_threads;
+
 	a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
   for(a = 0; a < words; a++)
     target_sums[a] = 0;
@@ -625,7 +634,7 @@
 
 	for(a=1; a < para_threads; a++) {
 		for(b=0; b < para_nbs[a]->length && para_nbs[a]->index[b] >= 0; b++) {
-			for(c=0; c < N; c++) {
+			for(c=0; c < N * para_threads; c++) {
 				if(para_nbs[a]->dist[b] > bestd[c]) {
 					for(d=N-1; d>c; d--) {
 						bestd[d] = bestd[d-1];
@@ -640,10 +649,30 @@
 	}
 
   AV* array = newAV();
-  for (a = 0; a < N; a++) {
+  int i;
+  int l1_words=0, l2_words=0;
+  for (a = 0, i = 0; i < N && a < 600; a++) {
+    long long c = besti[a];
+    if(merge_words > 0) {
+        if(c >= merge_words) {
+            if(l1_words > N / 2)
+                continue;
+            else
+                l1_words++;
+        } else {
+            if(l2_words > N / 2)
+                continue;
+            else
+                l2_words++;
+        }
+    }
+    fflush(stdout);
+    printf("%s l1:%d l2:%d i:%d a:%ld\n", &vocab[c * max_w], l1_words, l2_words, i, a);
+ 
     HV* hash = newHV();
-    SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
+    SV* word = newSVpvf(&vocab[c * max_w], 0);
     if(latin_enc == 0) SvUTF8_on(word);
+    fflush(stdout);
     hv_store(hash, "word", strlen("word"), word , 0);
     hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
     hv_store(hash, "rank", strlen("rank"), newSVuv(besti[a]), 0);
@@ -653,6 +682,7 @@
     }
     hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
     av_push(array, newRV_noinc((SV*)hash));
+    i++;
   }
   hv_store(result, "paradigmatic", strlen("paradigmatic"), newRV_noinc((SV*)array), 0);
   
@@ -737,6 +767,7 @@
     hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
   }
 end:
+	words = old_words;
 	return newRV_noinc((SV*)result);
 }
commit	a5f60048de0a93a6524b16a1c0196c14fdc72cf9	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu May 04 10:38:12 2017 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu May 04 10:38:12 2017 +0200
tree	b0f40d22d67cbca663904f1e7b3e7d48e422c3ce
parent	b613b05a41994df38585b38cb9abe680f8876181 [diff]