w2v-server: add basic functinalities for joint vec spaces * new option -d dumps vector file to txt vector file * .txt vector inputs are automatically parsed and converted to mmappale structs * slice size bug for paradigmatic neighbours fixed * fixes for paradadigmatic only output

commit: 43ee87e6c0f0f4b8d7a45e22ecd7966e3102e299 [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Mon Apr 25 10:50:08 2016 +0200
committer: Marc Kupietz <kupietz@ids-mannheim.de> Mon Apr 25 10:50:08 2016 +0200
tree: f212316476d43a36a2fea7fcd28616616383cb06
parent: c47b390822625a93893cf01f94715503ee2fbce7 [diff]
diff --git a/w2v-server.pl b/w2v-server.pl
index a44df71..df47adc 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl

@@ -1,5 +1,7 @@
 #!/usr/local/bin/perl
+#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -g";
 use Inline C;
+use Inline C => Config => CLEAN_AFTER_BUILD => 0; #, ccflags => $Config{ccflags}." -g";
 use Mojolicious::Lite;
 use Mojo::JSON qw(decode_json encode_json to_json);
 use Encode qw(decode encode);
@@ -10,11 +12,12 @@
 our $opt_i = 0; # latin1-input?
 our $opt_l = undef;
 our $opt_p = 5676;
-our $opt_n = undef;
+our $opt_n = '';
+our $opt_d;
 
 my $training_args="";
 
-getopt('il:p:n:'); 
+getopt('d:il:p:n:'); 
 
 # -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
 if(!$ARGV[0]) {
@@ -27,6 +30,11 @@
   close(FILE);
 }
 
+if($opt_d) { # -d: dump  vecs and exit
+	dump_vecs($opt_d);
+	exit;
+}
+
 my $daemon = Mojo::Server::Daemon->new(
     app    => app,
     listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
@@ -112,7 +120,7 @@
   float *target_sums;
 } knnpars;
 
-float *M, *M2, *syn1neg_window, *expTable;
+float *M, *M2=0L, *syn1neg_window, *expTable;
 char *vocab;
 
 long long words, size;
@@ -125,6 +133,7 @@
 	int binwords_fd, binvecs_fd, net_fd, i;
 	long long a, b, c, d, cn;
 	float len;
+  double val;
 
 	char binvecs_fname[256], binwords_fname[256];
 	strcpy(binwords_fname, file_name);
@@ -149,20 +158,40 @@
 			printf("Cannot allocate memory: %lld MB    %lld  %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
 			return -1;
 		}
-		for (b = 0; b < words; b++) {
-			a = 0;
-			while (1) {
-				vocab[b * max_w + a] = fgetc(f);
-				if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
-				if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
-			}
-			vocab[b * max_w + a] = 0;
-			fread(&M[b * size], sizeof(float), size, f);
-			len = 0;
-			for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
-			len = sqrt(len);
-			for (a = 0; a < size; a++) M[a + b * size] /= len;
-		}
+    if(strstr(file_name, ".txt")) {
+      for (b = 0; b < words; b++) {
+        a = 0;
+        while (1) {
+          vocab[b * max_w + a] = fgetc(f);
+          if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
+          if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
+        }
+        vocab[b * max_w + a] = 0;
+        len = 0;
+        for (a = 0; a < size; a++) {
+          fscanf(f, "%lf", &val);
+          M[a + b * size] = val;
+          len += val * val;
+        } 
+        len = sqrt(len);
+        for (a = 0; a < size; a++) M[a + b * size] /= len;
+      }
+    } else {
+      for (b = 0; b < words; b++) {
+        a = 0;
+        while (1) {
+          vocab[b * max_w + a] = fgetc(f);
+          if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
+          if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
+        }
+        vocab[b * max_w + a] = 0;
+        fread(&M[b * size], sizeof(float), size, f);
+        len = 0;
+        for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
+        len = sqrt(len);
+        for (a = 0; a < size; a++) M[a + b * size] /= len;
+      }
+    }
 		if( (binvecs = fopen(binvecs_fname, "wb")) != NULL && (binwords = fopen(binwords_fname, "wb")) != NULL) {
 			fwrite(M, sizeof(float), (long long)words * (long long)size, binvecs);
 			fclose(binvecs);
@@ -185,13 +214,13 @@
 	}
   fclose(f);
 
-  if(net_name) {
+  if(net_name && strlen(net_name) > 0) {
     if( (net_fd = open(net_name, O_RDONLY)) >= 0) {
       window = (lseek(net_fd, 0, SEEK_END) -  sizeof(float) * words * size) / words / size / sizeof(float) / 2;
       //      lseek(net_fd, sizeof(float) * words * size, SEEK_SET);
       // munmap(M,  sizeof(float) * words * size);
       M2 = mmap(0, sizeof(float) * words * size + sizeof(float) * 2 * window * size * words, PROT_READ, MAP_SHARED, net_fd, 0);
-      if (M == MAP_FAILED) {
+      if (M2 == MAP_FAILED) {
         close(net_fd);
         fprintf(stderr, "Cannot mmap %s\n", net_name);
         exit(-1);
@@ -222,7 +251,7 @@
 	float *target_sums, *bestf, *bestn, worstbest, wpos_sum;
 	long long *besti, *bestp;
 
-  if(cc == -1)
+  if(M2 == NULL || cc == -1)
     return NULL;
 
 	a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
@@ -372,7 +401,6 @@
   sep = wl->sep;
 	b = bi[0];
 	c = 0;
-
 	if (b == -1) {
     N = 0;
 	  goto end;
@@ -429,14 +457,13 @@
 	knnpars pars[MAX_THREADS];
   pthread_t *pt = (pthread_t *)malloc((num_threads+1) * sizeof(pthread_t));
   wordlist *wl;
-  int para_threads = num_threads - window * 2;
-  int syn_threads = window * 2;
-  num_threads = para_threads+syn_threads;
+  int syn_threads = (M2? window * 2 : 0);
+  int para_threads = num_threads - syn_threads;
   
   if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
 	
-	slice = words / syn_threads;
-
+	slice = words / para_threads;
+  
   wl = getTargetWords(st1);
   if(wl->length < 1)
     goto end;
@@ -445,6 +472,8 @@
   for(a = 0; a < words; a++)
     target_sums[a] = 0;
 
+  printf("Starting %d threads\n", para_threads);
+  fflush(stdout);
 	for(a=0; a < para_threads; a++) {
 		pars[a].token = st1;
 		pars[a].wl = wl;
@@ -453,13 +482,15 @@
 		pars[a].upto = ((a+1)*slice > words? words:(a+1)*slice);
 		pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
 	}
-	for(a=0; a < syn_threads; a++) {
-    pars[a + para_threads].target_sums = target_sums;
-    pars[a + para_threads].wl = wl;
-    pars[a + para_threads].N = N;
-    pars[a + para_threads].from = a;
-    pars[a + para_threads].upto = a+1;
-    pthread_create(&pt[a + para_threads], NULL, getCollocators, (void *) &pars[a + para_threads]);
+  if(M2) {
+    for(a=0; a < syn_threads; a++) {
+      pars[a + para_threads].target_sums = target_sums;
+      pars[a + para_threads].wl = wl;
+      pars[a + para_threads].N = N;
+      pars[a + para_threads].from = a;
+      pars[a + para_threads].upto = a+1;
+      pthread_create(&pt[a + para_threads], NULL, getCollocators, (void *) &pars[a + para_threads]);
+    }
   }
   printf("Waiting for para threads to join\n");
   fflush(stdout);
@@ -467,8 +498,8 @@
   printf("Para threads joint\n");
   fflush(stdout);
 
-	if(!syn_nbs[0])
-		goto end;
+	/* if(!syn_nbs[0]) */
+	/* 	goto end; */
 
 	for(b=0; b < N; b++) {
 		besti[b] = para_nbs[0]->index[b];
@@ -516,82 +547,102 @@
     bests[b] = 0;
   }
 
-  printf("Waiting for syn threads to join\n");
-  fflush(stdout);
-  for (a = 0; a < syn_threads; a++) pthread_join(pt[a+para_threads], &syn_nbs[a]);
-  printf("syn threads joint\n");
-  fflush(stdout);
-  
+  if (M2) {
+    printf("Waiting for syn threads to join\n");
+    fflush(stdout);
+    for (a = 0; a < syn_threads; a++) pthread_join(pt[a+para_threads], &syn_nbs[a]);
+    printf("syn threads joint\n");
+    fflush(stdout);
 
-  for(b=0; b < syn_nbs[0]->length; b++) {
-    besti[b] = syn_nbs[0]->index[b];
-    bestd[b] = syn_nbs[0]->dist[b];
-    bestn[b] = syn_nbs[0]->norm[b];
-    bestp[b] = syn_nbs[0]->pos[b];
-    bests[b] = target_sums[syn_nbs[0]->index[b]];
-  }
-  
-  if(sort_by != 1) { // sort by responsiveness
-    for(a=1; a < syn_threads; a++) {
-      for(b=0; b < syn_nbs[a]->length; b++) {
-        for(c=0; c < MAX_NEIGHBOURS; c++) {
-          if(syn_nbs[a]->dist[b] > bestd[c]) {
-            for(d=MAX_NEIGHBOURS-1; d>c; d--) {
-              bestd[d] = bestd[d-1];
-              besti[d] = besti[d-1];
-              bestn[d] = bestn[d-1];
-              bestp[d] = bestp[d-1];
+    for(b=0; b < syn_nbs[0]->length; b++) {
+      besti[b] = syn_nbs[0]->index[b];
+      bestd[b] = syn_nbs[0]->dist[b];
+      bestn[b] = syn_nbs[0]->norm[b];
+      bestp[b] = syn_nbs[0]->pos[b];
+      bests[b] = target_sums[syn_nbs[0]->index[b]];
+    }
+    
+    if(sort_by != 1) { // sort by responsiveness
+      for(a=1; a < syn_threads; a++) {
+        for(b=0; b < syn_nbs[a]->length; b++) {
+          for(c=0; c < MAX_NEIGHBOURS; c++) {
+            if(syn_nbs[a]->dist[b] > bestd[c]) {
+              for(d=MAX_NEIGHBOURS-1; d>c; d--) {
+                bestd[d] = bestd[d-1];
+                besti[d] = besti[d-1];
+                bestn[d] = bestn[d-1];
+                bestp[d] = bestp[d-1];
+              }
+              besti[c] = syn_nbs[a]->index[b];
+              bestd[c] = syn_nbs[a]->dist[b];
+              bestn[c] = syn_nbs[a]->norm[b];
+              bestp[c] = syn_nbs[a]->pos[b];
+              break;
             }
-            besti[c] = syn_nbs[a]->index[b];
-            bestd[c] = syn_nbs[a]->dist[b];
-            bestn[c] = syn_nbs[a]->norm[b];
-            bestp[c] = syn_nbs[a]->pos[b];
-            break;
+          }
+        }
+      }
+    } else { // sort by mean p
+      for(a=1; a < syn_threads; a++) {
+        for(b=0; b < syn_nbs[a]->length; b++) {
+          for(c=0; c < MAX_NEIGHBOURS; c++) {
+            if(target_sums[syn_nbs[a]->index[b]] > bests[c]) {
+              for(d=MAX_NEIGHBOURS-1; d>c; d--) {
+                bestd[d] = bestd[d-1];
+                besti[d] = besti[d-1];
+                bestn[d] = bestn[d-1];
+                bestp[d] = bestp[d-1];
+                bests[d] = bests[d-1];
+              }
+              besti[c] = syn_nbs[a]->index[b];
+              bestd[c] = syn_nbs[a]->dist[b];
+              bestn[c] = syn_nbs[a]->norm[b];
+              bestp[c] = syn_nbs[a]->pos[b];
+              bests[c] = target_sums[syn_nbs[a]->index[b]];
+              break;
+            }
           }
         }
       }
     }
-  } else { // sort by mean p
-    for(a=1; a < syn_threads; a++) {
-      for(b=0; b < syn_nbs[a]->length; b++) {
-        for(c=0; c < MAX_NEIGHBOURS; c++) {
-          if(target_sums[syn_nbs[a]->index[b]] > bests[c]) {
-            for(d=MAX_NEIGHBOURS-1; d>c; d--) {
-              bestd[d] = bestd[d-1];
-              besti[d] = besti[d-1];
-              bestn[d] = bestn[d-1];
-              bestp[d] = bestp[d-1];
-              bests[d] = bests[d-1];
-            }
-            besti[c] = syn_nbs[a]->index[b];
-            bestd[c] = syn_nbs[a]->dist[b];
-            bestn[c] = syn_nbs[a]->norm[b];
-            bestp[c] = syn_nbs[a]->pos[b];
-            bests[c] = target_sums[syn_nbs[a]->index[b]];
-            break;
-          }
-        }
-      }
+    array = newAV();
+    for (a = 0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
+      HV* hash = newHV();
+      SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
+      if(latin_enc == 0) SvUTF8_on(word);
+      hv_store(hash, "word", strlen("word"), word , 0);
+      hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
+      hv_store(hash, "norm", strlen("norm"), newSVnv(bestn[a]), 0);
+      hv_store(hash, "sum", strlen("sum"), newSVnv(target_sums[besti[a]]), 0);
+      hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
+      av_push(array, newRV_noinc((SV*)hash));
     }
+    hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
   }
-  array = newAV();
-  for (a = 0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
-    HV* hash = newHV();
-    SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
-    if(latin_enc == 0) SvUTF8_on(word);
-    hv_store(hash, "word", strlen("word"), word , 0);
-    hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
-    hv_store(hash, "norm", strlen("norm"), newSVnv(bestn[a]), 0);
-    hv_store(hash, "sum", strlen("sum"), newSVnv(target_sums[besti[a]]), 0);
-    hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
-    av_push(array, newRV_noinc((SV*)hash));
-  }
-  hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
 end:
 	return newRV_noinc((SV*)result);
 }
 
+int dump_vecs(char *fname) {
+	long i, j;
+	FILE *f;
+	/* if(words>200000) */
+	/* 	words=200000; */
 
+	if((f=fopen(fname, "w")) == NULL) {
+			fprintf(stderr, "cannot open %s for writing\n", fname);
+			return(-1);
+	}
+	fprintf(f, "%lld %lld\n", words, size);
+	for (i=0; i < words; i++) {
+		fprintf(f, "%s ", &vocab[i * max_w]);
+		for(j=0; j < size - 1; j++)
+			fprintf(f, "%f ", M[i*size + j]);
+		fprintf(f, "%f\n", M[i*size + j]);
+	}
+	fclose(f);
+	return(0);
+}
 __DATA__
 
 @@ index.html.ep
@@ -867,11 +918,13 @@
 		max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
 		max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
     SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
+		% if($collocators) {
     <span>  </span>sort collocators by
     <select name="sort">
       <option value="0"  <%= ($sort!=1? "selected":"") %>>responsiveness</option>
       <option value="1" <%= ($sort==1? "selected":"") %>>mean p</option>
     </select>
+		% }
 		<span>  </span><input type="submit" value="Show">
 	</form>
 	<br>
@@ -879,12 +932,15 @@
 	<div id="wrapper">
 		<table id="first">
 			<tr>
-				<th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th><th title="Position in winodw around target word. Absolute value can be too low because of sub-sampling frequent words.">@</th><th align="right" title="&#34;Responsivenes&#34; of the collocator at the relative position @. Approximation of the probability that the combination of the target word and the collocator at the relative position @ come from the corpus.">resp.</th><th title="Probability of the collocator at window location @."align="right">p(c<sub><small>@</small></sub>)</th><th align="right">Σp(c<sub><small>@</small></sub>)/|w|</th><th align="left">syntagmatic</th>
+				<th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
+        % if($collocators) {
+				<th title="Position in winodw around target word. Absolute value can be too low because of sub-sampling frequent words.">@</th><th align="right" title="&#34;Responsivenes&#34; of the collocator at the relative position @. Approximation of the probability that the combination of the target word and the collocator at the relative position @ come from the corpus.">resp.</th><th title="Probability of the collocator at window location @."align="right">p(c<sub><small>@</small></sub>)</th><th align="right">Σp(c<sub><small>@</small></sub>)/|w|</th><th align="left">syntagmatic</th>
+				% }
 			</tr>
 			% my $j=0; my @words; my @vecs; my @ranks; for my $list (@$lists) {
 			% my $i=0; while($list) {
       % my $item = (@$list)[$i];
-      % my $c = (@$collocators)[$i];
+      % my $c = ($collocators? (@$collocators)[$i] : 0);
       % last if(!$c && !$item);
 			<tr>
 				<td align="right">
commit	43ee87e6c0f0f4b8d7a45e22ecd7966e3102e299	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Apr 25 10:50:08 2016 +0200
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Mon Apr 25 10:50:08 2016 +0200
tree	f212316476d43a36a2fea7fcd28616616383cb06
parent	c47b390822625a93893cf01f94715503ee2fbce7 [diff]