w2v-server.pl: show collocators (still hacky)
diff --git a/w2v-server.pl b/w2v-server.pl
index b515c6b..4b43c49 100644
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -10,15 +10,15 @@
 our $opt_i = 0; # latin1-input?
 our $opt_l = undef;
 our $opt_p = 5676;
+our $opt_n = undef;
 
-getopt('il:p:'); 
+getopt('il:p:n:'); 
 
-print STDERR $ARGV[1];
 # -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
 if(!$ARGV[0]) {
-  init_net("vectors15.bin", ($opt_i? 1 : 0));
+  init_net("vectors15.bin", $opt_n, ($opt_i? 1 : 0));
 } else {
-  init_net($ARGV[0], ($opt_i? 1 : 0));
+  init_net($ARGV[0], $opt_n, ($opt_i? 1 : 0));
 }
 
 my $daemon = Mojo::Server::Daemon->new(
@@ -34,22 +34,24 @@
   my $perplexity=$c->param('perplexity') || 20;
   my $epsilon=$c->param('epsilon') || 5;
   my $som=$c->param('som') || 0;
-
+  my $res;
 	my @lists;
+	my @collocations;
 	if(defined($word) && $word !~ /^\s*$/) {
 		$c->inactivity_timeout(300);
 		$word =~ s/\s+/ /g;
     for my $w (split(' *\| *', $word)) {
 			$c->app->log->debug('Looking for neighbours of '.$w);
       if($opt_i) {
-        push(@lists, get_neighbours(encode("iso-8859-1", $w), $no_nbs));
+        $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs);
       } else {
-        push(@lists, get_neighbours($w, $no_nbs));
+        $res = get_neighbours($w, $no_nbs);
       }
+      push(@lists, $res->{paradigmatic});
 		}
 	}
 	$word =~ s/ *\| */ | /g;
-  $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, lists=> \@lists);
+  $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, lists=> \@lists, collocators=> $res->{syntagmatic});
 };
 
 $daemon->run; # app->start;
@@ -72,6 +74,9 @@
 #define MAX_NEIGHBOURS 1000
 #define MAX_WORDS -1
 #define MAX_THREADS 100
+#define MAX_CC 50
+#define EXP_TABLE_SIZE 1000
+#define MAX_EXP 6
 
 //the thread function
 void *connection_handler(void *);
@@ -79,6 +84,7 @@
 typedef struct {
 	long long *index;
 	float *dist;
+  long long *pos;
 	unsigned int length;
 } knn;
 
@@ -86,19 +92,21 @@
 typedef struct {
 	char *token;
 	int N;
-	unsigned long from;
+	long from;
 	unsigned long upto;
 } knnpars;
 
-float *M;
+float *M, *syn1neg_window, *expTable;
 char *vocab;
+
 long long words, size;
 int num_threads=20;
 int latin_enc=0;
+int window;
 
-int init_net(char *file_name, int latin) {
+int init_net(char *file_name, char *net_name, int latin) {
   FILE *f, *binvecs, *binwords;
-	int binwords_fd, binvecs_fd;
+	int binwords_fd, binvecs_fd, net_fd, i;
 	long long a, b, c, d, cn;
 	float len;
 
@@ -160,14 +168,134 @@
     exit(-1);
 	}
   fclose(f);
+
+  if(net_name) {
+    if( (net_fd = open(net_name, O_RDONLY)) >= 0) {
+      window = (lseek(net_fd, 0, SEEK_END) -  sizeof(float) * words * size) / words / size / sizeof(float) / 2;
+      //      lseek(net_fd, sizeof(float) * words * size, SEEK_SET);
+      munmap(M,  sizeof(float) * words * size);
+      M = mmap(0, sizeof(float) * words * size + sizeof(float) * 2 * window * size * words, PROT_READ, MAP_SHARED, net_fd, 0);
+      if (M == MAP_FAILED) {
+        close(net_fd);
+        fprintf(stderr, "Cannot mmap %s\n", net_name);
+        exit(-1);
+      }
+      syn1neg_window =  M + words * size;
+    } else {
+      fprintf(stderr, "Cannot open %s\n", net_name);
+      exit(-1);
+    }
+    fprintf(stderr, "Successfully memmaped %s. Determined window size: %d\n", net_name, window);
+  }    
+
+	expTable = (float *) malloc((EXP_TABLE_SIZE + 1) * sizeof(float));
+	for (i = 0; i < EXP_TABLE_SIZE; i++) {
+		expTable[i] = exp((i / (float) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
+		expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
+	}
 	return 0;
 }
 
 
+knn *getCollocators(int cc) {
+	knn *nbs = NULL;
+  long window_layer_size = size * window * 2;
+	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
+	float f, max_f, maxmax_f;
+	float *target_sums, *bestf, worstbest;
+	long long *besti, *bestp;
+	int N = 10;
+	a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
+	besti = malloc(N * sizeof(long long));
+	bestp = malloc(N * sizeof(long long));
+	bestf = malloc(N * sizeof(float));
+  for (b = 0; b < words; b++)
+			target_sums[b]=0;
+  for (b = 0; b < N; b++)
+			bestf[b]=-1;
+  worstbest = -1;
+  d = cc;
+  maxmax_f = -1;
+  maxmax_target = 0;
+
+
+  for (a = window * 2 + 1; a >=0; a--) {
+    printf("window pos: %ld\n", a);
+    if (a != window) {
+      max_f = -1;
+      window_offset = a * size;
+      if (a > window)
+        window_offset -= size;
+      for(target = 0; target < words; target ++) {
+        if(target == d)
+          continue;
+        f = 0;
+        for (c = 0; c < size; c++)
+          f += M[d* size + c]	* syn1neg_window[target * window_layer_size	+ window_offset + c];
+        if (f < -MAX_EXP)
+          continue;
+        else if (f > MAX_EXP)
+          continue;
+        else
+          f = expTable[(int) ((f + MAX_EXP)	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
+        if(f > max_f) {
+          max_f = f;
+          max_target = target;
+        }
+        target_sums[target] += (1-target_sums[target]) * f;
+        if(f > worstbest) {
+          for (b = 0; b < N; b++) {
+            if (f > bestf[b]) {
+              for (e = N - 1; e > b; e--) {
+                bestf[e] = bestf[e - 1];
+                besti[e] = besti[e - 1];
+                bestp[e] = bestp[e - 1];
+              }
+              bestf[b] = f;
+              besti[b] = target;
+              bestp[b] = window-a;
+              break;
+            }
+          }
+          worstbest = bestf[N-1];
+        }
+      }
+      printf("%d %.2f\n", max_target, max_f);
+      printf("%s (%.2f) ", &vocab[max_target * max_w], max_f);
+      if(max_f > maxmax_f) {
+        maxmax_f = max_f;
+        maxmax_target = max_target;
+      }
+    } else {
+      printf("\x1b[1m%s\x1b[0m ", &vocab[d*max_w]);
+    }
+  }
+  max_f = -1;
+  for (b = 0; b < words; b++) {
+    if(target_sums[b] > max_f) {
+      max_f = target_sums[b];
+      max_target = b;
+    }
+  }
+  printf(" -- max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
+         &vocab[max_target * max_w], max_f,
+         &vocab[maxmax_target * max_w], maxmax_f);
+  for(b=0; b<N && bestf[b]>-1; b++) 
+    printf("%-32s %.2f %d\n", &vocab[besti[b]*max_w], bestf[b], bestp[b]);
+  printf("\n");
+	free(target_sums);
+  nbs = malloc(sizeof(knn));
+	nbs->index = besti;
+	nbs->dist = bestf;
+	nbs->pos = bestp;
+	nbs->length = N;
+  return(nbs);
+}
+
 void *_get_neighbours(knnpars *pars) {
 	char *st1 = pars->token;
 	int N = pars->N;
-	unsigned long from = pars -> from;
+	long from = pars -> from;
 	unsigned long upto = pars -> upto;
 	char file_name[max_size], st[100][max_size], sep[100];
 	float dist, len, *bestd, vec[max_size];
@@ -203,6 +331,10 @@
 		if (b == words) b = -1;
 		bi[a] = b;
 		fprintf(stderr, "Word: \"%s\"  Position in vocabulary: %lld\n", st[a], bi[a]);
+    if(from < 0) {
+      nbs = getCollocators(b);
+      pthread_exit(nbs);
+    }
 		if (b == -1) {
 			fprintf(stderr, "Out of dictionary word!\n");
 			cn--;
@@ -257,19 +389,26 @@
 	pthread_exit(nbs);
 }
 
+
 SV *get_neighbours(char *st1, int N) {
+  HV *result = newHV();
 	float bestd[MAX_NEIGHBOURS], vec[max_size];
-	long long besti[MAX_NEIGHBOURS], a, b, c, d, slice;
+	long long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
 	char *bestw[MAX_NEIGHBOURS];
 	knn *nbs[MAX_THREADS];
 	knnpars pars[MAX_THREADS];
-  pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
-	AV* array = newAV();
+  pthread_t *pt = (pthread_t *)malloc((num_threads+1) * sizeof(pthread_t));
 
   if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
 	
 	slice = words / num_threads;
 
+  a = num_threads;
+  pars[a].token = st1;
+  pars[a].N = N;
+  pars[a].from = -1;
+  pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
+
 	for(a=0; a < num_threads; a++) {
 		pars[a].token = st1;
 		pars[a].N = N;
@@ -279,6 +418,8 @@
 	}
   for (a = 0; a < num_threads; a++) pthread_join(pt[a], &nbs[a]);
 
+  pthread_join(pt[a], &nbs[a]);
+
 	if(!nbs[0])
 		goto end;
 
@@ -303,7 +444,9 @@
 		}
 	}
 
+  
 	if(nbs) {
+    AV* array = newAV();
 		for (a = 0; a < N; a++) {
 			bestw[a] = (char *)malloc(max_size * sizeof(char));
 		}
@@ -322,11 +465,31 @@
 			hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
 			av_push(array, newRV_noinc((SV*)hash));
 		}
+    hv_store(result, "paradigmatic", strlen("paradigmatic"), newRV_noinc((SV*)array), 0);
+
+    for(b=0; b < 10; b++) {
+      besti[b] = nbs[num_threads]->index[b];
+      bestd[b] = nbs[num_threads]->dist[b];
+      bestp[b] = nbs[num_threads]->pos[b];
+    }
+    array = newAV();
+		for (a = 0; a < 10; a++) {
+			strcpy(bestw[a], &vocab[besti[a] * max_w]);
+			HV* hash = newHV();
+      SV* word = newSVpvf(bestw[a], 0);
+      if(latin_enc == 0) SvUTF8_on(word);
+			hv_store(hash, "word", strlen("word"), word , 0);
+			hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
+			hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
+			av_push(array, newRV_noinc((SV*)hash));
+		}
+    hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
 	}
 end:
-	return newRV_noinc((SV*)array);
+	return newRV_noinc((SV*)result);
 }
 
+
 __DATA__
 
 @@ index.html.ep
@@ -366,6 +529,10 @@
 	colour: #222222;
 }
 
+#collocators {
+    margin-bottom: 15px;
+}
+  
 #wrapper {
     width: 100%;
 //   border: 1px solid red; 
@@ -601,6 +768,13 @@
 		<span>  </span><input type="submit" value="Show">
 	</form>
 	<br>
+	% if($collocators) {
+  <div id="collocators">
+  % for my $item (@$collocators) {
+    <i><%= $item->{word} %></i> (<%= $item->{pos} %>: <%= sprintf("%.2f", $item->{dist}) %>) 
+      % }
+  </div>
+    % }
 	% if($lists) {
 	<div id="wrapper">
 		<table id="first">