w2v-server: load and serve similar profiles if present
diff --git a/w2v-server.pl b/w2v-server.pl
index a8ef559..b1e542f 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -31,6 +31,7 @@
 my $mergedEnd=0;
 my %cache;
 my %cccache; # classic collocator cache
+my %spcache; # similar profile cache
 
 getopts('d:Gil:p:m:n:M:');
 
@@ -56,6 +57,8 @@
   close(FILE);
 }
 
+my $have_sprofiles = load_sprofiles($ARGV[0]);
+
 if($opt_m) {
   $mergedEnd = mergeVectors($opt_m);
 }
@@ -102,6 +105,17 @@
   return $cccache{$word};
 }
 
+sub getSimilarProfilesCached {
+  my ($c, $word) = @_;
+  if(!$spcache{$word}) {
+    $spcache{$word} = getSimilarProfiles($word);
+  } else {
+    $c->app->log->info("Getting similar profiles for $word from cache:");
+    print $spcache{$word};
+  }
+  return $spcache{$word};
+}
+
 post '/derekovecs/getVecsByRanks' => sub {
   my $self = shift;
   my $vec = getVecs($self->req->json);
@@ -118,6 +132,11 @@
   $self->render(data => getClassicCollocatorsCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json');
 };
 
+any '*/getSimilarProfiles' => sub {
+  my $self = shift;
+  $self->render(data => getSimilarProfilesCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json');
+};
+
 get '*/img/*' => sub {
 	my $c = shift;
 	my $url = $c->req->url;
@@ -179,7 +198,7 @@
     $csv_data .= "\n";
     return $c->render(text=>$csv_data);
   } else {
-    $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+    $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, haveSProfiles=> $have_sprofiles, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
   }
 };
 
@@ -258,11 +277,23 @@
   float *window_sums;
 } knnpars;
 
+typedef struct {
+  uint32_t index;
+  float value;
+} sparse_t;
+
+typedef struct {
+  uint32_t len;
+  sparse_t nbr[100];
+} profile_t;
+
 float *M, *M2=0L, *syn1neg_window, *expTable;
 float *window_sums;
 char *vocab;
 char *garbage = NULL;
 COLLOCATORDB *cdb = NULL;
+profile_t *sprofiles = NULL;
+size_t sprofiles_qty = 0;
 
 long long words, size, merged_end;
 long long merge_words = 0;
@@ -270,6 +301,39 @@
 int latin_enc=0;
 int window;
 
+/* load collocation profiles if file exists */
+int load_sprofiles(char *vecsname) {
+  char *basename = strdup(vecsname);
+  char *pos = strstr(basename, ".vecs");
+  if(pos)
+    *pos=0;
+  
+  char binsprofiles_fname[256];
+  strcpy(binsprofiles_fname, basename);
+	strcat(binsprofiles_fname, ".sprofiles.bin");
+  FILE *fp = fopen(binsprofiles_fname, "rb");
+  if (fp == NULL) {
+    printf("Collocation profiles %s not found. No problem.\n", binsprofiles_fname);
+    return 0;
+  }
+  fseek(fp, 0L, SEEK_END);
+  size_t sz = ftell(fp);
+  fclose(fp);
+
+  int fd = open(binsprofiles_fname, O_RDONLY);
+  sprofiles = mmap(0,  sz, PROT_READ, MAP_SHARED, fd, 0);
+  if (sprofiles == MAP_FAILED) {
+    close(fd);
+    fprintf(stderr, "Cannot mmap %s\n", binsprofiles_fname);
+    sprofiles = NULL;
+    return 0;
+  }  else {
+    sprofiles_qty = sz / sizeof(profile_t);
+    fprintf(stderr, "Successfully mmaped %s containing similar profiles for %ld word forms.\n", binsprofiles_fname, sprofiles_qty);
+  }
+  return 1;
+}
+
 int init_net(char *file_name, char *net_name, int latin) {
   FILE *f, *binvecs, *binwords;
 	int binwords_fd, binvecs_fd, net_fd, i;
@@ -393,7 +457,8 @@
 		expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
 	}
 	window_sums = malloc(sizeof(float) * (window+1) * 2);
-	return 0;
+
+  return 0;
 }
 
 long mergeVectors(char *file_name){
@@ -463,7 +528,7 @@
     previous = 0;
     if(strncmp("quot", w, 4) == 0) {
       garbage[i]=1;
-      printf("Gargabe: %s\n", vocab + i * max_w);
+//      printf("Gargabe: %s\n", vocab + i * max_w);
     } else {
       while((c = *w++) && !garbage[i]) {
         if( ((c <= 90 && c >= 65) && (previous >= 97 && previous <= 122)) || 
@@ -597,6 +662,29 @@
   return result;
 }
 
+char *getSimilarProfiles(long node) {
+  int i;
+  char buffer[120000];
+  char pair_buffer[2048];
+  buffer[0]='[';
+  buffer[1]=0;
+  if(node >= sprofiles_qty) {
+    printf("Not available in precomputed profile\n");
+    return(strdup("[{\"w\":\"not available\", \"v\":0}]\n"));
+  }
+
+  printf("******* %s ******\n", &vocab[max_w * node]);
+  
+  for(i=0; i < 100 && i < sprofiles[node].len; i++) {
+    sprintf(pair_buffer, "{\"w\":\"%s\", \"v\":%f},", &vocab[max_w * (sprofiles[node].nbr[i].index)], sprofiles[node].nbr[i].value);
+    strcat(buffer, pair_buffer);
+  }
+  buffer[strlen(buffer)-1]=']';
+  strcat(buffer, "\n");
+  printf(buffer);
+  return(strdup(buffer));
+}
+
 char *getClassicCollocators(long node) {
 	char *res = (cdb? strdup(get_collocators_as_json(cdb, node)) : "[]");
 	return res;