w2v-server: add option -m to merge secondary models
diff --git a/w2v-server.pl b/w2v-server.pl
index df47adc..a255b2f 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -1,7 +1,7 @@
 #!/usr/local/bin/perl
-#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -g";
 use Inline C;
-use Inline C => Config => CLEAN_AFTER_BUILD => 0; #, ccflags => $Config{ccflags}." -g";
+#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
+use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -O4";
 use Mojolicious::Lite;
 use Mojo::JSON qw(decode_json encode_json to_json);
 use Encode qw(decode encode);
@@ -12,12 +12,14 @@
 our $opt_i = 0; # latin1-input?
 our $opt_l = undef;
 our $opt_p = 5676;
+our $opt_m;
 our $opt_n = '';
 our $opt_d;
 
 my $training_args="";
+my $mergedEnd=0;
 
-getopt('d:il:p:n:'); 
+getopt('d:il:p:m:'); 
 
 # -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
 if(!$ARGV[0]) {
@@ -30,6 +32,10 @@
   close(FILE);
 }
 
+if($opt_m) {
+  $mergedEnd = mergeVectors($opt_m);
+}
+
 if($opt_d) { # -d: dump  vecs and exit
 	dump_vecs($opt_d);
 	exit;
@@ -48,6 +54,7 @@
   my $perplexity=$c->param('perplexity') || 20;
   my $epsilon=$c->param('epsilon') || 5;
   my $som=$c->param('som') || 0;
+	my $searchBaseVocabFirst=$c->param('sbf') || 0;
   my $sort=$c->param('sort') || 0;
   my $res;
 	my @lists;
@@ -58,15 +65,15 @@
     for my $w (split(' *\| *', $word)) {
 			$c->app->log->debug('Looking for neighbours of '.$w);
       if($opt_i) {
-        $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort);
+        $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst);
       } else {
-        $res = get_neighbours($w, $no_nbs, $sort);
+        $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst);
       }
       push(@lists, $res->{paradigmatic});
 		}
 	}
 	$word =~ s/ *\| */ | /g;
-	$c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, sort=>$sort, training_args=>$training_args, lists=> \@lists, collocators=> $res->{syntagmatic});
+	$c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, lists=> \@lists, collocators=> $res->{syntagmatic});
 };
 
 $daemon->run; # app->start;
@@ -123,7 +130,7 @@
 float *M, *M2=0L, *syn1neg_window, *expTable;
 char *vocab;
 
-long long words, size;
+long long words, size, merged_end;
 int num_threads=20;
 int latin_enc=0;
 int window;
@@ -241,6 +248,62 @@
 	return 0;
 }
 
+long mergeVectors(char *file_name){
+  FILE *f, *binvecs, *binwords;
+	int binwords_fd, binvecs_fd, net_fd, i;
+	long long a, b, c, d, cn;
+	float len;
+  float *merge_vecs;
+  char *merge_vocab;
+  long long merge_words, merge_size;
+
+	char binvecs_fname[256], binwords_fname[256];
+	strcpy(binwords_fname, file_name);
+	strcat(binwords_fname, ".words");
+	strcpy(binvecs_fname, file_name);
+	strcat(binvecs_fname, ".vecs");
+
+  f = fopen(file_name, "rb");
+  if (f == NULL) {
+    printf("Input file %s not found\n", file_name);
+    exit -1;
+  }
+  fscanf(f, "%lld", &merge_words);
+  fscanf(f, "%lld", &merge_size);
+  if(merge_size != size){
+		fprintf(stderr, "vectors must have the same length\n");
+		exit(-1);
+	}
+	if( (binvecs_fd = open(binvecs_fname, O_RDONLY)) >= 0 && (binwords_fd = open(binwords_fname, O_RDONLY)) >= 0) {
+		merge_vecs = malloc(sizeof(float) * (words + merge_words) * size);
+		merge_vocab = malloc(sizeof(char) * (words + merge_words) * max_w);
+    if (merge_vecs == NULL || merge_vocab == NULL) {
+			close(binvecs_fd);
+			close(binwords_fd);
+			fprintf(stderr, "Cannot reserve memory for %s or %s\n", binwords_fname, binvecs_fname);
+			exit(-1);
+    }
+    read(binvecs_fd, merge_vecs, merge_words * size * sizeof(float));
+    read(binwords_fd, merge_vocab, merge_words * max_w);
+	} else {
+    fprintf(stderr, "Cannot open %s or %s\n", binwords_fname, binvecs_fname);
+    exit(-1);
+	}
+	printf("Successfully reallocated memory\nMerging...\n");
+	fflush(stdout);
+  memcpy(merge_vecs + merge_words * size, M, words * size * sizeof(float));
+  memcpy(merge_vocab + merge_words * max_w, vocab, words * max_w);
+  munmap(M, words * size * sizeof(float));
+  munmap(vocab, words * max_w);
+  M = merge_vecs;
+  vocab = merge_vocab;
+  merged_end = merge_words;
+  words += merge_words;
+  fclose(f);
+	printf("merged_end: %lld, words: %lld\n", merged_end, words);
+  return((long) merged_end);
+}
+
 void *getCollocators(knnpars *pars) {
 	int N = pars->N;
   int cc = pars->wl->wordi[0];
@@ -344,11 +407,12 @@
   pthread_exit(nbs);
 }
 
-wordlist *getTargetWords(char *st1) {
+wordlist *getTargetWords(char *st1, int search_backw) {
   wordlist *wl = malloc(sizeof(wordlist));
   char st[100][max_size], sep[100];
 	long a, b=0, c=0, cn=0;
-  
+  int unmerged;
+
 	while (1) {
 		st[cn][b] = st1[c];
 		b++;
@@ -363,7 +427,11 @@
 	}
 	cn++;
 	for (a = 0; a < cn; a++) {
-		for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
+		if(search_backw) {
+			for (b = words - 1; b >= 0; b--) if (!strcmp(&vocab[b * max_w], st[a])) break;
+		}  else {
+			for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
+		}
 		if (b == words) b = -1;
 		wl->wordi[a] = b;
 		fprintf(stderr, "Word: \"%s\"  Position in vocabulary: %lld\n", st[a], wl->wordi[a]);
@@ -448,7 +516,7 @@
 }
 
 
-SV *get_neighbours(char *st1, int N, int sort_by) {
+SV *get_neighbours(char *st1, int N, int sort_by, int search_backw) {
   HV *result = newHV();
 	float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
 	long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
@@ -464,7 +532,7 @@
 	
 	slice = words / para_threads;
   
-  wl = getTargetWords(st1);
+  wl = getTargetWords(st1, search_backw);
   if(wl->length < 1)
     goto end;
 
@@ -674,12 +742,27 @@
 
 .ui-tooltip-content {
 	font-size: 9pt;
-	colour: #222222;
+	color: #222222;
 }
 
 svg > .ui-tooltip-content {
 	font-size: 8pt;
-	colour: #222222;
+	color: #222222;
+}
+
+a.merged {
+  color: green;
+  fill: green;
+}
+
+a.marked {
+  color: orange;
+  fill: orange;
+}
+
+a.target {
+  color: red;
+  fill: red;
 }
 
 #collocators {
@@ -790,19 +873,29 @@
 	
   g.append("a")
 	 .attr("xlink:href", function(word) {return "/?word="+word;})
+   .attr("class", function(d, i) {
+			if(data.target.indexOf(" "+d+" ") >= 0) {
+				return "target";
+			} else if(data.mergedEnd > 0 && data.ranks[i] < data.mergedEnd) {
+			  return "merged";
+			} else {
+				return ""
+			}
+	 })
 	 .attr("title", function(d, i) {
-		 return "rank: "+i +"  "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+		 if(data.mergedEnd > 0) {
+       if(data.ranks[i] >= data.mergedEnd) {	
+         	return "rank: "+i +"  "+"freq. rank: "+(data.ranks[i]).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+       } else {
+         	return "rank: "+i +"  "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",") + " (merged vocab)";
+       }  
+	   } else {
+		   return "rank: "+i +"  "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+		 }
 	 })
 		.append("text")
    .attr("text-anchor", "top")
    .attr("font-size", 12)
-   .attr("fill", function(d) {
-			if(data.target.indexOf(" "+d+" ") >= 0) {
-				return "red";
-			} else {
-				return "#333"
-			}
-	 })
    .text(function(d) { return d; });
 	
   var zoomListener = d3.behavior.zoom()
@@ -915,6 +1008,9 @@
 	<form action="<%=url_for('/')->to_abs%>" method="GET">
 		word(s): 
     <input type="text" name="word" size="20"  value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word."> 
+		% if($mergedEnd > 0) {
+     backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
+    % }
 		max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
 		max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
     SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
@@ -956,9 +1052,15 @@
   				<%= sprintf("%.3f", $item->{dist}) %>
 				</td>
 				<td>
-  				<a title="freq. rank: <%= $item->{rank} %>" href="/?word=<%= $item->{word} %>">
-						<%= $item->{word} %>
-					</a>
+          % my $class = ""; 
+          % my $r = $item->{rank}; 
+          % if($r < $mergedEnd) {
+          %   $class="merged";
+          %   $r .= " (merged vocab)";
+          % } elsif($mergedEnd!=0 && $r > $mergedEnd) {
+          %   $r -= $mergedEnd;
+          % }
+  				<a class="<%= $class %>" title="freq. rank: <%= $r %>" href="/?word=<%= $item->{word} %>"><%= $item->{word} %></a>
 				</td>
         % } else {
           <td colspan="2"/>
@@ -990,7 +1092,7 @@
 		<script>
 		 % use Mojo::ByteStream 'b';
 		 $(window).load(function() {
-			 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", words => \@words, vecs => \@vecs, ranks => \@ranks})); %>);
+			 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", mergedEnd=> $mergedEnd, words => \@words, vecs => \@vecs, ranks => \@ranks})); %>);
 		 });
     </script>
 		% }