w2vserver: print words with larges difference between main and merged vectors
diff --git a/w2v-server.pl b/w2v-server.pl
index d0084fb..01b49a7 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -527,6 +527,7 @@
words += merge_words;
fclose(f);
printf("merged_end: %lld, words: %lld\n", merged_end, words);
+ printBiggestMergedDifferences();
return((long) merged_end);
}
@@ -773,6 +774,47 @@
return(wl);
}
+void printBiggestMergedDifferences() {
+ float dist, len, vec[max_size];
+ long long a, b, c, d, cn, *bi;
+ char ch;
+ knn *nbs = NULL;
+ int N = 100;
+
+ printf("Looking for biggest distances between main and merged vectors ...\n");
+ collocator *best;
+ best = malloc(N * sizeof(collocator));
+ memset(best, 0, N * sizeof(collocator));
+
+ float worstbest=1000000;
+
+ for (a = 0; a < N; a++) best[a].activation = worstbest;
+
+ for (c = 0; c < 10000; c++) {
+ if(garbage && garbage[c]) continue;
+ a = 0;
+ dist = 0;
+ for (a = 0; a < size; a++) dist += M[a + c * size] * M[a + (c+merged_end) * size];
+ if(dist < worstbest) {
+ for (a = 0; a < N; a++) {
+ if (dist < best[a].activation) {
+ memmove(best + a + 1, best + a, (N - a -1) * sizeof(collocator));
+ best[a].activation = dist;
+ best[a].wordi = c;
+ break;
+ }
+ }
+ worstbest = best[N-1].activation;
+ }
+ }
+
+ printf("Most distant vectors for:\n ");
+ for (a = 0; a < N; a++) {
+ printf("%s ", &vocab[best[a].wordi * max_w]);
+ }
+ printf("\n");
+}
+
void *_get_neighbours(void *arg) {
knnpars *pars = arg;
char *st1 = pars->token;