w2v-server: use conormalized activation sum for auto-focus
Best results after trying all alternatives:
* sum of gloablly normalized probabilities
* sum of column-set normalized probabilities (also co-normalized)
"best" meant as a combination of complementarity to max(actiavtion)
and usefulness, particularly for low frequency words
diff --git a/w2v-server.pl b/w2v-server.pl
index a649820..d7ec900 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -199,9 +199,11 @@
long from;
unsigned long upto;
float *target_sums;
+ float *window_sums;
} knnpars;
float *M, *M2=0L, *syn1neg_window, *expTable;
+float *window_sums;
char *vocab;
char *garbage = NULL;
@@ -321,6 +323,7 @@
expTable[i] = exp((i / (float) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
}
+ window_sums = malloc(sizeof(float) * (window+1) * 2);
return 0;
}
@@ -491,14 +494,14 @@
} else {
printf("\x1b[1m%s\x1b[0m ", &vocab[d*max_w]);
}
-
+ pars->window_sums[a] = wpos_sum;
}
for (b = 0; b < pars->cutoff; b++)
pars->target_sums[b] += (target_sums[b] / wpos_sum ) / (window * 2);
free(target_sums);
- for(b=0; b<N && besti[b] >= 0; b++); // THIS LOOP IS NEEDED (b...)
-// printf("%s %.2f %d * ", &vocab[besti[b]*max_w], bestf[b], bestp[b]);
-// printf("\n");
+ for(b=0; b<N && besti[b] >= 0; b++);; // THIS LOOP IS NEEDED (b...)
+// printf("%d: best syn: %s %.2f %.5f\n", b, &vocab[besti[b]*max_w], bestf[b], bestn[b]);
+// printf("\n");
nbs = malloc(sizeof(knn));
nbs->index = besti;
nbs->dist = bestf;
@@ -666,6 +669,7 @@
for(a=0; a < syn_threads; a++) {
pars[a + para_threads].cutoff = cutoff;
pars[a + para_threads].target_sums = target_sums;
+ pars[a + para_threads].window_sums = window_sums;
pars[a + para_threads].wl = wl;
pars[a + para_threads].N = N;
pars[a + para_threads].from = a;
@@ -766,6 +770,7 @@
printf("Waiting for syn threads to join\n");
fflush(stdout);
for (a = 0; a < syn_threads; a++) pthread_join(pt[a+para_threads], &syn_nbs[a]);
+ for (a = 0; a <= syn_threads; a++) printf("window pos: %d, sum: %f\n", a, window_sums[a]);
printf("syn threads joint\n");
fflush(stdout);
@@ -780,7 +785,7 @@
float best_window_sum[MAX_NEIGHBOURS];
int found_index=0, i=0, j, w;
if(sort_by != 1 && sort_by != 2) { // sort by auto focus mean
- for(a=1; a < syn_threads; a++) {
+ for(a=0; a < syn_threads; a++) {
for(b=0; b < syn_nbs[a]->length; b++) {
for(i=0; i < found_index; i++)
if(besti[i] == syn_nbs[a]->index[b])
@@ -792,30 +797,54 @@
}
}
printf("window: %d - syn_threads: %d, %d\n", window, syn_threads, (1 << syn_threads) -1);
+ int wpos;
for(i=0; i < found_index; i++) {
+ bestd[i] = 0; bestn[i] = 0;
for(w=1; w < (1 << syn_threads); w++) { // loop through all possible windows
- float word_window_sum = 0;
+ float word_window_sum = 0, word_activation_sum = 0, total_window_sum = 0;
int bits_set = 0;
- for(a=1; a < syn_threads; a++) {
+ for(a=0; a < syn_threads; a++) {
if((1 << a) & w) {
- bits_set++;
- for(b=0; b < syn_nbs[a]->length; b++)
- if(besti[i] == syn_nbs[a]->index[b])
- word_window_sum += syn_nbs[a]->dist[b];
+ wpos = (a >= window? a+1 : a);
+ total_window_sum += window_sums[wpos];
}
}
- if(bits_set)
- word_window_sum /= bits_set;
- if(word_window_sum > bestd[i]) {
- bestd[i] = word_window_sum;
+// printf("%d window-sum %f\n", w, total_window_sum);
+ for(a=0; a < syn_threads; a++) {
+ if((1 << a) & w) {
+ wpos = (a >= window? a+1 : a);
+ bits_set++;
+ for(b=0; b < syn_nbs[a]->length; b++)
+ if(besti[i] == syn_nbs[a]->index[b]) {
+// word_window_sum += syn_nbs[a]->dist[b] * syn_nbs[a]->norm[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
+// word_window_sum += syn_nbs[a]->dist[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
+// word_window_sum += syn_nbs[a]->norm[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
+// word_window_sum = (word_window_sum + syn_nbs[a]->norm[b]) - (word_window_sum * syn_nbs[a]->norm[b]); // syn_nbs[a]->norm[b];
+ word_window_sum += syn_nbs[a]->dist[b] - word_window_sum * syn_nbs[a]->dist[b]; // conormalied activation sum
+ word_activation_sum += syn_nbs[a]->dist[b];
+ }
+ }
+ }
+// if(bits_set) {
+// word_activation_sum /= bits_set;
+// word_window_sum /= bits_set;
+// }
+// word_window_sum /= total_window_sum;
+
+ if(word_window_sum > bestn[i]) {
+ bestn[i] = word_window_sum;
+ bestd[i] = word_activation_sum;
bestp[i] = w;
}
}
}
for(i=0; i<found_index;i++) {
for(j=0;j<found_index-1;j++) {
- if(bestd[j]<bestd[j+1]) {
- float tempd=bestd[j];
+ if(bestn[j]<bestn[j+1]) {
+ float tempd=bestn[j];
+ bestn[j]=bestn[j+1];
+ bestn[j+1]=tempd;
+ tempd=bestd[j];
bestd[j]=bestd[j+1];
bestd[j+1]=tempd;
int tempi=besti[j];