w2v-server: add column and overall normalized sums
diff --git a/templates/index.html.ep b/templates/index.html.ep
index 78ec921..9e78ea6 100644
--- a/templates/index.html.ep
+++ b/templates/index.html.ep
@@ -59,11 +59,10 @@
"orderable": false,
"targets": 0
},
- { "orderSequence": [ "desc" ], "targets": [ 2, 3, 4 ] },
- { "orderSequence": [ "asc", "desc" ], "targets": [ 1, 5 ] },
- { "orderSequence": [ "desc" ], "targets": [ 3 ] }
+ { "orderSequence": [ "desc" ], "targets": [ 2, 3, 4, 5, 6 ] },
+ { "orderSequence": [ "asc", "desc" ], "targets": [ 1, 7 ] },
],
- "order": [[ 3, 'desc' ]],
+ "order": [[ 4, 'desc' ]],
} );
t.on( 'order.dt search.dt', function () {
t.column(0, {order:'applied'}).nodes().each( function (cell, i) {
@@ -626,8 +625,8 @@
<label for="sortby">window/sort</label>
<select id="sortby" name="sort">
<option value="0" <%= ($sort!=1 && $sort!=2? "selected":"") %>>auto focus</option>
- <option value="1" <%= ($sort==1? "selected":"") %>>any single position</option>
- <option value="2" <%= ($sort==2? "selected":"") %>>whole window</option>
+ <!-- <option value="1" <%= ($sort==1? "selected":"") %>>any single position</option>
+ <option value="2" <%= ($sort==2? "selected":"") %>>whole window</option> -->
</select>
% }
<input type="button" value="→ KorAP" onclick="queryKorAP();" title="query word with KorAP"/>
@@ -734,11 +733,13 @@
<tr>
% if($collocators) {
<th>#</th>
- <th align="right" title="The window around the target word that is considered for summation.">w'</th>
+ <th align="right" title="The columns (c) around the target are considered for summation are marked with *.">w'</th>
<th align="right" title="Raw (max.) activation of the collocator in the output layers.">max(a)</th>
- <th title="(c<sub><small>@</small></sub>) – Sum of the probability approximations that the combination of the target word and the collocator at the relative position @ come from the training corpus. Single approximations can be distorted because of sub-sampling frequent words and the sum cannot itself be interpreted as probability." align="right">⊥Σa</th>
- <th align="right">Σa/Σw</th>
- <th title="c" align="left">collocator</th>
+ <th title="Co-normalized raw activation sum of the collocator in the selected columns." align="right">⊥Σa</th>
+ <th title="Sum of activations over the selected colunns normalized by the total activation sum of the selected columns." align="right">Σa/Σc</th>
+ <th title="Sum of the column normalized activations over the selected colunns." align="right">Σ(a/c)</th>
+ <th title="Sum of the activations over the whole window normalized by the total window sum (no auto-focus)." align="right">Σa/Σw</th>
+ <th align="left">collocator</th>
% }
</tr>
</thead>
@@ -762,6 +763,12 @@
<td align="right">
<%= sprintf("%.3e", $c->{prob}) %>
</td>
+ <td align="right">
+ <%= sprintf("%.3e", $c->{cprob}) %>
+ </td>
+ <td align="right">
+ <%= sprintf("%.3e", $c->{overall}) %>
+ </td>
<td align="left">
<a onclick="<%= sprintf("queryKorAPCII('%s /w5 %s')", $c->{word}, $word) =%>"
title="freq. rank: <%= $c->{rank} =%>">
diff --git a/w2v-server.pl b/w2v-server.pl
index 38b8cb8..9c02dc4 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -189,6 +189,7 @@
long long wordi;
long position;
float activation;
+ float cprobability; // column wise probability
float probability;
float activation_sum;
float conorm;
@@ -500,14 +501,15 @@
}
for (b = 0; b < N; b++)
if(best[b].position == window-a)
- best[b].probability = best[b].activation / wpos_sum;
+ best[b].cprobability = best[b].activation / wpos_sum;
} else {
printf("\x1b[1m%s\x1b[0m ", &vocab[d*max_w]);
}
pars->window_sums[a] = wpos_sum;
}
for (b = 0; b < pars->cutoff; b++)
- pars->target_sums[b] += (target_sums[b] / wpos_sum ) / (window * 2);
+ pars->target_sums[b] += target_sums[b]; //(target_sums[b] / wpos_sum ) / (window * 2);
+ printf("Target-Summe von 0: %f\n", pars->target_sums[150298]);
free(target_sums);
for(b=0; b<N && best[b].wordi >= 0; b++);; // THIS LOOP IS NEEDED (b...)
// printf("%d: best syn: %s %.2f %.5f\n", b, &vocab[best[b].wordi*max_w], best[b].activation, best[b].probability);
@@ -783,11 +785,16 @@
best[b].activation_sum = 0;
}
+ float total_activation = 0;
+
if (M2) {
printf("Waiting for syn threads to join\n");
fflush(stdout);
for (a = 0; a < syn_threads; a++) pthread_join(pt[a+para_threads], (void *) &syn_nbs[a]);
- for (a = 0; a <= syn_threads; a++) printf("window pos: %d, sum: %f\n", a, window_sums[a]);
+ for (a = 0; a <= syn_threads; a++) {
+ total_activation += window_sums[a];
+ printf("window pos: %d, sum: %f\n", a, window_sums[a]);
+ }
printf("syn threads joint\n");
fflush(stdout);
@@ -798,6 +805,7 @@
best[b].max_activation = 0.0;
best[b].conorm = 0.0;
best[b].probability = 0.0;
+ best[b].cprobability = syn_nbs[0]->best[b].cprobability;
}
float best_window_sum[MAX_NEIGHBOURS];
@@ -811,11 +819,14 @@
best[found_index].max_activation = 0.0;
best[found_index].conorm = 0.0;
best[found_index].probability = 0.0;
+ best[found_index].cprobability = syn_nbs[a]->best[b].cprobability;
+ best[found_index].activation_sum = target_sums[syn_nbs[a]->best[b].wordi]; // syn_nbs[a]->best[b].activation_sum;
best[found_index++].wordi = syn_nbs[a]->best[b].wordi;
// printf("found: %s\n", &vocab[syn_nbs[a]->index[b] * max_w]);
}
}
}
+ sort_by =0; // ALWAYS AUTO-FOCUS
if(sort_by != 1 && sort_by != 2) { // sort by auto focus mean
printf("window: %d - syn_threads: %d, %d\n", window, syn_threads, (1 << syn_threads) -1);
int wpos;
@@ -837,15 +848,18 @@
bits_set++;
for(b=0; b < syn_nbs[a]->length; b++)
if(best[i].wordi == syn_nbs[a]->best[b].wordi) {
+// float acti = syn_nbs[a]->best[b].activation / total_window_sum;
// word_window_sum += syn_nbs[a]->dist[b] * syn_nbs[a]->norm[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
// word_window_sum += syn_nbs[a]->norm[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
// word_window_sum = (word_window_sum + syn_nbs[a]->norm[b]) - (word_window_sum * syn_nbs[a]->norm[b]); // syn_nbs[a]->norm[b];
+
word_window_sum += syn_nbs[a]->best[b].activation; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
- word_window_conorm += syn_nbs[a]->best[b].activation - word_window_sum * syn_nbs[a]->best[b].activation; // conormalied activation sum
+// word_window_sum += acti - (word_window_sum * acti); syn_nbs[a]->best[b].activation; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
+
+ word_window_conorm += syn_nbs[a]->best[b].activation - word_window_conorm * syn_nbs[a]->best[b].activation; // conormalied activation sum
word_activation_sum += syn_nbs[a]->best[b].activation;
if(syn_nbs[a]->best[b].activation > best[i].max_activation)
best[i].max_activation = syn_nbs[a]->best[b].activation;
- word_activation_sum += syn_nbs[a]->best[b].activation;
}
}
}
@@ -853,7 +867,8 @@
// word_activation_sum /= bits_set;
// word_window_sum /= bits_set;
// }
- word_window_sum /= total_window_sum;
+
+ word_window_sum /= total_window_sum;
if(word_window_sum > best[i].probability) {
best[i].probability = word_window_sum;
@@ -924,17 +939,19 @@
array = newAV();
for (a = 0, i=0; a < MAX_NEIGHBOURS && best[a].wordi >= 0; a++) {
long long c = best[a].wordi;
+/*
if (dedupe) {
int filtered=0;
for (j=0; j<i; j++)
if (strcasestr(&vocab[c * max_w], chosen[j]) ||
strcasestr(chosen[j], &vocab[c * max_w])) {
- printf("filtering %s %s\n", chosen[j], &vocab[c * max_w]);
- filtered = 1;
- }
+ printf("filtering %s %s\n", chosen[j], &vocab[c * max_w]);
+ filtered = 1;
+ }
if(filtered)
continue;
}
+*/
chosen[i++]=&vocab[c * max_w];
HV* hash = newHV();
SV* word = newSVpvf(&vocab[best[a].wordi * max_w], 0);
@@ -943,7 +960,9 @@
hv_store(hash, "rank", strlen("rank"), newSVuv(best[a].wordi), 0);
hv_store(hash, "conorm", strlen("conorm"), newSVnv(best[a].conorm), 0);
hv_store(hash, "prob", strlen("prob"), newSVnv(best[a].probability), 0);
+ hv_store(hash, "cprob", strlen("cprob"), newSVnv(best[a].cprobability), 0);
hv_store(hash, "max", strlen("max"), newSVnv(best[a].max_activation), 0); // newSVnv(target_sums[best[a].wordi]), 0);
+ hv_store(hash, "overall", strlen("overall"), newSVnv(best[a].activation_sum/total_activation), 0); // newSVnv(target_sums[best[a].wordi]), 0);
hv_store(hash, "pos", strlen("pos"), newSVnv(best[a].position), 0);
av_push(array, newRV_noinc((SV*)hash));
}