w2v-server: refactor variable names
diff --git a/w2v-server.pl b/w2v-server.pl
index d7ec900..cc17cf3 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -178,10 +178,16 @@
void *connection_handler(void *);
typedef struct {
- long long *index;
- float *dist;
- float *norm;
- long long *pos;
+ long long wordi;
+ long position;
+ float activation;
+ float activation_sum;
+ float probability_sum;
+ float probability;
+} collocator;
+
+typedef struct {
+ collocator *best;
int length;
} knn;
@@ -414,26 +420,22 @@
long window_layer_size = size * window * 2;
long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
float f, max_f, maxmax_f;
- float *target_sums, *bestf, *bestn, worstbest, wpos_sum;
- long long *besti, *bestp;
+ float *target_sums, worstbest, wpos_sum;
+ collocator *best;
if(M2 == NULL || cc == -1)
return NULL;
a = posix_memalign((void **) &target_sums, 128, pars->cutoff * sizeof(float));
- besti = malloc(N * sizeof(long long));
- bestp = malloc(N * sizeof(long long));
- bestf = malloc(N * sizeof(float));
- bestn = malloc(N * sizeof(float));
-
+ best = malloc(N * sizeof(collocator));
worstbest = MIN_RESP;
for (b = 0; b < pars->cutoff; b++)
target_sums[b]=0;
for (b = 0; b < N; b++) {
- besti[b] = -1;
- bestn[b] = 1;
- bestf[b] = worstbest;
+ best[b].wordi = -1;
+ best[b].probability = 1;
+ best[b].activation = worstbest;
}
d = cc;
@@ -468,18 +470,16 @@
target_sums[target] += f;
if(f > worstbest) {
for (b = 0; b < N; b++) {
- if (f > bestf[b]) {
- memmove(bestf + b + 1, bestf + b, (N - b -1) * sizeof(float));
- memmove(besti + b + 1, besti + b, (N - b -1) * sizeof(long long));
- memmove(bestp + b + 1, bestp + b, (N - b -1) * sizeof(long long));
- bestf[b] = f;
- besti[b] = target;
- bestp[b] = window-a;
+ if (f > best[b].activation) {
+ memmove(best + b + 1, best + b, (N - b -1) * sizeof(collocator));
+ best[b].activation = f;
+ best[b].wordi = target;
+ best[b].position = window-a;
break;
}
}
if(b == N - 1)
- worstbest = bestf[N-1];
+ worstbest = best[N-1].activation;
}
}
printf("%d %.2f\n", max_target, max_f);
@@ -489,8 +489,8 @@
maxmax_target = max_target;
}
for (b = 0; b < N; b++)
- if(bestp[b] == window-a)
- bestn[b] = bestf[b] / wpos_sum;
+ if(best[b].position == window-a)
+ best[b].probability = best[b].activation / wpos_sum;
} else {
printf("\x1b[1m%s\x1b[0m ", &vocab[d*max_w]);
}
@@ -499,14 +499,11 @@
for (b = 0; b < pars->cutoff; b++)
pars->target_sums[b] += (target_sums[b] / wpos_sum ) / (window * 2);
free(target_sums);
- for(b=0; b<N && besti[b] >= 0; b++);; // THIS LOOP IS NEEDED (b...)
-// printf("%d: best syn: %s %.2f %.5f\n", b, &vocab[besti[b]*max_w], bestf[b], bestn[b]);
+ for(b=0; b<N && best[b].wordi >= 0; b++);; // THIS LOOP IS NEEDED (b...)
+// printf("%d: best syn: %s %.2f %.5f\n", b, &vocab[best[b].wordi*max_w], best[b].activation, best[b].probability);
// printf("\n");
nbs = malloc(sizeof(knn));
- nbs->index = besti;
- nbs->dist = bestf;
- nbs->norm = bestn;
- nbs->pos = bestp;
+ nbs->best = best;
nbs->length = b-1;
pthread_exit(nbs);
}
@@ -557,18 +554,17 @@
long from = pars -> from;
unsigned long upto = pars -> upto;
char file_name[max_size], st[100][max_size], *sep;
- float dist, len, *bestd, vec[max_size];
- long long a, b, c, d, cn, *bi, *besti;
+ float dist, len, vec[max_size];
+ long long a, b, c, d, cn, *bi;
char ch;
knn *nbs = NULL;
wordlist *wl = pars->wl;
- besti = malloc(N * sizeof(long long));
- bestd = malloc(N * sizeof(float));
+ collocator *best = malloc(N * sizeof(collocator));
float worstbest=-1;
- for (a = 0; a < N; a++) bestd[a] = 0;
+ for (a = 0; a < N; a++) best[a].activation = 0;
a = 0;
bi = wl->wordi;
cn = wl->length;
@@ -591,7 +587,7 @@
for (a = 0; a < size; a++) len += vec[a] * vec[a];
len = sqrt(len);
for (a = 0; a < size; a++) vec[a] /= len;
- for (a = 0; a < N; a++) bestd[a] = -1;
+ for (a = 0; a < N; a++) best[a].activation = -1;
for (c = from; c < upto; c++) {
if(garbage && garbage[c]) continue;
a = 0;
@@ -602,21 +598,19 @@
for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
if(dist > worstbest) {
for (a = 0; a < N; a++) {
- if (dist > bestd[a]) {
- memmove(bestd + a + 1, bestd + a, (N - a -1) * sizeof(float));
- memmove(besti + a + 1, besti + a, (N - a -1) * sizeof(long long));
- bestd[a] = dist;
- besti[a] = c;
+ if (dist > best[a].activation) {
+ memmove(best + a + 1, best + a, (N - a -1) * sizeof(collocator));
+ best[a].activation = dist;
+ best[a].wordi = c;
break;
}
}
- worstbest = bestd[N-1];
+ worstbest = best[N-1].activation;
}
}
nbs = malloc(sizeof(knn));
- nbs->index = besti;
- nbs->dist = bestd;
+ nbs->best = best;
nbs->length = N;
end:
pthread_exit(nbs);
@@ -625,9 +619,9 @@
SV *get_neighbours(char *st1, int N, int sort_by, int search_backw, long cutoff, int dedupe) {
HV *result = newHV();
- float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
+ float *target_sums, vec[max_size];
long long old_words;
- long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
+ long a, b, c, d, slice;
knn *para_nbs[MAX_THREADS];
knn *syn_nbs[MAX_THREADS];
knnpars pars[MAX_THREADS];
@@ -635,7 +629,9 @@
wordlist *wl;
int syn_threads = (M2? window * 2 : 0);
int para_threads = num_threads - syn_threads;
-
+
+ collocator *best = malloc(MAX_NEIGHBOURS * sizeof(collocator));
+
if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
if(cutoff < 1)
@@ -687,20 +683,18 @@
/* goto end; */
for(b=0; b < N; b++) {
- besti[b] = para_nbs[0]->index[b];
- bestd[b] = para_nbs[0]->dist[b];
+ best[b].wordi = para_nbs[0]->best[b].wordi;
+ best[b].activation = para_nbs[0]->best[b].activation;
}
for(a=1; a < para_threads; a++) {
- for(b=0; b < para_nbs[a]->length && para_nbs[a]->index[b] >= 0; b++) {
+ for(b=0; b < para_nbs[a]->length && para_nbs[a]->best[b].wordi >= 0; b++) {
for(c=0; c < N * para_threads; c++) {
- if(para_nbs[a]->dist[b] > bestd[c]) {
+ if(para_nbs[a]->best[b].activation > best[c].activation) {
for(d=N-1; d>c; d--) {
- bestd[d] = bestd[d-1];
- besti[d] = besti[d-1];
+ memmove(best + d, best + d - 1, sizeof(collocator));
}
- besti[c] = para_nbs[a]->index[b];
- bestd[c] = para_nbs[a]->dist[b];
+ memcpy(best + c, ¶_nbs[a]->best[b], sizeof(collocator));
break;
}
}
@@ -713,7 +707,7 @@
int l1_words=0, l2_words=0;
for (a = 0, i = 0; i < N && a < 600; a++) {
int filtered=0;
- long long c = besti[a];
+ long long c = best[a].wordi;
if (dedupe && i > 0) {
for (j=0; j<i; j++)
if (strcasestr(&vocab[c * max_w], chosen[j]) ||
@@ -746,11 +740,11 @@
if(latin_enc == 0) SvUTF8_on(word);
fflush(stdout);
hv_store(hash, "word", strlen("word"), word , 0);
- hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
- hv_store(hash, "rank", strlen("rank"), newSVuv(besti[a]), 0);
+ hv_store(hash, "dist", strlen("dist"), newSVnv(best[a].activation), 0);
+ hv_store(hash, "rank", strlen("rank"), newSVuv(best[a].wordi), 0);
AV *vector = newAV();
for (b = 0; b < size; b++) {
- av_push(vector, newSVnv(M[b + besti[a] * size]));
+ av_push(vector, newSVnv(M[b + best[a].wordi * size]));
}
hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
av_push(array, newRV_noinc((SV*)hash));
@@ -759,11 +753,11 @@
hv_store(result, "paradigmatic", strlen("paradigmatic"), newRV_noinc((SV*)array), 0);
for(b=0; b < MAX_NEIGHBOURS; b++) {
- besti[b] = -1L;
- bestd[b] = 0;
- bestn[b] = 0;
- bestp[b] = 0;
- bests[b] = 0;
+ best[b].wordi = -1L;
+ best[b].activation = 0;
+ best[b].probability = 0;
+ best[b].position = 0;
+ best[b].activation_sum = 0;
}
if (M2) {
@@ -775,11 +769,9 @@
fflush(stdout);
for(b=0; b < syn_nbs[0]->length; b++) {
- besti[b] = syn_nbs[0]->index[b];
- bestd[b] = syn_nbs[0]->dist[b];
- bestn[b] = syn_nbs[0]->norm[b];
- bestp[b] = -1; // syn_nbs[0]->pos[b];
- bests[b] = target_sums[syn_nbs[0]->index[b]];
+ memcpy(best + b, &syn_nbs[0]->best[b], sizeof(collocator));
+ best[b].position = -1; // syn_nbs[0]->pos[b];
+ best[b].activation_sum = target_sums[syn_nbs[0]->best[b].wordi];
}
float best_window_sum[MAX_NEIGHBOURS];
@@ -788,10 +780,10 @@
for(a=0; a < syn_threads; a++) {
for(b=0; b < syn_nbs[a]->length; b++) {
for(i=0; i < found_index; i++)
- if(besti[i] == syn_nbs[a]->index[b])
+ if(best[i].wordi == syn_nbs[a]->best[b].wordi)
break;
if(i >= found_index) {
- besti[found_index++] = syn_nbs[a]->index[b];
+ best[found_index++].wordi = syn_nbs[a]->best[b].wordi;
// printf("found: %s\n", &vocab[syn_nbs[a]->index[b] * max_w]);
}
}
@@ -799,7 +791,7 @@
printf("window: %d - syn_threads: %d, %d\n", window, syn_threads, (1 << syn_threads) -1);
int wpos;
for(i=0; i < found_index; i++) {
- bestd[i] = 0; bestn[i] = 0;
+ best[i].activation = 0; best[i].probability = 0;
for(w=1; w < (1 << syn_threads); w++) { // loop through all possible windows
float word_window_sum = 0, word_activation_sum = 0, total_window_sum = 0;
int bits_set = 0;
@@ -815,13 +807,13 @@
wpos = (a >= window? a+1 : a);
bits_set++;
for(b=0; b < syn_nbs[a]->length; b++)
- if(besti[i] == syn_nbs[a]->index[b]) {
+ if(best[i].wordi == syn_nbs[a]->best[b].wordi) {
// word_window_sum += syn_nbs[a]->dist[b] * syn_nbs[a]->norm[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
// word_window_sum += syn_nbs[a]->dist[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
// word_window_sum += syn_nbs[a]->norm[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
// word_window_sum = (word_window_sum + syn_nbs[a]->norm[b]) - (word_window_sum * syn_nbs[a]->norm[b]); // syn_nbs[a]->norm[b];
- word_window_sum += syn_nbs[a]->dist[b] - word_window_sum * syn_nbs[a]->dist[b]; // conormalied activation sum
- word_activation_sum += syn_nbs[a]->dist[b];
+ word_window_sum += syn_nbs[a]->best[b].activation - word_window_sum * syn_nbs[a]->best[b].activation; // conormalied activation sum
+ word_activation_sum += syn_nbs[a]->best[b].activation;
}
}
}
@@ -831,50 +823,38 @@
// }
// word_window_sum /= total_window_sum;
- if(word_window_sum > bestn[i]) {
- bestn[i] = word_window_sum;
- bestd[i] = word_activation_sum;
- bestp[i] = w;
+ if(word_window_sum > best[i].probability) {
+ best[i].probability = word_window_sum;
+ best[i].activation = word_activation_sum;
+ best[i].position = w;
}
}
}
+ collocator *tmp = malloc(sizeof(collocator));
for(i=0; i<found_index;i++) {
for(j=0;j<found_index-1;j++) {
- if(bestn[j]<bestn[j+1]) {
- float tempd=bestn[j];
- bestn[j]=bestn[j+1];
- bestn[j+1]=tempd;
- tempd=bestd[j];
- bestd[j]=bestd[j+1];
- bestd[j+1]=tempd;
- int tempi=besti[j];
- besti[j]=besti[j+1];
- besti[j+1]=tempi;
- int tempp=bestp[j];
- bestp[j]=bestp[j+1];
- bestp[j+1]=tempp;
+ if(best[j].probability < best[j+1].probability) {
+ memmove(tmp, best + j, sizeof(collocator));
+ memmove(best + j, best + j + 1, sizeof(collocator));
+ memmove(best + j + 1, tmp, sizeof(collocator));
}
}
}
+ free(tmp);
// for(i=0; i < found_index; i++) {
-// printf("found: %s - sum: %f - window: %d\n", &vocab[besti[i] * max_w], bestd[i], bestp[i]);
+// printf("found: %s - sum: %f - window: %d\n", &vocab[best[i].wordi * max_w], best[i].activation, best[i].position);
// }
} else if(sort_by ==1) { // single window position
for(a=1; a < syn_threads; a++) {
for(b=0; b < syn_nbs[a]->length; b++) {
for(c=0; c < MAX_NEIGHBOURS; c++) {
- if(syn_nbs[a]->dist[b] > bestd[c]) {
+ if(syn_nbs[a]->best[b].activation > best[c].activation) {
for(d=MAX_NEIGHBOURS-1; d>c; d--) {
- bestd[d] = bestd[d-1];
- besti[d] = besti[d-1];
- bestn[d] = bestn[d-1];
- bestp[d] = bestp[d-1];
+ memmove(best + d, best + d - 1, sizeof(collocator));
}
- besti[c] = syn_nbs[a]->index[b];
- bestd[c] = syn_nbs[a]->dist[b];
- bestn[c] = syn_nbs[a]->norm[b];
- bestp[c] = 1 << (-syn_nbs[a]->pos[b]+window - (syn_nbs[a]->pos[b] < 0 ? 1:0));
+ memcpy(best + c, &syn_nbs[a]->best[b], sizeof(collocator));
+ best[c].position = 1 << (-syn_nbs[a]->best[b].position+window - (syn_nbs[a]->best[b].position < 0 ? 1:0));
break;
}
}
@@ -884,19 +864,13 @@
for(a=1; a < syn_threads; a++) {
for(b=0; b < syn_nbs[a]->length; b++) {
for(c=0; c < MAX_NEIGHBOURS; c++) {
- if(target_sums[syn_nbs[a]->index[b]] > bests[c]) {
+ if(target_sums[syn_nbs[a]->best[b].wordi] > best[c].activation_sum) {
for(d=MAX_NEIGHBOURS-1; d>c; d--) {
- bestd[d] = bestd[d-1];
- besti[d] = besti[d-1];
- bestn[d] = bestn[d-1];
- bestp[d] = bestp[d-1];
- bests[d] = bests[d-1];
+ memmove(best + d, best + d - 1, sizeof(collocator));
}
- besti[c] = syn_nbs[a]->index[b];
- bestd[c] = syn_nbs[a]->dist[b];
- bestn[c] = syn_nbs[a]->norm[b];
- bestp[c] = (1 << 2*window) - 1; // syn_nbs[a]->pos[b];
- bests[c] = target_sums[syn_nbs[a]->index[b]];
+ memcpy(best + c, &syn_nbs[a]->best[b], sizeof(collocator));
+ best[c].position = (1 << 2*window) - 1; // syn_nbs[a]->pos[b];
+ best[c].activation_sum = target_sums[syn_nbs[a]->best[b].wordi];
break;
}
}
@@ -904,8 +878,8 @@
}
}
array = newAV();
- for (a = 0, i=0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
- long long c = besti[a];
+ for (a = 0, i=0; a < MAX_NEIGHBOURS && best[a].wordi >= 0; a++) {
+ long long c = best[a].wordi;
if (dedupe) {
int filtered=0;
for (j=0; j<i; j++)
@@ -919,20 +893,21 @@
}
chosen[i++]=&vocab[c * max_w];
HV* hash = newHV();
- SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
+ SV* word = newSVpvf(&vocab[best[a].wordi * max_w], 0);
if(latin_enc == 0) SvUTF8_on(word);
hv_store(hash, "word", strlen("word"), word , 0);
- hv_store(hash, "rank", strlen("rank"), newSVuv(besti[a]), 0);
- hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
- hv_store(hash, "norm", strlen("norm"), newSVnv(bestn[a]), 0);
- hv_store(hash, "sum", strlen("sum"), newSVnv(target_sums[besti[a]]), 0);
- hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
+ hv_store(hash, "rank", strlen("rank"), newSVuv(best[a].wordi), 0);
+ hv_store(hash, "dist", strlen("dist"), newSVnv(best[a].activation), 0);
+ hv_store(hash, "norm", strlen("norm"), newSVnv(best[a].probability), 0);
+ hv_store(hash, "sum", strlen("sum"), newSVnv(target_sums[best[a].wordi]), 0);
+ hv_store(hash, "pos", strlen("pos"), newSVnv(best[a].position), 0);
av_push(array, newRV_noinc((SV*)hash));
}
hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
}
end:
words = old_words;
+ free(best);
return newRV_noinc((SV*)result);
}