w2v-server: change merge semantics to cover mono-lingual corpora only
diff --git a/w2v-server.pl b/w2v-server.pl
index 6f8d7f5..8701f90 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -178,10 +178,20 @@
if(defined($word) && $word !~ /^\s*$/) {
$c->inactivity_timeout(300);
$word =~ s/\s+/ /g;
+ if($opt_m && $word !~ /\|/) {
+ $word .= "|$word";
+ }
for my $w (split(' *\| *', $word)) {
- if ($cache{$w.$cutoff.$no_nbs.$sort.$dedupe}) {
+ if($opt_m) {
+ if($searchBaseVocabFirst) {
+ $searchBaseVocabFirst=0;
+ } else {
+ $searchBaseVocabFirst=1;
+ }
+ }
+ if ($cache{$w.$cutoff.$no_nbs.$sort.$dedupe,$searchBaseVocabFirst}) {
$c->app->log->info("Getting $w results from cache");
- $res = $cache{$w.$cutoff.$no_nbs.$sort.$dedupe}
+ $res = $cache{$w.$cutoff.$no_nbs.$sort.$dedupe.$searchBaseVocabFirst}
} else {
$c->app->log->info('Looking for neighbours of '.$w);
if($opt_i) {
@@ -732,38 +742,30 @@
while (1) {
st[cn][b] = st1[c];
- if (merge_words > 0)
- st[cn+1][b] = st1[c];
b++;
c++;
st[cn][b] = 0;
if (st1[c] == 0) break;
if (st1[c] == ' ' || st1[c] == '-') {
sep[cn++] = st1[c];
- if (merge_words > 0)
- sep[cn++] = st1[c];
b = 0;
c++;
}
}
cn++;
- if (merge_words > 0)
- cn++;
for (a = 0; a < cn; a++) {
if (search_backw) {
- for (b = words - 1; b >= 0; b--) if (!strcmp(&vocab[b * max_w], st[a])) break;
+ for (b = words - 1; b >= (merge_words? merge_words : 0) && strcmp(&vocab[b * max_w], st[a]) !=0; b--);
} else {
- if (merge_words > 0 && a % 2 == 1)
- for (b = merge_words; b < words && strcmp(&vocab[b * max_w], st[a]) != 0; b++);
- else
- for (b = 0; b < words && strcmp(&vocab[b * max_w], st[a]) != 0; b++);
+ for (b = 0; b < (merge_words? merge_words : words) && strcmp(&vocab[b * max_w], st[a]) != 0; b++);
}
if (b == words) b = -1;
wl->wordi[a] = b;
- fprintf(stderr, "Word: \"%s\" Position in vocabulary: %lld\n", &vocab[wl->wordi[a]*max_w], wl->wordi[a]);
if (b == -1) {
fprintf(stderr, "Out of dictionary word!\n");
cn--;
+ } else {
+ fprintf(stderr, "Word: \"%s\" Position in vocabulary: %lld\n", &vocab[wl->wordi[a]*max_w], wl->wordi[a]);
}
}
wl->length=cn;
@@ -917,7 +919,7 @@
goto end;
old_words = cutoff;
- slice = (para_threads? cutoff / para_threads * (merge_words > 0? 2 : 1) : 0);
+ slice = cutoff / para_threads;
a = posix_memalign((void **) &target_sums, 128, cutoff * sizeof(float));
for(a = 0; a < cutoff; a++)
@@ -931,12 +933,12 @@
pars[a].wl = wl;
pars[a].N = N;
pars[a].best = &best[N*a];
- if(merge_words == 0 || a < para_threads / 2) {
+ if(merge_words == 0 || search_backw == 0) {
pars[a].from = a*slice;
pars[a].upto = ((a+1)*slice > cutoff? cutoff : (a+1) * slice);
} else {
- pars[a].from = merge_words + (a - para_threads / 2) * slice;
- pars[a].upto = merge_words + ((a - para_threads / 2 + 1)*slice > cutoff? cutoff : (a - para_threads / 2 + 1) *slice);
+ pars[a].from = merge_words + a * slice;
+ pars[a].upto = merge_words + ((a+1)*slice > cutoff? cutoff : (a+1) * slice);
}
printf("From: %ld, Upto: %ld\n", pars[a].from, pars[a].upto);
pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
@@ -985,8 +987,9 @@
if(filtered)
continue;
}
-/*
- if(merge_words > 0) {
+
+
+ if(0 && merge_words > 0) {
if(c >= merge_words) {
if(l1_words > N / 2)
continue;
@@ -999,9 +1002,9 @@
l2_words++;
}
}
-*/
- printf("%s l1:%d l2:%d i:%d a:%ld\n", &vocab[c * max_w], l1_words, l2_words, i, a);
- fflush(stdout);
+
+// printf("%s l1:%d l2:%d i:%d a:%ld\n", &vocab[c * max_w], l1_words, l2_words, i, a);
+// fflush(stdout);
HV* hash = newHV();
SV* word = newSVpvf(&vocab[c * max_w], 0);
chosen[i] = c;