w2v-server: bug-fixes and hacks for merged/comparable vecs
diff --git a/w2v-server.pl b/w2v-server.pl
index df93317..6ed83a8 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -1,7 +1,8 @@
#!/usr/local/bin/perl
use Inline C;
-#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -O4";
+#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
+#use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -Ofast -march k8 -mtune k8 ";
use Mojolicious::Lite;
use Mojo::JSON qw(decode_json encode_json to_json);
use Encode qw(decode encode);
@@ -22,7 +23,7 @@
my $training_args="";
my $mergedEnd=0;
-getopts('d:Gil:p:m:M:');
+getopts('d:Gil:p:m:n:M:');
if($opt_M) {
open my $handle, '<:encoding(UTF-8)', $opt_M
@@ -68,6 +69,7 @@
get '/' => sub {
my $c = shift;
+ $c->app->log->info("get: ".$c->req->url->to_abs);
my $word=$c->param('word');
my $no_nbs=$c->param('n') || 100;
my $no_iterations=$c->param('N') || 2000;
@@ -84,7 +86,7 @@
$c->inactivity_timeout(300);
$word =~ s/\s+/ /g;
for my $w (split(' *\| *', $word)) {
- $c->app->log->debug('Looking for neighbours of '.$w);
+ $c->app->log->info('Looking for neighbours of '.$w);
if($opt_i) {
$res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst);
} else {
@@ -157,6 +159,7 @@
char *garbage = NULL;
long long words, size, merged_end;
+long long merge_words = 0;
int num_threads=20;
int latin_enc=0;
int window;
@@ -281,7 +284,8 @@
float len;
float *merge_vecs;
char *merge_vocab;
- long long merge_words, merge_size;
+ /* long long merge_words, merge_size; */
+ long long merge_size;
char binvecs_fname[256], binwords_fname[256];
strcpy(binwords_fname, file_name);
@@ -568,6 +572,7 @@
SV *get_neighbours(char *st1, int N, int sort_by, int search_backw) {
HV *result = newHV();
float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
+ long long old_words;
long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
knn *para_nbs[MAX_THREADS];
knn *syn_nbs[MAX_THREADS];
@@ -579,12 +584,16 @@
if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
- slice = words / para_threads;
wl = getTargetWords(st1, search_backw);
if(wl->length < 1)
goto end;
+ old_words = words;
+ if(merge_words > 0)
+ words = merge_words * 1.25; /* HACK */
+ slice = words / para_threads;
+
a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
for(a = 0; a < words; a++)
target_sums[a] = 0;
@@ -625,7 +634,7 @@
for(a=1; a < para_threads; a++) {
for(b=0; b < para_nbs[a]->length && para_nbs[a]->index[b] >= 0; b++) {
- for(c=0; c < N; c++) {
+ for(c=0; c < N * para_threads; c++) {
if(para_nbs[a]->dist[b] > bestd[c]) {
for(d=N-1; d>c; d--) {
bestd[d] = bestd[d-1];
@@ -640,10 +649,30 @@
}
AV* array = newAV();
- for (a = 0; a < N; a++) {
+ int i;
+ int l1_words=0, l2_words=0;
+ for (a = 0, i = 0; i < N && a < 600; a++) {
+ long long c = besti[a];
+ if(merge_words > 0) {
+ if(c >= merge_words) {
+ if(l1_words > N / 2)
+ continue;
+ else
+ l1_words++;
+ } else {
+ if(l2_words > N / 2)
+ continue;
+ else
+ l2_words++;
+ }
+ }
+ fflush(stdout);
+ printf("%s l1:%d l2:%d i:%d a:%ld\n", &vocab[c * max_w], l1_words, l2_words, i, a);
+
HV* hash = newHV();
- SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
+ SV* word = newSVpvf(&vocab[c * max_w], 0);
if(latin_enc == 0) SvUTF8_on(word);
+ fflush(stdout);
hv_store(hash, "word", strlen("word"), word , 0);
hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
hv_store(hash, "rank", strlen("rank"), newSVuv(besti[a]), 0);
@@ -653,6 +682,7 @@
}
hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
av_push(array, newRV_noinc((SV*)hash));
+ i++;
}
hv_store(result, "paradigmatic", strlen("paradigmatic"), newRV_noinc((SV*)array), 0);
@@ -737,6 +767,7 @@
hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
}
end:
+ words = old_words;
return newRV_noinc((SV*)result);
}