w2v-server: add basic functinalities for joint vec spaces
* new option -d dumps vector file to txt vector file
* .txt vector inputs are automatically parsed and converted to mmappale structs
* slice size bug for paradigmatic neighbours fixed
* fixes for paradadigmatic only output
diff --git a/w2v-server.pl b/w2v-server.pl
index a44df71..df47adc 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -1,5 +1,7 @@
#!/usr/local/bin/perl
+#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -g";
use Inline C;
+use Inline C => Config => CLEAN_AFTER_BUILD => 0; #, ccflags => $Config{ccflags}." -g";
use Mojolicious::Lite;
use Mojo::JSON qw(decode_json encode_json to_json);
use Encode qw(decode encode);
@@ -10,11 +12,12 @@
our $opt_i = 0; # latin1-input?
our $opt_l = undef;
our $opt_p = 5676;
-our $opt_n = undef;
+our $opt_n = '';
+our $opt_d;
my $training_args="";
-getopt('il:p:n:');
+getopt('d:il:p:n:');
# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
if(!$ARGV[0]) {
@@ -27,6 +30,11 @@
close(FILE);
}
+if($opt_d) { # -d: dump vecs and exit
+ dump_vecs($opt_d);
+ exit;
+}
+
my $daemon = Mojo::Server::Daemon->new(
app => app,
listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
@@ -112,7 +120,7 @@
float *target_sums;
} knnpars;
-float *M, *M2, *syn1neg_window, *expTable;
+float *M, *M2=0L, *syn1neg_window, *expTable;
char *vocab;
long long words, size;
@@ -125,6 +133,7 @@
int binwords_fd, binvecs_fd, net_fd, i;
long long a, b, c, d, cn;
float len;
+ double val;
char binvecs_fname[256], binwords_fname[256];
strcpy(binwords_fname, file_name);
@@ -149,20 +158,40 @@
printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
return -1;
}
- for (b = 0; b < words; b++) {
- a = 0;
- while (1) {
- vocab[b * max_w + a] = fgetc(f);
- if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
- if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
- }
- vocab[b * max_w + a] = 0;
- fread(&M[b * size], sizeof(float), size, f);
- len = 0;
- for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
- len = sqrt(len);
- for (a = 0; a < size; a++) M[a + b * size] /= len;
- }
+ if(strstr(file_name, ".txt")) {
+ for (b = 0; b < words; b++) {
+ a = 0;
+ while (1) {
+ vocab[b * max_w + a] = fgetc(f);
+ if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
+ if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
+ }
+ vocab[b * max_w + a] = 0;
+ len = 0;
+ for (a = 0; a < size; a++) {
+ fscanf(f, "%lf", &val);
+ M[a + b * size] = val;
+ len += val * val;
+ }
+ len = sqrt(len);
+ for (a = 0; a < size; a++) M[a + b * size] /= len;
+ }
+ } else {
+ for (b = 0; b < words; b++) {
+ a = 0;
+ while (1) {
+ vocab[b * max_w + a] = fgetc(f);
+ if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
+ if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
+ }
+ vocab[b * max_w + a] = 0;
+ fread(&M[b * size], sizeof(float), size, f);
+ len = 0;
+ for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
+ len = sqrt(len);
+ for (a = 0; a < size; a++) M[a + b * size] /= len;
+ }
+ }
if( (binvecs = fopen(binvecs_fname, "wb")) != NULL && (binwords = fopen(binwords_fname, "wb")) != NULL) {
fwrite(M, sizeof(float), (long long)words * (long long)size, binvecs);
fclose(binvecs);
@@ -185,13 +214,13 @@
}
fclose(f);
- if(net_name) {
+ if(net_name && strlen(net_name) > 0) {
if( (net_fd = open(net_name, O_RDONLY)) >= 0) {
window = (lseek(net_fd, 0, SEEK_END) - sizeof(float) * words * size) / words / size / sizeof(float) / 2;
// lseek(net_fd, sizeof(float) * words * size, SEEK_SET);
// munmap(M, sizeof(float) * words * size);
M2 = mmap(0, sizeof(float) * words * size + sizeof(float) * 2 * window * size * words, PROT_READ, MAP_SHARED, net_fd, 0);
- if (M == MAP_FAILED) {
+ if (M2 == MAP_FAILED) {
close(net_fd);
fprintf(stderr, "Cannot mmap %s\n", net_name);
exit(-1);
@@ -222,7 +251,7 @@
float *target_sums, *bestf, *bestn, worstbest, wpos_sum;
long long *besti, *bestp;
- if(cc == -1)
+ if(M2 == NULL || cc == -1)
return NULL;
a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
@@ -372,7 +401,6 @@
sep = wl->sep;
b = bi[0];
c = 0;
-
if (b == -1) {
N = 0;
goto end;
@@ -429,14 +457,13 @@
knnpars pars[MAX_THREADS];
pthread_t *pt = (pthread_t *)malloc((num_threads+1) * sizeof(pthread_t));
wordlist *wl;
- int para_threads = num_threads - window * 2;
- int syn_threads = window * 2;
- num_threads = para_threads+syn_threads;
+ int syn_threads = (M2? window * 2 : 0);
+ int para_threads = num_threads - syn_threads;
if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
- slice = words / syn_threads;
-
+ slice = words / para_threads;
+
wl = getTargetWords(st1);
if(wl->length < 1)
goto end;
@@ -445,6 +472,8 @@
for(a = 0; a < words; a++)
target_sums[a] = 0;
+ printf("Starting %d threads\n", para_threads);
+ fflush(stdout);
for(a=0; a < para_threads; a++) {
pars[a].token = st1;
pars[a].wl = wl;
@@ -453,13 +482,15 @@
pars[a].upto = ((a+1)*slice > words? words:(a+1)*slice);
pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
}
- for(a=0; a < syn_threads; a++) {
- pars[a + para_threads].target_sums = target_sums;
- pars[a + para_threads].wl = wl;
- pars[a + para_threads].N = N;
- pars[a + para_threads].from = a;
- pars[a + para_threads].upto = a+1;
- pthread_create(&pt[a + para_threads], NULL, getCollocators, (void *) &pars[a + para_threads]);
+ if(M2) {
+ for(a=0; a < syn_threads; a++) {
+ pars[a + para_threads].target_sums = target_sums;
+ pars[a + para_threads].wl = wl;
+ pars[a + para_threads].N = N;
+ pars[a + para_threads].from = a;
+ pars[a + para_threads].upto = a+1;
+ pthread_create(&pt[a + para_threads], NULL, getCollocators, (void *) &pars[a + para_threads]);
+ }
}
printf("Waiting for para threads to join\n");
fflush(stdout);
@@ -467,8 +498,8 @@
printf("Para threads joint\n");
fflush(stdout);
- if(!syn_nbs[0])
- goto end;
+ /* if(!syn_nbs[0]) */
+ /* goto end; */
for(b=0; b < N; b++) {
besti[b] = para_nbs[0]->index[b];
@@ -516,82 +547,102 @@
bests[b] = 0;
}
- printf("Waiting for syn threads to join\n");
- fflush(stdout);
- for (a = 0; a < syn_threads; a++) pthread_join(pt[a+para_threads], &syn_nbs[a]);
- printf("syn threads joint\n");
- fflush(stdout);
-
+ if (M2) {
+ printf("Waiting for syn threads to join\n");
+ fflush(stdout);
+ for (a = 0; a < syn_threads; a++) pthread_join(pt[a+para_threads], &syn_nbs[a]);
+ printf("syn threads joint\n");
+ fflush(stdout);
- for(b=0; b < syn_nbs[0]->length; b++) {
- besti[b] = syn_nbs[0]->index[b];
- bestd[b] = syn_nbs[0]->dist[b];
- bestn[b] = syn_nbs[0]->norm[b];
- bestp[b] = syn_nbs[0]->pos[b];
- bests[b] = target_sums[syn_nbs[0]->index[b]];
- }
-
- if(sort_by != 1) { // sort by responsiveness
- for(a=1; a < syn_threads; a++) {
- for(b=0; b < syn_nbs[a]->length; b++) {
- for(c=0; c < MAX_NEIGHBOURS; c++) {
- if(syn_nbs[a]->dist[b] > bestd[c]) {
- for(d=MAX_NEIGHBOURS-1; d>c; d--) {
- bestd[d] = bestd[d-1];
- besti[d] = besti[d-1];
- bestn[d] = bestn[d-1];
- bestp[d] = bestp[d-1];
+ for(b=0; b < syn_nbs[0]->length; b++) {
+ besti[b] = syn_nbs[0]->index[b];
+ bestd[b] = syn_nbs[0]->dist[b];
+ bestn[b] = syn_nbs[0]->norm[b];
+ bestp[b] = syn_nbs[0]->pos[b];
+ bests[b] = target_sums[syn_nbs[0]->index[b]];
+ }
+
+ if(sort_by != 1) { // sort by responsiveness
+ for(a=1; a < syn_threads; a++) {
+ for(b=0; b < syn_nbs[a]->length; b++) {
+ for(c=0; c < MAX_NEIGHBOURS; c++) {
+ if(syn_nbs[a]->dist[b] > bestd[c]) {
+ for(d=MAX_NEIGHBOURS-1; d>c; d--) {
+ bestd[d] = bestd[d-1];
+ besti[d] = besti[d-1];
+ bestn[d] = bestn[d-1];
+ bestp[d] = bestp[d-1];
+ }
+ besti[c] = syn_nbs[a]->index[b];
+ bestd[c] = syn_nbs[a]->dist[b];
+ bestn[c] = syn_nbs[a]->norm[b];
+ bestp[c] = syn_nbs[a]->pos[b];
+ break;
}
- besti[c] = syn_nbs[a]->index[b];
- bestd[c] = syn_nbs[a]->dist[b];
- bestn[c] = syn_nbs[a]->norm[b];
- bestp[c] = syn_nbs[a]->pos[b];
- break;
+ }
+ }
+ }
+ } else { // sort by mean p
+ for(a=1; a < syn_threads; a++) {
+ for(b=0; b < syn_nbs[a]->length; b++) {
+ for(c=0; c < MAX_NEIGHBOURS; c++) {
+ if(target_sums[syn_nbs[a]->index[b]] > bests[c]) {
+ for(d=MAX_NEIGHBOURS-1; d>c; d--) {
+ bestd[d] = bestd[d-1];
+ besti[d] = besti[d-1];
+ bestn[d] = bestn[d-1];
+ bestp[d] = bestp[d-1];
+ bests[d] = bests[d-1];
+ }
+ besti[c] = syn_nbs[a]->index[b];
+ bestd[c] = syn_nbs[a]->dist[b];
+ bestn[c] = syn_nbs[a]->norm[b];
+ bestp[c] = syn_nbs[a]->pos[b];
+ bests[c] = target_sums[syn_nbs[a]->index[b]];
+ break;
+ }
}
}
}
}
- } else { // sort by mean p
- for(a=1; a < syn_threads; a++) {
- for(b=0; b < syn_nbs[a]->length; b++) {
- for(c=0; c < MAX_NEIGHBOURS; c++) {
- if(target_sums[syn_nbs[a]->index[b]] > bests[c]) {
- for(d=MAX_NEIGHBOURS-1; d>c; d--) {
- bestd[d] = bestd[d-1];
- besti[d] = besti[d-1];
- bestn[d] = bestn[d-1];
- bestp[d] = bestp[d-1];
- bests[d] = bests[d-1];
- }
- besti[c] = syn_nbs[a]->index[b];
- bestd[c] = syn_nbs[a]->dist[b];
- bestn[c] = syn_nbs[a]->norm[b];
- bestp[c] = syn_nbs[a]->pos[b];
- bests[c] = target_sums[syn_nbs[a]->index[b]];
- break;
- }
- }
- }
+ array = newAV();
+ for (a = 0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
+ HV* hash = newHV();
+ SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
+ if(latin_enc == 0) SvUTF8_on(word);
+ hv_store(hash, "word", strlen("word"), word , 0);
+ hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
+ hv_store(hash, "norm", strlen("norm"), newSVnv(bestn[a]), 0);
+ hv_store(hash, "sum", strlen("sum"), newSVnv(target_sums[besti[a]]), 0);
+ hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
+ av_push(array, newRV_noinc((SV*)hash));
}
+ hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
}
- array = newAV();
- for (a = 0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
- HV* hash = newHV();
- SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
- if(latin_enc == 0) SvUTF8_on(word);
- hv_store(hash, "word", strlen("word"), word , 0);
- hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
- hv_store(hash, "norm", strlen("norm"), newSVnv(bestn[a]), 0);
- hv_store(hash, "sum", strlen("sum"), newSVnv(target_sums[besti[a]]), 0);
- hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
- av_push(array, newRV_noinc((SV*)hash));
- }
- hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
end:
return newRV_noinc((SV*)result);
}
+int dump_vecs(char *fname) {
+ long i, j;
+ FILE *f;
+ /* if(words>200000) */
+ /* words=200000; */
+ if((f=fopen(fname, "w")) == NULL) {
+ fprintf(stderr, "cannot open %s for writing\n", fname);
+ return(-1);
+ }
+ fprintf(f, "%lld %lld\n", words, size);
+ for (i=0; i < words; i++) {
+ fprintf(f, "%s ", &vocab[i * max_w]);
+ for(j=0; j < size - 1; j++)
+ fprintf(f, "%f ", M[i*size + j]);
+ fprintf(f, "%f\n", M[i*size + j]);
+ }
+ fclose(f);
+ return(0);
+}
__DATA__
@@ index.html.ep
@@ -867,11 +918,13 @@
max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
+ % if($collocators) {
<span> </span>sort collocators by
<select name="sort">
<option value="0" <%= ($sort!=1? "selected":"") %>>responsiveness</option>
<option value="1" <%= ($sort==1? "selected":"") %>>mean p</option>
</select>
+ % }
<span> </span><input type="submit" value="Show">
</form>
<br>
@@ -879,12 +932,15 @@
<div id="wrapper">
<table id="first">
<tr>
- <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th><th title="Position in winodw around target word. Absolute value can be too low because of sub-sampling frequent words.">@</th><th align="right" title=""Responsivenes" of the collocator at the relative position @. Approximation of the probability that the combination of the target word and the collocator at the relative position @ come from the corpus.">resp.</th><th title="Probability of the collocator at window location @."align="right">p(c<sub><small>@</small></sub>)</th><th align="right">Σp(c<sub><small>@</small></sub>)/|w|</th><th align="left">syntagmatic</th>
+ <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
+ % if($collocators) {
+ <th title="Position in winodw around target word. Absolute value can be too low because of sub-sampling frequent words.">@</th><th align="right" title=""Responsivenes" of the collocator at the relative position @. Approximation of the probability that the combination of the target word and the collocator at the relative position @ come from the corpus.">resp.</th><th title="Probability of the collocator at window location @."align="right">p(c<sub><small>@</small></sub>)</th><th align="right">Σp(c<sub><small>@</small></sub>)/|w|</th><th align="left">syntagmatic</th>
+ % }
</tr>
% my $j=0; my @words; my @vecs; my @ranks; for my $list (@$lists) {
% my $i=0; while($list) {
% my $item = (@$list)[$i];
- % my $c = (@$collocators)[$i];
+ % my $c = ($collocators? (@$collocators)[$i] : 0);
% last if(!$c && !$item);
<tr>
<td align="right">