w2v-server.pl: show collocators (still hacky)
diff --git a/w2v-server.pl b/w2v-server.pl
index b515c6b..4b43c49 100644
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -10,15 +10,15 @@
our $opt_i = 0; # latin1-input?
our $opt_l = undef;
our $opt_p = 5676;
+our $opt_n = undef;
-getopt('il:p:');
+getopt('il:p:n:');
-print STDERR $ARGV[1];
# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
if(!$ARGV[0]) {
- init_net("vectors15.bin", ($opt_i? 1 : 0));
+ init_net("vectors15.bin", $opt_n, ($opt_i? 1 : 0));
} else {
- init_net($ARGV[0], ($opt_i? 1 : 0));
+ init_net($ARGV[0], $opt_n, ($opt_i? 1 : 0));
}
my $daemon = Mojo::Server::Daemon->new(
@@ -34,22 +34,24 @@
my $perplexity=$c->param('perplexity') || 20;
my $epsilon=$c->param('epsilon') || 5;
my $som=$c->param('som') || 0;
-
+ my $res;
my @lists;
+ my @collocations;
if(defined($word) && $word !~ /^\s*$/) {
$c->inactivity_timeout(300);
$word =~ s/\s+/ /g;
for my $w (split(' *\| *', $word)) {
$c->app->log->debug('Looking for neighbours of '.$w);
if($opt_i) {
- push(@lists, get_neighbours(encode("iso-8859-1", $w), $no_nbs));
+ $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs);
} else {
- push(@lists, get_neighbours($w, $no_nbs));
+ $res = get_neighbours($w, $no_nbs);
}
+ push(@lists, $res->{paradigmatic});
}
}
$word =~ s/ *\| */ | /g;
- $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, lists=> \@lists);
+ $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, lists=> \@lists, collocators=> $res->{syntagmatic});
};
$daemon->run; # app->start;
@@ -72,6 +74,9 @@
#define MAX_NEIGHBOURS 1000
#define MAX_WORDS -1
#define MAX_THREADS 100
+#define MAX_CC 50
+#define EXP_TABLE_SIZE 1000
+#define MAX_EXP 6
//the thread function
void *connection_handler(void *);
@@ -79,6 +84,7 @@
typedef struct {
long long *index;
float *dist;
+ long long *pos;
unsigned int length;
} knn;
@@ -86,19 +92,21 @@
typedef struct {
char *token;
int N;
- unsigned long from;
+ long from;
unsigned long upto;
} knnpars;
-float *M;
+float *M, *syn1neg_window, *expTable;
char *vocab;
+
long long words, size;
int num_threads=20;
int latin_enc=0;
+int window;
-int init_net(char *file_name, int latin) {
+int init_net(char *file_name, char *net_name, int latin) {
FILE *f, *binvecs, *binwords;
- int binwords_fd, binvecs_fd;
+ int binwords_fd, binvecs_fd, net_fd, i;
long long a, b, c, d, cn;
float len;
@@ -160,14 +168,134 @@
exit(-1);
}
fclose(f);
+
+ if(net_name) {
+ if( (net_fd = open(net_name, O_RDONLY)) >= 0) {
+ window = (lseek(net_fd, 0, SEEK_END) - sizeof(float) * words * size) / words / size / sizeof(float) / 2;
+ // lseek(net_fd, sizeof(float) * words * size, SEEK_SET);
+ munmap(M, sizeof(float) * words * size);
+ M = mmap(0, sizeof(float) * words * size + sizeof(float) * 2 * window * size * words, PROT_READ, MAP_SHARED, net_fd, 0);
+ if (M == MAP_FAILED) {
+ close(net_fd);
+ fprintf(stderr, "Cannot mmap %s\n", net_name);
+ exit(-1);
+ }
+ syn1neg_window = M + words * size;
+ } else {
+ fprintf(stderr, "Cannot open %s\n", net_name);
+ exit(-1);
+ }
+ fprintf(stderr, "Successfully memmaped %s. Determined window size: %d\n", net_name, window);
+ }
+
+ expTable = (float *) malloc((EXP_TABLE_SIZE + 1) * sizeof(float));
+ for (i = 0; i < EXP_TABLE_SIZE; i++) {
+ expTable[i] = exp((i / (float) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
+ expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
+ }
return 0;
}
+knn *getCollocators(int cc) {
+ knn *nbs = NULL;
+ long window_layer_size = size * window * 2;
+ long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
+ float f, max_f, maxmax_f;
+ float *target_sums, *bestf, worstbest;
+ long long *besti, *bestp;
+ int N = 10;
+ a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
+ besti = malloc(N * sizeof(long long));
+ bestp = malloc(N * sizeof(long long));
+ bestf = malloc(N * sizeof(float));
+ for (b = 0; b < words; b++)
+ target_sums[b]=0;
+ for (b = 0; b < N; b++)
+ bestf[b]=-1;
+ worstbest = -1;
+ d = cc;
+ maxmax_f = -1;
+ maxmax_target = 0;
+
+
+ for (a = window * 2 + 1; a >=0; a--) {
+ printf("window pos: %ld\n", a);
+ if (a != window) {
+ max_f = -1;
+ window_offset = a * size;
+ if (a > window)
+ window_offset -= size;
+ for(target = 0; target < words; target ++) {
+ if(target == d)
+ continue;
+ f = 0;
+ for (c = 0; c < size; c++)
+ f += M[d* size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
+ if (f < -MAX_EXP)
+ continue;
+ else if (f > MAX_EXP)
+ continue;
+ else
+ f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
+ if(f > max_f) {
+ max_f = f;
+ max_target = target;
+ }
+ target_sums[target] += (1-target_sums[target]) * f;
+ if(f > worstbest) {
+ for (b = 0; b < N; b++) {
+ if (f > bestf[b]) {
+ for (e = N - 1; e > b; e--) {
+ bestf[e] = bestf[e - 1];
+ besti[e] = besti[e - 1];
+ bestp[e] = bestp[e - 1];
+ }
+ bestf[b] = f;
+ besti[b] = target;
+ bestp[b] = window-a;
+ break;
+ }
+ }
+ worstbest = bestf[N-1];
+ }
+ }
+ printf("%d %.2f\n", max_target, max_f);
+ printf("%s (%.2f) ", &vocab[max_target * max_w], max_f);
+ if(max_f > maxmax_f) {
+ maxmax_f = max_f;
+ maxmax_target = max_target;
+ }
+ } else {
+ printf("\x1b[1m%s\x1b[0m ", &vocab[d*max_w]);
+ }
+ }
+ max_f = -1;
+ for (b = 0; b < words; b++) {
+ if(target_sums[b] > max_f) {
+ max_f = target_sums[b];
+ max_target = b;
+ }
+ }
+ printf(" -- max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
+ &vocab[max_target * max_w], max_f,
+ &vocab[maxmax_target * max_w], maxmax_f);
+ for(b=0; b<N && bestf[b]>-1; b++)
+ printf("%-32s %.2f %d\n", &vocab[besti[b]*max_w], bestf[b], bestp[b]);
+ printf("\n");
+ free(target_sums);
+ nbs = malloc(sizeof(knn));
+ nbs->index = besti;
+ nbs->dist = bestf;
+ nbs->pos = bestp;
+ nbs->length = N;
+ return(nbs);
+}
+
void *_get_neighbours(knnpars *pars) {
char *st1 = pars->token;
int N = pars->N;
- unsigned long from = pars -> from;
+ long from = pars -> from;
unsigned long upto = pars -> upto;
char file_name[max_size], st[100][max_size], sep[100];
float dist, len, *bestd, vec[max_size];
@@ -203,6 +331,10 @@
if (b == words) b = -1;
bi[a] = b;
fprintf(stderr, "Word: \"%s\" Position in vocabulary: %lld\n", st[a], bi[a]);
+ if(from < 0) {
+ nbs = getCollocators(b);
+ pthread_exit(nbs);
+ }
if (b == -1) {
fprintf(stderr, "Out of dictionary word!\n");
cn--;
@@ -257,19 +389,26 @@
pthread_exit(nbs);
}
+
SV *get_neighbours(char *st1, int N) {
+ HV *result = newHV();
float bestd[MAX_NEIGHBOURS], vec[max_size];
- long long besti[MAX_NEIGHBOURS], a, b, c, d, slice;
+ long long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
char *bestw[MAX_NEIGHBOURS];
knn *nbs[MAX_THREADS];
knnpars pars[MAX_THREADS];
- pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
- AV* array = newAV();
+ pthread_t *pt = (pthread_t *)malloc((num_threads+1) * sizeof(pthread_t));
if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
slice = words / num_threads;
+ a = num_threads;
+ pars[a].token = st1;
+ pars[a].N = N;
+ pars[a].from = -1;
+ pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
+
for(a=0; a < num_threads; a++) {
pars[a].token = st1;
pars[a].N = N;
@@ -279,6 +418,8 @@
}
for (a = 0; a < num_threads; a++) pthread_join(pt[a], &nbs[a]);
+ pthread_join(pt[a], &nbs[a]);
+
if(!nbs[0])
goto end;
@@ -303,7 +444,9 @@
}
}
+
if(nbs) {
+ AV* array = newAV();
for (a = 0; a < N; a++) {
bestw[a] = (char *)malloc(max_size * sizeof(char));
}
@@ -322,11 +465,31 @@
hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
av_push(array, newRV_noinc((SV*)hash));
}
+ hv_store(result, "paradigmatic", strlen("paradigmatic"), newRV_noinc((SV*)array), 0);
+
+ for(b=0; b < 10; b++) {
+ besti[b] = nbs[num_threads]->index[b];
+ bestd[b] = nbs[num_threads]->dist[b];
+ bestp[b] = nbs[num_threads]->pos[b];
+ }
+ array = newAV();
+ for (a = 0; a < 10; a++) {
+ strcpy(bestw[a], &vocab[besti[a] * max_w]);
+ HV* hash = newHV();
+ SV* word = newSVpvf(bestw[a], 0);
+ if(latin_enc == 0) SvUTF8_on(word);
+ hv_store(hash, "word", strlen("word"), word , 0);
+ hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
+ hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
+ av_push(array, newRV_noinc((SV*)hash));
+ }
+ hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
}
end:
- return newRV_noinc((SV*)array);
+ return newRV_noinc((SV*)result);
}
+
__DATA__
@@ index.html.ep
@@ -366,6 +529,10 @@
colour: #222222;
}
+#collocators {
+ margin-bottom: 15px;
+}
+
#wrapper {
width: 100%;
// border: 1px solid red;
@@ -601,6 +768,13 @@
<span> </span><input type="submit" value="Show">
</form>
<br>
+ % if($collocators) {
+ <div id="collocators">
+ % for my $item (@$collocators) {
+ <i><%= $item->{word} %></i> (<%= $item->{pos} %>: <%= sprintf("%.2f", $item->{dist}) %>)
+ % }
+ </div>
+ % }
% if($lists) {
<div id="wrapper">
<table id="first">