w2v-server: add some parameters and comments
diff --git a/w2v-server.pl b/w2v-server.pl
index 82e7b17..897e7b1 100644
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -2,17 +2,22 @@
use Inline C;
use Mojolicious::Lite;
use Encode qw(decode encode);
+use Mojo::Server::Daemon;
-init_net("vectors14.bin");
+# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
+init_net("vectors15.bin");
get '/' => sub {
my $c = shift;
my $word=$c->param('word');
+ my $no_nbs=$c->param('n') || 50;
my $list;
- if($word !~ /^\s*$/) {
- $list = print_neighbours(encode("iso-8859-1", $word));
+ if(defined($word) && $word !~ /^\s*$/) {
+ $c->inactivity_timeout(300);
+ $c->app->log->debug('Looking for neighbours of '.$word);
+ $list = get_neighbours(encode("iso-8859-1", $word), $no_nbs);
}
- $c->render(template=>"index", word=>$word, list=> $list);
+ $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, list=> $list);
};
app->start;
@@ -30,14 +35,14 @@
#define max_size 2000
#define max_w 50
-#define N 75
+#define MAX_NEIGHBOURS 1000
//the thread function
void *connection_handler(void *);
-char *bestw[N];
+char *bestw[MAX_NEIGHBOURS];
char file_name[max_size], st[100][max_size];
-float dist, len, bestd[N], vec[max_size];
+float dist, len, bestd[MAX_NEIGHBOURS], vec[max_size];
long long words, size, a, b, c, d, cn, bi[100];
char ch;
float *M;
@@ -56,7 +61,7 @@
fscanf(f, "%lld", &words);
fscanf(f, "%lld", &size);
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
- for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
+ for (a = 0; a < MAX_NEIGHBOURS; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
M = (float *)malloc((long long)words * (long long)size * sizeof(float));
if (M == NULL) {
printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
@@ -80,7 +85,9 @@
return 0;
}
-SV *print_neighbours(char *st1) {
+SV *get_neighbours(char *st1, int N) {
+ if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
+
FILE *out=stdout;
*stringBuffer=0;
@@ -154,6 +161,7 @@
return newRV_noinc((SV*)array);
}
+
__DATA__
@@ index.html.ep
@@ -161,19 +169,19 @@
<html>
<head><title>DeReKo-Word-Vector-Distances</title></head>
<body>
- <p>Word vector model based on a 1.9 billion word sample of DeReKo-2015-II (mainly wikipedia including discussions, current newspapapers and fiction). Trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters:</p>
+ <p>Word vector model based on DeReKo-2015-II. Trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters:</p>
<pre>
--cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
+-cbow 1 -size 300 -window 7 -negative 5 -hs 0 -sample 1e-5 -threads 44 -binary 1 -iter 5
</pre>
</p>
<form action="<%=url_for('/')->to_abs%>" method="GET">
- Word: <input type="text" name="word">
+ Word: <input type="text" name="word" value="<%= $word %>">
+ Neighbours: <input type="text" name="n" value="<%= $no_nbs %>">
<input type="submit" value="Show neighbours">
</form>
<br>
% if($list) {
- <p>Target word: <b><%= $word%></b></p>
- <h3>Nearest neighbours</h3>
+ <h3>Nearest neighbours of "<%= $word %>"</h3>
<table>
<tr>
<th align="right">Pos.</th><th align="left">Word</th><th align="right">Cosine dist.</th>