w2v-server.pl: allow for average vectors and iterations parameter
diff --git a/w2v-server.pl b/w2v-server.pl
index f69b667..e62c63c 100644
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -6,21 +6,23 @@
use Mojo::Server::Daemon;
# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
-init_net("vectors14.bin");
+init_net("vectors15.bin");
get '/' => sub {
my $c = shift;
my $word=$c->param('word');
- my $no_nbs=$c->param('n') || 100;
+ my $no_nbs=$c->param('n') || 100;
+ my $no_iterations=$c->param('N') || 2000;
my @lists;
if(defined($word) && $word !~ /^\s*$/) {
$c->inactivity_timeout(300);
- for my $w (split('\s+', $word)) {
+ $word =~ s/\s+/ /g;
+ for my $w (split(' *\| *', $word)) {
$c->app->log->debug('Looking for neighbours of '.$w);
push(@lists, get_neighbours(encode("iso-8859-1", $w), $no_nbs));
}
}
- $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, lists=> \@lists);
+ $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, lists=> \@lists);
};
app->start;
@@ -39,6 +41,7 @@
#define max_size 2000
#define max_w 50
#define MAX_NEIGHBOURS 1000
+#define MAX_WORDS -1
//the thread function
void *connection_handler(void *);
@@ -62,6 +65,7 @@
return -1;
}
fscanf(f, "%lld", &words);
+ if(MAX_WORDS > 0 && words > MAX_WORDS) words = MAX_WORDS;
fscanf(f, "%lld", &size);
vocab = (char *)malloc((long long)words * max_w * sizeof(char));
for (a = 0; a < MAX_NEIGHBOURS; a++) bestw[a] = (char *)malloc(max_size * sizeof(char));
@@ -120,10 +124,14 @@
fprintf(stderr, "Word: \"%s\" Position in vocabulary: %lld\n", st[a], bi[a]);
if (b == -1) {
fprintf(stderr, "Out of dictionary word!\n");
+ cn--;
break;
}
}
- if (b == -1) goto end;
+ if (b == -1 && cn <= 0) {
+ N = 0;
+ goto end;
+ }
for (a = 0; a < size; a++) vec[a] = 0;
for (b = 0; b < cn; b++) {
if (bi[b] == -1) continue;
@@ -155,6 +163,8 @@
}
}
+end:
+ a=0;
AV* array = newAV();
for (a = 0; a < N; a++) {
strcpy(bestw[a], &vocab[besti[a] * max_w]);
@@ -168,7 +178,6 @@
hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
av_push(array, newRV_noinc((SV*)hash));
}
- end:
return newRV_noinc((SV*)array);
}
@@ -180,12 +189,25 @@
<html>
<head>
<title>DeReKo-Word-Vector-Distances</title>
+ <link rel="stylesheet" href="//code.jquery.com/ui/1.11.4/themes/smoothness/jquery-ui.css">
<script src="http://code.jquery.com/jquery-latest.min.js"></script>
+ <script src="//code.jquery.com/ui/1.11.4/jquery-ui.js"></script>
+ <script>
+ $(function() {
+ $( document ).tooltip();
+ });
+ </script>
<script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
<script src="http://klinux10/word2vec/tsne.js"></script>
<style>
-body {
+body, input {
font-family: Arial, sans-serif;
+ font-size: 11pt;
+}
+
+.ui-tooltip-content {
+ font-size: 9pt;
+ colour: #222222;
}
svg {
// border: 1px solid #333;
@@ -198,7 +220,6 @@
overflow: hidden; /* will contain if #first is longer than #second */
}
#first {
- width: 300px;
margin-right: 20px;
float:left; /* add this */
// border: 1px solid green;
@@ -291,11 +312,11 @@
function step() {
var i = T.iter;
- if(i > 2000) {
+ if(i > <%= $no_iterations %>) {
stopStep();
} else {
var cost = Math.round(T.step() *1000) / 1000; // do a few steps
- $("#cost").html("iteration " + i + ", cost: " + cost.toFixed(3));
+ $("#cost").html("tsne iteration " + i + ", cost: " + cost.toFixed(3));
updateEmbedding();
}
}
@@ -318,8 +339,10 @@
</head>
<body>
<form action="<%=url_for('/')->to_abs%>" method="GET">
- word(s) (space-separated): <input type="text" name="word" value="<%= $word %>">
- max. neighbours: <input type="text" name="n" value="<%= $no_nbs %>">
+ word(s):
+ <input type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
+ max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
+ iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
<input type="submit" value="Show">
</form>
<br>