w2v-server.pl: add tsne visualization
diff --git a/w2v-server.pl b/w2v-server.pl
index 897e7b1..f51f9da 100644
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -1,11 +1,12 @@
#!/usr/local/bin/perl
use Inline C;
use Mojolicious::Lite;
+use Mojo::JSON qw(decode_json encode_json to_json);
use Encode qw(decode encode);
use Mojo::Server::Daemon;
# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
-init_net("vectors15.bin");
+init_net("vectors14.bin");
get '/' => sub {
my $c = shift;
@@ -43,7 +44,7 @@
char *bestw[MAX_NEIGHBOURS];
char file_name[max_size], st[100][max_size];
float dist, len, bestd[MAX_NEIGHBOURS], vec[max_size];
-long long words, size, a, b, c, d, cn, bi[100];
+long long words, size, a, b, c, d, cn, bi[100], besti[MAX_NEIGHBOURS];
char ch;
float *M;
char *vocab;
@@ -145,6 +146,7 @@
strcpy(bestw[d], bestw[d - 1]);
}
bestd[a] = dist;
+ besti[a] = c;
strcpy(bestw[a], &vocab[c * max_w]);
break;
}
@@ -155,6 +157,11 @@
HV* hash = newHV();
hv_store(hash, "word", strlen("word"), newSVpvf(bestw[a], 0), 0);
hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
+ AV *vector = newAV();
+ for (b = 0; b < size; b++) {
+ av_push(vector, newSVnv(M[b + besti[a] * size]));
+ }
+ hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
av_push(array, newRV_noinc((SV*)hash));
}
end:
@@ -167,7 +174,138 @@
@@ index.html.ep
<!DOCTYPE html>
<html>
-<head><title>DeReKo-Word-Vector-Distances</title></head>
+<head>
+ <title>DeReKo-Word-Vector-Distances</title>
+ <script src="http://code.jquery.com/jquery-latest.min.js"></script>
+ <script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
+ <script src="http://klinux10/word2vec/tsne.js"></script>
+<style>
+svg {
+// border: 1px solid #333;
+// margin-right: 5px;
+// margin-bottom: 5px;
+}
+#wrapper {
+ width: 100%;
+// border: 1px solid red;
+ overflow: hidden; /* will contain if #first is longer than #second */
+}
+#first {
+ width: 300px;
+ margin-right: 20px;
+ float:left; /* add this */
+// border: 1px solid green;
+}
+#second {
+ border: 1px solid #333;
+ overflow: hidden; /* if you don't want #second to wrap below #first */
+}
+</style>
+<script>
+
+var opt = {epsilon: 1, perplexity: 8};
+var T = new tsnejs.tSNE(opt); // create a tSNE instance
+
+var Y;
+
+var data;
+
+function updateEmbedding() {
+ var Y = T.getSolution();
+ svg.selectAll('.u')
+ .data(data.words)
+ .attr("transform", function(d, i) { return "translate(" +
+ ((Y[i][0]*20*ss + tx) + 400) + "," +
+ ((Y[i][1]*20*ss + ty) + 400) + ")"; });
+}
+
+var svg;
+function drawEmbedding() {
+ $("#embed").empty();
+ var div = d3.select("#embed");
+
+ // get min and max in each column of Y
+ var Y = T.Y;
+
+ svg = div.append("svg") // svg is global
+ .attr("width", 800)
+ .attr("height", 800);
+
+ var g = svg.selectAll(".b")
+ .data(data.words)
+ .enter().append("g")
+ .attr("class", "u");
+
+ g.append("text")
+ .attr("text-anchor", "top")
+ .attr("font-size", 12)
+ .attr("fill", "#333")
+ .text(function(d) { return d; });
+
+ var zoomListener = d3.behavior.zoom()
+ .scaleExtent([0.1, 10])
+ .center([0,0])
+ .on("zoom", zoomHandler);
+ zoomListener(svg);
+}
+
+var tx=0, ty=0;
+var ss=1;
+var iter_id=-1;
+
+function zoomHandler() {
+ tx = d3.event.translate[0];
+ ty = d3.event.translate[1];
+ ss = d3.event.scale;
+ updateEmbedding();
+}
+
+var stepnum = 0;
+
+function stopStep() {
+ clearInterval(iter_id);
+}
+
+function step() {
+ var i = T.iter;
+ if(i >= 1000) {
+ stopStep();
+ } else {
+ var cost = T.step(); // do a few steps
+ $("#cost").html("iteration " + i + ", cost: " + cost);
+ updateEmbedding();
+ }
+}
+
+ function showMap(j) {
+ data=j;
+ T.iter=0;
+ T.initDataRaw(data.vecs); // init embedding
+ drawEmbedding(); // draw initial embedding
+
+ if(iter_id >= 0) {
+ clearInterval(iter_id);
+ }
+ //T.debugGrad();
+ iter_id = setInterval(step, 1);
+ //step();
+ }
+
+$(window).xxload(function() {
+ $.getJSON( "http://klinux10/word2vec/dings.json", function( j ) {
+ data = j;
+ T.initDataRaw(data.vecs); // init embedding
+ drawEmbedding(); // draw initial embedding
+
+ // T.debugGrad();
+ iter_id = setInterval(step, 1);
+ // step();
+
+ });
+});
+
+</script>
+</head>
<body>
<p>Word vector model based on DeReKo-2015-II. Trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters:</p>
<pre>
@@ -182,11 +320,14 @@
<br>
% if($list) {
<h3>Nearest neighbours of "<%= $word %>"</h3>
- <table>
+<div id="wrapper">
+ <table id="first">
<tr>
<th align="right">Pos.</th><th align="left">Word</th><th align="right">Cosine dist.</th>
</tr>
- % my $i=1; for my $item (@$list) {
+ % my $i=1; my @words; my @vecs; for my $item (@$list) {
+ % push @vecs, $item->{vector};
+ % push @words, $item->{word};
<tr>
<td align="right">
<%= $i++ %>.
@@ -202,7 +343,18 @@
</tr>
% }
</table>
+ <script>
+ % use Mojo::ByteStream 'b';
+$(window).load(function() {
+ showMap(<%= b(Mojo::JSON::to_json({words => \@words, vecs => \@vecs})); %>);
+});
+ </script>
% }
-</body>
+<div id="second" style="width:800px; height:800px; font-family: arial;">
+<div id="embed"></div>
+<div id="cost" style="text-align:left; font-family: Impact;"></div>
+ </div>
+ </div>
+ </body>
</html>