blob: 9183278f245ae2499cad3694013abbe9c6d11cc2 [file] [log] [blame]
Marc Kupietz83305222016-04-28 09:57:22 +02001<!DOCTYPE html>
2<html>
3 <head>
4 <title>DeReKo-Word-Vector-Distances: <%= $word %></title>
5 <link rel="stylesheet" href="//code.jquery.com/ui/1.11.4/themes/smoothness/jquery-ui.css">
6 <script src="http://code.jquery.com/jquery-latest.min.js"></script>
7 <script src="//code.jquery.com/ui/1.11.4/jquery-ui.js"></script>
8 <script>
9 $(function() {
10 $( document ).tooltip({
11 content: function() {
12 return $(this).attr('title');
13 }}
14 )
15 })
16 </script>
17 <script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
18 <script src="http://klinux10/word2vec/tsne.js"></script>
19 <script src="http://klinux10/word2vec/som.js"></script>
20 <script src="http://klinux10/word2vec/labeler.js"></script>
21 <style>
22 body, input {
23 font-family: Arial, sans-serif;
24 font-size: 11pt;
25 }
26
27 .ui-tooltip-content {
28 font-size: 9pt;
29 color: #222222;
30 }
31
32 svg > .ui-tooltip-content {
33 font-size: 8pt;
34 color: #222222;
35 }
36
37 a.merged {
38 color: green;
39 fill: green;
40 }
41
42 #first a {
43 text-decoration: none;
44 }
45
46 a.marked, #first a.marked {
47 text-decoration: underline;
48 }
49
50 a.target {
51 color: red;
52 fill: red;
53 }
54
55 #collocators {
56 margin-bottom: 15px;
57 }
58
59 #wrapper {
60 width: 100%;
61 // border: 1px solid red;
62 overflow: hidden; /* will contain if #first is longer than #second */
63 }
64 #first {
65 margin-right: 20px;
66 float: left;
67 // border: 1px solid green;
68 }
69 #second {
70 border: 1px solid #333;
71 overflow: hidden; /* if you don't want #second to wrap below #first */
72 }
73 #som2 svg {
74 border: 1px solid #333;
75 }
76
77 #cost {
78 font-size: 8pt;
79 color: #222222;
80 margin-top: 4px;
81 margin-bottom: 12px;
82 }
83
84 #sominfo1, #sominfo {
85 font-size: 8pt;
86 color: #222222;
87 margin-top: 0px;
88 }
89
90 #somcolor1, #somcolor2, #somcolor3 {
91 display: inline-block;
92 height: 10px;
93 width: 10px;
94 }
95
96 #third {
97 border: 1px solid #333;
98 }
99
100 </style>
101 <script>
102
103 var opt = {epsilon: <%= $epsilon %>, perplexity: <%= $perplexity %>},
104 mapWidth = 800, // width map
105 mapHeight = 800,
106 jitterRadius = 7;
107
108 var T = new tsnejs.tSNE(opt); // create a tSNE instance
109
110 var Y;
111
112 var data;
113 var labeler;
114
115 function applyJitter() {
116 svg.selectAll('.tsnet')
117 .data(labels)
118 .transition()
119 .duration(50)
120 .attr("transform", function(d, i) {
121 T.Y[i][0] = (d.x - mapWidth/2 - tx)/ss/20;
122 T.Y[i][1] = (d.y - mapHeight/2 - ty)/ss/20;
123 return "translate(" +
124 (d.x) + "," +
125 (d.y) + ")";
126 });
127 }
128
129 function updateEmbedding() {
130 var Y = T.getSolution();
131 svg.selectAll('.tsnet')
132 .data(data.words)
133 .attr("transform", function(d, i) {
134 return "translate(" +
135 ((Y[i][0]*20*ss + tx) + mapWidth/2) + "," +
136 ((Y[i][1]*20*ss + ty) + mapHeight/2) + ")"; });
137 }
138
139 var svg;
140 var labels = [];
141 var anchor_array = [];
142 var text;
143
144 function drawEmbedding() {
145 $("#embed").empty();
146 var div = d3.select("#embed");
147
148 // get min and max in each column of Y
149 var Y = T.Y;
150
151 svg = div.append("svg") // svg is global
152 .attr("width", mapWidth)
153 .attr("height", mapHeight);
154
155 var g = svg.selectAll(".b")
156 .data(data.words)
157 .enter().append("g")
158 .attr("class", "tsnet");
159
160 g.append("a")
161 .attr("xlink:href", function(word) {return "/?word="+word;})
162 .attr("class", function(d, i) {
163 var res="";
164 if(data.marked[i]) {
165 res="marked ";
166 }
167 if(data.target.indexOf(" "+d+" ") >= 0) {
168 return res+"target";
169 } else if(data.ranks[i] < data.mergedEnd) {
170 return res+"merged";
171 } else {
172 return res;
173 }
174 })
175 .attr("title", function(d, i) {
176 if(data.mergedEnd > 0) {
177 if(data.ranks[i] >= data.mergedEnd) {
178 return "rank: "+i +" "+"freq. rank: "+(data.ranks[i]).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
179 } else {
180 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",") + " (merged vocab)";
181 }
182 } else {
183 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
184 }
185 })
186 .append("text")
187 .attr("text-anchor", "top")
188 .attr("font-size", 12)
189 .text(function(d) { return d; });
190
191 var zoomListener = d3.behavior.zoom()
192 .scaleExtent([0.1, 10])
193 .center([0,0])
194 .on("zoom", zoomHandler);
195 zoomListener(svg);
196 }
197
198 var tx=0, ty=0;
199 var ss=1;
200 var iter_id=-1;
201
202 function zoomHandler() {
203 tx = d3.event.translate[0];
204 ty = d3.event.translate[1];
205 ss = d3.event.scale;
206 updateEmbedding();
207 }
208
209 var stepnum = 0;
210
211 function stopStep() {
212 clearInterval(iter_id);
213 text = svg.selectAll("text");
214
215 // jitter function needs different data and co-ordinate representation
216 labels = d3.range(data.words.length).map(function(i) {
217 var x = (T.Y[i][0]*20*ss + tx) + mapWidth/2;
218 var y = (T.Y[i][1]*20*ss + ty) + mapHeight/2;
219 anchor_array.push({x: x, y: y, r: jitterRadius});
220 return {
221 x: x,
222 y: y,
223 name: data.words[i]
224 };
225 });
226
227 // get the actual label bounding boxes for the jitter function
228 var index = 0;
229 text.each(function() {
230 labels[index].width = this.getBBox().width;
231 labels[index].height = this.getBBox().height;
232 index += 1;
233 });
234
235
236 // setTimeout(updateEmbedding, 1);
237 // setTimeout(
238 labeler = d3.labeler()
239 .label(labels)
240 .anchor(anchor_array)
241 .width(mapWidth)
242 .height(mapHeight)
243 .update(applyJitter);
244 // .start(1000);
245
246 iter_id = setInterval(jitterStep, 1);
247 }
248
249 var jitter_i=0;
250
251 function jitterStep() {
252 if(jitter_i++ > 100) {
253 clearInterval(iter_id);
254 } else {
255 labeler.start2(10);
256 applyJitter();
257 }
258 }
259
260 var last_cost=1000;
261
262 function step() {
263 var i = T.iter;
264
265 if(i > <%= $no_iterations %>) {
266 stopStep();
267 } else {
268 var cost = Math.round(T.step() * 100000) / 100000; // do a few steps
269 $("#cost").html("tsne iteration " + i + ", cost: " + cost.toFixed(5));
270 if(i % 250 == 0 && cost >= last_cost) {
271 stopStep();
272 } else {
273 last_cost = cost;
274 updateEmbedding();
275 }
276 }
277 }
278
279 function showMap(j) {
280 data=j;
281 T.iter=0;
282 T.initDataRaw(data.vecs); // init embedding
283 drawEmbedding(); // draw initial embedding
284
285 if(iter_id >= 0) {
286 clearInterval(iter_id);
287 }
288 //T.debugGrad();
289 iter_id = setInterval(step, 1);
290 if(<%= $show_som %>) {
291 makeSOM(j, <%= $no_iterations %>);
292 }
293 }
294
295 </script>
296 </head>
297 <body>
298 <form action="<%=url_for('/')->to_abs%>" method="GET">
299 word(s):
300 <input type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
301 % if($mergedEnd > 0) {
302 backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
303 % }
304 max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
305 max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
306 SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
307 % if($collocators) {
308 <span> </span>sort collocators by
309 <select name="sort">
310 <option value="0" <%= ($sort!=1? "selected":"") %>>responsiveness</option>
311 <option value="1" <%= ($sort==1? "selected":"") %>>mean p</option>
312 </select>
313 % }
314 <span> </span><input type="submit" value="Show">
315 </form>
316 <br>
317 % if($lists) {
318 <div id="wrapper">
319 <table id="first">
320 <tr>
321 <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
322 % if($collocators) {
323 <th title="Position in winodw around target word. Absolute value can be too low because of sub-sampling frequent words.">@</th><th align="right" title="&#34;Responsivenes&#34; of the collocator at the relative position @. Approximation of the probability that the combination of the target word and the collocator at the relative position @ come from the corpus.">resp.</th><th title="Probability of the collocator at window location @."align="right">p(c<sub><small>@</small></sub>)</th><th align="right">Σp(c<sub><small>@</small></sub>)/|w|</th><th align="left">syntagmatic</th>
324 % }
325 </tr>
326 % my $j=0; my @words; my @vecs; my @ranks; my @marked;
327 % for my $list (@$lists) {
328 % my $i=0; while($list) {
329 % my $item = (@$list)[$i];
330 % my $c = ($collocators? (@$collocators)[$i] : 0);
331 % last if(!$c && !$item);
332 <tr>
333 <td align="right">
334 <%= ++$i %>.
335 </td>
336 % if($item) {
337 % if(!grep{$_ eq $item->{word}} @words) {
338 % push @vecs, $item->{vector};
339 % push @words, $item->{word};
340 % push @ranks, $item->{rank};
341 % push @marked, ($marked->{$item->{word}}? 1 : 0);
342 % }
343 <td align="right">
344 <%= sprintf("%.3f", $item->{dist}) %>
345 </td>
346 <td>
347 % my $class = ($marked->{$item->{word}}? "marked " : "");
348 % my $r = $item->{rank};
349 % if($r < $mergedEnd) {
350 % $class .= "merged";
351 % $r .= " (merged vocab)";
352 % } elsif($mergedEnd!=0 && $r > $mergedEnd) {
353 % $r -= $mergedEnd;
354 % }
355 <a class="<%= $class %>" title="freq. rank: <%= $r %>" href="/?word=<%= $item->{word} %>"><%= $item->{word} %></a>
356 </td>
357 % } else {
358 <td colspan="2"/>
359 % }
360 % if($c) {
361 <td align="right">
362 <%= $c->{pos} %>:
363 </td>
364 <td align="right">
365 <%= sprintf("%.3f", $c->{dist}) %>
366 </td>
367 <td align="right">
368 <%= sprintf("%.3e", $c->{norm}) %>
369 </td>
370 <td align="right">
371 <%= sprintf("%.3e", $c->{sum}) %>
372 </td>
373 <td align="left">
374 <a href="/?word=<%= $c->{word} %>">
375 <%= $c->{word} %>
376 </td>
377 % } else {
378 <td colspan="5"/>
379 % }
380 </tr>
381 % }
382 % }
383 </table>
384 <script>
385 % use Mojo::ByteStream 'b';
386 $(window).load(function() {
387 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", mergedEnd=> $mergedEnd, words => \@words, vecs => \@vecs, ranks => \@ranks, marked => \@marked})); %>);
388 });
389 </script>
390 % }
391 <div id="second" style="width:800px; height:800px; font-family: arial;">
392 <div id="embed">
393 </div>
394 </div>
395 <div id="cost"></div>
396 % if($show_som) {
397 <div id="som2">
398 </div>
399 <div id="sominfo1"><span id="somcolor1"> </span> <span id="somword1"> </span> <span id="somcolor2"> </span> <span id="somword2"> </span> <span id="somcolor3"> </span></div>
400 <div id="sominfo">SOM iteration <span id="iterations">0</span></div>
401 % }
402 </div>
403 % if($training_args) {
404 <p>
405 Word vector model trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters: <pre><%= $training_args %></pre>
406 </p>
407 % }
408 </body>
409</html>