blob: 57ee2ab2e1facf5bd11b9b7c99dc70accb4929a8 [file] [log] [blame]
Marc Kupietz83305222016-04-28 09:57:22 +02001<!DOCTYPE html>
2<html>
3 <head>
4 <title>DeReKo-Word-Vector-Distances: <%= $word %></title>
5 <link rel="stylesheet" href="//code.jquery.com/ui/1.11.4/themes/smoothness/jquery-ui.css">
6 <script src="http://code.jquery.com/jquery-latest.min.js"></script>
7 <script src="//code.jquery.com/ui/1.11.4/jquery-ui.js"></script>
8 <script>
9 $(function() {
10 $( document ).tooltip({
11 content: function() {
12 return $(this).attr('title');
13 }}
14 )
15 })
16 </script>
17 <script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
Marc Kupietzadaa1632017-07-04 14:10:29 +020018 <script src="/word2vec/js/tsne.js"></script>
19 <script src="/word2vec/js/som.js"></script>
20 <script src="/word2vec/js/labeler.js"></script>
Marc Kupietz83305222016-04-28 09:57:22 +020021 <style>
22 body, input {
23 font-family: Arial, sans-serif;
24 font-size: 11pt;
25 }
26
27 .ui-tooltip-content {
28 font-size: 9pt;
29 color: #222222;
30 }
31
32 svg > .ui-tooltip-content {
33 font-size: 8pt;
34 color: #222222;
35 }
36
37 a.merged {
38 color: green;
39 fill: green;
40 }
41
42 #first a {
43 text-decoration: none;
44 }
45
46 a.marked, #first a.marked {
47 text-decoration: underline;
48 }
Marc Kupietzf4b49392016-04-28 10:49:56 +020049
Marc Kupietz83305222016-04-28 09:57:22 +020050 a.target {
51 color: red;
52 fill: red;
53 }
54
55 #collocators {
56 margin-bottom: 15px;
57 }
58
59 #wrapper {
60 width: 100%;
61 // border: 1px solid red;
62 overflow: hidden; /* will contain if #first is longer than #second */
63 }
64 #first {
65 margin-right: 20px;
66 float: left;
67 // border: 1px solid green;
68 }
69 #second {
70 border: 1px solid #333;
71 overflow: hidden; /* if you don't want #second to wrap below #first */
72 }
73 #som2 svg {
74 border: 1px solid #333;
75 }
76
77 #cost {
78 font-size: 8pt;
79 color: #222222;
80 margin-top: 4px;
81 margin-bottom: 12px;
82 }
83
84 #sominfo1, #sominfo {
85 font-size: 8pt;
86 color: #222222;
87 margin-top: 0px;
88 }
89
90 #somcolor1, #somcolor2, #somcolor3 {
91 display: inline-block;
92 height: 10px;
93 width: 10px;
94 }
95
96 #third {
97 border: 1px solid #333;
98 }
99
100 </style>
101 <script>
102
103 var opt = {epsilon: <%= $epsilon %>, perplexity: <%= $perplexity %>},
104 mapWidth = 800, // width map
105 mapHeight = 800,
106 jitterRadius = 7;
107
108 var T = new tsnejs.tSNE(opt); // create a tSNE instance
109
110 var Y;
111
112 var data;
113 var labeler;
114
115 function applyJitter() {
116 svg.selectAll('.tsnet')
117 .data(labels)
118 .transition()
119 .duration(50)
120 .attr("transform", function(d, i) {
121 T.Y[i][0] = (d.x - mapWidth/2 - tx)/ss/20;
122 T.Y[i][1] = (d.y - mapHeight/2 - ty)/ss/20;
123 return "translate(" +
124 (d.x) + "," +
125 (d.y) + ")";
126 });
127 }
128
129 function updateEmbedding() {
130 var Y = T.getSolution();
131 svg.selectAll('.tsnet')
132 .data(data.words)
133 .attr("transform", function(d, i) {
134 return "translate(" +
135 ((Y[i][0]*20*ss + tx) + mapWidth/2) + "," +
136 ((Y[i][1]*20*ss + ty) + mapHeight/2) + ")"; });
137 }
138
139 var svg;
140 var labels = [];
141 var anchor_array = [];
142 var text;
143
144 function drawEmbedding() {
145 $("#embed").empty();
146 var div = d3.select("#embed");
147
148 // get min and max in each column of Y
149 var Y = T.Y;
150
151 svg = div.append("svg") // svg is global
152 .attr("width", mapWidth)
153 .attr("height", mapHeight);
154
155 var g = svg.selectAll(".b")
156 .data(data.words)
157 .enter().append("g")
158 .attr("class", "tsnet");
159
160 g.append("a")
Marc Kupietzf4b49392016-04-28 10:49:56 +0200161 .attr("xlink:href", function(word) {
162 return (data.urlprefix+word);})
Marc Kupietz83305222016-04-28 09:57:22 +0200163 .attr("class", function(d, i) {
164 var res="";
165 if(data.marked[i]) {
166 res="marked ";
167 }
168 if(data.target.indexOf(" "+d+" ") >= 0) {
169 return res+"target";
170 } else if(data.ranks[i] < data.mergedEnd) {
171 return res+"merged";
172 } else {
173 return res;
174 }
175 })
176 .attr("title", function(d, i) {
177 if(data.mergedEnd > 0) {
178 if(data.ranks[i] >= data.mergedEnd) {
179 return "rank: "+i +" "+"freq. rank: "+(data.ranks[i]).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
180 } else {
181 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",") + " (merged vocab)";
182 }
183 } else {
184 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
185 }
186 })
187 .append("text")
188 .attr("text-anchor", "top")
189 .attr("font-size", 12)
190 .text(function(d) { return d; });
191
192 var zoomListener = d3.behavior.zoom()
193 .scaleExtent([0.1, 10])
194 .center([0,0])
195 .on("zoom", zoomHandler);
196 zoomListener(svg);
197 }
198
199 var tx=0, ty=0;
200 var ss=1;
201 var iter_id=-1;
202
203 function zoomHandler() {
204 tx = d3.event.translate[0];
205 ty = d3.event.translate[1];
206 ss = d3.event.scale;
207 updateEmbedding();
208 }
209
210 var stepnum = 0;
211
212 function stopStep() {
213 clearInterval(iter_id);
214 text = svg.selectAll("text");
215
216 // jitter function needs different data and co-ordinate representation
217 labels = d3.range(data.words.length).map(function(i) {
218 var x = (T.Y[i][0]*20*ss + tx) + mapWidth/2;
219 var y = (T.Y[i][1]*20*ss + ty) + mapHeight/2;
220 anchor_array.push({x: x, y: y, r: jitterRadius});
221 return {
222 x: x,
223 y: y,
224 name: data.words[i]
225 };
226 });
227
228 // get the actual label bounding boxes for the jitter function
229 var index = 0;
230 text.each(function() {
231 labels[index].width = this.getBBox().width;
232 labels[index].height = this.getBBox().height;
233 index += 1;
234 });
235
236
237 // setTimeout(updateEmbedding, 1);
238 // setTimeout(
239 labeler = d3.labeler()
240 .label(labels)
241 .anchor(anchor_array)
242 .width(mapWidth)
243 .height(mapHeight)
244 .update(applyJitter);
245 // .start(1000);
246
247 iter_id = setInterval(jitterStep, 1);
248 }
249
250 var jitter_i=0;
251
252 function jitterStep() {
253 if(jitter_i++ > 100) {
254 clearInterval(iter_id);
255 } else {
256 labeler.start2(10);
257 applyJitter();
258 }
259 }
260
261 var last_cost=1000;
262
263 function step() {
264 var i = T.iter;
265
266 if(i > <%= $no_iterations %>) {
267 stopStep();
268 } else {
269 var cost = Math.round(T.step() * 100000) / 100000; // do a few steps
270 $("#cost").html("tsne iteration " + i + ", cost: " + cost.toFixed(5));
271 if(i % 250 == 0 && cost >= last_cost) {
272 stopStep();
273 } else {
274 last_cost = cost;
275 updateEmbedding();
276 }
277 }
278 }
279
280 function showMap(j) {
281 data=j;
282 T.iter=0;
283 T.initDataRaw(data.vecs); // init embedding
284 drawEmbedding(); // draw initial embedding
285
286 if(iter_id >= 0) {
287 clearInterval(iter_id);
288 }
289 //T.debugGrad();
290 iter_id = setInterval(step, 1);
291 if(<%= $show_som %>) {
292 makeSOM(j, <%= $no_iterations %>);
293 }
294 }
295
296 </script>
297 </head>
298 <body>
Marc Kupietzb3422c12017-07-04 14:12:11 +0200299 <form method="GET">
Marc Kupietz83305222016-04-28 09:57:22 +0200300 word(s):
301 <input type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
302 % if($mergedEnd > 0) {
303 backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
304 % }
305 max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
306 max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
307 SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
308 % if($collocators) {
309 <span> </span>sort collocators by
310 <select name="sort">
311 <option value="0" <%= ($sort!=1? "selected":"") %>>responsiveness</option>
312 <option value="1" <%= ($sort==1? "selected":"") %>>mean p</option>
313 </select>
314 % }
315 <span> </span><input type="submit" value="Show">
316 </form>
317 <br>
318 % if($lists) {
319 <div id="wrapper">
320 <table id="first">
321 <tr>
322 <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
323 % if($collocators) {
324 <th title="Position in winodw around target word. Absolute value can be too low because of sub-sampling frequent words.">@</th><th align="right" title="&#34;Responsivenes&#34; of the collocator at the relative position @. Approximation of the probability that the combination of the target word and the collocator at the relative position @ come from the corpus.">resp.</th><th title="Probability of the collocator at window location @."align="right">p(c<sub><small>@</small></sub>)</th><th align="right">Σp(c<sub><small>@</small></sub>)/|w|</th><th align="left">syntagmatic</th>
325 % }
326 </tr>
327 % my $j=0; my @words; my @vecs; my @ranks; my @marked;
328 % for my $list (@$lists) {
329 % my $i=0; while($list) {
330 % my $item = (@$list)[$i];
331 % my $c = ($collocators? (@$collocators)[$i] : 0);
332 % last if(!$c && !$item);
333 <tr>
334 <td align="right">
335 <%= ++$i %>.
336 </td>
337 % if($item) {
338 % if(!grep{$_ eq $item->{word}} @words) {
339 % push @vecs, $item->{vector};
340 % push @words, $item->{word};
341 % push @ranks, $item->{rank};
342 % push @marked, ($marked->{$item->{word}}? 1 : 0);
343 % }
344 <td align="right">
345 <%= sprintf("%.3f", $item->{dist}) %>
346 </td>
347 <td>
348 % my $class = ($marked->{$item->{word}}? "marked " : "");
349 % my $r = $item->{rank};
350 % if($r < $mergedEnd) {
351 % $class .= "merged";
352 % $r .= " (merged vocab)";
353 % } elsif($mergedEnd!=0 && $r > $mergedEnd) {
354 % $r -= $mergedEnd;
355 % }
Marc Kupietzf4b49392016-04-28 10:49:56 +0200356 <a class="<%= $class =%>"
357 title="freq. rank: <%= $r =%>"
358 href="<%= url_with->query([word => $item->{word}]) =%>">
359 <%= $item->{word} =%>
360 </a>
Marc Kupietz83305222016-04-28 09:57:22 +0200361 </td>
362 % } else {
363 <td colspan="2"/>
364 % }
365 % if($c) {
366 <td align="right">
367 <%= $c->{pos} %>:
368 </td>
369 <td align="right">
370 <%= sprintf("%.3f", $c->{dist}) %>
371 </td>
372 <td align="right">
373 <%= sprintf("%.3e", $c->{norm}) %>
374 </td>
375 <td align="right">
376 <%= sprintf("%.3e", $c->{sum}) %>
377 </td>
378 <td align="left">
Marc Kupietzf4b49392016-04-28 10:49:56 +0200379 <a href="<%= url_with->query([word => $c->{word}]) =%>">
Marc Kupietz83305222016-04-28 09:57:22 +0200380 <%= $c->{word} %>
381 </td>
382 % } else {
383 <td colspan="5"/>
384 % }
385 </tr>
386 % }
387 % }
388 </table>
389 <script>
390 % use Mojo::ByteStream 'b';
Marc Kupietzf4b49392016-04-28 10:49:56 +0200391 % my $urlprefix = url_with->query([word=>'']);
Marc Kupietz83305222016-04-28 09:57:22 +0200392 $(window).load(function() {
Marc Kupietzf4b49392016-04-28 10:49:56 +0200393 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", mergedEnd=> $mergedEnd, words => \@words, vecs => \@vecs, ranks => \@ranks, marked => \@marked, urlprefix => $urlprefix})); %>);
Marc Kupietz83305222016-04-28 09:57:22 +0200394 });
395 </script>
396 % }
397 <div id="second" style="width:800px; height:800px; font-family: arial;">
398 <div id="embed">
399 </div>
400 </div>
401 <div id="cost"></div>
402 % if($show_som) {
403 <div id="som2">
404 </div>
405 <div id="sominfo1"><span id="somcolor1"> </span> <span id="somword1"> </span> <span id="somcolor2"> </span> <span id="somword2"> </span> <span id="somcolor3"> </span></div>
406 <div id="sominfo">SOM iteration <span id="iterations">0</span></div>
407 % }
408 </div>
409 % if($training_args) {
410 <p>
411 Word vector model trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters: <pre><%= $training_args %></pre>
412 </p>
413 % }
414 </body>
415</html>