blob: 3c2ca3de5d51609680d57046859d75939a9cfc29 [file] [log] [blame]
Marc Kupietz83305222016-04-28 09:57:22 +02001<!DOCTYPE html>
2<html>
3 <head>
4 <title>DeReKo-Word-Vector-Distances: <%= $word %></title>
Marc Kupietz80bd7b92017-07-04 16:25:54 +02005 <link rel="stylesheet" href="//code.jquery.com/ui/1.12.1/themes/base/jquery-ui.css">
6 <script src="http://code.jquery.com/jquery-latest.min.js"></script>
7 <script
8 src="http://code.jquery.com/ui/1.12.1/jquery-ui.min.js"
9 integrity="sha256-VazP97ZCwtekAsvgPBSUwPFKdrwD3unUfSGVYrahUqU="
10 crossorigin="anonymous"></script>
11 <script>
Marc Kupietz83305222016-04-28 09:57:22 +020012 $(function() {
13 $( document ).tooltip({
14 content: function() {
15 return $(this).attr('title');
16 }}
17 )
18 })
19 </script>
20 <script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
Marc Kupietzadaa1632017-07-04 14:10:29 +020021 <script src="/word2vec/js/tsne.js"></script>
22 <script src="/word2vec/js/som.js"></script>
23 <script src="/word2vec/js/labeler.js"></script>
Marc Kupietz83305222016-04-28 09:57:22 +020024 <style>
25 body, input {
26 font-family: Arial, sans-serif;
27 font-size: 11pt;
28 }
29
30 .ui-tooltip-content {
31 font-size: 9pt;
32 color: #222222;
33 }
34
35 svg > .ui-tooltip-content {
36 font-size: 8pt;
37 color: #222222;
38 }
39
40 a.merged {
41 color: green;
42 fill: green;
43 }
44
45 #first a {
46 text-decoration: none;
47 }
48
49 a.marked, #first a.marked {
50 text-decoration: underline;
51 }
Marc Kupietzf4b49392016-04-28 10:49:56 +020052
Marc Kupietz83305222016-04-28 09:57:22 +020053 a.target {
54 color: red;
55 fill: red;
56 }
57
58 #collocators {
59 margin-bottom: 15px;
60 }
61
62 #wrapper {
63 width: 100%;
64 // border: 1px solid red;
65 overflow: hidden; /* will contain if #first is longer than #second */
66 }
67 #first {
68 margin-right: 20px;
69 float: left;
70 // border: 1px solid green;
71 }
72 #second {
73 border: 1px solid #333;
74 overflow: hidden; /* if you don't want #second to wrap below #first */
75 }
76 #som2 svg {
77 border: 1px solid #333;
78 }
79
80 #cost {
81 font-size: 8pt;
82 color: #222222;
83 margin-top: 4px;
84 margin-bottom: 12px;
85 }
86
87 #sominfo1, #sominfo {
88 font-size: 8pt;
89 color: #222222;
90 margin-top: 0px;
91 }
92
93 #somcolor1, #somcolor2, #somcolor3 {
94 display: inline-block;
95 height: 10px;
96 width: 10px;
97 }
98
99 #third {
100 border: 1px solid #333;
101 }
102
103 </style>
104 <script>
105
106 var opt = {epsilon: <%= $epsilon %>, perplexity: <%= $perplexity %>},
107 mapWidth = 800, // width map
108 mapHeight = 800,
109 jitterRadius = 7;
110
111 var T = new tsnejs.tSNE(opt); // create a tSNE instance
112
113 var Y;
114
115 var data;
116 var labeler;
117
118 function applyJitter() {
119 svg.selectAll('.tsnet')
120 .data(labels)
121 .transition()
122 .duration(50)
123 .attr("transform", function(d, i) {
124 T.Y[i][0] = (d.x - mapWidth/2 - tx)/ss/20;
125 T.Y[i][1] = (d.y - mapHeight/2 - ty)/ss/20;
126 return "translate(" +
127 (d.x) + "," +
128 (d.y) + ")";
129 });
130 }
131
132 function updateEmbedding() {
133 var Y = T.getSolution();
134 svg.selectAll('.tsnet')
135 .data(data.words)
136 .attr("transform", function(d, i) {
137 return "translate(" +
138 ((Y[i][0]*20*ss + tx) + mapWidth/2) + "," +
139 ((Y[i][1]*20*ss + ty) + mapHeight/2) + ")"; });
140 }
141
142 var svg;
143 var labels = [];
144 var anchor_array = [];
145 var text;
146
147 function drawEmbedding() {
148 $("#embed").empty();
149 var div = d3.select("#embed");
150
151 // get min and max in each column of Y
152 var Y = T.Y;
153
154 svg = div.append("svg") // svg is global
155 .attr("width", mapWidth)
156 .attr("height", mapHeight);
157
158 var g = svg.selectAll(".b")
159 .data(data.words)
160 .enter().append("g")
161 .attr("class", "tsnet");
162
163 g.append("a")
Marc Kupietzf4b49392016-04-28 10:49:56 +0200164 .attr("xlink:href", function(word) {
165 return (data.urlprefix+word);})
Marc Kupietz83305222016-04-28 09:57:22 +0200166 .attr("class", function(d, i) {
167 var res="";
168 if(data.marked[i]) {
169 res="marked ";
170 }
171 if(data.target.indexOf(" "+d+" ") >= 0) {
172 return res+"target";
173 } else if(data.ranks[i] < data.mergedEnd) {
174 return res+"merged";
175 } else {
176 return res;
177 }
178 })
179 .attr("title", function(d, i) {
180 if(data.mergedEnd > 0) {
181 if(data.ranks[i] >= data.mergedEnd) {
182 return "rank: "+i +" "+"freq. rank: "+(data.ranks[i]).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
183 } else {
184 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",") + " (merged vocab)";
185 }
186 } else {
187 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
188 }
189 })
190 .append("text")
191 .attr("text-anchor", "top")
192 .attr("font-size", 12)
193 .text(function(d) { return d; });
194
195 var zoomListener = d3.behavior.zoom()
196 .scaleExtent([0.1, 10])
197 .center([0,0])
198 .on("zoom", zoomHandler);
199 zoomListener(svg);
200 }
201
202 var tx=0, ty=0;
203 var ss=1;
204 var iter_id=-1;
205
206 function zoomHandler() {
207 tx = d3.event.translate[0];
208 ty = d3.event.translate[1];
209 ss = d3.event.scale;
210 updateEmbedding();
211 }
212
213 var stepnum = 0;
214
215 function stopStep() {
216 clearInterval(iter_id);
217 text = svg.selectAll("text");
218
219 // jitter function needs different data and co-ordinate representation
220 labels = d3.range(data.words.length).map(function(i) {
221 var x = (T.Y[i][0]*20*ss + tx) + mapWidth/2;
222 var y = (T.Y[i][1]*20*ss + ty) + mapHeight/2;
223 anchor_array.push({x: x, y: y, r: jitterRadius});
224 return {
225 x: x,
226 y: y,
227 name: data.words[i]
228 };
229 });
230
231 // get the actual label bounding boxes for the jitter function
232 var index = 0;
233 text.each(function() {
234 labels[index].width = this.getBBox().width;
235 labels[index].height = this.getBBox().height;
236 index += 1;
237 });
238
239
240 // setTimeout(updateEmbedding, 1);
241 // setTimeout(
242 labeler = d3.labeler()
243 .label(labels)
244 .anchor(anchor_array)
245 .width(mapWidth)
246 .height(mapHeight)
247 .update(applyJitter);
248 // .start(1000);
249
250 iter_id = setInterval(jitterStep, 1);
251 }
252
253 var jitter_i=0;
254
255 function jitterStep() {
256 if(jitter_i++ > 100) {
257 clearInterval(iter_id);
258 } else {
259 labeler.start2(10);
260 applyJitter();
261 }
262 }
263
264 var last_cost=1000;
265
266 function step() {
267 var i = T.iter;
268
269 if(i > <%= $no_iterations %>) {
270 stopStep();
271 } else {
272 var cost = Math.round(T.step() * 100000) / 100000; // do a few steps
273 $("#cost").html("tsne iteration " + i + ", cost: " + cost.toFixed(5));
274 if(i % 250 == 0 && cost >= last_cost) {
275 stopStep();
276 } else {
277 last_cost = cost;
278 updateEmbedding();
279 }
280 }
281 }
282
283 function showMap(j) {
284 data=j;
285 T.iter=0;
286 T.initDataRaw(data.vecs); // init embedding
287 drawEmbedding(); // draw initial embedding
288
289 if(iter_id >= 0) {
290 clearInterval(iter_id);
291 }
292 //T.debugGrad();
293 iter_id = setInterval(step, 1);
294 if(<%= $show_som %>) {
295 makeSOM(j, <%= $no_iterations %>);
296 }
297 }
298
299 </script>
300 </head>
301 <body>
Marc Kupietzb3422c12017-07-04 14:12:11 +0200302 <form method="GET">
Marc Kupietz83305222016-04-28 09:57:22 +0200303 word(s):
304 <input type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
305 % if($mergedEnd > 0) {
306 backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
307 % }
308 max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
309 max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
310 SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
311 % if($collocators) {
312 <span> </span>sort collocators by
313 <select name="sort">
314 <option value="0" <%= ($sort!=1? "selected":"") %>>responsiveness</option>
315 <option value="1" <%= ($sort==1? "selected":"") %>>mean p</option>
316 </select>
317 % }
318 <span> </span><input type="submit" value="Show">
319 </form>
320 <br>
321 % if($lists) {
322 <div id="wrapper">
323 <table id="first">
324 <tr>
325 <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
326 % if($collocators) {
327 <th title="Position in winodw around target word. Absolute value can be too low because of sub-sampling frequent words.">@</th><th align="right" title="&#34;Responsivenes&#34; of the collocator at the relative position @. Approximation of the probability that the combination of the target word and the collocator at the relative position @ come from the corpus.">resp.</th><th title="Probability of the collocator at window location @."align="right">p(c<sub><small>@</small></sub>)</th><th align="right">Σp(c<sub><small>@</small></sub>)/|w|</th><th align="left">syntagmatic</th>
328 % }
329 </tr>
330 % my $j=0; my @words; my @vecs; my @ranks; my @marked;
331 % for my $list (@$lists) {
332 % my $i=0; while($list) {
333 % my $item = (@$list)[$i];
334 % my $c = ($collocators? (@$collocators)[$i] : 0);
335 % last if(!$c && !$item);
336 <tr>
337 <td align="right">
338 <%= ++$i %>.
339 </td>
340 % if($item) {
341 % if(!grep{$_ eq $item->{word}} @words) {
342 % push @vecs, $item->{vector};
343 % push @words, $item->{word};
344 % push @ranks, $item->{rank};
345 % push @marked, ($marked->{$item->{word}}? 1 : 0);
346 % }
347 <td align="right">
348 <%= sprintf("%.3f", $item->{dist}) %>
349 </td>
350 <td>
351 % my $class = ($marked->{$item->{word}}? "marked " : "");
352 % my $r = $item->{rank};
353 % if($r < $mergedEnd) {
354 % $class .= "merged";
355 % $r .= " (merged vocab)";
356 % } elsif($mergedEnd!=0 && $r > $mergedEnd) {
357 % $r -= $mergedEnd;
358 % }
Marc Kupietzf4b49392016-04-28 10:49:56 +0200359 <a class="<%= $class =%>"
360 title="freq. rank: <%= $r =%>"
361 href="<%= url_with->query([word => $item->{word}]) =%>">
362 <%= $item->{word} =%>
363 </a>
Marc Kupietz83305222016-04-28 09:57:22 +0200364 </td>
365 % } else {
366 <td colspan="2"/>
367 % }
368 % if($c) {
369 <td align="right">
370 <%= $c->{pos} %>:
371 </td>
372 <td align="right">
373 <%= sprintf("%.3f", $c->{dist}) %>
374 </td>
375 <td align="right">
376 <%= sprintf("%.3e", $c->{norm}) %>
377 </td>
378 <td align="right">
379 <%= sprintf("%.3e", $c->{sum}) %>
380 </td>
381 <td align="left">
Marc Kupietzf4b49392016-04-28 10:49:56 +0200382 <a href="<%= url_with->query([word => $c->{word}]) =%>">
Marc Kupietz83305222016-04-28 09:57:22 +0200383 <%= $c->{word} %>
384 </td>
385 % } else {
386 <td colspan="5"/>
387 % }
388 </tr>
389 % }
390 % }
391 </table>
392 <script>
393 % use Mojo::ByteStream 'b';
Marc Kupietzf4b49392016-04-28 10:49:56 +0200394 % my $urlprefix = url_with->query([word=>'']);
Marc Kupietz83305222016-04-28 09:57:22 +0200395 $(window).load(function() {
Marc Kupietzf4b49392016-04-28 10:49:56 +0200396 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", mergedEnd=> $mergedEnd, words => \@words, vecs => \@vecs, ranks => \@ranks, marked => \@marked, urlprefix => $urlprefix})); %>);
Marc Kupietz83305222016-04-28 09:57:22 +0200397 });
398 </script>
399 % }
400 <div id="second" style="width:800px; height:800px; font-family: arial;">
401 <div id="embed">
402 </div>
403 </div>
404 <div id="cost"></div>
405 % if($show_som) {
406 <div id="som2">
407 </div>
408 <div id="sominfo1"><span id="somcolor1"> </span> <span id="somword1"> </span> <span id="somcolor2"> </span> <span id="somword2"> </span> <span id="somcolor3"> </span></div>
409 <div id="sominfo">SOM iteration <span id="iterations">0</span></div>
410 % }
411 </div>
412 % if($training_args) {
413 <p>
414 Word vector model trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters: <pre><%= $training_args %></pre>
415 </p>
416 % }
417 </body>
418</html>