blob: 1a2aa10f7d278767b9d2622499b521b3540e899b [file] [log] [blame]
Marc Kupietz83305222016-04-28 09:57:22 +02001<!DOCTYPE html>
2<html>
3 <head>
4 <title>DeReKo-Word-Vector-Distances: <%= $word %></title>
Marc Kupietz80bd7b92017-07-04 16:25:54 +02005 <link rel="stylesheet" href="//code.jquery.com/ui/1.12.1/themes/base/jquery-ui.css">
6 <script src="http://code.jquery.com/jquery-latest.min.js"></script>
Marc Kupietz694610d2017-11-25 18:30:03 +01007 <script src = "https://cdn.datatables.net/1.10.16/js/jquery.dataTables.min.js"></script>
8 <script src = "https://cdn.datatables.net/fixedcolumns/3.2.3/js/dataTables.fixedColumns.min.js"></script>
9 <link rel="stylesheet" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.min.css">
Marc Kupietz80bd7b92017-07-04 16:25:54 +020010 <script
11 src="http://code.jquery.com/ui/1.12.1/jquery-ui.min.js"
12 integrity="sha256-VazP97ZCwtekAsvgPBSUwPFKdrwD3unUfSGVYrahUqU="
13 crossorigin="anonymous"></script>
14 <script>
Marc Kupietz694610d2017-11-25 18:30:03 +010015$(document).ready(function() {
Marc Kupietz3305b0a2017-11-27 10:46:20 +010016 $("#xxxtabs").tabs( {
Marc Kupietz694610d2017-11-25 18:30:03 +010017 "show": function(event, ui) {
18 var oTable = $('div.dataTables_scrollBody>table.display', ui.panel).dataTable();
19 if ( oTable.length > 0 ) {
20 oTable.fnAdjustColumnSizing();
21 }
22 }
23 } );
24
Marc Kupietz3305b0a2017-11-27 10:46:20 +010025$(".selector").tabs({ active: 1 });
Marc Kupietz694610d2017-11-25 18:30:03 +010026 $('#firsttable').DataTable({
Marc Kupietz0af83e32017-11-27 09:31:37 +010027 "sScrollY": "760px",
28 "bScrollCollapse": true,
29 "bPaginate": false,
30 "bJQueryUI": true,
31 "dom": '<"top">rt<"bottom"flp><"clear">',
32 "aoColumnDefs": [
33 { "sWidth": "10%", "aTargets": [ -1 ] }
34 ]
35 } );
36
37
38 $('#secondtable').DataTable({
Marc Kupietz694610d2017-11-25 18:30:03 +010039 "sScrollY": "800px",
40 "bScrollCollapse": true,
41 "bPaginate": false,
42 "bJQueryUI": true,
43 "dom": '<"top">rt<"bottom"flp><"clear">',
44 "aoColumnDefs": [
45 { "sWidth": "10%", "aTargets": [ -1 ] }
46 ]
47 } );
48});
Marc Kupietz0af83e32017-11-27 09:31:37 +010049
Marc Kupietz3305b0a2017-11-27 10:46:20 +010050 $( function() {
51 $( "#tabs" ).tabs();
52 } );
53
Marc Kupietz0af83e32017-11-27 09:31:37 +010054$(function() {
Marc Kupietz83305222016-04-28 09:57:22 +020055 $( document ).tooltip({
56 content: function() {
57 return $(this).attr('title');
58 }}
59 )
60 })
Marc Kupietz694610d2017-11-25 18:30:03 +010061
Marc Kupietz83305222016-04-28 09:57:22 +020062 </script>
63 <script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
Marc Kupietz554aff52017-11-09 14:42:09 +010064 <script src="/derekovecs/js/tsne.js"></script>
65 <script src="/derekovecs/js/som.js"></script>
66 <script src="/derekovecs/js/labeler.js"></script>
Marc Kupietz83305222016-04-28 09:57:22 +020067 <style>
68 body, input {
69 font-family: Arial, sans-serif;
70 font-size: 11pt;
71 }
Marc Kupietz30ca4342017-11-22 21:21:20 +010072
73 .mono {
74 font-family: "DejaVu Sans Mono", Inconsolata, SourceCodePro, Courier;
75 }
76
Marc Kupietz83305222016-04-28 09:57:22 +020077 .ui-tooltip-content {
78 font-size: 9pt;
79 color: #222222;
80 }
81
82 svg > .ui-tooltip-content {
83 font-size: 8pt;
84 color: #222222;
85 }
86
87 a.merged {
88 color: green;
89 fill: green;
90 }
91
92 #first a {
93 text-decoration: none;
94 }
95
96 a.marked, #first a.marked {
97 text-decoration: underline;
98 }
Marc Kupietzf4b49392016-04-28 10:49:56 +020099
Marc Kupietz83305222016-04-28 09:57:22 +0200100 a.target {
101 color: red;
102 fill: red;
103 }
Marc Kupietz694610d2017-11-25 18:30:03 +0100104
105table.display {
106 width: 40% important!;
107 margin: 0; /* <- works for me this way ****/
108}
109table.dataTable thead th, table.dataTable thead td, table.dataTable tbody td {
110 padding: 2px 2px;
111// border-bottom: 1px solid #111;
112}
Marc Kupietz83305222016-04-28 09:57:22 +0200113 #collocators {
114 margin-bottom: 15px;
115 }
116
117 #wrapper {
118 width: 100%;
119 // border: 1px solid red;
120 overflow: hidden; /* will contain if #first is longer than #second */
121 }
122 #first {
123 margin-right: 20px;
124 float: left;
125 // border: 1px solid green;
126 }
127 #second {
128 border: 1px solid #333;
129 overflow: hidden; /* if you don't want #second to wrap below #first */
130 }
131 #som2 svg {
132 border: 1px solid #333;
133 }
134
135 #cost {
136 font-size: 8pt;
137 color: #222222;
138 margin-top: 4px;
139 margin-bottom: 12px;
140 }
141
142 #sominfo1, #sominfo {
143 font-size: 8pt;
144 color: #222222;
145 margin-top: 0px;
146 }
147
148 #somcolor1, #somcolor2, #somcolor3 {
149 display: inline-block;
150 height: 10px;
151 width: 10px;
152 }
153
154 #third {
155 border: 1px solid #333;
156 }
157
158 </style>
159 <script>
160
161 var opt = {epsilon: <%= $epsilon %>, perplexity: <%= $perplexity %>},
162 mapWidth = 800, // width map
163 mapHeight = 800,
164 jitterRadius = 7;
165
166 var T = new tsnejs.tSNE(opt); // create a tSNE instance
167
168 var Y;
169
170 var data;
171 var labeler;
172
173 function applyJitter() {
174 svg.selectAll('.tsnet')
175 .data(labels)
176 .transition()
177 .duration(50)
178 .attr("transform", function(d, i) {
179 T.Y[i][0] = (d.x - mapWidth/2 - tx)/ss/20;
180 T.Y[i][1] = (d.y - mapHeight/2 - ty)/ss/20;
181 return "translate(" +
182 (d.x) + "," +
183 (d.y) + ")";
184 });
185 }
186
187 function updateEmbedding() {
188 var Y = T.getSolution();
189 svg.selectAll('.tsnet')
190 .data(data.words)
191 .attr("transform", function(d, i) {
192 return "translate(" +
193 ((Y[i][0]*20*ss + tx) + mapWidth/2) + "," +
194 ((Y[i][1]*20*ss + ty) + mapHeight/2) + ")"; });
195 }
196
197 var svg;
198 var labels = [];
199 var anchor_array = [];
200 var text;
201
202 function drawEmbedding() {
203 $("#embed").empty();
204 var div = d3.select("#embed");
205
206 // get min and max in each column of Y
207 var Y = T.Y;
208
209 svg = div.append("svg") // svg is global
210 .attr("width", mapWidth)
211 .attr("height", mapHeight);
212
213 var g = svg.selectAll(".b")
214 .data(data.words)
215 .enter().append("g")
216 .attr("class", "tsnet");
217
218 g.append("a")
Marc Kupietzf4b49392016-04-28 10:49:56 +0200219 .attr("xlink:href", function(word) {
220 return (data.urlprefix+word);})
Marc Kupietz83305222016-04-28 09:57:22 +0200221 .attr("class", function(d, i) {
222 var res="";
223 if(data.marked[i]) {
224 res="marked ";
225 }
226 if(data.target.indexOf(" "+d+" ") >= 0) {
227 return res+"target";
228 } else if(data.ranks[i] < data.mergedEnd) {
229 return res+"merged";
230 } else {
231 return res;
232 }
233 })
234 .attr("title", function(d, i) {
235 if(data.mergedEnd > 0) {
236 if(data.ranks[i] >= data.mergedEnd) {
237 return "rank: "+i +" "+"freq. rank: "+(data.ranks[i]).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
238 } else {
239 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",") + " (merged vocab)";
240 }
241 } else {
242 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
243 }
244 })
245 .append("text")
246 .attr("text-anchor", "top")
247 .attr("font-size", 12)
248 .text(function(d) { return d; });
249
250 var zoomListener = d3.behavior.zoom()
251 .scaleExtent([0.1, 10])
252 .center([0,0])
253 .on("zoom", zoomHandler);
254 zoomListener(svg);
255 }
256
257 var tx=0, ty=0;
258 var ss=1;
259 var iter_id=-1;
260
261 function zoomHandler() {
262 tx = d3.event.translate[0];
263 ty = d3.event.translate[1];
264 ss = d3.event.scale;
265 updateEmbedding();
266 }
267
268 var stepnum = 0;
269
270 function stopStep() {
271 clearInterval(iter_id);
272 text = svg.selectAll("text");
273
274 // jitter function needs different data and co-ordinate representation
275 labels = d3.range(data.words.length).map(function(i) {
276 var x = (T.Y[i][0]*20*ss + tx) + mapWidth/2;
277 var y = (T.Y[i][1]*20*ss + ty) + mapHeight/2;
278 anchor_array.push({x: x, y: y, r: jitterRadius});
279 return {
280 x: x,
281 y: y,
282 name: data.words[i]
283 };
284 });
285
286 // get the actual label bounding boxes for the jitter function
287 var index = 0;
288 text.each(function() {
289 labels[index].width = this.getBBox().width;
290 labels[index].height = this.getBBox().height;
291 index += 1;
292 });
293
294
295 // setTimeout(updateEmbedding, 1);
296 // setTimeout(
297 labeler = d3.labeler()
298 .label(labels)
299 .anchor(anchor_array)
300 .width(mapWidth)
301 .height(mapHeight)
302 .update(applyJitter);
303 // .start(1000);
304
305 iter_id = setInterval(jitterStep, 1);
306 }
307
308 var jitter_i=0;
309
310 function jitterStep() {
311 if(jitter_i++ > 100) {
312 clearInterval(iter_id);
313 } else {
314 labeler.start2(10);
315 applyJitter();
316 }
317 }
318
319 var last_cost=1000;
320
321 function step() {
322 var i = T.iter;
323
324 if(i > <%= $no_iterations %>) {
325 stopStep();
326 } else {
327 var cost = Math.round(T.step() * 100000) / 100000; // do a few steps
328 $("#cost").html("tsne iteration " + i + ", cost: " + cost.toFixed(5));
329 if(i % 250 == 0 && cost >= last_cost) {
330 stopStep();
331 } else {
332 last_cost = cost;
333 updateEmbedding();
334 }
335 }
336 }
337
338 function showMap(j) {
339 data=j;
340 T.iter=0;
341 T.initDataRaw(data.vecs); // init embedding
342 drawEmbedding(); // draw initial embedding
343
344 if(iter_id >= 0) {
345 clearInterval(iter_id);
346 }
347 //T.debugGrad();
348 iter_id = setInterval(step, 1);
349 if(<%= $show_som %>) {
350 makeSOM(j, <%= $no_iterations %>);
351 }
352 }
Marc Kupietz39179ab2017-07-04 16:28:06 +0200353 var queryword;
354
355 function onload() {
356 queryword = document.getElementById('word');
357 }
358
359 function queryKorAP() {
360 window.open('http://korap.ids-mannheim.de/kalamar/?q='+queryword.value, 'KorAP');
361 }
Marc Kupietz4dc270c2017-11-24 10:17:12 +0100362
363 function queryKorAPCII(query) {
364 window.open('http://korap.ids-mannheim.de/kalamar/?ql=cosmas2&q='+query, 'KorAP');
365 }
Marc Kupietz83305222016-04-28 09:57:22 +0200366 </script>
367 </head>
Marc Kupietz39179ab2017-07-04 16:28:06 +0200368 <body onload="onload()">
Marc Kupietzb3422c12017-07-04 14:12:11 +0200369 <form method="GET">
Marc Kupietz83305222016-04-28 09:57:22 +0200370 word(s):
Marc Kupietz39179ab2017-07-04 16:28:06 +0200371 <input id="word" type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
Marc Kupietz2c79c5e2017-11-09 16:18:40 +0100372 cut-off:
373 <input id="cutoff" type="text" name="cutoff" size="10" value="<%= $cutoff %>" title="Only consider the most frequent x word forms.">
Marc Kupietz4ccb4892017-11-21 09:33:08 +0100374 dedupe <input type="checkbox" name="dedupe" value="1" <%= ($dedupe ? "checked" : "") %> title="radically filter out any near-duplicates">
Marc Kupietz83305222016-04-28 09:57:22 +0200375 % if($mergedEnd > 0) {
376 backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
377 % }
Marc Kupietz2c79c5e2017-11-09 16:18:40 +0100378 max. neighbours: <input type="text" size="4" name="n" value="<%= $no_nbs %>">
379 max. iterations: <input type="text" name="N" size="4" value="<%= $no_iterations %>">
Marc Kupietz83305222016-04-28 09:57:22 +0200380 SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
381 % if($collocators) {
Marc Kupietz30ca4342017-11-22 21:21:20 +0100382 <span> </span>window/sort
Marc Kupietz83305222016-04-28 09:57:22 +0200383 <select name="sort">
Marc Kupietz30ca4342017-11-22 21:21:20 +0100384 <option value="0" <%= ($sort!=1 && $sort!=2? "selected":"") %>>auto focus</option>
385 <option value="1" <%= ($sort==1? "selected":"") %>>any single position</option>
386 <option value="2" <%= ($sort==2? "selected":"") %>>whole window</option>
Marc Kupietz83305222016-04-28 09:57:22 +0200387 </select>
388 % }
Marc Kupietz39179ab2017-07-04 16:28:06 +0200389 <span> </span><input type="submit" value="Show">
390 <span> </span><input type="button" value="→ KorAP" onclick="queryKorAP();" title="query word with KorAP"/>
Marc Kupietz83305222016-04-28 09:57:22 +0200391 </form>
392 <br>
Marc Kupietz3305b0a2017-11-27 10:46:20 +0100393 <div id="tabs">
394 <ul>
395 <li><a href="#tabs-1">Semantics</a></li>
396 <li><a href="#tabs-2">Syntagmatic (collocators)</a></li>
397 </ul>
398 <div id="tabs-1">
Marc Kupietz694610d2017-11-25 18:30:03 +0100399 <div id="mytable"/>
Marc Kupietzf9ac54e2017-11-21 09:22:29 +0100400 % if($lists && (@$lists) > 0 && (@$lists)[0]) {
Marc Kupietz83305222016-04-28 09:57:22 +0200401 <div id="wrapper">
Marc Kupietz3305b0a2017-11-27 10:46:20 +0100402 <div id="first" style="width:220px">
Marc Kupietz694610d2017-11-25 18:30:03 +0100403 <table class="display compact nowrap" id="firsttable">
404 <thead>
Marc Kupietz83305222016-04-28 09:57:22 +0200405 <tr>
406 <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
Marc Kupietz83305222016-04-28 09:57:22 +0200407 </tr>
Marc Kupietz694610d2017-11-25 18:30:03 +0100408 </thead>
409 <tbody>
Marc Kupietz83305222016-04-28 09:57:22 +0200410 % my $j=0; my @words; my @vecs; my @ranks; my @marked;
411 % for my $list (@$lists) {
412 % my $i=0; while($list) {
413 % my $item = (@$list)[$i];
414 % my $c = ($collocators? (@$collocators)[$i] : 0);
415 % last if(!$c && !$item);
416 <tr>
417 <td align="right">
418 <%= ++$i %>.
419 </td>
420 % if($item) {
421 % if(!grep{$_ eq $item->{word}} @words) {
422 % push @vecs, $item->{vector};
423 % push @words, $item->{word};
424 % push @ranks, $item->{rank};
425 % push @marked, ($marked->{$item->{word}}? 1 : 0);
426 % }
427 <td align="right">
428 <%= sprintf("%.3f", $item->{dist}) %>
429 </td>
430 <td>
431 % my $class = ($marked->{$item->{word}}? "marked " : "");
432 % my $r = $item->{rank};
433 % if($r < $mergedEnd) {
434 % $class .= "merged";
435 % $r .= " (merged vocab)";
436 % } elsif($mergedEnd!=0 && $r > $mergedEnd) {
437 % $r -= $mergedEnd;
438 % }
Marc Kupietzf4b49392016-04-28 10:49:56 +0200439 <a class="<%= $class =%>"
440 title="freq. rank: <%= $r =%>"
441 href="<%= url_with->query([word => $item->{word}]) =%>">
442 <%= $item->{word} =%>
443 </a>
Marc Kupietz83305222016-04-28 09:57:22 +0200444 </td>
445 % } else {
446 <td colspan="2"/>
447 % }
Marc Kupietz0af83e32017-11-27 09:31:37 +0100448 </tr>
449 % last if($i >= 100);
450 % }
451 % }
452 </tbody>
453 </table>
454</div>
455 <script>
456 % use Mojo::ByteStream 'b';
457 % my $urlprefix = url_with->query([word=>'']);
458 $(window).load(function() {
459 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", mergedEnd=> $mergedEnd, words => \@words, vecs => \@vecs, ranks => \@ranks, marked => \@marked, urlprefix => $urlprefix})); %>);
460 });
461 </script>
462 % } else { # ($word && $word !~ /^\s*$/)
463 <div id="wrapper">
464 <p>
465 ERROR: "<%= $word %>" not found in vocabluary.
466 </p>
467 </div>
468 % }
469 <div id="second" style="width:800px; height:800px; font-family: arial;">
470 <div id="embed">
471 </div>
472 </div>
Marc Kupietz3305b0a2017-11-27 10:46:20 +0100473</div>
Marc Kupietz0af83e32017-11-27 09:31:37 +0100474 <div id="cost"></div>
Marc Kupietz3305b0a2017-11-27 10:46:20 +0100475</div>
476</div>
477<div id="tabs-2">
Marc Kupietz0af83e32017-11-27 09:31:37 +0100478 <div id="second" style="width:500px">
479 <table class="display compact nowrap" id="secondtable">
480 <thead>
481 <tr>
482 % if($collocators) {
483 <th>#</th>
484 <th align="right" title="The window around the target word that is considered for summation.">w'</th>
485 <th align="right" title="Raw (max.) activation of the collocator in the output layers.">a</th>
486 <th title="Σp(c<sub><small>@</small></sub>) – Sum of the probability approximations that the combination of the target word and the collocator at the relative position @ come from the training corpus. Single approximations can be distorted because of sub-sampling frequent words and the sum cannot itself be interpreted as probability." align="right">Σp</th>
487 <th align="right">Σp/|w|</th>
488 <th title="c" align="left">collocator</th>
489 % }
490 </tr>
491 </thead>
492 <tbody>
493 % for(my $i=0; $i < 100; $i++) {
494 % my $c = ($collocators? (@$collocators)[$i] : 0);
495 <tr>
496 <td align="right">
497 <%= ++$i %>.
498 </td>
Marc Kupietz83305222016-04-28 09:57:22 +0200499 % if($c) {
500 <td align="right">
Marc Kupietz30ca4342017-11-22 21:21:20 +0100501 <span class="mono"><%= bitvec2window($c->{pos}) %></span>
Marc Kupietz83305222016-04-28 09:57:22 +0200502 </td>
503 <td align="right">
504 <%= sprintf("%.3f", $c->{dist}) %>
505 </td>
506 <td align="right">
507 <%= sprintf("%.3e", $c->{norm}) %>
508 </td>
509 <td align="right">
510 <%= sprintf("%.3e", $c->{sum}) %>
511 </td>
512 <td align="left">
Marc Kupietz4dc270c2017-11-24 10:17:12 +0100513 <a onclick="<%= sprintf("queryKorAPCII('%s /w5 %s')", $c->{word}, $word) =%>"
Marc Kupietzb18978b2017-11-09 14:51:17 +0100514 title="freq. rank: <%= $c->{rank} =%>">
Marc Kupietz83305222016-04-28 09:57:22 +0200515 <%= $c->{word} %>
516 </td>
517 % } else {
518 <td colspan="5"/>
519 % }
520 </tr>
521 % }
Marc Kupietz694610d2017-11-25 18:30:03 +0100522 </tbody>
Marc Kupietz83305222016-04-28 09:57:22 +0200523 </table>
Marc Kupietz694610d2017-11-25 18:30:03 +0100524</div>
Marc Kupietz3305b0a2017-11-27 10:46:20 +0100525</div>
526</div>
Marc Kupietz83305222016-04-28 09:57:22 +0200527 % if($show_som) {
528 <div id="som2">
529 </div>
530 <div id="sominfo1"><span id="somcolor1"> </span> <span id="somword1"> </span> <span id="somcolor2"> </span> <span id="somword2"> </span> <span id="somcolor3"> </span></div>
531 <div id="sominfo">SOM iteration <span id="iterations">0</span></div>
532 % }
533 </div>
534 % if($training_args) {
535 <p>
536 Word vector model trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters: <pre><%= $training_args %></pre>
537 </p>
538 % }
539 </body>
540</html>