blob: 0f487182b04c78dcf1342cac8f51ed29b6dbc3eb [file] [log] [blame]
Marc Kupietz83305222016-04-28 09:57:22 +02001<!DOCTYPE html>
2<html>
3 <head>
4 <title>DeReKo-Word-Vector-Distances: <%= $word %></title>
Marc Kupietz80bd7b92017-07-04 16:25:54 +02005 <link rel="stylesheet" href="//code.jquery.com/ui/1.12.1/themes/base/jquery-ui.css">
6 <script src="http://code.jquery.com/jquery-latest.min.js"></script>
Marc Kupietz694610d2017-11-25 18:30:03 +01007 <script src = "https://cdn.datatables.net/1.10.16/js/jquery.dataTables.min.js"></script>
8 <script src = "https://cdn.datatables.net/fixedcolumns/3.2.3/js/dataTables.fixedColumns.min.js"></script>
9 <link rel="stylesheet" href="https://cdn.datatables.net/1.10.16/css/jquery.dataTables.min.css">
Marc Kupietz80bd7b92017-07-04 16:25:54 +020010 <script
11 src="http://code.jquery.com/ui/1.12.1/jquery-ui.min.js"
12 integrity="sha256-VazP97ZCwtekAsvgPBSUwPFKdrwD3unUfSGVYrahUqU="
13 crossorigin="anonymous"></script>
14 <script>
Marc Kupietz694610d2017-11-25 18:30:03 +010015$(document).ready(function() {
16 $("#tabs").tabs( {
17 "show": function(event, ui) {
18 var oTable = $('div.dataTables_scrollBody>table.display', ui.panel).dataTable();
19 if ( oTable.length > 0 ) {
20 oTable.fnAdjustColumnSizing();
21 }
22 }
23 } );
24
25 $('#firsttable').DataTable({
Marc Kupietz0af83e32017-11-27 09:31:37 +010026 "sScrollY": "760px",
27 "bScrollCollapse": true,
28 "bPaginate": false,
29 "bJQueryUI": true,
30 "dom": '<"top">rt<"bottom"flp><"clear">',
31 "aoColumnDefs": [
32 { "sWidth": "10%", "aTargets": [ -1 ] }
33 ]
34 } );
35
36
37 $('#secondtable').DataTable({
Marc Kupietz694610d2017-11-25 18:30:03 +010038 "sScrollY": "800px",
39 "bScrollCollapse": true,
40 "bPaginate": false,
41 "bJQueryUI": true,
42 "dom": '<"top">rt<"bottom"flp><"clear">',
43 "aoColumnDefs": [
44 { "sWidth": "10%", "aTargets": [ -1 ] }
45 ]
46 } );
47});
Marc Kupietz0af83e32017-11-27 09:31:37 +010048
49$(function() {
Marc Kupietz83305222016-04-28 09:57:22 +020050 $( document ).tooltip({
51 content: function() {
52 return $(this).attr('title');
53 }}
54 )
55 })
Marc Kupietz694610d2017-11-25 18:30:03 +010056
Marc Kupietz83305222016-04-28 09:57:22 +020057 </script>
58 <script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
Marc Kupietz554aff52017-11-09 14:42:09 +010059 <script src="/derekovecs/js/tsne.js"></script>
60 <script src="/derekovecs/js/som.js"></script>
61 <script src="/derekovecs/js/labeler.js"></script>
Marc Kupietz83305222016-04-28 09:57:22 +020062 <style>
63 body, input {
64 font-family: Arial, sans-serif;
65 font-size: 11pt;
66 }
Marc Kupietz30ca4342017-11-22 21:21:20 +010067
68 .mono {
69 font-family: "DejaVu Sans Mono", Inconsolata, SourceCodePro, Courier;
70 }
71
Marc Kupietz83305222016-04-28 09:57:22 +020072 .ui-tooltip-content {
73 font-size: 9pt;
74 color: #222222;
75 }
76
77 svg > .ui-tooltip-content {
78 font-size: 8pt;
79 color: #222222;
80 }
81
82 a.merged {
83 color: green;
84 fill: green;
85 }
86
87 #first a {
88 text-decoration: none;
89 }
90
91 a.marked, #first a.marked {
92 text-decoration: underline;
93 }
Marc Kupietzf4b49392016-04-28 10:49:56 +020094
Marc Kupietz83305222016-04-28 09:57:22 +020095 a.target {
96 color: red;
97 fill: red;
98 }
Marc Kupietz694610d2017-11-25 18:30:03 +010099
100table.display {
101 width: 40% important!;
102 margin: 0; /* <- works for me this way ****/
103}
104table.dataTable thead th, table.dataTable thead td, table.dataTable tbody td {
105 padding: 2px 2px;
106// border-bottom: 1px solid #111;
107}
Marc Kupietz83305222016-04-28 09:57:22 +0200108 #collocators {
109 margin-bottom: 15px;
110 }
111
112 #wrapper {
113 width: 100%;
114 // border: 1px solid red;
115 overflow: hidden; /* will contain if #first is longer than #second */
116 }
117 #first {
118 margin-right: 20px;
119 float: left;
120 // border: 1px solid green;
121 }
122 #second {
123 border: 1px solid #333;
124 overflow: hidden; /* if you don't want #second to wrap below #first */
125 }
126 #som2 svg {
127 border: 1px solid #333;
128 }
129
130 #cost {
131 font-size: 8pt;
132 color: #222222;
133 margin-top: 4px;
134 margin-bottom: 12px;
135 }
136
137 #sominfo1, #sominfo {
138 font-size: 8pt;
139 color: #222222;
140 margin-top: 0px;
141 }
142
143 #somcolor1, #somcolor2, #somcolor3 {
144 display: inline-block;
145 height: 10px;
146 width: 10px;
147 }
148
149 #third {
150 border: 1px solid #333;
151 }
152
153 </style>
154 <script>
155
156 var opt = {epsilon: <%= $epsilon %>, perplexity: <%= $perplexity %>},
157 mapWidth = 800, // width map
158 mapHeight = 800,
159 jitterRadius = 7;
160
161 var T = new tsnejs.tSNE(opt); // create a tSNE instance
162
163 var Y;
164
165 var data;
166 var labeler;
167
168 function applyJitter() {
169 svg.selectAll('.tsnet')
170 .data(labels)
171 .transition()
172 .duration(50)
173 .attr("transform", function(d, i) {
174 T.Y[i][0] = (d.x - mapWidth/2 - tx)/ss/20;
175 T.Y[i][1] = (d.y - mapHeight/2 - ty)/ss/20;
176 return "translate(" +
177 (d.x) + "," +
178 (d.y) + ")";
179 });
180 }
181
182 function updateEmbedding() {
183 var Y = T.getSolution();
184 svg.selectAll('.tsnet')
185 .data(data.words)
186 .attr("transform", function(d, i) {
187 return "translate(" +
188 ((Y[i][0]*20*ss + tx) + mapWidth/2) + "," +
189 ((Y[i][1]*20*ss + ty) + mapHeight/2) + ")"; });
190 }
191
192 var svg;
193 var labels = [];
194 var anchor_array = [];
195 var text;
196
197 function drawEmbedding() {
198 $("#embed").empty();
199 var div = d3.select("#embed");
200
201 // get min and max in each column of Y
202 var Y = T.Y;
203
204 svg = div.append("svg") // svg is global
205 .attr("width", mapWidth)
206 .attr("height", mapHeight);
207
208 var g = svg.selectAll(".b")
209 .data(data.words)
210 .enter().append("g")
211 .attr("class", "tsnet");
212
213 g.append("a")
Marc Kupietzf4b49392016-04-28 10:49:56 +0200214 .attr("xlink:href", function(word) {
215 return (data.urlprefix+word);})
Marc Kupietz83305222016-04-28 09:57:22 +0200216 .attr("class", function(d, i) {
217 var res="";
218 if(data.marked[i]) {
219 res="marked ";
220 }
221 if(data.target.indexOf(" "+d+" ") >= 0) {
222 return res+"target";
223 } else if(data.ranks[i] < data.mergedEnd) {
224 return res+"merged";
225 } else {
226 return res;
227 }
228 })
229 .attr("title", function(d, i) {
230 if(data.mergedEnd > 0) {
231 if(data.ranks[i] >= data.mergedEnd) {
232 return "rank: "+i +" "+"freq. rank: "+(data.ranks[i]).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
233 } else {
234 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",") + " (merged vocab)";
235 }
236 } else {
237 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
238 }
239 })
240 .append("text")
241 .attr("text-anchor", "top")
242 .attr("font-size", 12)
243 .text(function(d) { return d; });
244
245 var zoomListener = d3.behavior.zoom()
246 .scaleExtent([0.1, 10])
247 .center([0,0])
248 .on("zoom", zoomHandler);
249 zoomListener(svg);
250 }
251
252 var tx=0, ty=0;
253 var ss=1;
254 var iter_id=-1;
255
256 function zoomHandler() {
257 tx = d3.event.translate[0];
258 ty = d3.event.translate[1];
259 ss = d3.event.scale;
260 updateEmbedding();
261 }
262
263 var stepnum = 0;
264
265 function stopStep() {
266 clearInterval(iter_id);
267 text = svg.selectAll("text");
268
269 // jitter function needs different data and co-ordinate representation
270 labels = d3.range(data.words.length).map(function(i) {
271 var x = (T.Y[i][0]*20*ss + tx) + mapWidth/2;
272 var y = (T.Y[i][1]*20*ss + ty) + mapHeight/2;
273 anchor_array.push({x: x, y: y, r: jitterRadius});
274 return {
275 x: x,
276 y: y,
277 name: data.words[i]
278 };
279 });
280
281 // get the actual label bounding boxes for the jitter function
282 var index = 0;
283 text.each(function() {
284 labels[index].width = this.getBBox().width;
285 labels[index].height = this.getBBox().height;
286 index += 1;
287 });
288
289
290 // setTimeout(updateEmbedding, 1);
291 // setTimeout(
292 labeler = d3.labeler()
293 .label(labels)
294 .anchor(anchor_array)
295 .width(mapWidth)
296 .height(mapHeight)
297 .update(applyJitter);
298 // .start(1000);
299
300 iter_id = setInterval(jitterStep, 1);
301 }
302
303 var jitter_i=0;
304
305 function jitterStep() {
306 if(jitter_i++ > 100) {
307 clearInterval(iter_id);
308 } else {
309 labeler.start2(10);
310 applyJitter();
311 }
312 }
313
314 var last_cost=1000;
315
316 function step() {
317 var i = T.iter;
318
319 if(i > <%= $no_iterations %>) {
320 stopStep();
321 } else {
322 var cost = Math.round(T.step() * 100000) / 100000; // do a few steps
323 $("#cost").html("tsne iteration " + i + ", cost: " + cost.toFixed(5));
324 if(i % 250 == 0 && cost >= last_cost) {
325 stopStep();
326 } else {
327 last_cost = cost;
328 updateEmbedding();
329 }
330 }
331 }
332
333 function showMap(j) {
334 data=j;
335 T.iter=0;
336 T.initDataRaw(data.vecs); // init embedding
337 drawEmbedding(); // draw initial embedding
338
339 if(iter_id >= 0) {
340 clearInterval(iter_id);
341 }
342 //T.debugGrad();
343 iter_id = setInterval(step, 1);
344 if(<%= $show_som %>) {
345 makeSOM(j, <%= $no_iterations %>);
346 }
347 }
Marc Kupietz39179ab2017-07-04 16:28:06 +0200348 var queryword;
349
350 function onload() {
351 queryword = document.getElementById('word');
352 }
353
354 function queryKorAP() {
355 window.open('http://korap.ids-mannheim.de/kalamar/?q='+queryword.value, 'KorAP');
356 }
Marc Kupietz4dc270c2017-11-24 10:17:12 +0100357
358 function queryKorAPCII(query) {
359 window.open('http://korap.ids-mannheim.de/kalamar/?ql=cosmas2&q='+query, 'KorAP');
360 }
Marc Kupietz83305222016-04-28 09:57:22 +0200361 </script>
362 </head>
Marc Kupietz39179ab2017-07-04 16:28:06 +0200363 <body onload="onload()">
Marc Kupietzb3422c12017-07-04 14:12:11 +0200364 <form method="GET">
Marc Kupietz83305222016-04-28 09:57:22 +0200365 word(s):
Marc Kupietz39179ab2017-07-04 16:28:06 +0200366 <input id="word" type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
Marc Kupietz2c79c5e2017-11-09 16:18:40 +0100367 cut-off:
368 <input id="cutoff" type="text" name="cutoff" size="10" value="<%= $cutoff %>" title="Only consider the most frequent x word forms.">
Marc Kupietz4ccb4892017-11-21 09:33:08 +0100369 dedupe <input type="checkbox" name="dedupe" value="1" <%= ($dedupe ? "checked" : "") %> title="radically filter out any near-duplicates">
Marc Kupietz83305222016-04-28 09:57:22 +0200370 % if($mergedEnd > 0) {
371 backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
372 % }
Marc Kupietz2c79c5e2017-11-09 16:18:40 +0100373 max. neighbours: <input type="text" size="4" name="n" value="<%= $no_nbs %>">
374 max. iterations: <input type="text" name="N" size="4" value="<%= $no_iterations %>">
Marc Kupietz83305222016-04-28 09:57:22 +0200375 SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
376 % if($collocators) {
Marc Kupietz30ca4342017-11-22 21:21:20 +0100377 <span> </span>window/sort
Marc Kupietz83305222016-04-28 09:57:22 +0200378 <select name="sort">
Marc Kupietz30ca4342017-11-22 21:21:20 +0100379 <option value="0" <%= ($sort!=1 && $sort!=2? "selected":"") %>>auto focus</option>
380 <option value="1" <%= ($sort==1? "selected":"") %>>any single position</option>
381 <option value="2" <%= ($sort==2? "selected":"") %>>whole window</option>
Marc Kupietz83305222016-04-28 09:57:22 +0200382 </select>
383 % }
Marc Kupietz39179ab2017-07-04 16:28:06 +0200384 <span> </span><input type="submit" value="Show">
385 <span> </span><input type="button" value="→ KorAP" onclick="queryKorAP();" title="query word with KorAP"/>
Marc Kupietz83305222016-04-28 09:57:22 +0200386 </form>
387 <br>
Marc Kupietz694610d2017-11-25 18:30:03 +0100388 <div id="mytable"/>
Marc Kupietzf9ac54e2017-11-21 09:22:29 +0100389 % if($lists && (@$lists) > 0 && (@$lists)[0]) {
Marc Kupietz83305222016-04-28 09:57:22 +0200390 <div id="wrapper">
Marc Kupietz0af83e32017-11-27 09:31:37 +0100391 <div id="first" style="width:200px">
Marc Kupietz694610d2017-11-25 18:30:03 +0100392 <table class="display compact nowrap" id="firsttable">
393 <thead>
Marc Kupietz83305222016-04-28 09:57:22 +0200394 <tr>
395 <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
Marc Kupietz83305222016-04-28 09:57:22 +0200396 </tr>
Marc Kupietz694610d2017-11-25 18:30:03 +0100397 </thead>
398 <tbody>
Marc Kupietz83305222016-04-28 09:57:22 +0200399 % my $j=0; my @words; my @vecs; my @ranks; my @marked;
400 % for my $list (@$lists) {
401 % my $i=0; while($list) {
402 % my $item = (@$list)[$i];
403 % my $c = ($collocators? (@$collocators)[$i] : 0);
404 % last if(!$c && !$item);
405 <tr>
406 <td align="right">
407 <%= ++$i %>.
408 </td>
409 % if($item) {
410 % if(!grep{$_ eq $item->{word}} @words) {
411 % push @vecs, $item->{vector};
412 % push @words, $item->{word};
413 % push @ranks, $item->{rank};
414 % push @marked, ($marked->{$item->{word}}? 1 : 0);
415 % }
416 <td align="right">
417 <%= sprintf("%.3f", $item->{dist}) %>
418 </td>
419 <td>
420 % my $class = ($marked->{$item->{word}}? "marked " : "");
421 % my $r = $item->{rank};
422 % if($r < $mergedEnd) {
423 % $class .= "merged";
424 % $r .= " (merged vocab)";
425 % } elsif($mergedEnd!=0 && $r > $mergedEnd) {
426 % $r -= $mergedEnd;
427 % }
Marc Kupietzf4b49392016-04-28 10:49:56 +0200428 <a class="<%= $class =%>"
429 title="freq. rank: <%= $r =%>"
430 href="<%= url_with->query([word => $item->{word}]) =%>">
431 <%= $item->{word} =%>
432 </a>
Marc Kupietz83305222016-04-28 09:57:22 +0200433 </td>
434 % } else {
435 <td colspan="2"/>
436 % }
Marc Kupietz0af83e32017-11-27 09:31:37 +0100437 </tr>
438 % last if($i >= 100);
439 % }
440 % }
441 </tbody>
442 </table>
443</div>
444 <script>
445 % use Mojo::ByteStream 'b';
446 % my $urlprefix = url_with->query([word=>'']);
447 $(window).load(function() {
448 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", mergedEnd=> $mergedEnd, words => \@words, vecs => \@vecs, ranks => \@ranks, marked => \@marked, urlprefix => $urlprefix})); %>);
449 });
450 </script>
451 % } else { # ($word && $word !~ /^\s*$/)
452 <div id="wrapper">
453 <p>
454 ERROR: "<%= $word %>" not found in vocabluary.
455 </p>
456 </div>
457 % }
458 <div id="second" style="width:800px; height:800px; font-family: arial;">
459 <div id="embed">
460 </div>
461 </div>
462 <div id="cost"></div>
463 <div id="second" style="width:500px">
464 <table class="display compact nowrap" id="secondtable">
465 <thead>
466 <tr>
467 % if($collocators) {
468 <th>#</th>
469 <th align="right" title="The window around the target word that is considered for summation.">w'</th>
470 <th align="right" title="Raw (max.) activation of the collocator in the output layers.">a</th>
471 <th title="Σp(c<sub><small>@</small></sub>) – Sum of the probability approximations that the combination of the target word and the collocator at the relative position @ come from the training corpus. Single approximations can be distorted because of sub-sampling frequent words and the sum cannot itself be interpreted as probability." align="right">Σp</th>
472 <th align="right">Σp/|w|</th>
473 <th title="c" align="left">collocator</th>
474 % }
475 </tr>
476 </thead>
477 <tbody>
478 % for(my $i=0; $i < 100; $i++) {
479 % my $c = ($collocators? (@$collocators)[$i] : 0);
480 <tr>
481 <td align="right">
482 <%= ++$i %>.
483 </td>
Marc Kupietz83305222016-04-28 09:57:22 +0200484 % if($c) {
485 <td align="right">
Marc Kupietz30ca4342017-11-22 21:21:20 +0100486 <span class="mono"><%= bitvec2window($c->{pos}) %></span>
Marc Kupietz83305222016-04-28 09:57:22 +0200487 </td>
488 <td align="right">
489 <%= sprintf("%.3f", $c->{dist}) %>
490 </td>
491 <td align="right">
492 <%= sprintf("%.3e", $c->{norm}) %>
493 </td>
494 <td align="right">
495 <%= sprintf("%.3e", $c->{sum}) %>
496 </td>
497 <td align="left">
Marc Kupietz4dc270c2017-11-24 10:17:12 +0100498 <a onclick="<%= sprintf("queryKorAPCII('%s /w5 %s')", $c->{word}, $word) =%>"
Marc Kupietzb18978b2017-11-09 14:51:17 +0100499 title="freq. rank: <%= $c->{rank} =%>">
Marc Kupietz83305222016-04-28 09:57:22 +0200500 <%= $c->{word} %>
501 </td>
502 % } else {
503 <td colspan="5"/>
504 % }
505 </tr>
506 % }
Marc Kupietz694610d2017-11-25 18:30:03 +0100507 </tbody>
Marc Kupietz83305222016-04-28 09:57:22 +0200508 </table>
Marc Kupietz694610d2017-11-25 18:30:03 +0100509</div>
Marc Kupietz0af83e32017-11-27 09:31:37 +0100510
Marc Kupietz83305222016-04-28 09:57:22 +0200511 % if($show_som) {
512 <div id="som2">
513 </div>
514 <div id="sominfo1"><span id="somcolor1"> </span> <span id="somword1"> </span> <span id="somcolor2"> </span> <span id="somword2"> </span> <span id="somcolor3"> </span></div>
515 <div id="sominfo">SOM iteration <span id="iterations">0</span></div>
516 % }
517 </div>
518 % if($training_args) {
519 <p>
520 Word vector model trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters: <pre><%= $training_args %></pre>
521 </p>
522 % }
523 </body>
524</html>