blob: 7927bd8c9ab318df89e8bce23dc1b22cd3dfff70 [file] [log] [blame]
Marc Kupietz83305222016-04-28 09:57:22 +02001<!DOCTYPE html>
2<html>
3 <head>
4 <title>DeReKo-Word-Vector-Distances: <%= $word %></title>
Marc Kupietz80bd7b92017-07-04 16:25:54 +02005 <link rel="stylesheet" href="//code.jquery.com/ui/1.12.1/themes/base/jquery-ui.css">
6 <script src="http://code.jquery.com/jquery-latest.min.js"></script>
7 <script
8 src="http://code.jquery.com/ui/1.12.1/jquery-ui.min.js"
9 integrity="sha256-VazP97ZCwtekAsvgPBSUwPFKdrwD3unUfSGVYrahUqU="
10 crossorigin="anonymous"></script>
11 <script>
Marc Kupietz83305222016-04-28 09:57:22 +020012 $(function() {
13 $( document ).tooltip({
14 content: function() {
15 return $(this).attr('title');
16 }}
17 )
18 })
19 </script>
20 <script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
Marc Kupietz554aff52017-11-09 14:42:09 +010021 <script src="/derekovecs/js/tsne.js"></script>
22 <script src="/derekovecs/js/som.js"></script>
23 <script src="/derekovecs/js/labeler.js"></script>
Marc Kupietz83305222016-04-28 09:57:22 +020024 <style>
25 body, input {
26 font-family: Arial, sans-serif;
27 font-size: 11pt;
28 }
Marc Kupietz30ca4342017-11-22 21:21:20 +010029
30 .mono {
31 font-family: "DejaVu Sans Mono", Inconsolata, SourceCodePro, Courier;
32 }
33
Marc Kupietz83305222016-04-28 09:57:22 +020034 .ui-tooltip-content {
35 font-size: 9pt;
36 color: #222222;
37 }
38
39 svg > .ui-tooltip-content {
40 font-size: 8pt;
41 color: #222222;
42 }
43
44 a.merged {
45 color: green;
46 fill: green;
47 }
48
49 #first a {
50 text-decoration: none;
51 }
52
53 a.marked, #first a.marked {
54 text-decoration: underline;
55 }
Marc Kupietzf4b49392016-04-28 10:49:56 +020056
Marc Kupietz83305222016-04-28 09:57:22 +020057 a.target {
58 color: red;
59 fill: red;
60 }
61
62 #collocators {
63 margin-bottom: 15px;
64 }
65
66 #wrapper {
67 width: 100%;
68 // border: 1px solid red;
69 overflow: hidden; /* will contain if #first is longer than #second */
70 }
71 #first {
72 margin-right: 20px;
73 float: left;
74 // border: 1px solid green;
75 }
76 #second {
77 border: 1px solid #333;
78 overflow: hidden; /* if you don't want #second to wrap below #first */
79 }
80 #som2 svg {
81 border: 1px solid #333;
82 }
83
84 #cost {
85 font-size: 8pt;
86 color: #222222;
87 margin-top: 4px;
88 margin-bottom: 12px;
89 }
90
91 #sominfo1, #sominfo {
92 font-size: 8pt;
93 color: #222222;
94 margin-top: 0px;
95 }
96
97 #somcolor1, #somcolor2, #somcolor3 {
98 display: inline-block;
99 height: 10px;
100 width: 10px;
101 }
102
103 #third {
104 border: 1px solid #333;
105 }
106
107 </style>
108 <script>
109
110 var opt = {epsilon: <%= $epsilon %>, perplexity: <%= $perplexity %>},
111 mapWidth = 800, // width map
112 mapHeight = 800,
113 jitterRadius = 7;
114
115 var T = new tsnejs.tSNE(opt); // create a tSNE instance
116
117 var Y;
118
119 var data;
120 var labeler;
121
122 function applyJitter() {
123 svg.selectAll('.tsnet')
124 .data(labels)
125 .transition()
126 .duration(50)
127 .attr("transform", function(d, i) {
128 T.Y[i][0] = (d.x - mapWidth/2 - tx)/ss/20;
129 T.Y[i][1] = (d.y - mapHeight/2 - ty)/ss/20;
130 return "translate(" +
131 (d.x) + "," +
132 (d.y) + ")";
133 });
134 }
135
136 function updateEmbedding() {
137 var Y = T.getSolution();
138 svg.selectAll('.tsnet')
139 .data(data.words)
140 .attr("transform", function(d, i) {
141 return "translate(" +
142 ((Y[i][0]*20*ss + tx) + mapWidth/2) + "," +
143 ((Y[i][1]*20*ss + ty) + mapHeight/2) + ")"; });
144 }
145
146 var svg;
147 var labels = [];
148 var anchor_array = [];
149 var text;
150
151 function drawEmbedding() {
152 $("#embed").empty();
153 var div = d3.select("#embed");
154
155 // get min and max in each column of Y
156 var Y = T.Y;
157
158 svg = div.append("svg") // svg is global
159 .attr("width", mapWidth)
160 .attr("height", mapHeight);
161
162 var g = svg.selectAll(".b")
163 .data(data.words)
164 .enter().append("g")
165 .attr("class", "tsnet");
166
167 g.append("a")
Marc Kupietzf4b49392016-04-28 10:49:56 +0200168 .attr("xlink:href", function(word) {
169 return (data.urlprefix+word);})
Marc Kupietz83305222016-04-28 09:57:22 +0200170 .attr("class", function(d, i) {
171 var res="";
172 if(data.marked[i]) {
173 res="marked ";
174 }
175 if(data.target.indexOf(" "+d+" ") >= 0) {
176 return res+"target";
177 } else if(data.ranks[i] < data.mergedEnd) {
178 return res+"merged";
179 } else {
180 return res;
181 }
182 })
183 .attr("title", function(d, i) {
184 if(data.mergedEnd > 0) {
185 if(data.ranks[i] >= data.mergedEnd) {
186 return "rank: "+i +" "+"freq. rank: "+(data.ranks[i]).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
187 } else {
188 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",") + " (merged vocab)";
189 }
190 } else {
191 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
192 }
193 })
194 .append("text")
195 .attr("text-anchor", "top")
196 .attr("font-size", 12)
197 .text(function(d) { return d; });
198
199 var zoomListener = d3.behavior.zoom()
200 .scaleExtent([0.1, 10])
201 .center([0,0])
202 .on("zoom", zoomHandler);
203 zoomListener(svg);
204 }
205
206 var tx=0, ty=0;
207 var ss=1;
208 var iter_id=-1;
209
210 function zoomHandler() {
211 tx = d3.event.translate[0];
212 ty = d3.event.translate[1];
213 ss = d3.event.scale;
214 updateEmbedding();
215 }
216
217 var stepnum = 0;
218
219 function stopStep() {
220 clearInterval(iter_id);
221 text = svg.selectAll("text");
222
223 // jitter function needs different data and co-ordinate representation
224 labels = d3.range(data.words.length).map(function(i) {
225 var x = (T.Y[i][0]*20*ss + tx) + mapWidth/2;
226 var y = (T.Y[i][1]*20*ss + ty) + mapHeight/2;
227 anchor_array.push({x: x, y: y, r: jitterRadius});
228 return {
229 x: x,
230 y: y,
231 name: data.words[i]
232 };
233 });
234
235 // get the actual label bounding boxes for the jitter function
236 var index = 0;
237 text.each(function() {
238 labels[index].width = this.getBBox().width;
239 labels[index].height = this.getBBox().height;
240 index += 1;
241 });
242
243
244 // setTimeout(updateEmbedding, 1);
245 // setTimeout(
246 labeler = d3.labeler()
247 .label(labels)
248 .anchor(anchor_array)
249 .width(mapWidth)
250 .height(mapHeight)
251 .update(applyJitter);
252 // .start(1000);
253
254 iter_id = setInterval(jitterStep, 1);
255 }
256
257 var jitter_i=0;
258
259 function jitterStep() {
260 if(jitter_i++ > 100) {
261 clearInterval(iter_id);
262 } else {
263 labeler.start2(10);
264 applyJitter();
265 }
266 }
267
268 var last_cost=1000;
269
270 function step() {
271 var i = T.iter;
272
273 if(i > <%= $no_iterations %>) {
274 stopStep();
275 } else {
276 var cost = Math.round(T.step() * 100000) / 100000; // do a few steps
277 $("#cost").html("tsne iteration " + i + ", cost: " + cost.toFixed(5));
278 if(i % 250 == 0 && cost >= last_cost) {
279 stopStep();
280 } else {
281 last_cost = cost;
282 updateEmbedding();
283 }
284 }
285 }
286
287 function showMap(j) {
288 data=j;
289 T.iter=0;
290 T.initDataRaw(data.vecs); // init embedding
291 drawEmbedding(); // draw initial embedding
292
293 if(iter_id >= 0) {
294 clearInterval(iter_id);
295 }
296 //T.debugGrad();
297 iter_id = setInterval(step, 1);
298 if(<%= $show_som %>) {
299 makeSOM(j, <%= $no_iterations %>);
300 }
301 }
Marc Kupietz39179ab2017-07-04 16:28:06 +0200302 var queryword;
303
304 function onload() {
305 queryword = document.getElementById('word');
306 }
307
308 function queryKorAP() {
309 window.open('http://korap.ids-mannheim.de/kalamar/?q='+queryword.value, 'KorAP');
310 }
Marc Kupietz83305222016-04-28 09:57:22 +0200311 </script>
312 </head>
Marc Kupietz39179ab2017-07-04 16:28:06 +0200313 <body onload="onload()">
Marc Kupietzb3422c12017-07-04 14:12:11 +0200314 <form method="GET">
Marc Kupietz83305222016-04-28 09:57:22 +0200315 word(s):
Marc Kupietz39179ab2017-07-04 16:28:06 +0200316 <input id="word" type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
Marc Kupietz2c79c5e2017-11-09 16:18:40 +0100317 cut-off:
318 <input id="cutoff" type="text" name="cutoff" size="10" value="<%= $cutoff %>" title="Only consider the most frequent x word forms.">
Marc Kupietz4ccb4892017-11-21 09:33:08 +0100319 dedupe <input type="checkbox" name="dedupe" value="1" <%= ($dedupe ? "checked" : "") %> title="radically filter out any near-duplicates">
Marc Kupietz83305222016-04-28 09:57:22 +0200320 % if($mergedEnd > 0) {
321 backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
322 % }
Marc Kupietz2c79c5e2017-11-09 16:18:40 +0100323 max. neighbours: <input type="text" size="4" name="n" value="<%= $no_nbs %>">
324 max. iterations: <input type="text" name="N" size="4" value="<%= $no_iterations %>">
Marc Kupietz83305222016-04-28 09:57:22 +0200325 SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
326 % if($collocators) {
Marc Kupietz30ca4342017-11-22 21:21:20 +0100327 <span> </span>window/sort
Marc Kupietz83305222016-04-28 09:57:22 +0200328 <select name="sort">
Marc Kupietz30ca4342017-11-22 21:21:20 +0100329 <option value="0" <%= ($sort!=1 && $sort!=2? "selected":"") %>>auto focus</option>
330 <option value="1" <%= ($sort==1? "selected":"") %>>any single position</option>
331 <option value="2" <%= ($sort==2? "selected":"") %>>whole window</option>
Marc Kupietz83305222016-04-28 09:57:22 +0200332 </select>
333 % }
Marc Kupietz39179ab2017-07-04 16:28:06 +0200334 <span> </span><input type="submit" value="Show">
335 <span> </span><input type="button" value="→ KorAP" onclick="queryKorAP();" title="query word with KorAP"/>
Marc Kupietz83305222016-04-28 09:57:22 +0200336 </form>
337 <br>
Marc Kupietzf9ac54e2017-11-21 09:22:29 +0100338 % if($lists && (@$lists) > 0 && (@$lists)[0]) {
Marc Kupietz83305222016-04-28 09:57:22 +0200339 <div id="wrapper">
340 <table id="first">
341 <tr>
342 <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
343 % if($collocators) {
Marc Kupietze7ffaf22017-11-24 10:13:08 +0100344 <th title="The window around the target word that is considered for summation.">w'</th>
345 <th align="center" title="Raw (max.) activation of the collocator in the output layers.">a</th>
346 <th title="Σp(c<sub><small>@</small></sub>) – Sum of the probability approximations that the combination of the target word and the collocator at the relative position @ come from the training corpus. Single approximations can be distorted because of sub-sampling frequent words and the sum cannot itself be interpreted as probability."align="center">Σp</th>
Marc Kupietz6fe7f392017-11-24 10:15:30 +0100347<!--
Marc Kupietze7ffaf22017-11-24 10:13:08 +0100348 <th align="right">Σp/|w|</th>
Marc Kupietz6fe7f392017-11-24 10:15:30 +0100349-->
Marc Kupietze7ffaf22017-11-24 10:13:08 +0100350 <th title="c" align="left">collocator</th>
Marc Kupietz83305222016-04-28 09:57:22 +0200351 % }
352 </tr>
353 % my $j=0; my @words; my @vecs; my @ranks; my @marked;
354 % for my $list (@$lists) {
355 % my $i=0; while($list) {
356 % my $item = (@$list)[$i];
357 % my $c = ($collocators? (@$collocators)[$i] : 0);
358 % last if(!$c && !$item);
359 <tr>
360 <td align="right">
361 <%= ++$i %>.
362 </td>
363 % if($item) {
364 % if(!grep{$_ eq $item->{word}} @words) {
365 % push @vecs, $item->{vector};
366 % push @words, $item->{word};
367 % push @ranks, $item->{rank};
368 % push @marked, ($marked->{$item->{word}}? 1 : 0);
369 % }
370 <td align="right">
371 <%= sprintf("%.3f", $item->{dist}) %>
372 </td>
373 <td>
374 % my $class = ($marked->{$item->{word}}? "marked " : "");
375 % my $r = $item->{rank};
376 % if($r < $mergedEnd) {
377 % $class .= "merged";
378 % $r .= " (merged vocab)";
379 % } elsif($mergedEnd!=0 && $r > $mergedEnd) {
380 % $r -= $mergedEnd;
381 % }
Marc Kupietzf4b49392016-04-28 10:49:56 +0200382 <a class="<%= $class =%>"
383 title="freq. rank: <%= $r =%>"
384 href="<%= url_with->query([word => $item->{word}]) =%>">
385 <%= $item->{word} =%>
386 </a>
Marc Kupietz83305222016-04-28 09:57:22 +0200387 </td>
388 % } else {
389 <td colspan="2"/>
390 % }
391 % if($c) {
392 <td align="right">
Marc Kupietz30ca4342017-11-22 21:21:20 +0100393 <span class="mono"><%= bitvec2window($c->{pos}) %></span>
Marc Kupietz83305222016-04-28 09:57:22 +0200394 </td>
395 <td align="right">
396 <%= sprintf("%.3f", $c->{dist}) %>
397 </td>
398 <td align="right">
399 <%= sprintf("%.3e", $c->{norm}) %>
400 </td>
Marc Kupietz6fe7f392017-11-24 10:15:30 +0100401<!--
Marc Kupietz83305222016-04-28 09:57:22 +0200402 <td align="right">
403 <%= sprintf("%.3e", $c->{sum}) %>
404 </td>
Marc Kupietz6fe7f392017-11-24 10:15:30 +0100405-->
Marc Kupietz83305222016-04-28 09:57:22 +0200406 <td align="left">
Marc Kupietzb18978b2017-11-09 14:51:17 +0100407 <a href="<%= url_with->query([word => $c->{word}]) =%>"
408 title="freq. rank: <%= $c->{rank} =%>">
Marc Kupietz83305222016-04-28 09:57:22 +0200409 <%= $c->{word} %>
410 </td>
411 % } else {
412 <td colspan="5"/>
413 % }
414 </tr>
415 % }
416 % }
417 </table>
418 <script>
419 % use Mojo::ByteStream 'b';
Marc Kupietzf4b49392016-04-28 10:49:56 +0200420 % my $urlprefix = url_with->query([word=>'']);
Marc Kupietz83305222016-04-28 09:57:22 +0200421 $(window).load(function() {
Marc Kupietzf4b49392016-04-28 10:49:56 +0200422 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", mergedEnd=> $mergedEnd, words => \@words, vecs => \@vecs, ranks => \@ranks, marked => \@marked, urlprefix => $urlprefix})); %>);
Marc Kupietz83305222016-04-28 09:57:22 +0200423 });
424 </script>
Marc Kupietzf9ac54e2017-11-21 09:22:29 +0100425 % } else { # ($word && $word !~ /^\s*$/)
426 <div id="wrapper">
427 <p>
428 ERROR: "<%= $word %>" not found in vocabluary.
429 </p>
430 </div>
Marc Kupietz83305222016-04-28 09:57:22 +0200431 % }
432 <div id="second" style="width:800px; height:800px; font-family: arial;">
433 <div id="embed">
434 </div>
435 </div>
436 <div id="cost"></div>
437 % if($show_som) {
438 <div id="som2">
439 </div>
440 <div id="sominfo1"><span id="somcolor1"> </span> <span id="somword1"> </span> <span id="somcolor2"> </span> <span id="somword2"> </span> <span id="somcolor3"> </span></div>
441 <div id="sominfo">SOM iteration <span id="iterations">0</span></div>
442 % }
443 </div>
444 % if($training_args) {
445 <p>
446 Word vector model trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters: <pre><%= $training_args %></pre>
447 </p>
448 % }
449 </body>
450</html>