derekovecs: add tab for words with largest distances in reference space

commit: d7760b4d53069026fcb99397a096795a9490c6aa [log] [tgz]
author: Marc Kupietz <kupietz@ids-mannheim.de> Thu Feb 21 09:01:44 2019 +0100
committer: Marc Kupietz <kupietz@ids-mannheim.de> Thu Feb 21 09:01:44 2019 +0100
tree: 39385c064950380d5d52a22309a011c878ef7ef4
parent: 001bffda2195814b6f265287e7d1bdbf0388111f [diff]
diff --git a/templates/index.html.ep b/templates/index.html.ep
index b9dae8a..418fdc7 100644
--- a/templates/index.html.ep
+++ b/templates/index.html.ep

@@ -81,7 +81,7 @@
        var collocatorTable_activated = false;
        $( "#tabs" ).on( "tabsactivate", function( event, ui ) {
          if (localStorage) localStorage['tab'] = ui.newTab.index();
-         if(ui.newTab.index() == 2 && !collocatorTable_activated) {
+         if(ui.newTab.index() == 3 && !collocatorTable_activated) {
            classicCollocatorTable.columns.adjust();
            collocatorTable.columns.adjust();
            collocatorTable_activated = true;
@@ -215,6 +215,51 @@
        var collocatorData = <%= b(Mojo::JSON::to_json($collocators)) %>;
        var maxHeat; // = Math.max.apply(Math,collocatorData.map(function(o){return o.cprob;}))
 
+         vocabDistanceTable = $('#vocabdistt').DataTable({
+           ajax: {
+             method: "GET",
+             url: baseURL + '/getBiggestVocabDistances',
+             dataType: 'json',
+             dataSrc: function (result) {
+               return result;
+             },
+             timeout: 30000,
+           },
+           "initComplete":function(settings, json){
+             vocabDistanceTable.columns.adjust().draw();
+           },
+           "createdRow": function (row, data, rowIndex) {
+             $.each($('td.collocator', row), function (colIndex) {
+               $(this).attr('title', "f("+data.word+")="+data.f2.toLocaleString("en-GB") + " f1: "+ccResult.f1+ " total: "+ccResult.N);
+             });
+           },
+           "sScrollY": "780px",
+           "bScrollCollapse": true,
+           "bPaginate": false,
+           "bJQueryUI": true,
+           "dom": '<"top">rt<"bottom"flp><"clear">',
+           "columns": [
+             { "data": "rank", type: "allnumeric" },
+             { "data": "dist",  render: function ( data, type, row ) {return data.toFixed(3) }},
+             { "data": "word",  class: "paradigmator", render: function ( data, type, row ) {
+               urlprefix.set("word", data); return  '<a class="' + getMergedClass(row.rank) + '" href="?' + urlprefix + '">' + data + '</a>' 
+             }}
+           ],
+           "columnDefs": [
+             { className: "dt-right", "targets": [0,1] },
+             { "searchable": false,
+               "orderable": false,
+               "targets": 0
+             },
+             { "orderSequence": [ "desc" ], "targets": [ 1 ] },
+             { "orderSequence": [ "asc", "desc" ], "targets": [ 2 ] },
+           ],
+           "oLanguage": {
+             "sSearch": "Filter: "
+           },
+           "order": [[ 1, 'desc' ]]
+         });
+
        if (collocatorData != null) {
          maxHeat = Math.max.apply(Math,collocatorData.map(function(o){return Math.max.apply(Math,o.heat);}))
          collocatorTable = $('#secondtable').DataTable({
@@ -299,6 +344,7 @@
              "order": [[ 0, 'desc' ]],
            });
          }
+
          //         var filterQuot = /(^quot?=[A-Z])|(quot$)/g;
          var filterQuot = /^quot/;
          var ccResult;
@@ -563,7 +609,7 @@
      var text;
 
      function getMergedClass(i) {
-       if(data.mergedEnd && i > data.mergedEnd) {
+       if(typeof data !== 'undefined' && i > data.mergedEnd) {
          return " merged"
        } else {
          return "";
@@ -828,10 +874,33 @@
     <div id="topwrapper">
       <div style="visibility: hidden;" id="tabs">
         <ul>
+          % if($distantWords) {
+          <li><a href="#tabs-0" title="Cos offsets of the words furthest away from their position in the reference corpus."">Offsets</a></li>
+          % }
           <li><a href="#tabs-1">Semantics (TSNE-map)</a></li>
           <li><a href="#tabs-2">Semantics (SOM)</a></li>
           <li><a href="#tabs-3">Syntagmatic (collocates)</a></li>
         </ul>
+        <div id="tabs-0" style="display: flex;  padding: 5px; flex-flow: row wrap;">
+          <div id="vocabdist" style="width: 230px; margin-bottom: 15px;">
+            <table class="display compact nowrap" id="vocabdistt">
+              <thead>
+                <tr>
+                  <th align="right">#</th><th align="right">cos</th><th align="left">word</th>
+                </tr>
+              </thead>
+              <tbody>
+                <tr>
+                  <td align="right">
+                      </td>
+                  <td align="right">
+                  </td>
+                  <td></td>
+                </tr>
+              </tbody>
+            </table>
+          </div>
+        </div>
         <div id="tabs-1" style="display: flex;  padding: 5px; flex-flow: row wrap;">
           % if($lists && (@$lists) > 0 && (@$lists)[0]) {
             <div id="wrapper">

diff --git a/w2v-server.pl b/w2v-server.pl
index 2f402ec..38dc8e8 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl

@@ -186,6 +186,16 @@
   $self->render(data => getClassicCollocatorsCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json');
 };
 
+any '/getBiggestVocabDistances' => sub {
+  my $self = shift;
+  $self->render(data => getBiggestMergedDifferences(), format=>'json');
+};
+
+any '*/getBiggestVocabDistances' => sub {
+  my $self = shift;
+  $self->render(data => getBiggestMergedDifferences(), format=>'json');
+};
+
 any '*/getSimilarProfiles' => sub {
   my $self = shift;
   $self->render(data => getSimilarProfilesCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json');
@@ -268,7 +278,11 @@
     $csv_data .= "\n";
     return $c->render(text=>$csv_data);
   } else {
-    $c->render(template=>"index", title=>$title, word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, haveSProfiles=> $have_sprofiles, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+    my $distantWords="";
+    if(!defined($word) || $word !~ /^\s*$/) {
+      $distantWords = getBiggestMergedDifferences();
+    }
+    $c->render(template=>"index", title=>$title, word=>$word, distantWords=>$distantWords, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, haveSProfiles=> $have_sprofiles, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
   }
 };
 
@@ -585,7 +599,7 @@
   words += merge_words;
   fclose(f);
 	printf("merged_end: %lld, words: %lld\n", merged_end, words);
-	printBiggestMergedDifferences();
+	//printBiggestMergedDifferences();
   return((long) merged_end);
 }
 
@@ -823,12 +837,16 @@
   return(wl);
 }
 
-void printBiggestMergedDifferences() {
+char  *getBiggestMergedDifferences() {
+  static char *result = NULL;
   float dist, len, vec[max_size];
   long long a, b, c, d, cn, *bi;
   char ch;
   knn *nbs = NULL;
-  int N = 100;
+  int N = 1000;
+
+  if(result != NULL)
+    return result;
 
   printf("Looking for biggest distances between main and merged vectors ...\n");
   collocator *best;
@@ -857,11 +875,14 @@
     }
   }
 
-  printf("Most distant vectors for:\n ");
+  result = malloc(N*max_w);
+  char *p = result;
+  *p++ = '['; *p = 0;
   for (a = 0; a < N; a++) {
-    printf("%s ", &vocab[best[a].wordi * max_w]);
+    p += sprintf(p, "{\"rank\":%d,\"word\":\"%s\",\"dist\":%.3f},", a, &vocab[best[a].wordi * max_w], 1-best[a].activation);
   }
-  printf("\n");
+  *--p = ']';
+  return(result);
 }
 
 void *_get_neighbours(void *arg) {
commit	d7760b4d53069026fcb99397a096795a9490c6aa	[log] [tgz]
author	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Feb 21 09:01:44 2019 +0100
committer	Marc Kupietz <kupietz@ids-mannheim.de>	Thu Feb 21 09:01:44 2019 +0100
tree	39385c064950380d5d52a22309a011c878ef7ef4
parent	001bffda2195814b6f265287e7d1bdbf0388111f [diff]