Make project a Perl package

Change-Id: I9cf3c99a1b849d939ace9c9a3b265267e45ab21c
diff --git a/script/derekovecs-server b/script/derekovecs-server
new file mode 100755
index 0000000..8f71355
--- /dev/null
+++ b/script/derekovecs-server
@@ -0,0 +1,380 @@
+#!/usr/local/bin/perl
+our $VERSION = '0.90';
+
+use IDS::DeReKoVecs::Read qw(init_net load_sprofiles getCollocationAssociation getClassicCollocatorsCached getClassicCollocatorsCached getBiggestMergedDifferences filter_garbage get_neighbours);
+use Mojolicious::Lite;
+use Mojo::JSON qw(decode_json encode_json to_json);
+use base 'Mojolicious::Plugin';
+
+use Encode qw(decode encode);
+use Getopt::Std;
+#use Mojo::Server::Daemon;
+use Cwd;
+
+my $mojo_config = $ENV{MOJO_CONFIG} // '../derekovecs-server.conf';
+plugin Config => {file => $mojo_config};
+
+my $DEFAULT_VECS = app->config->{w2v}->{vecs} // "../models/dereko-2021-i.vecs";
+my $DEFAULT_NET_NAME = "";
+if ($DEFAULT_VECS=~ /\.vecs/) {
+  $DEFAULT_NET_NAME = $DEFAULT_VECS;
+  $DEFAULT_NET_NAME =~ s/\.vecs/.net/;
+}
+my $DEFAULT_NET = app->config->{w2v}->{net} // $DEFAULT_NET_NAME;
+
+app->static->paths->[0] = getcwd;
+
+plugin 'Piwik';
+plugin "RemoteAddr";
+plugin 'Util::RandomString' => {
+  piwik_rand_id => {
+    alphabet => '0123456789abcdef',
+    length   => 16
+  }
+};
+
+plugin 'Log::Access';
+plugin "RequestBase";
+#plugin 'AutoReload';
+plugin  Localize => {
+  dict => {
+    _  => sub { $_->locale },
+  },
+  resources => ['../derekovecs-server.dict']
+};
+our $opt_i = 0; # latin1-input?
+our $opt_l = undef;
+our $opt_m;
+our $opt_M;
+our $opt_n = $DEFAULT_NET;
+our $opt_d;
+our $opt_D;
+our $opt_G = 1;
+
+our $mergedEnd=0;
+our %cache;
+our %cccache; # classic collocator cache
+our %spcache; # similar profile cache
+our $opt_p = 5676;
+our $opt_C;
+
+my %marked;
+my $title="";
+my $training_args="";
+
+getopts('d:D:Gil:p:m:n:M:C');
+
+if($opt_M) {
+  open my $handle, '<:encoding(UTF-8)', $opt_M
+    or die "Can't open '$opt_M' for reading: $!";
+  while(<$handle>) {
+    foreach my $mw (split /\s+/) {
+      $marked{$mw}=1
+    }
+  }
+  close($handle);
+}
+
+my $vecs_name = (@ARGV > 0 && -r $ARGV[0] ? $ARGV[0] : $DEFAULT_VECS);
+init_net($vecs_name, $opt_n, ($opt_i? 1 : 0), 1);
+if(open(FILE, "$vecs_name.args")) {
+  $training_args = <FILE>;
+}
+close(FILE);
+$title = fname2corpusname($vecs_name);
+
+my $have_sprofiles = load_sprofiles($vecs_name);
+
+if (app->config->{w2v}->{merge}) {
+  $opt_m = app->config->{w2v}->{merge};
+}
+
+if($opt_m) {
+  $mergedEnd = mergeVectors($opt_m);
+  $title = "<span class=\"merged\">" . $title . "</span> vs. " . fname2corpusname($opt_m);
+}
+
+
+if($opt_d) { # -d: dump  vecs and exit
+	dump_vecs($opt_d);
+	exit;
+}
+
+if($opt_D) { # -D: dump  vecs for numpy and exit
+	dump_for_numpy($opt_D);
+	exit;
+}
+
+#my $daemon = Mojo::Server::Daemon->new(
+#    app    => app,
+#    listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
+#);
+
+if($opt_G) {
+  print "Filtering garbage\n";
+  filter_garbage();
+  print "Finished filtering garbage\n";
+}
+
+get '*/js/*' => sub {
+	my $c = shift;
+	my $url = $c->req->url;
+	$url =~ s@/derekovecs/@/@g;
+  $c->app->log->info("GET: " . $url);
+	$c->reply->static($url);
+} => 'js';
+
+get '*/css/*' => sub {
+	my $c = shift;
+	my $url = $c->req->url;
+	$url =~ s@/derekovecs/@/@g;
+  $c->app->log->info("GET: " . $url);
+	$c->reply->static($url);
+} => 'css';
+
+sub fname2corpusname {
+  ($_) = @_;
+  s@.*/@@;
+  s@\.en@-en@;
+  s@\..*@@;
+  return $_;
+}
+
+sub getWord {
+  ($_) = @_;
+  if ($_ =~ /^\d+/) {
+    return $_;
+  } else {
+    return getWordNumber($_);
+  }
+}
+
+
+post '/derekovecs/getVecsByRanks' => sub {
+  my $self = shift;
+  my $vec = getVecs($self->req->json);
+  $self->render(json => $vec);
+};
+
+any '*/getCollocationAssociation' => sub {
+  my $self = shift;
+  $self->render(data => getCollocationAssociation($self, getWord($self->param("w") ? $self->param("w") : $self->req->json), getWord($self->param("c"))), format=>'json');
+} => 'getCollocationAssociation';
+
+any '/getCollocationAssociation' => sub {
+  my $self = shift;
+  $self->render(data => getCollocationAssociation($self, getWord($self->param("w") ? $self->param("w") : $self->req->json), getWord($self->param("c"))), format=>'json');
+} => 'getCollocationAssociation1';
+
+any '*/getClassicCollocators' => sub {
+  my $self = shift;
+  $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+} => 'getClassicCollocators1';
+
+any '/getClassicCollocators' => sub {
+  my $self = shift;
+  $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+} => 'getClassicCollocators';
+
+any '/getBiggestVocabDistances' => sub {
+  my $self = shift;
+  $self->render(data => getBiggestMergedDifferences(), format=>'json');
+} => 'getBiggestVocabDistances1';
+
+any '*/getBiggestVocabDistances' => sub {
+  my $self = shift;
+  $self->render(data => getBiggestMergedDifferences(), format=>'json');
+} => 'getBiggestVocabDistances';
+
+any '*/getPosWiseW2VCollocators' => sub {
+  my $self = shift;
+  $self->render(data => getPosWiseW2VCollocatorsAsTsv($self->param("w"),
+                                            ($self->param("max")? $self->param("max") : 200),
+                                            ($self->param("cutoff")? $self->param("cutoff") :750000),
+                                            ($self->param("threshold")? $self->param("threshold") : 0.2)),
+                                            format=>'tsv');
+};
+
+any '/getPosWiseW2VCollocators' => sub {
+  my $self = shift;
+  $self->render(data => getPosWiseW2VCollocatorsAsTsv($self->param("w"), 
+                                            ($self->param("max")? $self->param("max") : 200),
+                                            ($self->param("cutoff")? $self->param("cutoff") : 750000),
+                                            ($self->param("threshold")? $self->param("threshold") : 0.2)),
+                                            format=>'tsv');
+};
+
+any '*/getSimilarProfiles' => sub {
+  my $self = shift;
+  $self->render(data => getSimilarProfilesCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+};
+
+any '/getSimilarProfiles' => sub {
+  my $self = shift;
+  $self->render(data => getSimilarProfilesCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+};
+
+any '*/getWord' => sub {
+  my $self = shift;
+  my $w = $self->param("w");
+  my $rank = getWord($w);
+  my $status = 200;
+  if ($rank <= 0) {
+    $rank = -1;
+    $status = 404;
+  }
+  $self->render(data => encode_json({word => $w, frequencyRank => $rank}), format => 'json', status => $status);
+};
+
+any '/getWord' => sub {
+  my $self = shift;
+  my $w = $self->param("w");
+  my $rank = getWord($w);
+  my $status = 200;
+  if ($rank <= 0) {
+    $rank = -1;
+    $status = 404;
+  }
+  $self->render(data => encode_json({word => $w, frequencyRank => $rank}), format => 'json', status => $status);
+};
+
+any '/getSimilarity' => sub {
+  my $self = shift;
+  my $w1 = $self->param("w1");
+  my $w2 = $self->param("w2");
+  $self->render(data => cos_similarity_as_json($w1, $w2), format=>'json');
+};
+
+any '*/getSimilarity' => sub {
+  my $self = shift;
+  my $w1 = $self->param("w1");
+  my $w2 = $self->param("w2");
+  $self->render(data => cos_similarity_as_json($w1, $w2), format=>'json');
+};
+
+get '*/img/*' => sub {
+	my $c = shift;
+	my $url = $c->req->url;
+	$url =~ s@/derekovecs@@g;
+  $c->app->log->info("GET: " . $url);
+	$c->reply->static($url);
+};
+
+get '/' => sub {
+  my $c    = shift;
+  $c->app->log->info("get: ".$c->req->url->to_abs);
+	my $word=$c->param('word');
+  my $no_nbs=$c->param('n') || ($opt_m? 50 : 100);
+  my $no_iterations=$c->param('N') || 2000;
+  my $perplexity=$c->param('perplexity') || 20;
+  my $epsilon=$c->param('epsilon') || 5;
+  my $som=$c->param('som') || 0;
+	my $searchBaseVocabFirst=$c->param('sbf') || 0;
+  my $sort=$c->param('sort') || 0;
+  my $csv=$c->param('csv') || 0;
+  my $json=$c->param('json') || 0;
+  my $cutoff=$c->param('cutoff') || 500000;
+  my $dedupe=$c->param('dedupe') || 0;
+  my $nosp=$c->param('nosp') || 0;
+  my $res;
+	my @lists;
+	my @collocations;
+	if(defined($word) && $word !~ /^\s*$/) {
+		$c->inactivity_timeout(300);
+		$word =~ s/\s+/ /g;
+    if($opt_m && $word !~ /\|/) {
+      $word .= "|$word";
+    }
+    for my $w (split(' *\| *', $word)) {
+      if($opt_m) {
+        if($searchBaseVocabFirst) {
+          $searchBaseVocabFirst=0;
+        } else {
+          $searchBaseVocabFirst=1;
+        }
+      }
+      if ($cache{$w.$cutoff.$no_nbs.$sort.$dedupe,$searchBaseVocabFirst}) {
+        $c->app->log->info("Getting $w results from cache");
+        $res = $cache{$w.$cutoff.$no_nbs.$sort.$dedupe.$searchBaseVocabFirst}
+      } else {
+        $c->app->log->info('Looking for neighbours of '.$w);
+        if($opt_i) {
+          $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe, $nosp);
+        } else {
+          $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe, $nosp);
+        }
+        $cache{$w.$cutoff.$no_nbs.$sort.$dedupe} = $res;
+      }
+      push(@lists, $res->{paradigmatic});
+    }
+  }
+  
+	$word =~ s/ *\| */ | /g;
+  if($json) {
+    return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}});
+  } elsif($csv) {
+    my $csv_data="";
+    for (my $i=0; $i <= $no_nbs; $i++) {
+      $csv_data .= $res->{paradigmatic}->[$i]->{word} . ", ";
+    }
+    for (my $i=0; $i < $no_nbs; $i++) {
+      $csv_data .= $res->{syntagmatic}->[$i]->{word} . ", ";
+    }
+    chop $csv_data;
+    chop $csv_data;
+    $csv_data .= "\n";
+    return $c->render(text=>$csv_data);
+  } else {
+    my $distantWords="";
+    if(!defined($word) || $word !~ /^\s*$/) {
+      $distantWords = getBiggestMergedDifferences();
+    }
+    $c->render(template=>"index", title=>$title, word=>$word, distantWords=>$distantWords, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, haveSProfiles=> $have_sprofiles, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+  }
+} => "paradigmaticAndSyntagmaticNbs";
+
+helper(bitvec2window => sub {
+         my ($self, $n) = @_;
+         my $str = unpack("B32", pack("N", $n));
+         $str =~ s/^\d{22}//;
+         $str =~ s/^(\d{5})/$1x/;
+         $str =~ s/0/·/g;
+         $str =~ s/1/+/g;
+         return $str;
+      });
+
+hook(
+      after_render => sub {
+        my $c = shift;
+
+        # Only track valid routes
+        my $route = $c->current_route or return;
+
+        # This won't forward personalized information
+        my $hash = {
+          action_url =>  $c->req->url->to_abs,
+          action_name => $route,
+          ua => $c->req->headers->user_agent,
+          urlref => '',
+          send_image => 0,
+          dnt => 0,
+          cip => $c->remote_addr,
+          lang => $c->req->headers->accept_language,
+          uid => $c->random_string('piwik_rand_id')
+        };
+        # $c->app->log->info("PIWIK: counting " . $hash->{action_url} . "\nremote:" . $c->remote_addr);
+        # $c->app->log->info("PIWIK: tag " . $c->piwik_tag);
+
+        # Send track
+        $c->piwik->api_p(Track => $hash)->wait;
+
+        # $c->app->log->info("PIWIK: counted.");
+      }
+);
+
+app->renderer->paths([app->home->rel_file('../templates')]);
+app->start;
+#$daemon->run;
+# app->start;
+
+# exit;