Make project a Perl package
Change-Id: I9cf3c99a1b849d939ace9c9a3b265267e45ab21c
diff --git a/script/derekovecs-server b/script/derekovecs-server
new file mode 100755
index 0000000..8f71355
--- /dev/null
+++ b/script/derekovecs-server
@@ -0,0 +1,380 @@
+#!/usr/local/bin/perl
+our $VERSION = '0.90';
+
+use IDS::DeReKoVecs::Read qw(init_net load_sprofiles getCollocationAssociation getClassicCollocatorsCached getClassicCollocatorsCached getBiggestMergedDifferences filter_garbage get_neighbours);
+use Mojolicious::Lite;
+use Mojo::JSON qw(decode_json encode_json to_json);
+use base 'Mojolicious::Plugin';
+
+use Encode qw(decode encode);
+use Getopt::Std;
+#use Mojo::Server::Daemon;
+use Cwd;
+
+my $mojo_config = $ENV{MOJO_CONFIG} // '../derekovecs-server.conf';
+plugin Config => {file => $mojo_config};
+
+my $DEFAULT_VECS = app->config->{w2v}->{vecs} // "../models/dereko-2021-i.vecs";
+my $DEFAULT_NET_NAME = "";
+if ($DEFAULT_VECS=~ /\.vecs/) {
+ $DEFAULT_NET_NAME = $DEFAULT_VECS;
+ $DEFAULT_NET_NAME =~ s/\.vecs/.net/;
+}
+my $DEFAULT_NET = app->config->{w2v}->{net} // $DEFAULT_NET_NAME;
+
+app->static->paths->[0] = getcwd;
+
+plugin 'Piwik';
+plugin "RemoteAddr";
+plugin 'Util::RandomString' => {
+ piwik_rand_id => {
+ alphabet => '0123456789abcdef',
+ length => 16
+ }
+};
+
+plugin 'Log::Access';
+plugin "RequestBase";
+#plugin 'AutoReload';
+plugin Localize => {
+ dict => {
+ _ => sub { $_->locale },
+ },
+ resources => ['../derekovecs-server.dict']
+};
+our $opt_i = 0; # latin1-input?
+our $opt_l = undef;
+our $opt_m;
+our $opt_M;
+our $opt_n = $DEFAULT_NET;
+our $opt_d;
+our $opt_D;
+our $opt_G = 1;
+
+our $mergedEnd=0;
+our %cache;
+our %cccache; # classic collocator cache
+our %spcache; # similar profile cache
+our $opt_p = 5676;
+our $opt_C;
+
+my %marked;
+my $title="";
+my $training_args="";
+
+getopts('d:D:Gil:p:m:n:M:C');
+
+if($opt_M) {
+ open my $handle, '<:encoding(UTF-8)', $opt_M
+ or die "Can't open '$opt_M' for reading: $!";
+ while(<$handle>) {
+ foreach my $mw (split /\s+/) {
+ $marked{$mw}=1
+ }
+ }
+ close($handle);
+}
+
+my $vecs_name = (@ARGV > 0 && -r $ARGV[0] ? $ARGV[0] : $DEFAULT_VECS);
+init_net($vecs_name, $opt_n, ($opt_i? 1 : 0), 1);
+if(open(FILE, "$vecs_name.args")) {
+ $training_args = <FILE>;
+}
+close(FILE);
+$title = fname2corpusname($vecs_name);
+
+my $have_sprofiles = load_sprofiles($vecs_name);
+
+if (app->config->{w2v}->{merge}) {
+ $opt_m = app->config->{w2v}->{merge};
+}
+
+if($opt_m) {
+ $mergedEnd = mergeVectors($opt_m);
+ $title = "<span class=\"merged\">" . $title . "</span> vs. " . fname2corpusname($opt_m);
+}
+
+
+if($opt_d) { # -d: dump vecs and exit
+ dump_vecs($opt_d);
+ exit;
+}
+
+if($opt_D) { # -D: dump vecs for numpy and exit
+ dump_for_numpy($opt_D);
+ exit;
+}
+
+#my $daemon = Mojo::Server::Daemon->new(
+# app => app,
+# listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
+#);
+
+if($opt_G) {
+ print "Filtering garbage\n";
+ filter_garbage();
+ print "Finished filtering garbage\n";
+}
+
+get '*/js/*' => sub {
+ my $c = shift;
+ my $url = $c->req->url;
+ $url =~ s@/derekovecs/@/@g;
+ $c->app->log->info("GET: " . $url);
+ $c->reply->static($url);
+} => 'js';
+
+get '*/css/*' => sub {
+ my $c = shift;
+ my $url = $c->req->url;
+ $url =~ s@/derekovecs/@/@g;
+ $c->app->log->info("GET: " . $url);
+ $c->reply->static($url);
+} => 'css';
+
+sub fname2corpusname {
+ ($_) = @_;
+ s@.*/@@;
+ s@\.en@-en@;
+ s@\..*@@;
+ return $_;
+}
+
+sub getWord {
+ ($_) = @_;
+ if ($_ =~ /^\d+/) {
+ return $_;
+ } else {
+ return getWordNumber($_);
+ }
+}
+
+
+post '/derekovecs/getVecsByRanks' => sub {
+ my $self = shift;
+ my $vec = getVecs($self->req->json);
+ $self->render(json => $vec);
+};
+
+any '*/getCollocationAssociation' => sub {
+ my $self = shift;
+ $self->render(data => getCollocationAssociation($self, getWord($self->param("w") ? $self->param("w") : $self->req->json), getWord($self->param("c"))), format=>'json');
+} => 'getCollocationAssociation';
+
+any '/getCollocationAssociation' => sub {
+ my $self = shift;
+ $self->render(data => getCollocationAssociation($self, getWord($self->param("w") ? $self->param("w") : $self->req->json), getWord($self->param("c"))), format=>'json');
+} => 'getCollocationAssociation1';
+
+any '*/getClassicCollocators' => sub {
+ my $self = shift;
+ $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+} => 'getClassicCollocators1';
+
+any '/getClassicCollocators' => sub {
+ my $self = shift;
+ $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+} => 'getClassicCollocators';
+
+any '/getBiggestVocabDistances' => sub {
+ my $self = shift;
+ $self->render(data => getBiggestMergedDifferences(), format=>'json');
+} => 'getBiggestVocabDistances1';
+
+any '*/getBiggestVocabDistances' => sub {
+ my $self = shift;
+ $self->render(data => getBiggestMergedDifferences(), format=>'json');
+} => 'getBiggestVocabDistances';
+
+any '*/getPosWiseW2VCollocators' => sub {
+ my $self = shift;
+ $self->render(data => getPosWiseW2VCollocatorsAsTsv($self->param("w"),
+ ($self->param("max")? $self->param("max") : 200),
+ ($self->param("cutoff")? $self->param("cutoff") :750000),
+ ($self->param("threshold")? $self->param("threshold") : 0.2)),
+ format=>'tsv');
+};
+
+any '/getPosWiseW2VCollocators' => sub {
+ my $self = shift;
+ $self->render(data => getPosWiseW2VCollocatorsAsTsv($self->param("w"),
+ ($self->param("max")? $self->param("max") : 200),
+ ($self->param("cutoff")? $self->param("cutoff") : 750000),
+ ($self->param("threshold")? $self->param("threshold") : 0.2)),
+ format=>'tsv');
+};
+
+any '*/getSimilarProfiles' => sub {
+ my $self = shift;
+ $self->render(data => getSimilarProfilesCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+};
+
+any '/getSimilarProfiles' => sub {
+ my $self = shift;
+ $self->render(data => getSimilarProfilesCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
+};
+
+any '*/getWord' => sub {
+ my $self = shift;
+ my $w = $self->param("w");
+ my $rank = getWord($w);
+ my $status = 200;
+ if ($rank <= 0) {
+ $rank = -1;
+ $status = 404;
+ }
+ $self->render(data => encode_json({word => $w, frequencyRank => $rank}), format => 'json', status => $status);
+};
+
+any '/getWord' => sub {
+ my $self = shift;
+ my $w = $self->param("w");
+ my $rank = getWord($w);
+ my $status = 200;
+ if ($rank <= 0) {
+ $rank = -1;
+ $status = 404;
+ }
+ $self->render(data => encode_json({word => $w, frequencyRank => $rank}), format => 'json', status => $status);
+};
+
+any '/getSimilarity' => sub {
+ my $self = shift;
+ my $w1 = $self->param("w1");
+ my $w2 = $self->param("w2");
+ $self->render(data => cos_similarity_as_json($w1, $w2), format=>'json');
+};
+
+any '*/getSimilarity' => sub {
+ my $self = shift;
+ my $w1 = $self->param("w1");
+ my $w2 = $self->param("w2");
+ $self->render(data => cos_similarity_as_json($w1, $w2), format=>'json');
+};
+
+get '*/img/*' => sub {
+ my $c = shift;
+ my $url = $c->req->url;
+ $url =~ s@/derekovecs@@g;
+ $c->app->log->info("GET: " . $url);
+ $c->reply->static($url);
+};
+
+get '/' => sub {
+ my $c = shift;
+ $c->app->log->info("get: ".$c->req->url->to_abs);
+ my $word=$c->param('word');
+ my $no_nbs=$c->param('n') || ($opt_m? 50 : 100);
+ my $no_iterations=$c->param('N') || 2000;
+ my $perplexity=$c->param('perplexity') || 20;
+ my $epsilon=$c->param('epsilon') || 5;
+ my $som=$c->param('som') || 0;
+ my $searchBaseVocabFirst=$c->param('sbf') || 0;
+ my $sort=$c->param('sort') || 0;
+ my $csv=$c->param('csv') || 0;
+ my $json=$c->param('json') || 0;
+ my $cutoff=$c->param('cutoff') || 500000;
+ my $dedupe=$c->param('dedupe') || 0;
+ my $nosp=$c->param('nosp') || 0;
+ my $res;
+ my @lists;
+ my @collocations;
+ if(defined($word) && $word !~ /^\s*$/) {
+ $c->inactivity_timeout(300);
+ $word =~ s/\s+/ /g;
+ if($opt_m && $word !~ /\|/) {
+ $word .= "|$word";
+ }
+ for my $w (split(' *\| *', $word)) {
+ if($opt_m) {
+ if($searchBaseVocabFirst) {
+ $searchBaseVocabFirst=0;
+ } else {
+ $searchBaseVocabFirst=1;
+ }
+ }
+ if ($cache{$w.$cutoff.$no_nbs.$sort.$dedupe,$searchBaseVocabFirst}) {
+ $c->app->log->info("Getting $w results from cache");
+ $res = $cache{$w.$cutoff.$no_nbs.$sort.$dedupe.$searchBaseVocabFirst}
+ } else {
+ $c->app->log->info('Looking for neighbours of '.$w);
+ if($opt_i) {
+ $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe, $nosp);
+ } else {
+ $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe, $nosp);
+ }
+ $cache{$w.$cutoff.$no_nbs.$sort.$dedupe} = $res;
+ }
+ push(@lists, $res->{paradigmatic});
+ }
+ }
+
+ $word =~ s/ *\| */ | /g;
+ if($json) {
+ return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}});
+ } elsif($csv) {
+ my $csv_data="";
+ for (my $i=0; $i <= $no_nbs; $i++) {
+ $csv_data .= $res->{paradigmatic}->[$i]->{word} . ", ";
+ }
+ for (my $i=0; $i < $no_nbs; $i++) {
+ $csv_data .= $res->{syntagmatic}->[$i]->{word} . ", ";
+ }
+ chop $csv_data;
+ chop $csv_data;
+ $csv_data .= "\n";
+ return $c->render(text=>$csv_data);
+ } else {
+ my $distantWords="";
+ if(!defined($word) || $word !~ /^\s*$/) {
+ $distantWords = getBiggestMergedDifferences();
+ }
+ $c->render(template=>"index", title=>$title, word=>$word, distantWords=>$distantWords, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, haveSProfiles=> $have_sprofiles, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+ }
+} => "paradigmaticAndSyntagmaticNbs";
+
+helper(bitvec2window => sub {
+ my ($self, $n) = @_;
+ my $str = unpack("B32", pack("N", $n));
+ $str =~ s/^\d{22}//;
+ $str =~ s/^(\d{5})/$1x/;
+ $str =~ s/0/·/g;
+ $str =~ s/1/+/g;
+ return $str;
+ });
+
+hook(
+ after_render => sub {
+ my $c = shift;
+
+ # Only track valid routes
+ my $route = $c->current_route or return;
+
+ # This won't forward personalized information
+ my $hash = {
+ action_url => $c->req->url->to_abs,
+ action_name => $route,
+ ua => $c->req->headers->user_agent,
+ urlref => '',
+ send_image => 0,
+ dnt => 0,
+ cip => $c->remote_addr,
+ lang => $c->req->headers->accept_language,
+ uid => $c->random_string('piwik_rand_id')
+ };
+ # $c->app->log->info("PIWIK: counting " . $hash->{action_url} . "\nremote:" . $c->remote_addr);
+ # $c->app->log->info("PIWIK: tag " . $c->piwik_tag);
+
+ # Send track
+ $c->piwik->api_p(Track => $hash)->wait;
+
+ # $c->app->log->info("PIWIK: counted.");
+ }
+);
+
+app->renderer->paths([app->home->rel_file('../templates')]);
+app->start;
+#$daemon->run;
+# app->start;
+
+# exit;