| #!/usr/local/bin/perl |
| use Inline C => "./w2v-server.c" => CLEAN_AFTER_BUILD => 0, BUILD_NOISY => 1, ccflags => $Config{ccflags}." -I/vol/work/kupietz/Work2/kl/trunk/CollocatorDB -I, -L. -Wall -O4", libs => "-shared -lpthread -lcollocatordb -lrt -lsnappy -lz -lbz2 -llz4 -lzstd -lrocksdb -lgomp"; |
| #use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9"; |
| #use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -Ofast -march k8 -mtune k8 "; |
| use Mojolicious::Lite; |
| use Mojo::JSON qw(decode_json encode_json to_json); |
| use base 'Mojolicious::Plugin'; |
| |
| use Encode qw(decode encode); |
| use Getopt::Std; |
| use Mojo::Server::Daemon; |
| use Cwd; |
| |
| app->static->paths->[0] = getcwd; |
| |
| plugin 'Log::Access'; |
| plugin "RequestBase"; |
| |
| our $opt_i = 0; # latin1-input? |
| our $opt_l = undef; |
| our $opt_p = 5676; |
| our $opt_m; |
| our $opt_M; |
| our $opt_n = ''; |
| our $opt_d; |
| our $opt_D; |
| our $opt_G; |
| |
| my %marked; |
| my $title=""; |
| my $training_args=""; |
| my $mergedEnd=0; |
| my %cache; |
| my %cccache; # classic collocator cache |
| my %spcache; # similar profile cache |
| |
| getopts('d:D:Gil:p:m:n:M:'); |
| |
| if($opt_M) { |
| open my $handle, '<:encoding(UTF-8)', $opt_M |
| or die "Can't open '$opt_M' for reading: $!"; |
| while(<$handle>) { |
| foreach my $mw (split /\s+/) { |
| $marked{$mw}=1 |
| } |
| } |
| close($handle); |
| } |
| |
| # -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15 |
| if(!$ARGV[0]) { |
| init_net("vectors15.bin", $opt_n, ($opt_i? 1 : 0)); |
| } else { |
| init_net($ARGV[0], $opt_n, ($opt_i? 1 : 0)); |
| if(open(FILE, "$ARGV[0].args")) { |
| $training_args = <FILE>; |
| } |
| close(FILE); |
| $title = fname2corpusname($ARGV[0]); |
| } |
| |
| my $have_sprofiles = load_sprofiles($ARGV[0]); |
| |
| if($opt_m) { |
| $mergedEnd = mergeVectors($opt_m); |
| $title = "<span class=\"merged\">" . $title . "</span> vs. " . fname2corpusname($opt_m); |
| } |
| |
| |
| if($opt_d) { # -d: dump vecs and exit |
| dump_vecs($opt_d); |
| exit; |
| } |
| |
| if($opt_D) { # -D: dump vecs for numpy and exit |
| dump_for_numpy($opt_D); |
| exit; |
| } |
| |
| my $daemon = Mojo::Server::Daemon->new( |
| app => app, |
| listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"] |
| ); |
| |
| if($opt_G) { |
| print "Filtering garbage\n"; |
| filter_garbage(); |
| } |
| |
| get '*/js/*' => sub { |
| my $c = shift; |
| my $url = $c->req->url; |
| $url =~ s@/derekovecs@@g; |
| $c->app->log->info("GET: " . $url); |
| $c->reply->static($url); |
| }; |
| |
| get '*/css/*' => sub { |
| my $c = shift; |
| my $url = $c->req->url; |
| $url =~ s@/derekovecs/@/@g; |
| $c->app->log->info("GET: " . $url); |
| $c->reply->static($url); |
| }; |
| |
| sub fname2corpusname { |
| ($_) = @_; |
| s@.*/@@; |
| s@\.en@-en@; |
| s@\..*@@; |
| return $_; |
| } |
| |
| sub getClassicCollocatorsCached { |
| my ($c, $word) = @_; |
| my $s2 = ""; |
| if($word > $mergedEnd) { |
| $word-=$mergedEnd; |
| } |
| |
| if($opt_p >= 5000 && $opt_p < 5600) { # German non-reference |
| open PIPE, "GET http://compute:5673/getClassicCollocators?w=$word |" or |
| open PIPE, "GET http://klinux10:5673/getClassicCollocators?w=$word |"; |
| } |
| if(!$cccache{$word}) { |
| $c->app->log->info("Getting classic collocates of $word."); |
| $cccache{$word} = getClassicCollocators($word); |
| $cccache{$word} =~ s/:(-?)(nan|inf)/:"${1}${2}"/g; |
| } else { |
| $c->app->log->info("Getting classic collocates for $word from cache."); |
| } |
| if($opt_p >= 5000 && $opt_p < 5600) { # German non-reference |
| while(<PIPE>) { |
| $s2 .= $_; |
| } |
| close(PIPE); |
| } |
| |
| if(length($s2) > 2000) { |
| my $d1 = decode_json($cccache{$word}); |
| my $d2 = decode_json($s2); |
| my %d2ld; |
| my $minLd = 14; |
| foreach my $i (@{$d2->{collocates}}) { |
| $d2ld{$i->{word}}=$i->{ld}; |
| $minLd=$i->{ld} if($i->{ld} < $minLd); |
| } |
| foreach my $i (@{$d1->{collocates}}) { |
| my $w = $i->{word}; |
| $i->{delta} = $i->{ld} - (defined $d2ld{$w} ? $d2ld{$w} : $minLd-0.1); |
| } |
| return(encode_json($d1)); |
| } else { |
| my $d1 = decode_json($cccache{$word}); |
| foreach my $i (@{$d1->{collocates}}) { |
| $i->{delta} = 0; |
| } |
| return(encode_json($d1)); |
| } |
| } |
| |
| sub getSimilarProfilesCached { |
| my ($c, $word) = @_; |
| if(!$spcache{$word}) { |
| $spcache{$word} = getSimilarProfiles($word); |
| } else { |
| $c->app->log->info("Getting similar profiles for $word from cache:"); |
| } |
| return $spcache{$word}; |
| } |
| |
| post '/derekovecs/getVecsByRanks' => sub { |
| my $self = shift; |
| my $vec = getVecs($self->req->json); |
| $self->render(json => $vec); |
| }; |
| |
| any '*/getClassicCollocators' => sub { |
| my $self = shift; |
| $self->render(data => getClassicCollocatorsCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json'); |
| }; |
| |
| any '/getClassicCollocators' => sub { |
| my $self = shift; |
| $self->render(data => getClassicCollocatorsCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json'); |
| }; |
| |
| any '/getBiggestVocabDistances' => sub { |
| my $self = shift; |
| $self->render(data => getBiggestMergedDifferences(), format=>'json'); |
| }; |
| |
| any '*/getBiggestVocabDistances' => sub { |
| my $self = shift; |
| $self->render(data => getBiggestMergedDifferences(), format=>'json'); |
| }; |
| |
| any '*/getPosWiseW2VCollocators' => sub { |
| my $self = shift; |
| $self->render(data => getPosWiseW2VCollocatorsAsTsv($self->param("w"), |
| ($self->param("max")? $self->param("max") : 200), |
| ($self->param("cutoff")? $self->param("cutoff") :750000), |
| ($self->param("threshold")? $self->param("threshold") : 0.2)), |
| format=>'tsv'); |
| }; |
| |
| any '/getPosWiseW2VCollocators' => sub { |
| my $self = shift; |
| $self->render(data => getPosWiseW2VCollocatorsAsTsv($self->param("w"), |
| ($self->param("max")? $self->param("max") : 200), |
| ($self->param("cutoff")? $self->param("cutoff") : 750000), |
| ($self->param("threshold")? $self->param("threshold") : 0.2)), |
| format=>'tsv'); |
| }; |
| |
| any '*/getSimilarProfiles' => sub { |
| my $self = shift; |
| $self->render(data => getSimilarProfilesCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json'); |
| }; |
| |
| any '/getSimilarProfiles' => sub { |
| my $self = shift; |
| $self->render(data => getSimilarProfilesCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json'); |
| }; |
| |
| any '/getSimilarity' => sub { |
| my $self = shift; |
| my $w1 = $self->param("w1"); |
| my $w2 = $self->param("w2"); |
| $self->render(data => cos_similarity_as_json($w1, $w2), format=>'json'); |
| }; |
| |
| any '*/getSimilarity' => sub { |
| my $self = shift; |
| my $w1 = $self->param("w1"); |
| my $w2 = $self->param("w2"); |
| $self->render(data => cos_similarity_as_json($w1, $w2), format=>'json'); |
| }; |
| |
| get '*/img/*' => sub { |
| my $c = shift; |
| my $url = $c->req->url; |
| $url =~ s@/derekovecs@@g; |
| $c->app->log->info("GET: " . $url); |
| $c->reply->static($url); |
| }; |
| |
| get '/' => sub { |
| my $c = shift; |
| $c->app->log->info("get: ".$c->req->url->to_abs); |
| my $word=$c->param('word'); |
| my $no_nbs=$c->param('n') || ($opt_m? 50 : 100); |
| my $no_iterations=$c->param('N') || 2000; |
| my $perplexity=$c->param('perplexity') || 20; |
| my $epsilon=$c->param('epsilon') || 5; |
| my $som=$c->param('som') || 0; |
| my $searchBaseVocabFirst=$c->param('sbf') || 0; |
| my $sort=$c->param('sort') || 0; |
| my $csv=$c->param('csv') || 0; |
| my $json=$c->param('json') || 0; |
| my $cutoff=$c->param('cutoff') || 500000; |
| my $dedupe=$c->param('dedupe') || 0; |
| my $nosp=$c->param('nosp') || 0; |
| my $res; |
| my @lists; |
| my @collocations; |
| $word="idea" if(!defined($word)); |
| if(defined($word) && $word !~ /^\s*$/) { |
| $c->inactivity_timeout(300); |
| $word =~ s/\s+/ /g; |
| if($opt_m && $word !~ /\|/) { |
| $word .= "|$word"; |
| } |
| for my $w (split(' *\| *', $word)) { |
| if($opt_m) { |
| if($searchBaseVocabFirst) { |
| $searchBaseVocabFirst=0; |
| } else { |
| $searchBaseVocabFirst=1; |
| } |
| } |
| if ($cache{$w.$cutoff.$no_nbs.$sort.$dedupe,$searchBaseVocabFirst}) { |
| $c->app->log->info("Getting $w results from cache"); |
| $res = $cache{$w.$cutoff.$no_nbs.$sort.$dedupe.$searchBaseVocabFirst} |
| } else { |
| $c->app->log->info('Looking for neighbours of '.$w); |
| if($opt_i) { |
| $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe, $nosp); |
| } else { |
| $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe, $nosp); |
| } |
| $cache{$w.$cutoff.$no_nbs.$sort.$dedupe} = $res; |
| } |
| push(@lists, $res->{paradigmatic}); |
| } |
| } |
| |
| $word =~ s/ *\| */ | /g; |
| if($json) { |
| return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}}); |
| } elsif($csv) { |
| my $csv_data=""; |
| for (my $i=0; $i <= $no_nbs; $i++) { |
| $csv_data .= $res->{paradigmatic}->[$i]->{word} . ", "; |
| } |
| for (my $i=0; $i < $no_nbs; $i++) { |
| $csv_data .= $res->{syntagmatic}->[$i]->{word} . ", "; |
| } |
| chop $csv_data; |
| chop $csv_data; |
| $csv_data .= "\n"; |
| return $c->render(text=>$csv_data); |
| } else { |
| my $distantWords=""; |
| if(!defined($word) || $word !~ /^\s*$/) { |
| $distantWords = getBiggestMergedDifferences(); |
| } |
| $c->render(template=>"index", title=>$title, word=>$word, distantWords=>$distantWords, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, haveSProfiles=> $have_sprofiles, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic}); |
| } |
| }; |
| |
| helper(bitvec2window => sub { |
| my ($self, $n) = @_; |
| my $str = unpack("B32", pack("N", $n)); |
| $str =~ s/^\d{22}//; |
| $str =~ s/^(\d{5})/$1x/; |
| $str =~ s/0/ยท/g; |
| $str =~ s/1/+/g; |
| return $str; |
| }); |
| |
| $daemon->run; # app->start; |
| |
| exit; |