blob: 4aa687c4b79e5c4c144f1da1b23d5b3480a1d8d8 [file] [log] [blame]
Marc Kupietzdc22b982015-10-09 09:19:34 +02001#!/usr/local/bin/perl
Marc Kupietzf11d20c2019-08-02 15:42:04 +02002use Inline C => "./w2v-server.c" => CLEAN_AFTER_BUILD => 0, BUILD_NOISY => 1, ccflags => $Config{ccflags}." -I/vol/work/kupietz/Work2/kl/trunk/CollocatorDB -I, -L. -Wall -O4", libs => "-shared -lpthread -lcollocatordb -lrt -lsnappy -lz -lbz2 -llz4 -lzstd -lrocksdb -lgomp";
Marc Kupietza5f60042017-05-04 10:38:12 +02003#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
4#use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -Ofast -march k8 -mtune k8 ";
Marc Kupietzdc22b982015-10-09 09:19:34 +02005use Mojolicious::Lite;
Marc Kupietzc4893362016-02-25 08:04:46 +01006use Mojo::JSON qw(decode_json encode_json to_json);
Marc Kupietz30ca4342017-11-22 21:21:20 +01007use base 'Mojolicious::Plugin';
8
Marc Kupietz247500f2015-10-09 11:29:01 +02009use Encode qw(decode encode);
Marc Kupietza5b90152016-03-15 17:39:19 +010010use Getopt::Std;
Marc Kupietz7bc85fd2016-02-24 11:42:41 +010011use Mojo::Server::Daemon;
Marc Kupietzffef9302017-11-07 15:58:01 +010012use Cwd;
Marc Kupietz66bfd952017-12-11 09:59:45 +010013
Marc Kupietzffef9302017-11-07 15:58:01 +010014app->static->paths->[0] = getcwd;
15
Marc Kupietzd4227392016-03-01 16:45:12 +010016plugin 'Log::Access';
Marc Kupietzb3422c12017-07-04 14:12:11 +020017plugin "RequestBase";
Marc Kupietzdc22b982015-10-09 09:19:34 +020018
Marc Kupietza5b90152016-03-15 17:39:19 +010019our $opt_i = 0; # latin1-input?
20our $opt_l = undef;
21our $opt_p = 5676;
Marc Kupietza2e64502016-04-27 09:53:51 +020022our $opt_m;
Marc Kupietz6ed81872016-04-27 14:04:04 +020023our $opt_M;
Marc Kupietz43ee87e2016-04-25 10:50:08 +020024our $opt_n = '';
25our $opt_d;
Marc Kupietzfa194262018-06-05 09:39:32 +020026our $opt_D;
Marc Kupietz5c3887d2016-04-28 08:53:35 +020027our $opt_G;
Marc Kupietza5b90152016-03-15 17:39:19 +010028
Marc Kupietz6ed81872016-04-27 14:04:04 +020029my %marked;
Marc Kupietzc053d972019-01-10 10:41:51 +010030my $title="";
Marc Kupietz793413b2016-04-02 21:48:57 +020031my $training_args="";
Marc Kupietza2e64502016-04-27 09:53:51 +020032my $mergedEnd=0;
Marc Kupietz15987412017-11-07 15:56:58 +010033my %cache;
Marc Kupietz19c68242018-03-12 09:42:21 +010034my %cccache; # classic collocator cache
Marc Kupietza51dcfa2018-03-19 16:22:05 +010035my %spcache; # similar profile cache
Marc Kupietz793413b2016-04-02 21:48:57 +020036
Marc Kupietzfa194262018-06-05 09:39:32 +020037getopts('d:D:Gil:p:m:n:M:');
Marc Kupietz6ed81872016-04-27 14:04:04 +020038
39if($opt_M) {
Marc Kupietzed930212016-04-27 15:42:38 +020040 open my $handle, '<:encoding(UTF-8)', $opt_M
41 or die "Can't open '$opt_M' for reading: $!";
42 while(<$handle>) {
Marc Kupietz6ed81872016-04-27 14:04:04 +020043 foreach my $mw (split /\s+/) {
44 $marked{$mw}=1
45 }
46 }
Marc Kupietzed930212016-04-27 15:42:38 +020047 close($handle);
Marc Kupietz6ed81872016-04-27 14:04:04 +020048}
Marc Kupietza5b90152016-03-15 17:39:19 +010049
Marc Kupietz7bc85fd2016-02-24 11:42:41 +010050# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
Marc Kupietza5b90152016-03-15 17:39:19 +010051if(!$ARGV[0]) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +010052 init_net("vectors15.bin", $opt_n, ($opt_i? 1 : 0));
Marc Kupietz2cb667e2016-03-10 09:44:12 +010053} else {
Marc Kupietz6b2975c2016-03-18 21:59:33 +010054 init_net($ARGV[0], $opt_n, ($opt_i? 1 : 0));
Marc Kupietz793413b2016-04-02 21:48:57 +020055 if(open(FILE, "$ARGV[0].args")) {
56 $training_args = <FILE>;
57 }
58 close(FILE);
Marc Kupietzc053d972019-01-10 10:41:51 +010059 $title = fname2corpusname($ARGV[0]);
Marc Kupietz2cb667e2016-03-10 09:44:12 +010060}
Marc Kupietzdc22b982015-10-09 09:19:34 +020061
Marc Kupietza51dcfa2018-03-19 16:22:05 +010062my $have_sprofiles = load_sprofiles($ARGV[0]);
63
Marc Kupietza2e64502016-04-27 09:53:51 +020064if($opt_m) {
65 $mergedEnd = mergeVectors($opt_m);
Marc Kupietzc053d972019-01-10 10:41:51 +010066 $title = "<span class=\"merged\">" . $title . "</span> vs. " . fname2corpusname($opt_m);
Marc Kupietza2e64502016-04-27 09:53:51 +020067}
68
Marc Kupietze5568a02018-12-20 11:42:02 +010069
Marc Kupietz43ee87e2016-04-25 10:50:08 +020070if($opt_d) { # -d: dump vecs and exit
71 dump_vecs($opt_d);
72 exit;
73}
74
Marc Kupietzfa194262018-06-05 09:39:32 +020075if($opt_D) { # -D: dump vecs for numpy and exit
76 dump_for_numpy($opt_D);
77 exit;
78}
79
Marc Kupietza5b90152016-03-15 17:39:19 +010080my $daemon = Mojo::Server::Daemon->new(
81 app => app,
82 listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
83);
84
Marc Kupietz5c3887d2016-04-28 08:53:35 +020085if($opt_G) {
86 print "Filtering garbage\n";
87 filter_garbage();
88}
89
Marc Kupietz554aff52017-11-09 14:42:09 +010090get '*/js/*' => sub {
Marc Kupietzffef9302017-11-07 15:58:01 +010091 my $c = shift;
92 my $url = $c->req->url;
93 $url =~ s@/derekovecs@@g;
94 $c->app->log->info("GET: " . $url);
95 $c->reply->static($url);
96};
97
Marc Kupietza9270572018-03-17 15:17:07 +010098get '*/css/*' => sub {
99 my $c = shift;
100 my $url = $c->req->url;
101 $url =~ s@/derekovecs/@/@g;
102 $c->app->log->info("GET: " . $url);
103 $c->reply->static($url);
104};
105
Marc Kupietzc053d972019-01-10 10:41:51 +0100106sub fname2corpusname {
107 ($_) = @_;
108 s@.*/@@;
Marc Kupietz86b50292019-02-17 21:03:59 +0100109 s@\.en@-en@;
Marc Kupietzc053d972019-01-10 10:41:51 +0100110 s@\..*@@;
111 return $_;
112}
113
Marc Kupietzcb43e492019-12-03 10:07:53 +0100114sub getWord {
115 ($_) = @_;
116 if ($_ =~ /^\d+/) {
117 return $_;
118 } else {
119 return getWordNumber($_);
120 }
121}
122
Marc Kupietz19c68242018-03-12 09:42:21 +0100123sub getClassicCollocatorsCached {
124 my ($c, $word) = @_;
Marc Kupietz81aeed22019-02-17 21:22:45 +0100125 my $s2 = "";
Marc Kupietz9ff3c992019-02-04 12:32:54 +0100126 if($word > $mergedEnd) {
127 $word-=$mergedEnd;
128 }
Marc Kupietz81aeed22019-02-17 21:22:45 +0100129
Marc Kupietz999ab8c2019-02-17 21:42:21 +0100130 if($opt_p >= 5000 && $opt_p < 5600) { # German non-reference
Marc Kupietz9ee3f412019-08-02 14:58:19 +0200131 open PIPE, "GET http://compute:5673/getClassicCollocators?w=$word |" or
132 open PIPE, "GET http://klinux10:5673/getClassicCollocators?w=$word |";
Marc Kupietz999ab8c2019-02-17 21:42:21 +0100133 }
Marc Kupietz19c68242018-03-12 09:42:21 +0100134 if(!$cccache{$word}) {
Marc Kupietz06d61292019-02-04 12:33:22 +0100135 $c->app->log->info("Getting classic collocates of $word.");
Marc Kupietz19c68242018-03-12 09:42:21 +0100136 $cccache{$word} = getClassicCollocators($word);
Marc Kupietz1d96a082019-02-18 09:29:06 +0100137 $cccache{$word} =~ s/:(-?)(nan|inf)/:"${1}${2}"/g;
Marc Kupietz19c68242018-03-12 09:42:21 +0100138 } else {
Marc Kupietz06d61292019-02-04 12:33:22 +0100139 $c->app->log->info("Getting classic collocates for $word from cache.");
Marc Kupietz19c68242018-03-12 09:42:21 +0100140 }
Marc Kupietz81aeed22019-02-17 21:22:45 +0100141 if($opt_p >= 5000 && $opt_p < 5600) { # German non-reference
Marc Kupietz999ab8c2019-02-17 21:42:21 +0100142 while(<PIPE>) {
143 $s2 .= $_;
144 }
145 close(PIPE);
Marc Kupietz81aeed22019-02-17 21:22:45 +0100146 }
Marc Kupietz999ab8c2019-02-17 21:42:21 +0100147
Marc Kupietz81aeed22019-02-17 21:22:45 +0100148 if(length($s2) > 2000) {
149 my $d1 = decode_json($cccache{$word});
150 my $d2 = decode_json($s2);
151 my %d2ld;
Marc Kupietz001bffd2019-02-21 08:52:41 +0100152 my $minLd = 14;
Marc Kupietz81aeed22019-02-17 21:22:45 +0100153 foreach my $i (@{$d2->{collocates}}) {
154 $d2ld{$i->{word}}=$i->{ld};
Marc Kupietz001bffd2019-02-21 08:52:41 +0100155 $minLd=$i->{ld} if($i->{ld} < $minLd);
Marc Kupietz81aeed22019-02-17 21:22:45 +0100156 }
157 foreach my $i (@{$d1->{collocates}}) {
158 my $w = $i->{word};
Marc Kupietz001bffd2019-02-21 08:52:41 +0100159 $i->{delta} = $i->{ld} - (defined $d2ld{$w} ? $d2ld{$w} : $minLd-0.1);
Marc Kupietz81aeed22019-02-17 21:22:45 +0100160 }
161 return(encode_json($d1));
162 } else {
163 my $d1 = decode_json($cccache{$word});
164 foreach my $i (@{$d1->{collocates}}) {
165 $i->{delta} = 0;
166 }
167 return(encode_json($d1));
168 }
Marc Kupietz19c68242018-03-12 09:42:21 +0100169}
170
Marc Kupietza51dcfa2018-03-19 16:22:05 +0100171sub getSimilarProfilesCached {
172 my ($c, $word) = @_;
173 if(!$spcache{$word}) {
174 $spcache{$word} = getSimilarProfiles($word);
175 } else {
176 $c->app->log->info("Getting similar profiles for $word from cache:");
Marc Kupietza51dcfa2018-03-19 16:22:05 +0100177 }
178 return $spcache{$word};
179}
180
Marc Kupietz66bfd952017-12-11 09:59:45 +0100181post '/derekovecs/getVecsByRanks' => sub {
182 my $self = shift;
183 my $vec = getVecs($self->req->json);
184 $self->render(json => $vec);
185};
186
Marc Kupietze13a3552018-01-25 08:48:34 +0100187any '*/getClassicCollocators' => sub {
Marc Kupietze243efd2018-01-11 22:19:24 +0100188 my $self = shift;
Marc Kupietzcb43e492019-12-03 10:07:53 +0100189 $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
Marc Kupietze243efd2018-01-11 22:19:24 +0100190};
191
Marc Kupietze13a3552018-01-25 08:48:34 +0100192any '/getClassicCollocators' => sub {
Marc Kupietze243efd2018-01-11 22:19:24 +0100193 my $self = shift;
Marc Kupietzcb43e492019-12-03 10:07:53 +0100194 $self->render(data => getClassicCollocatorsCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
Marc Kupietze243efd2018-01-11 22:19:24 +0100195};
196
Marc Kupietzd7760b42019-02-21 09:01:44 +0100197any '/getBiggestVocabDistances' => sub {
198 my $self = shift;
199 $self->render(data => getBiggestMergedDifferences(), format=>'json');
200};
201
202any '*/getBiggestVocabDistances' => sub {
203 my $self = shift;
204 $self->render(data => getBiggestMergedDifferences(), format=>'json');
205};
206
Marc Kupietz33c79d32019-08-02 15:11:23 +0200207any '*/getPosWiseW2VCollocators' => sub {
208 my $self = shift;
209 $self->render(data => getPosWiseW2VCollocatorsAsTsv($self->param("w"),
210 ($self->param("max")? $self->param("max") : 200),
211 ($self->param("cutoff")? $self->param("cutoff") :750000),
212 ($self->param("threshold")? $self->param("threshold") : 0.2)),
213 format=>'tsv');
214};
215
216any '/getPosWiseW2VCollocators' => sub {
217 my $self = shift;
218 $self->render(data => getPosWiseW2VCollocatorsAsTsv($self->param("w"),
219 ($self->param("max")? $self->param("max") : 200),
220 ($self->param("cutoff")? $self->param("cutoff") : 750000),
221 ($self->param("threshold")? $self->param("threshold") : 0.2)),
222 format=>'tsv');
223};
224
Marc Kupietza51dcfa2018-03-19 16:22:05 +0100225any '*/getSimilarProfiles' => sub {
226 my $self = shift;
Marc Kupietzcb43e492019-12-03 10:07:53 +0100227 $self->render(data => getSimilarProfilesCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
Marc Kupietza51dcfa2018-03-19 16:22:05 +0100228};
229
Marc Kupietzc987fa82018-03-21 12:14:25 +0100230any '/getSimilarProfiles' => sub {
231 my $self = shift;
Marc Kupietzcb43e492019-12-03 10:07:53 +0100232 $self->render(data => getSimilarProfilesCached($self, getWord($self->param("w") ? $self->param("w") : $self->req->json)), format=>'json');
Marc Kupietzc987fa82018-03-21 12:14:25 +0100233};
234
Marc Kupietz98ed1c02019-08-02 15:05:37 +0200235any '/getSimilarity' => sub {
236 my $self = shift;
237 my $w1 = $self->param("w1");
238 my $w2 = $self->param("w2");
239 $self->render(data => cos_similarity_as_json($w1, $w2), format=>'json');
240};
241
242any '*/getSimilarity' => sub {
243 my $self = shift;
244 my $w1 = $self->param("w1");
245 my $w2 = $self->param("w2");
246 $self->render(data => cos_similarity_as_json($w1, $w2), format=>'json');
247};
248
Marc Kupietzdf3d4b52017-11-29 16:57:27 +0100249get '*/img/*' => sub {
250 my $c = shift;
251 my $url = $c->req->url;
252 $url =~ s@/derekovecs@@g;
253 $c->app->log->info("GET: " . $url);
254 $c->reply->static($url);
255};
256
Marc Kupietzdc22b982015-10-09 09:19:34 +0200257get '/' => sub {
258 my $c = shift;
Marc Kupietza5f60042017-05-04 10:38:12 +0200259 $c->app->log->info("get: ".$c->req->url->to_abs);
Marc Kupietzdc22b982015-10-09 09:19:34 +0200260 my $word=$c->param('word');
Marc Kupietz2da2a812019-02-21 14:17:35 +0100261 my $no_nbs=$c->param('n') || ($opt_m? 50 : 100);
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100262 my $no_iterations=$c->param('N') || 2000;
Marc Kupietzd4227392016-03-01 16:45:12 +0100263 my $perplexity=$c->param('perplexity') || 20;
Marc Kupietzc4d62f82016-03-01 11:04:24 +0100264 my $epsilon=$c->param('epsilon') || 5;
Marc Kupietzd7aea722016-03-02 11:59:12 +0100265 my $som=$c->param('som') || 0;
Marc Kupietza2e64502016-04-27 09:53:51 +0200266 my $searchBaseVocabFirst=$c->param('sbf') || 0;
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100267 my $sort=$c->param('sort') || 0;
Marc Kupietzc469f3b2017-11-13 14:07:36 +0100268 my $csv=$c->param('csv') || 0;
Marc Kupietzb613b052016-04-28 14:11:59 +0200269 my $json=$c->param('json') || 0;
Marc Kupietzdb2dc7e2017-12-02 12:04:03 +0100270 my $cutoff=$c->param('cutoff') || 500000;
Marc Kupietzd91212f2017-11-13 10:05:09 +0100271 my $dedupe=$c->param('dedupe') || 0;
Marc Kupietzac707b32018-12-20 11:36:38 +0100272 my $nosp=$c->param('nosp') || 0;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100273 my $res;
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +0100274 my @lists;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100275 my @collocations;
Marc Kupietzcddc8482019-12-04 08:57:33 +0100276 if(defined($word) && $word !~ /^\s*$/) {
Marc Kupietz7bc85fd2016-02-24 11:42:41 +0100277 $c->inactivity_timeout(300);
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100278 $word =~ s/\s+/ /g;
Marc Kupietz3082fd02019-01-09 14:54:06 +0100279 if($opt_m && $word !~ /\|/) {
280 $word .= "|$word";
281 }
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100282 for my $w (split(' *\| *', $word)) {
Marc Kupietz3082fd02019-01-09 14:54:06 +0100283 if($opt_m) {
284 if($searchBaseVocabFirst) {
285 $searchBaseVocabFirst=0;
286 } else {
287 $searchBaseVocabFirst=1;
288 }
289 }
290 if ($cache{$w.$cutoff.$no_nbs.$sort.$dedupe,$searchBaseVocabFirst}) {
Marc Kupietz15987412017-11-07 15:56:58 +0100291 $c->app->log->info("Getting $w results from cache");
Marc Kupietz3082fd02019-01-09 14:54:06 +0100292 $res = $cache{$w.$cutoff.$no_nbs.$sort.$dedupe.$searchBaseVocabFirst}
Marc Kupietza5b90152016-03-15 17:39:19 +0100293 } else {
Marc Kupietz15987412017-11-07 15:56:58 +0100294 $c->app->log->info('Looking for neighbours of '.$w);
295 if($opt_i) {
Marc Kupietzac707b32018-12-20 11:36:38 +0100296 $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe, $nosp);
Marc Kupietz15987412017-11-07 15:56:58 +0100297 } else {
Marc Kupietzac707b32018-12-20 11:36:38 +0100298 $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe, $nosp);
Marc Kupietz15987412017-11-07 15:56:58 +0100299 }
Marc Kupietz2dd2dd72017-12-01 22:08:14 +0100300 $cache{$w.$cutoff.$no_nbs.$sort.$dedupe} = $res;
Marc Kupietza5b90152016-03-15 17:39:19 +0100301 }
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100302 push(@lists, $res->{paradigmatic});
Marc Kupietz15987412017-11-07 15:56:58 +0100303 }
304 }
Marc Kupietz56844a22019-08-02 15:12:19 +0200305
Marc Kupietz000ad862016-02-26 14:59:12 +0100306 $word =~ s/ *\| */ | /g;
Marc Kupietzb613b052016-04-28 14:11:59 +0200307 if($json) {
308 return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}});
Marc Kupietzc469f3b2017-11-13 14:07:36 +0100309 } elsif($csv) {
310 my $csv_data="";
311 for (my $i=0; $i <= $no_nbs; $i++) {
312 $csv_data .= $res->{paradigmatic}->[$i]->{word} . ", ";
313 }
314 for (my $i=0; $i < $no_nbs; $i++) {
315 $csv_data .= $res->{syntagmatic}->[$i]->{word} . ", ";
316 }
317 chop $csv_data;
318 chop $csv_data;
319 $csv_data .= "\n";
320 return $c->render(text=>$csv_data);
Marc Kupietzb613b052016-04-28 14:11:59 +0200321 } else {
Marc Kupietzd7760b42019-02-21 09:01:44 +0100322 my $distantWords="";
323 if(!defined($word) || $word !~ /^\s*$/) {
324 $distantWords = getBiggestMergedDifferences();
325 }
326 $c->render(template=>"index", title=>$title, word=>$word, distantWords=>$distantWords, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, haveSProfiles=> $have_sprofiles, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
Marc Kupietzb613b052016-04-28 14:11:59 +0200327 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200328};
329
Marc Kupietz30ca4342017-11-22 21:21:20 +0100330helper(bitvec2window => sub {
331 my ($self, $n) = @_;
332 my $str = unpack("B32", pack("N", $n));
333 $str =~ s/^\d{22}//;
334 $str =~ s/^(\d{5})/$1x/;
335 $str =~ s/0/ยท/g;
336 $str =~ s/1/+/g;
337 return $str;
338 });
339
Marc Kupietza5b90152016-03-15 17:39:19 +0100340$daemon->run; # app->start;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200341
342exit;