w2v-server.pl: utf8 now default encoding
* use -i option for latin1 models
* new command line syntax:
e.g. perl w2v-server.pl -p 5676 models/wpdro.vecs
diff --git a/w2v-server.pl b/w2v-server.pl
index 2d1b874..b515c6b 100644
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -3,17 +3,29 @@
use Mojolicious::Lite;
use Mojo::JSON qw(decode_json encode_json to_json);
use Encode qw(decode encode);
+use Getopt::Std;
use Mojo::Server::Daemon;
plugin 'Log::Access';
+our $opt_i = 0; # latin1-input?
+our $opt_l = undef;
+our $opt_p = 5676;
+
+getopt('il:p:');
+
print STDERR $ARGV[1];
# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
-if(!$ARGV[1]) {
- init_net("vectors15.bin");
+if(!$ARGV[0]) {
+ init_net("vectors15.bin", ($opt_i? 1 : 0));
} else {
- init_net($ARGV[1]);
+ init_net($ARGV[0], ($opt_i? 1 : 0));
}
+my $daemon = Mojo::Server::Daemon->new(
+ app => app,
+ listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
+);
+
get '/' => sub {
my $c = shift;
my $word=$c->param('word');
@@ -29,14 +41,18 @@
$word =~ s/\s+/ /g;
for my $w (split(' *\| *', $word)) {
$c->app->log->debug('Looking for neighbours of '.$w);
- push(@lists, get_neighbours(encode("iso-8859-1", $w), $no_nbs));
+ if($opt_i) {
+ push(@lists, get_neighbours(encode("iso-8859-1", $w), $no_nbs));
+ } else {
+ push(@lists, get_neighbours($w, $no_nbs));
+ }
}
}
$word =~ s/ *\| */ | /g;
$c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, lists=> \@lists);
};
-app->start;
+$daemon->run; # app->start;
exit;
@@ -78,8 +94,9 @@
char *vocab;
long long words, size;
int num_threads=20;
+int latin_enc=0;
-int init_net(char *file_name) {
+int init_net(char *file_name, int latin) {
FILE *f, *binvecs, *binwords;
int binwords_fd, binvecs_fd;
long long a, b, c, d, cn;
@@ -91,6 +108,7 @@
strcpy(binvecs_fname, file_name);
strcat(binvecs_fname, ".vecs");
+ latin_enc = latin;
f = fopen(file_name, "rb");
if (f == NULL) {
printf("Input file %s not found\n", file_name);
@@ -292,7 +310,9 @@
for (a = 0; a < N; a++) {
strcpy(bestw[a], &vocab[besti[a] * max_w]);
HV* hash = newHV();
- hv_store(hash, "word", strlen("word"), newSVpvf(bestw[a], 0), 0);
+ SV* word = newSVpvf(bestw[a], 0);
+ if(latin_enc == 0) SvUTF8_on(word);
+ hv_store(hash, "word", strlen("word"), word , 0);
hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
hv_store(hash, "rank", strlen("rank"), newSVuv(besti[a]), 0);
AV *vector = newAV();