w2v-server: load and serve similar profiles if present
diff --git a/w2v-server.pl b/w2v-server.pl
index a8ef559..b1e542f 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -31,6 +31,7 @@
my $mergedEnd=0;
my %cache;
my %cccache; # classic collocator cache
+my %spcache; # similar profile cache
getopts('d:Gil:p:m:n:M:');
@@ -56,6 +57,8 @@
close(FILE);
}
+my $have_sprofiles = load_sprofiles($ARGV[0]);
+
if($opt_m) {
$mergedEnd = mergeVectors($opt_m);
}
@@ -102,6 +105,17 @@
return $cccache{$word};
}
+sub getSimilarProfilesCached {
+ my ($c, $word) = @_;
+ if(!$spcache{$word}) {
+ $spcache{$word} = getSimilarProfiles($word);
+ } else {
+ $c->app->log->info("Getting similar profiles for $word from cache:");
+ print $spcache{$word};
+ }
+ return $spcache{$word};
+}
+
post '/derekovecs/getVecsByRanks' => sub {
my $self = shift;
my $vec = getVecs($self->req->json);
@@ -118,6 +132,11 @@
$self->render(data => getClassicCollocatorsCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json');
};
+any '*/getSimilarProfiles' => sub {
+ my $self = shift;
+ $self->render(data => getSimilarProfilesCached($self, $self->param("w") ? $self->param("w") : $self->req->json), format=>'json');
+};
+
get '*/img/*' => sub {
my $c = shift;
my $url = $c->req->url;
@@ -179,7 +198,7 @@
$csv_data .= "\n";
return $c->render(text=>$csv_data);
} else {
- $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+ $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, haveSProfiles=> $have_sprofiles, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
}
};
@@ -258,11 +277,23 @@
float *window_sums;
} knnpars;
+typedef struct {
+ uint32_t index;
+ float value;
+} sparse_t;
+
+typedef struct {
+ uint32_t len;
+ sparse_t nbr[100];
+} profile_t;
+
float *M, *M2=0L, *syn1neg_window, *expTable;
float *window_sums;
char *vocab;
char *garbage = NULL;
COLLOCATORDB *cdb = NULL;
+profile_t *sprofiles = NULL;
+size_t sprofiles_qty = 0;
long long words, size, merged_end;
long long merge_words = 0;
@@ -270,6 +301,39 @@
int latin_enc=0;
int window;
+/* load collocation profiles if file exists */
+int load_sprofiles(char *vecsname) {
+ char *basename = strdup(vecsname);
+ char *pos = strstr(basename, ".vecs");
+ if(pos)
+ *pos=0;
+
+ char binsprofiles_fname[256];
+ strcpy(binsprofiles_fname, basename);
+ strcat(binsprofiles_fname, ".sprofiles.bin");
+ FILE *fp = fopen(binsprofiles_fname, "rb");
+ if (fp == NULL) {
+ printf("Collocation profiles %s not found. No problem.\n", binsprofiles_fname);
+ return 0;
+ }
+ fseek(fp, 0L, SEEK_END);
+ size_t sz = ftell(fp);
+ fclose(fp);
+
+ int fd = open(binsprofiles_fname, O_RDONLY);
+ sprofiles = mmap(0, sz, PROT_READ, MAP_SHARED, fd, 0);
+ if (sprofiles == MAP_FAILED) {
+ close(fd);
+ fprintf(stderr, "Cannot mmap %s\n", binsprofiles_fname);
+ sprofiles = NULL;
+ return 0;
+ } else {
+ sprofiles_qty = sz / sizeof(profile_t);
+ fprintf(stderr, "Successfully mmaped %s containing similar profiles for %ld word forms.\n", binsprofiles_fname, sprofiles_qty);
+ }
+ return 1;
+}
+
int init_net(char *file_name, char *net_name, int latin) {
FILE *f, *binvecs, *binwords;
int binwords_fd, binvecs_fd, net_fd, i;
@@ -393,7 +457,8 @@
expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
}
window_sums = malloc(sizeof(float) * (window+1) * 2);
- return 0;
+
+ return 0;
}
long mergeVectors(char *file_name){
@@ -463,7 +528,7 @@
previous = 0;
if(strncmp("quot", w, 4) == 0) {
garbage[i]=1;
- printf("Gargabe: %s\n", vocab + i * max_w);
+// printf("Gargabe: %s\n", vocab + i * max_w);
} else {
while((c = *w++) && !garbage[i]) {
if( ((c <= 90 && c >= 65) && (previous >= 97 && previous <= 122)) ||
@@ -597,6 +662,29 @@
return result;
}
+char *getSimilarProfiles(long node) {
+ int i;
+ char buffer[120000];
+ char pair_buffer[2048];
+ buffer[0]='[';
+ buffer[1]=0;
+ if(node >= sprofiles_qty) {
+ printf("Not available in precomputed profile\n");
+ return(strdup("[{\"w\":\"not available\", \"v\":0}]\n"));
+ }
+
+ printf("******* %s ******\n", &vocab[max_w * node]);
+
+ for(i=0; i < 100 && i < sprofiles[node].len; i++) {
+ sprintf(pair_buffer, "{\"w\":\"%s\", \"v\":%f},", &vocab[max_w * (sprofiles[node].nbr[i].index)], sprofiles[node].nbr[i].value);
+ strcat(buffer, pair_buffer);
+ }
+ buffer[strlen(buffer)-1]=']';
+ strcat(buffer, "\n");
+ printf(buffer);
+ return(strdup(buffer));
+}
+
char *getClassicCollocators(long node) {
char *res = (cdb? strdup(get_collocators_as_json(cdb, node)) : "[]");
return res;