w2v-server: add option -m to merge secondary models
diff --git a/w2v-server.pl b/w2v-server.pl
index df47adc..a255b2f 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -1,7 +1,7 @@
#!/usr/local/bin/perl
-#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -g";
use Inline C;
-use Inline C => Config => CLEAN_AFTER_BUILD => 0; #, ccflags => $Config{ccflags}." -g";
+#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
+use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -O4";
use Mojolicious::Lite;
use Mojo::JSON qw(decode_json encode_json to_json);
use Encode qw(decode encode);
@@ -12,12 +12,14 @@
our $opt_i = 0; # latin1-input?
our $opt_l = undef;
our $opt_p = 5676;
+our $opt_m;
our $opt_n = '';
our $opt_d;
my $training_args="";
+my $mergedEnd=0;
-getopt('d:il:p:n:');
+getopt('d:il:p:m:');
# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
if(!$ARGV[0]) {
@@ -30,6 +32,10 @@
close(FILE);
}
+if($opt_m) {
+ $mergedEnd = mergeVectors($opt_m);
+}
+
if($opt_d) { # -d: dump vecs and exit
dump_vecs($opt_d);
exit;
@@ -48,6 +54,7 @@
my $perplexity=$c->param('perplexity') || 20;
my $epsilon=$c->param('epsilon') || 5;
my $som=$c->param('som') || 0;
+ my $searchBaseVocabFirst=$c->param('sbf') || 0;
my $sort=$c->param('sort') || 0;
my $res;
my @lists;
@@ -58,15 +65,15 @@
for my $w (split(' *\| *', $word)) {
$c->app->log->debug('Looking for neighbours of '.$w);
if($opt_i) {
- $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort);
+ $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst);
} else {
- $res = get_neighbours($w, $no_nbs, $sort);
+ $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst);
}
push(@lists, $res->{paradigmatic});
}
}
$word =~ s/ *\| */ | /g;
- $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, sort=>$sort, training_args=>$training_args, lists=> \@lists, collocators=> $res->{syntagmatic});
+ $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, lists=> \@lists, collocators=> $res->{syntagmatic});
};
$daemon->run; # app->start;
@@ -123,7 +130,7 @@
float *M, *M2=0L, *syn1neg_window, *expTable;
char *vocab;
-long long words, size;
+long long words, size, merged_end;
int num_threads=20;
int latin_enc=0;
int window;
@@ -241,6 +248,62 @@
return 0;
}
+long mergeVectors(char *file_name){
+ FILE *f, *binvecs, *binwords;
+ int binwords_fd, binvecs_fd, net_fd, i;
+ long long a, b, c, d, cn;
+ float len;
+ float *merge_vecs;
+ char *merge_vocab;
+ long long merge_words, merge_size;
+
+ char binvecs_fname[256], binwords_fname[256];
+ strcpy(binwords_fname, file_name);
+ strcat(binwords_fname, ".words");
+ strcpy(binvecs_fname, file_name);
+ strcat(binvecs_fname, ".vecs");
+
+ f = fopen(file_name, "rb");
+ if (f == NULL) {
+ printf("Input file %s not found\n", file_name);
+ exit -1;
+ }
+ fscanf(f, "%lld", &merge_words);
+ fscanf(f, "%lld", &merge_size);
+ if(merge_size != size){
+ fprintf(stderr, "vectors must have the same length\n");
+ exit(-1);
+ }
+ if( (binvecs_fd = open(binvecs_fname, O_RDONLY)) >= 0 && (binwords_fd = open(binwords_fname, O_RDONLY)) >= 0) {
+ merge_vecs = malloc(sizeof(float) * (words + merge_words) * size);
+ merge_vocab = malloc(sizeof(char) * (words + merge_words) * max_w);
+ if (merge_vecs == NULL || merge_vocab == NULL) {
+ close(binvecs_fd);
+ close(binwords_fd);
+ fprintf(stderr, "Cannot reserve memory for %s or %s\n", binwords_fname, binvecs_fname);
+ exit(-1);
+ }
+ read(binvecs_fd, merge_vecs, merge_words * size * sizeof(float));
+ read(binwords_fd, merge_vocab, merge_words * max_w);
+ } else {
+ fprintf(stderr, "Cannot open %s or %s\n", binwords_fname, binvecs_fname);
+ exit(-1);
+ }
+ printf("Successfully reallocated memory\nMerging...\n");
+ fflush(stdout);
+ memcpy(merge_vecs + merge_words * size, M, words * size * sizeof(float));
+ memcpy(merge_vocab + merge_words * max_w, vocab, words * max_w);
+ munmap(M, words * size * sizeof(float));
+ munmap(vocab, words * max_w);
+ M = merge_vecs;
+ vocab = merge_vocab;
+ merged_end = merge_words;
+ words += merge_words;
+ fclose(f);
+ printf("merged_end: %lld, words: %lld\n", merged_end, words);
+ return((long) merged_end);
+}
+
void *getCollocators(knnpars *pars) {
int N = pars->N;
int cc = pars->wl->wordi[0];
@@ -344,11 +407,12 @@
pthread_exit(nbs);
}
-wordlist *getTargetWords(char *st1) {
+wordlist *getTargetWords(char *st1, int search_backw) {
wordlist *wl = malloc(sizeof(wordlist));
char st[100][max_size], sep[100];
long a, b=0, c=0, cn=0;
-
+ int unmerged;
+
while (1) {
st[cn][b] = st1[c];
b++;
@@ -363,7 +427,11 @@
}
cn++;
for (a = 0; a < cn; a++) {
- for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
+ if(search_backw) {
+ for (b = words - 1; b >= 0; b--) if (!strcmp(&vocab[b * max_w], st[a])) break;
+ } else {
+ for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
+ }
if (b == words) b = -1;
wl->wordi[a] = b;
fprintf(stderr, "Word: \"%s\" Position in vocabulary: %lld\n", st[a], wl->wordi[a]);
@@ -448,7 +516,7 @@
}
-SV *get_neighbours(char *st1, int N, int sort_by) {
+SV *get_neighbours(char *st1, int N, int sort_by, int search_backw) {
HV *result = newHV();
float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
@@ -464,7 +532,7 @@
slice = words / para_threads;
- wl = getTargetWords(st1);
+ wl = getTargetWords(st1, search_backw);
if(wl->length < 1)
goto end;
@@ -674,12 +742,27 @@
.ui-tooltip-content {
font-size: 9pt;
- colour: #222222;
+ color: #222222;
}
svg > .ui-tooltip-content {
font-size: 8pt;
- colour: #222222;
+ color: #222222;
+}
+
+a.merged {
+ color: green;
+ fill: green;
+}
+
+a.marked {
+ color: orange;
+ fill: orange;
+}
+
+a.target {
+ color: red;
+ fill: red;
}
#collocators {
@@ -790,19 +873,29 @@
g.append("a")
.attr("xlink:href", function(word) {return "/?word="+word;})
+ .attr("class", function(d, i) {
+ if(data.target.indexOf(" "+d+" ") >= 0) {
+ return "target";
+ } else if(data.mergedEnd > 0 && data.ranks[i] < data.mergedEnd) {
+ return "merged";
+ } else {
+ return ""
+ }
+ })
.attr("title", function(d, i) {
- return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+ if(data.mergedEnd > 0) {
+ if(data.ranks[i] >= data.mergedEnd) {
+ return "rank: "+i +" "+"freq. rank: "+(data.ranks[i]).toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+ } else {
+ return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",") + " (merged vocab)";
+ }
+ } else {
+ return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
+ }
})
.append("text")
.attr("text-anchor", "top")
.attr("font-size", 12)
- .attr("fill", function(d) {
- if(data.target.indexOf(" "+d+" ") >= 0) {
- return "red";
- } else {
- return "#333"
- }
- })
.text(function(d) { return d; });
var zoomListener = d3.behavior.zoom()
@@ -915,6 +1008,9 @@
<form action="<%=url_for('/')->to_abs%>" method="GET">
word(s):
<input type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
+ % if($mergedEnd > 0) {
+ backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
+ % }
max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
@@ -956,9 +1052,15 @@
<%= sprintf("%.3f", $item->{dist}) %>
</td>
<td>
- <a title="freq. rank: <%= $item->{rank} %>" href="/?word=<%= $item->{word} %>">
- <%= $item->{word} %>
- </a>
+ % my $class = "";
+ % my $r = $item->{rank};
+ % if($r < $mergedEnd) {
+ % $class="merged";
+ % $r .= " (merged vocab)";
+ % } elsif($mergedEnd!=0 && $r > $mergedEnd) {
+ % $r -= $mergedEnd;
+ % }
+ <a class="<%= $class %>" title="freq. rank: <%= $r %>" href="/?word=<%= $item->{word} %>"><%= $item->{word} %></a>
</td>
% } else {
<td colspan="2"/>
@@ -990,7 +1092,7 @@
<script>
% use Mojo::ByteStream 'b';
$(window).load(function() {
- showMap(<%= b(Mojo::JSON::to_json({target => " $word ", words => \@words, vecs => \@vecs, ranks => \@ranks})); %>);
+ showMap(<%= b(Mojo::JSON::to_json({target => " $word ", mergedEnd=> $mergedEnd, words => \@words, vecs => \@vecs, ranks => \@ranks})); %>);
});
</script>
% }