w2v-servert: add cutoff parame (consider only n most frequent forms)
WARNING: this might break mapped multi-lingual vector-spaces
diff --git a/templates/index.html.ep b/templates/index.html.ep
index 7270997..d42a282 100644
--- a/templates/index.html.ep
+++ b/templates/index.html.ep
@@ -310,11 +310,13 @@
<form method="GET">
word(s):
<input id="word" type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
+ cut-off:
+ <input id="cutoff" type="text" name="cutoff" size="10" value="<%= $cutoff %>" title="Only consider the most frequent x word forms.">
% if($mergedEnd > 0) {
backw. <input type="checkbox" name="sbf" value="1" <%= ($searchBaseVocabFirst ? "checked" : "") %> title="If checkecked base vocabulary will be searched first. Otherwise merged vocabulray will be searched first.">
% }
- max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
- max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
+ max. neighbours: <input type="text" size="4" name="n" value="<%= $no_nbs %>">
+ max. iterations: <input type="text" name="N" size="4" value="<%= $no_iterations %>">
SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
% if($collocators) {
<span> </span>sort collocators by
diff --git a/w2v-server.pl b/w2v-server.pl
index c864774..402a0d5 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -92,6 +92,7 @@
my $searchBaseVocabFirst=$c->param('sbf') || 0;
my $sort=$c->param('sort') || 0;
my $json=$c->param('json') || 0;
+ my $cutoff=$c->param('cutoff') || 1000000;
my $res;
my @lists;
my @collocations;
@@ -105,9 +106,9 @@
} else {
$c->app->log->info('Looking for neighbours of '.$w);
if($opt_i) {
- $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst);
+ $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff);
} else {
- $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst);
+ $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff);
}
$cache{$w} = $res;
}
@@ -118,7 +119,7 @@
if($json) {
return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}});
} else {
- $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+ $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
}
};
@@ -165,6 +166,7 @@
} wordlist;
typedef struct {
+ long cutoff;
wordlist *wl;
char *token;
int N;
@@ -388,7 +390,7 @@
if(M2 == NULL || cc == -1)
return NULL;
- a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
+ a = posix_memalign((void **) &target_sums, 128, pars->cutoff * sizeof(float));
besti = malloc(N * sizeof(long long));
bestp = malloc(N * sizeof(long long));
bestf = malloc(N * sizeof(float));
@@ -396,7 +398,7 @@
worstbest = MIN_RESP;
- for (b = 0; b < words; b++)
+ for (b = 0; b < pars->cutoff; b++)
target_sums[b]=0;
for (b = 0; b < N; b++) {
besti[b] = -1;
@@ -418,7 +420,8 @@
window_offset = a * size;
if (a > window)
window_offset -= size;
- for(target = 0; target < words; target ++) {
+ for(target = 0; target < pars->cutoff; target ++) {
+ if(garbage && garbage[target]) continue;
if(target == d)
continue;
f = 0;
@@ -463,7 +466,7 @@
}
}
- for (b = 0; b < words; b++)
+ for (b = 0; b < pars->cutoff; b++)
pars->target_sums[b] += (target_sums[b] / wpos_sum ) / (window * 2);
free(target_sums);
for(b=0; b<N && besti[b] >= 0; b++) // THIS LOOP IS NEEDED (b...)
@@ -588,7 +591,7 @@
}
-SV *get_neighbours(char *st1, int N, int sort_by, int search_backw) {
+SV *get_neighbours(char *st1, int N, int sort_by, int search_backw, long cutoff) {
HV *result = newHV();
float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
long long old_words;
@@ -603,32 +606,36 @@
if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
-
+ if(cutoff < 1)
+ cutoff=words;
+
wl = getTargetWords(st1, search_backw);
if(wl->length < 1)
goto end;
- old_words = words;
+ old_words = cutoff;
if(merge_words > 0)
- words = merge_words * 1.25; /* HACK */
- slice = words / para_threads;
+ cutoff = merge_words * 1.25; /* HACK */
+ slice = cutoff / para_threads;
- a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
- for(a = 0; a < words; a++)
+ a = posix_memalign((void **) &target_sums, 128, cutoff * sizeof(float));
+ for(a = 0; a < cutoff; a++)
target_sums[a] = 0;
printf("Starting %d threads\n", para_threads);
fflush(stdout);
for(a=0; a < para_threads; a++) {
+ pars[a].cutoff = cutoff;
pars[a].token = st1;
pars[a].wl = wl;
pars[a].N = N;
pars[a].from = a*slice;
- pars[a].upto = ((a+1)*slice > words? words:(a+1)*slice);
+ pars[a].upto = ((a+1)*slice > cutoff? cutoff:(a+1)*slice);
pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
}
if(M2) {
for(a=0; a < syn_threads; a++) {
+ pars[a + para_threads].cutoff = cutoff;
pars[a + para_threads].target_sums = target_sums;
pars[a + para_threads].wl = wl;
pars[a + para_threads].N = N;