w2v-server: new param: dedupe
Removes all words that contain the target word or that are contained by
the target word from similars and collocators.
diff --git a/w2v-server.pl b/w2v-server.pl
index 1173a18..a0e9e49 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -93,6 +93,7 @@
my $sort=$c->param('sort') || 0;
my $json=$c->param('json') || 0;
my $cutoff=$c->param('cutoff') || 1000000;
+ my $dedupe=$c->param('dedupe') || 0;
my $res;
my @lists;
my @collocations;
@@ -100,15 +101,15 @@
$c->inactivity_timeout(300);
$word =~ s/\s+/ /g;
for my $w (split(' *\| *', $word)) {
- if ($cache{$w.$cutoff.$no_nbs.$sort}) {
+ if ($cache{$w.$cutoff.$no_nbs.$sort.$dedupe}) {
$c->app->log->info("Getting $w results from cache");
- $res = $cache{$w.$cutoff.$no_nbs.$sort}
+ $res = $cache{$w.$cutoff.$no_nbs.$sort.$dedupe}
} else {
$c->app->log->info('Looking for neighbours of '.$w);
if($opt_i) {
- $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff);
+ $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe);
} else {
- $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff);
+ $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst, $cutoff, $dedupe);
}
$cache{$w} = $res;
}
@@ -119,7 +120,7 @@
if($json) {
return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}});
} else {
- $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
+ $c->render(template=>"index", word=>$word, cutoff=>$cutoff, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, dedupe=> $dedupe, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
}
};
@@ -592,7 +593,7 @@
}
-SV *get_neighbours(char *st1, int N, int sort_by, int search_backw, long cutoff) {
+SV *get_neighbours(char *st1, int N, int sort_by, int search_backw, long cutoff, int dedupe) {
HV *result = newHV();
float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
long long old_words;
@@ -675,11 +676,23 @@
}
}
+ char *chosen[600];
AV* array = newAV();
- int i;
+ int i, j;
int l1_words=0, l2_words=0;
for (a = 0, i = 0; i < N && a < 600; a++) {
+ int filtered=0;
long long c = besti[a];
+ if (dedupe && i > 0) {
+ for (j=0; j<i; j++)
+ if (strcasestr(&vocab[c * max_w], chosen[j]) ||
+ strcasestr(chosen[j], &vocab[c * max_w])) {
+ printf("filtering %s %s\n", chosen[j], &vocab[c * max_w]);
+ filtered = 1;
+ }
+ if(filtered)
+ continue;
+ }
if(merge_words > 0) {
if(c >= merge_words) {
if(l1_words > N / 2)
@@ -698,6 +711,7 @@
HV* hash = newHV();
SV* word = newSVpvf(&vocab[c * max_w], 0);
+ chosen[i] = &vocab[c * max_w];
if(latin_enc == 0) SvUTF8_on(word);
fflush(stdout);
hv_store(hash, "word", strlen("word"), word , 0);
@@ -780,7 +794,20 @@
}
}
array = newAV();
- for (a = 0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
+ for (a = 0, i=0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
+ long long c = besti[a];
+ if (dedupe) {
+ int filtered=0;
+ for (j=0; j<i; j++)
+ if (strcasestr(&vocab[c * max_w], chosen[j]) ||
+ strcasestr(chosen[j], &vocab[c * max_w])) {
+ printf("filtering %s %s\n", chosen[j], &vocab[c * max_w]);
+ filtered = 1;
+ }
+ if(filtered)
+ continue;
+ }
+ chosen[i++]=&vocab[c * max_w];
HV* hash = newHV();
SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
if(latin_enc == 0) SvUTF8_on(word);