w2v-server: add option to filter out garbage: -G
diff --git a/w2v-server.pl b/w2v-server.pl
index 381e81b..6f39376 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -16,12 +16,13 @@
our $opt_M;
our $opt_n = '';
our $opt_d;
+our $opt_G;
my %marked;
my $training_args="";
my $mergedEnd=0;
-getopt('d:il:p:m:M:');
+getopts('d:Gil:p:m:M:');
if($opt_M) {
open my $handle, '<:encoding(UTF-8)', $opt_M
@@ -60,6 +61,11 @@
listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
);
+if($opt_G) {
+ print "Filtering garbage\n";
+ filter_garbage();
+}
+
get '/' => sub {
my $c = shift;
my $word=$c->param('word');
@@ -143,6 +149,7 @@
float *M, *M2=0L, *syn1neg_window, *expTable;
char *vocab;
+char *garbage = NULL;
long long words, size, merged_end;
int num_threads=20;
@@ -318,6 +325,28 @@
return((long) merged_end);
}
+void filter_garbage() {
+ long i;
+ char *w, previous, c;
+ garbage = malloc(words);
+ memset(garbage, 0, words);
+ for (i = 0; i < words; i++) {
+ w = vocab + i * max_w;
+ previous = 0;
+ while((c=*w++) && !garbage[i]) {
+ if( ((c & 32) == 0 && (previous & 32) == 32) ||
+ previous == '-' && (c & 32) ||
+ c == ''
+ ) {
+ garbage[i]=1;
+ continue;
+ }
+ previous = c;
+ }
+ }
+ return;
+}
+
void *getCollocators(knnpars *pars) {
int N = pars->N;
int cc = pars->wl->wordi[0];
@@ -501,6 +530,7 @@
for (a = 0; a < size; a++) vec[a] /= len;
for (a = 0; a < N; a++) bestd[a] = -1;
for (c = from; c < upto; c++) {
+ if(garbage && garbage[c]) continue;
a = 0;
// do not skip taget word
// for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
@@ -773,7 +803,7 @@
text-decoration: none;
}
-a.marked {
+a.marked, #first a.marked {
text-decoration: underline;
}