w2v-server: add option to filter out garbage: -G
diff --git a/w2v-server.pl b/w2v-server.pl
index 381e81b..6f39376 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -16,12 +16,13 @@
 our $opt_M;
 our $opt_n = '';
 our $opt_d;
+our $opt_G;
 
 my %marked;
 my $training_args="";
 my $mergedEnd=0;
 
-getopt('d:il:p:m:M:'); 
+getopts('d:Gil:p:m:M:');
 
 if($opt_M) {
   open my $handle, '<:encoding(UTF-8)', $opt_M
@@ -60,6 +61,11 @@
     listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
 );
 
+if($opt_G) {
+  print "Filtering garbage\n";
+  filter_garbage();
+}
+
 get '/' => sub {
   my $c    = shift;
 	my $word=$c->param('word');
@@ -143,6 +149,7 @@
 
 float *M, *M2=0L, *syn1neg_window, *expTable;
 char *vocab;
+char *garbage = NULL;
 
 long long words, size, merged_end;
 int num_threads=20;
@@ -318,6 +325,28 @@
   return((long) merged_end);
 }
 
+void filter_garbage() {
+  long i;
+  char *w, previous, c;
+  garbage = malloc(words);
+  memset(garbage, 0, words);
+  for (i = 0; i < words; i++) {
+    w = vocab + i * max_w;
+    previous = 0;
+    while((c=*w++) && !garbage[i]) {
+      if( ((c & 32) == 0 && (previous & 32) == 32) || 
+         previous == '-' && (c & 32) ||
+         c == 'ƒ'
+    ) {
+        garbage[i]=1;
+        continue;
+      }
+      previous = c;
+    }
+  }
+  return;
+}
+
 void *getCollocators(knnpars *pars) {
 	int N = pars->N;
   int cc = pars->wl->wordi[0];
@@ -501,6 +530,7 @@
 	for (a = 0; a < size; a++) vec[a] /= len;
 	for (a = 0; a < N; a++) bestd[a] = -1;
 	for (c = from; c < upto; c++) {
+    if(garbage && garbage[c]) continue;
 		a = 0;
 // do not skip taget word
 //		for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
@@ -773,7 +803,7 @@
   text-decoration: none;
 }
 
-a.marked {
+a.marked, #first a.marked {
   text-decoration: underline;
 }