w2v-server: fix/improve garbage filter
diff --git a/w2v-server.pl b/w2v-server.pl
index b7c1f55..127c779 100755
--- a/w2v-server.pl
+++ b/w2v-server.pl
@@ -327,17 +327,17 @@
void filter_garbage() {
long i;
- char *w, previous, c;
+ unsigned char *w, previous, c;
garbage = malloc(words);
memset(garbage, 0, words);
for (i = 0; i < words; i++) {
w = vocab + i * max_w;
previous = 0;
- while((c=*w++) && !garbage[i]) {
- if( ((c & 32) == 0 && (previous & 32) == 32) ||
- previous == '-' && (c & 32) ||
- c == ''
- ) {
+ while((c = *w++) && !garbage[i]) {
+ if( ((c <= 90 && c >= 65) && (previous >= 97 && previous <= 122)) ||
+ (previous == '-' && (c & 32)) ||
+ (previous == 0xc2 && (c == 0xa4 || c == 0xb6 ))
+ ) {
garbage[i]=1;
continue;
}