| #!/usr/local/bin/perl |
| use Inline C; |
| use Mojolicious::Lite; |
| |
| init_net($ARGV[1]); |
| |
| helper print_neighbours => sub { shift; print_neighbours(@_) }; |
| |
| get '/' => sub { |
| my $c = shift; |
| my $word=$c->param('word'); |
| $c->render(template=>"index", word=>$word); |
| }; |
| |
| app->start; |
| |
| exit; |
| |
| __END__ |
| |
| __C__ |
| #include <stdio.h> |
| #include <string.h> |
| #include <math.h> |
| #include <malloc.h> |
| #include <stdlib.h> //strlen |
| |
| #define max_size 2000 |
| #define max_w 50 |
| #define N 75 |
| |
| //the thread function |
| void *connection_handler(void *); |
| |
| char *bestw[N]; |
| char file_name[max_size], st[100][max_size]; |
| float dist, len, bestd[N], vec[max_size]; |
| long long words, size, a, b, c, d, cn, bi[100]; |
| char ch; |
| float *M; |
| char *vocab; |
| char *stringBuffer; |
| |
| int init_net(char *file_name) { |
| FILE *f; |
| |
| stringBuffer = malloc(64000); |
| f = fopen(file_name, "rb"); |
| if (f == NULL) { |
| printf("Input file %s not found\n", file_name); |
| return -1; |
| } |
| fscanf(f, "%lld", &words); |
| fscanf(f, "%lld", &size); |
| vocab = (char *)malloc((long long)words * max_w * sizeof(char)); |
| for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); |
| M = (float *)malloc((long long)words * (long long)size * sizeof(float)); |
| if (M == NULL) { |
| printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); |
| return -1; |
| } |
| for (b = 0; b < words; b++) { |
| a = 0; |
| while (1) { |
| vocab[b * max_w + a] = fgetc(f); |
| if (feof(f) || (vocab[b * max_w + a] == ' ')) break; |
| if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; |
| } |
| vocab[b * max_w + a] = 0; |
| for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); |
| len = 0; |
| for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; |
| len = sqrt(len); |
| for (a = 0; a < size; a++) M[a + b * size] /= len; |
| } |
| fclose(f); |
| return 0; |
| } |
| |
| char *print_neighbours(char *st1) { |
| FILE *out=stdout; |
| *stringBuffer=0; |
| |
| for (a = 0; a < N; a++) bestd[a] = 0; |
| for (a = 0; a < N; a++) bestw[a][0] = 0; |
| a = 0; |
| cn = 0; |
| b = 0; |
| c = 0; |
| while (1) { |
| st[cn][b] = st1[c]; |
| b++; |
| c++; |
| st[cn][b] = 0; |
| if (st1[c] == 0) break; |
| if (st1[c] == ' ') { |
| cn++; |
| b = 0; |
| c++; |
| } |
| } |
| cn++; |
| for (a = 0; a < cn; a++) { |
| for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; |
| if (b == words) b = -1; |
| bi[a] = b; |
| sprintf(stringBuffer, "\n<pre>Word: \"%s\" Position in vocabulary: %lld</pre>\n", st[a], bi[a]); |
| if (b == -1) { |
| sprintf(stringBuffer+strlen(stringBuffer), "Out of dictionary word!\n"); |
| break; |
| } |
| } |
| if (b == -1) return stringBuffer; |
| sprintf(stringBuffer+strlen(stringBuffer), "\n<table><tr><th>Word</th><th>Cosine distance</th></tr>\n"); |
| for (a = 0; a < size; a++) vec[a] = 0; |
| for (b = 0; b < cn; b++) { |
| if (bi[b] == -1) continue; |
| for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; |
| } |
| len = 0; |
| for (a = 0; a < size; a++) len += vec[a] * vec[a]; |
| len = sqrt(len); |
| for (a = 0; a < size; a++) vec[a] /= len; |
| for (a = 0; a < N; a++) bestd[a] = -1; |
| for (a = 0; a < N; a++) bestw[a][0] = 0; |
| for (c = 0; c < words; c++) { |
| a = 0; |
| for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; |
| if (a == 1) continue; |
| dist = 0; |
| for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; |
| for (a = 0; a < N; a++) { |
| if (dist > bestd[a]) { |
| for (d = N - 1; d > a; d--) { |
| bestd[d] = bestd[d - 1]; |
| strcpy(bestw[d], bestw[d - 1]); |
| } |
| bestd[a] = dist; |
| strcpy(bestw[a], &vocab[c * max_w]); |
| break; |
| } |
| } |
| } |
| for (a = 0; a < N; a++) sprintf(stringBuffer+strlen(stringBuffer), "<tr><td>%s</td><td align=\"right\">%f</td></tr>\n", bestw[a], bestd[a]); |
| sprintf(stringBuffer+strlen(stringBuffer), "</table>\n"); |
| return stringBuffer; |
| } |
| |
| __DATA__ |
| |
| @@ index.html.ep |
| <!DOCTYPE html> |
| <html> |
| <head><title>word2vec</title></head> |
| <body> |
| <form action="<%=url_for('/')->to_abs%>" method="GET"> |
| Word: <input type="text" name="word"> |
| <input type="submit" value="Show neighbours"> |
| </form> |
| <br> |
| <%== print_neighbours($word) %> |
| </body> |
| </html> |
| |