| #!/usr/local/bin/perl |
| use Inline C; |
| use Mojolicious::Lite; |
| use Encode qw(decode encode); |
| |
| init_net("vectors14.bin"); |
| |
| get '/' => sub { |
| my $c = shift; |
| my $word=$c->param('word'); |
| my $list; |
| if($word !~ /^\s*$/) { |
| $list = print_neighbours(encode("iso-8859-1", $word)); |
| } |
| $c->render(template=>"index", word=>$word, list=> $list); |
| }; |
| |
| app->start; |
| |
| exit; |
| |
| __END__ |
| |
| __C__ |
| #include <stdio.h> |
| #include <string.h> |
| #include <math.h> |
| #include <malloc.h> |
| #include <stdlib.h> //strlen |
| |
| #define max_size 2000 |
| #define max_w 50 |
| #define N 75 |
| |
| //the thread function |
| void *connection_handler(void *); |
| |
| char *bestw[N]; |
| char file_name[max_size], st[100][max_size]; |
| float dist, len, bestd[N], vec[max_size]; |
| long long words, size, a, b, c, d, cn, bi[100]; |
| char ch; |
| float *M; |
| char *vocab; |
| char *stringBuffer; |
| |
| int init_net(char *file_name) { |
| FILE *f; |
| |
| stringBuffer = malloc(64000); |
| f = fopen(file_name, "rb"); |
| if (f == NULL) { |
| printf("Input file %s not found\n", file_name); |
| return -1; |
| } |
| fscanf(f, "%lld", &words); |
| fscanf(f, "%lld", &size); |
| vocab = (char *)malloc((long long)words * max_w * sizeof(char)); |
| for (a = 0; a < N; a++) bestw[a] = (char *)malloc(max_size * sizeof(char)); |
| M = (float *)malloc((long long)words * (long long)size * sizeof(float)); |
| if (M == NULL) { |
| printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size); |
| return -1; |
| } |
| for (b = 0; b < words; b++) { |
| a = 0; |
| while (1) { |
| vocab[b * max_w + a] = fgetc(f); |
| if (feof(f) || (vocab[b * max_w + a] == ' ')) break; |
| if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++; |
| } |
| vocab[b * max_w + a] = 0; |
| for (a = 0; a < size; a++) fread(&M[a + b * size], sizeof(float), 1, f); |
| len = 0; |
| for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size]; |
| len = sqrt(len); |
| for (a = 0; a < size; a++) M[a + b * size] /= len; |
| } |
| fclose(f); |
| return 0; |
| } |
| |
| SV *print_neighbours(char *st1) { |
| FILE *out=stdout; |
| *stringBuffer=0; |
| |
| for (a = 0; a < N; a++) bestd[a] = 0; |
| for (a = 0; a < N; a++) bestw[a][0] = 0; |
| a = 0; |
| cn = 0; |
| b = 0; |
| c = 0; |
| while (1) { |
| st[cn][b] = st1[c]; |
| b++; |
| c++; |
| st[cn][b] = 0; |
| if (st1[c] == 0) break; |
| if (st1[c] == ' ') { |
| cn++; |
| b = 0; |
| c++; |
| } |
| } |
| cn++; |
| for (a = 0; a < cn; a++) { |
| for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break; |
| if (b == words) b = -1; |
| bi[a] = b; |
| sprintf(stringBuffer, "\n<pre>Word: \"%s\" Position in vocabulary: %lld</pre>\n", st[a], bi[a]); |
| if (b == -1) { |
| sprintf(stringBuffer+strlen(stringBuffer), "Out of dictionary word!\n"); |
| break; |
| } |
| } |
| if (b == -1) goto end; |
| for (a = 0; a < size; a++) vec[a] = 0; |
| for (b = 0; b < cn; b++) { |
| if (bi[b] == -1) continue; |
| for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size]; |
| } |
| len = 0; |
| for (a = 0; a < size; a++) len += vec[a] * vec[a]; |
| len = sqrt(len); |
| for (a = 0; a < size; a++) vec[a] /= len; |
| for (a = 0; a < N; a++) bestd[a] = -1; |
| for (a = 0; a < N; a++) bestw[a][0] = 0; |
| for (c = 0; c < words; c++) { |
| a = 0; |
| for (b = 0; b < cn; b++) if (bi[b] == c) a = 1; |
| if (a == 1) continue; |
| dist = 0; |
| for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size]; |
| for (a = 0; a < N; a++) { |
| if (dist > bestd[a]) { |
| for (d = N - 1; d > a; d--) { |
| bestd[d] = bestd[d - 1]; |
| strcpy(bestw[d], bestw[d - 1]); |
| } |
| bestd[a] = dist; |
| strcpy(bestw[a], &vocab[c * max_w]); |
| break; |
| } |
| } |
| } |
| AV* array = newAV(); |
| for (a = 0; a < N; a++) { |
| HV* hash = newHV(); |
| hv_store(hash, "word", strlen("word"), newSVpvf(bestw[a], 0), 0); |
| hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0); |
| av_push(array, newRV_noinc((SV*)hash)); |
| } |
| end: |
| return newRV_noinc((SV*)array); |
| } |
| |
| __DATA__ |
| |
| @@ index.html.ep |
| <!DOCTYPE html> |
| <html> |
| <head><title>DeReKo-Word-Vector-Distances</title></head> |
| <body> |
| <p>Word vector model based on a 1.9 billion word sample of DeReKo-2015-II (mainly wikipedia including discussions, current newspapapers and fiction). Trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters:</p> |
| <pre> |
| -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15 |
| </pre> |
| </p> |
| <form action="<%=url_for('/')->to_abs%>" method="GET"> |
| Word: <input type="text" name="word"> |
| <input type="submit" value="Show neighbours"> |
| </form> |
| <br> |
| % if($list) { |
| <p>Target word: <b><%= $word%></b></p> |
| <h3>Nearest neighbours</h3> |
| <table> |
| <tr> |
| <th align="right">Pos.</th><th align="left">Word</th><th align="right">Cosine dist.</th> |
| </tr> |
| % my $i=1; for my $item (@$list) { |
| <tr> |
| <td align="right"> |
| <%= $i++ %>. |
| </td> |
| <td> |
| <a href="/?word=<%= $item->{word} %>"> |
| <%= $item->{word} %> |
| </a> |
| </td> |
| <td align="right"> |
| <%= sprintf("%.3f", $item->{dist}) %> |
| </td> |
| </tr> |
| % } |
| </table> |
| % } |
| </body> |
| </html> |
| |