blob: df47adc6da47219c6998ff03d1e9caf1aa04ef80 [file] [log] [blame]
Marc Kupietzdc22b982015-10-09 09:19:34 +02001#!/usr/local/bin/perl
Marc Kupietz43ee87e2016-04-25 10:50:08 +02002#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -g";
Marc Kupietzdc22b982015-10-09 09:19:34 +02003use Inline C;
Marc Kupietz43ee87e2016-04-25 10:50:08 +02004use Inline C => Config => CLEAN_AFTER_BUILD => 0; #, ccflags => $Config{ccflags}." -g";
Marc Kupietzdc22b982015-10-09 09:19:34 +02005use Mojolicious::Lite;
Marc Kupietzc4893362016-02-25 08:04:46 +01006use Mojo::JSON qw(decode_json encode_json to_json);
Marc Kupietz247500f2015-10-09 11:29:01 +02007use Encode qw(decode encode);
Marc Kupietza5b90152016-03-15 17:39:19 +01008use Getopt::Std;
Marc Kupietz7bc85fd2016-02-24 11:42:41 +01009use Mojo::Server::Daemon;
Marc Kupietzd4227392016-03-01 16:45:12 +010010plugin 'Log::Access';
Marc Kupietzdc22b982015-10-09 09:19:34 +020011
Marc Kupietza5b90152016-03-15 17:39:19 +010012our $opt_i = 0; # latin1-input?
13our $opt_l = undef;
14our $opt_p = 5676;
Marc Kupietz43ee87e2016-04-25 10:50:08 +020015our $opt_n = '';
16our $opt_d;
Marc Kupietza5b90152016-03-15 17:39:19 +010017
Marc Kupietz793413b2016-04-02 21:48:57 +020018my $training_args="";
19
Marc Kupietz43ee87e2016-04-25 10:50:08 +020020getopt('d:il:p:n:');
Marc Kupietza5b90152016-03-15 17:39:19 +010021
Marc Kupietz7bc85fd2016-02-24 11:42:41 +010022# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
Marc Kupietza5b90152016-03-15 17:39:19 +010023if(!$ARGV[0]) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +010024 init_net("vectors15.bin", $opt_n, ($opt_i? 1 : 0));
Marc Kupietz2cb667e2016-03-10 09:44:12 +010025} else {
Marc Kupietz6b2975c2016-03-18 21:59:33 +010026 init_net($ARGV[0], $opt_n, ($opt_i? 1 : 0));
Marc Kupietz793413b2016-04-02 21:48:57 +020027 if(open(FILE, "$ARGV[0].args")) {
28 $training_args = <FILE>;
29 }
30 close(FILE);
Marc Kupietz2cb667e2016-03-10 09:44:12 +010031}
Marc Kupietzdc22b982015-10-09 09:19:34 +020032
Marc Kupietz43ee87e2016-04-25 10:50:08 +020033if($opt_d) { # -d: dump vecs and exit
34 dump_vecs($opt_d);
35 exit;
36}
37
Marc Kupietza5b90152016-03-15 17:39:19 +010038my $daemon = Mojo::Server::Daemon->new(
39 app => app,
40 listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
41);
42
Marc Kupietzdc22b982015-10-09 09:19:34 +020043get '/' => sub {
44 my $c = shift;
45 my $word=$c->param('word');
Marc Kupietz44bee3c2016-02-25 16:26:29 +010046 my $no_nbs=$c->param('n') || 100;
47 my $no_iterations=$c->param('N') || 2000;
Marc Kupietzd4227392016-03-01 16:45:12 +010048 my $perplexity=$c->param('perplexity') || 20;
Marc Kupietzc4d62f82016-03-01 11:04:24 +010049 my $epsilon=$c->param('epsilon') || 5;
Marc Kupietzd7aea722016-03-02 11:59:12 +010050 my $som=$c->param('som') || 0;
Marc Kupietz6d9a6782016-03-23 17:25:25 +010051 my $sort=$c->param('sort') || 0;
Marc Kupietz6b2975c2016-03-18 21:59:33 +010052 my $res;
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +010053 my @lists;
Marc Kupietz6b2975c2016-03-18 21:59:33 +010054 my @collocations;
Marc Kupietz7bc85fd2016-02-24 11:42:41 +010055 if(defined($word) && $word !~ /^\s*$/) {
56 $c->inactivity_timeout(300);
Marc Kupietz44bee3c2016-02-25 16:26:29 +010057 $word =~ s/\s+/ /g;
58 for my $w (split(' *\| *', $word)) {
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +010059 $c->app->log->debug('Looking for neighbours of '.$w);
Marc Kupietza5b90152016-03-15 17:39:19 +010060 if($opt_i) {
Marc Kupietz6d9a6782016-03-23 17:25:25 +010061 $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort);
Marc Kupietza5b90152016-03-15 17:39:19 +010062 } else {
Marc Kupietz6d9a6782016-03-23 17:25:25 +010063 $res = get_neighbours($w, $no_nbs, $sort);
Marc Kupietza5b90152016-03-15 17:39:19 +010064 }
Marc Kupietz6b2975c2016-03-18 21:59:33 +010065 push(@lists, $res->{paradigmatic});
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +010066 }
Marc Kupietz247500f2015-10-09 11:29:01 +020067 }
Marc Kupietz000ad862016-02-26 14:59:12 +010068 $word =~ s/ *\| */ | /g;
Marc Kupietzc47b3902016-04-22 10:29:44 +020069 $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, sort=>$sort, training_args=>$training_args, lists=> \@lists, collocators=> $res->{syntagmatic});
Marc Kupietzdc22b982015-10-09 09:19:34 +020070};
71
Marc Kupietza5b90152016-03-15 17:39:19 +010072$daemon->run; # app->start;
Marc Kupietzdc22b982015-10-09 09:19:34 +020073
74exit;
75
76__END__
77
78__C__
79#include <stdio.h>
80#include <string.h>
81#include <math.h>
82#include <malloc.h>
83#include <stdlib.h> //strlen
Marc Kupietzf0809762016-02-26 10:13:47 +010084#include <sys/mman.h>
Marc Kupietz000ad862016-02-26 14:59:12 +010085#include <pthread.h>
Marc Kupietzdc22b982015-10-09 09:19:34 +020086
87#define max_size 2000
88#define max_w 50
Marc Kupietz7bc85fd2016-02-24 11:42:41 +010089#define MAX_NEIGHBOURS 1000
Marc Kupietz44bee3c2016-02-25 16:26:29 +010090#define MAX_WORDS -1
Marc Kupietz000ad862016-02-26 14:59:12 +010091#define MAX_THREADS 100
Marc Kupietz6b2975c2016-03-18 21:59:33 +010092#define MAX_CC 50
93#define EXP_TABLE_SIZE 1000
94#define MAX_EXP 6
Marc Kupietz271e2a42016-03-22 11:37:43 +010095#define MIN_RESP 0.50
Marc Kupietzdc22b982015-10-09 09:19:34 +020096
97//the thread function
98void *connection_handler(void *);
Marc Kupietz000ad862016-02-26 14:59:12 +010099
100typedef struct {
101 long long *index;
102 float *dist;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100103 float *norm;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100104 long long *pos;
Marc Kupietz80abb442016-03-23 21:04:08 +0100105 int length;
Marc Kupietz000ad862016-02-26 14:59:12 +0100106} knn;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100107
Marc Kupietz000ad862016-02-26 14:59:12 +0100108typedef struct {
Marc Kupietz48c29682016-03-19 11:30:43 +0100109 long long wordi[MAX_NEIGHBOURS];
110 char sep[MAX_NEIGHBOURS];
111 int length;
112} wordlist;
113
114typedef struct {
115 wordlist *wl;
Marc Kupietz000ad862016-02-26 14:59:12 +0100116 char *token;
117 int N;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100118 long from;
Marc Kupietz000ad862016-02-26 14:59:12 +0100119 unsigned long upto;
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100120 float *target_sums;
Marc Kupietz000ad862016-02-26 14:59:12 +0100121} knnpars;
122
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200123float *M, *M2=0L, *syn1neg_window, *expTable;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200124char *vocab;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100125
Marc Kupietz82b02672016-02-26 12:32:25 +0100126long long words, size;
Marc Kupietz000ad862016-02-26 14:59:12 +0100127int num_threads=20;
Marc Kupietza5b90152016-03-15 17:39:19 +0100128int latin_enc=0;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100129int window;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200130
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100131int init_net(char *file_name, char *net_name, int latin) {
Marc Kupietz67c20282016-02-26 09:42:00 +0100132 FILE *f, *binvecs, *binwords;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100133 int binwords_fd, binvecs_fd, net_fd, i;
Marc Kupietz82b02672016-02-26 12:32:25 +0100134 long long a, b, c, d, cn;
135 float len;
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200136 double val;
Marc Kupietz82b02672016-02-26 12:32:25 +0100137
Marc Kupietz67c20282016-02-26 09:42:00 +0100138 char binvecs_fname[256], binwords_fname[256];
139 strcpy(binwords_fname, file_name);
140 strcat(binwords_fname, ".words");
141 strcpy(binvecs_fname, file_name);
142 strcat(binvecs_fname, ".vecs");
Marc Kupietzdc22b982015-10-09 09:19:34 +0200143
Marc Kupietza5b90152016-03-15 17:39:19 +0100144 latin_enc = latin;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200145 f = fopen(file_name, "rb");
146 if (f == NULL) {
147 printf("Input file %s not found\n", file_name);
148 return -1;
149 }
150 fscanf(f, "%lld", &words);
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100151 if(MAX_WORDS > 0 && words > MAX_WORDS) words = MAX_WORDS;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200152 fscanf(f, "%lld", &size);
Marc Kupietz2cb667e2016-03-10 09:44:12 +0100153 if( (binvecs_fd = open(binvecs_fname, O_RDONLY)) < 0 || (binwords_fd = open(binwords_fname, O_RDONLY)) < 0) {
154 printf("Converting %s to memory mappable structures\n", file_name);
Marc Kupietzf0809762016-02-26 10:13:47 +0100155 vocab = (char *)malloc((long long)words * max_w * sizeof(char));
156 M = (float *)malloc((long long)words * (long long)size * sizeof(float));
157 if (M == NULL) {
158 printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
159 return -1;
160 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200161 if(strstr(file_name, ".txt")) {
162 for (b = 0; b < words; b++) {
163 a = 0;
164 while (1) {
165 vocab[b * max_w + a] = fgetc(f);
166 if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
167 if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
168 }
169 vocab[b * max_w + a] = 0;
170 len = 0;
171 for (a = 0; a < size; a++) {
172 fscanf(f, "%lf", &val);
173 M[a + b * size] = val;
174 len += val * val;
175 }
176 len = sqrt(len);
177 for (a = 0; a < size; a++) M[a + b * size] /= len;
178 }
179 } else {
180 for (b = 0; b < words; b++) {
181 a = 0;
182 while (1) {
183 vocab[b * max_w + a] = fgetc(f);
184 if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
185 if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
186 }
187 vocab[b * max_w + a] = 0;
188 fread(&M[b * size], sizeof(float), size, f);
189 len = 0;
190 for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
191 len = sqrt(len);
192 for (a = 0; a < size; a++) M[a + b * size] /= len;
193 }
194 }
Marc Kupietz67c20282016-02-26 09:42:00 +0100195 if( (binvecs = fopen(binvecs_fname, "wb")) != NULL && (binwords = fopen(binwords_fname, "wb")) != NULL) {
196 fwrite(M, sizeof(float), (long long)words * (long long)size, binvecs);
197 fclose(binvecs);
198 fwrite(vocab, sizeof(char), (long long)words * max_w, binwords);
199 fclose(binwords);
200 }
Marc Kupietz2cb667e2016-03-10 09:44:12 +0100201 }
202 if( (binvecs_fd = open(binvecs_fname, O_RDONLY)) >= 0 && (binwords_fd = open(binwords_fname, O_RDONLY)) >= 0) {
203 M = mmap(0, sizeof(float) * (long long)words * (long long)size, PROT_READ, MAP_SHARED, binvecs_fd, 0);
204 vocab = mmap(0, sizeof(char) * (long long)words * max_w, PROT_READ, MAP_SHARED, binwords_fd, 0);
205 if (M == MAP_FAILED || vocab == MAP_FAILED) {
206 close(binvecs_fd);
207 close(binwords_fd);
208 fprintf(stderr, "Cannot mmap %s or %s\n", binwords_fname, binvecs_fname);
209 exit(-1);
210 }
211 } else {
212 fprintf(stderr, "Cannot open %s or %s\n", binwords_fname, binvecs_fname);
213 exit(-1);
Marc Kupietz67c20282016-02-26 09:42:00 +0100214 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200215 fclose(f);
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100216
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200217 if(net_name && strlen(net_name) > 0) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100218 if( (net_fd = open(net_name, O_RDONLY)) >= 0) {
219 window = (lseek(net_fd, 0, SEEK_END) - sizeof(float) * words * size) / words / size / sizeof(float) / 2;
220 // lseek(net_fd, sizeof(float) * words * size, SEEK_SET);
Marc Kupietz10bec2b2016-03-23 09:41:31 +0100221 // munmap(M, sizeof(float) * words * size);
222 M2 = mmap(0, sizeof(float) * words * size + sizeof(float) * 2 * window * size * words, PROT_READ, MAP_SHARED, net_fd, 0);
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200223 if (M2 == MAP_FAILED) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100224 close(net_fd);
225 fprintf(stderr, "Cannot mmap %s\n", net_name);
226 exit(-1);
227 }
Marc Kupietz10bec2b2016-03-23 09:41:31 +0100228 syn1neg_window = M2 + words * size;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100229 } else {
230 fprintf(stderr, "Cannot open %s\n", net_name);
231 exit(-1);
232 }
233 fprintf(stderr, "Successfully memmaped %s. Determined window size: %d\n", net_name, window);
234 }
235
236 expTable = (float *) malloc((EXP_TABLE_SIZE + 1) * sizeof(float));
237 for (i = 0; i < EXP_TABLE_SIZE; i++) {
238 expTable[i] = exp((i / (float) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
239 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
240 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200241 return 0;
242}
243
Marc Kupietz271e2a42016-03-22 11:37:43 +0100244void *getCollocators(knnpars *pars) {
245 int N = pars->N;
246 int cc = pars->wl->wordi[0];
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100247 knn *nbs = NULL;
248 long window_layer_size = size * window * 2;
249 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
250 float f, max_f, maxmax_f;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100251 float *target_sums, *bestf, *bestn, worstbest, wpos_sum;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100252 long long *besti, *bestp;
Marc Kupietzd5642582016-03-19 22:23:13 +0100253
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200254 if(M2 == NULL || cc == -1)
Marc Kupietzd5642582016-03-19 22:23:13 +0100255 return NULL;
256
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100257 a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
258 besti = malloc(N * sizeof(long long));
259 bestp = malloc(N * sizeof(long long));
260 bestf = malloc(N * sizeof(float));
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100261 bestn = malloc(N * sizeof(float));
262
Marc Kupietz271e2a42016-03-22 11:37:43 +0100263 worstbest = MIN_RESP;
264
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100265 for (b = 0; b < words; b++)
266 target_sums[b]=0;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100267 for (b = 0; b < N; b++) {
Marc Kupietz271e2a42016-03-22 11:37:43 +0100268 besti[b] = -1;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100269 bestn[b] = 1;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100270 bestf[b] = worstbest;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100271 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100272
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100273 d = cc;
274 maxmax_f = -1;
275 maxmax_target = 0;
276
Marc Kupietz271e2a42016-03-22 11:37:43 +0100277 for (a = pars->from; a < pars->upto; a++) {
278 if(a >= window)
279 a++;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100280 wpos_sum = 0;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100281 printf("window pos: %ld\n", a);
282 if (a != window) {
283 max_f = -1;
284 window_offset = a * size;
285 if (a > window)
286 window_offset -= size;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100287 for(target = 0; target < words; target ++) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100288 if(target == d)
289 continue;
290 f = 0;
291 for (c = 0; c < size; c++)
Marc Kupietz10bec2b2016-03-23 09:41:31 +0100292 f += M2[d* size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100293 if (f < -MAX_EXP)
294 continue;
295 else if (f > MAX_EXP)
296 continue;
297 else
298 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100299 wpos_sum += f;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100300
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100301 target_sums[target] += f;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100302 if(f > worstbest) {
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100303 for (b = 0; b < N; b++) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100304 if (f > bestf[b]) {
Marc Kupietz33679a32016-03-22 08:49:39 +0100305 memmove(bestf + b + 1, bestf + b, (N - b -1) * sizeof(float));
306 memmove(besti + b + 1, besti + b, (N - b -1) * sizeof(long long));
307 memmove(bestp + b + 1, bestp + b, (N - b -1) * sizeof(long long));
308 bestf[b] = f;
309 besti[b] = target;
310 bestp[b] = window-a;
311 break;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100312 }
313 }
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100314 if(b == N - 1)
315 worstbest = bestf[N-1];
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100316 }
317 }
318 printf("%d %.2f\n", max_target, max_f);
319 printf("%s (%.2f) ", &vocab[max_target * max_w], max_f);
320 if(max_f > maxmax_f) {
321 maxmax_f = max_f;
322 maxmax_target = max_target;
323 }
Marc Kupietz33679a32016-03-22 08:49:39 +0100324 for (b = 0; b < N; b++)
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100325 if(bestp[b] == window-a)
326 bestn[b] = bestf[b] / wpos_sum;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100327 } else {
328 printf("\x1b[1m%s\x1b[0m ", &vocab[d*max_w]);
329 }
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100330
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100331 }
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100332 for (b = 0; b < words; b++)
333 pars->target_sums[b] += (target_sums[b] / wpos_sum ) / (window * 2);
334 free(target_sums);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100335 for(b=0; b<N && besti[b] >= 0; b++) // THIS LOOP IS NEEDED (b...)
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100336 printf("%s %.2f %d * ", &vocab[besti[b]*max_w], bestf[b], bestp[b]);
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100337 printf("\n");
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100338 nbs = malloc(sizeof(knn));
339 nbs->index = besti;
340 nbs->dist = bestf;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100341 nbs->norm = bestn;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100342 nbs->pos = bestp;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100343 nbs->length = b-1;
344 pthread_exit(nbs);
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100345}
346
Marc Kupietz48c29682016-03-19 11:30:43 +0100347wordlist *getTargetWords(char *st1) {
348 wordlist *wl = malloc(sizeof(wordlist));
349 char st[100][max_size], sep[100];
350 long a, b=0, c=0, cn=0;
351
Marc Kupietzdc22b982015-10-09 09:19:34 +0200352 while (1) {
353 st[cn][b] = st1[c];
354 b++;
355 c++;
356 st[cn][b] = 0;
357 if (st1[c] == 0) break;
Marc Kupietz95aa1c02016-03-15 09:40:43 +0100358 if (st1[c] == ' ' || st1[c] == '-') {
359 sep[cn++] = st1[c];
Marc Kupietzdc22b982015-10-09 09:19:34 +0200360 b = 0;
361 c++;
362 }
363 }
364 cn++;
365 for (a = 0; a < cn; a++) {
Marc Kupietz34a3ee92016-02-27 22:43:16 +0100366 for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
367 if (b == words) b = -1;
Marc Kupietz48c29682016-03-19 11:30:43 +0100368 wl->wordi[a] = b;
369 fprintf(stderr, "Word: \"%s\" Position in vocabulary: %lld\n", st[a], wl->wordi[a]);
Marc Kupietzdc22b982015-10-09 09:19:34 +0200370 if (b == -1) {
Marc Kupietze8da3062016-02-25 08:37:53 +0100371 fprintf(stderr, "Out of dictionary word!\n");
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100372 cn--;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200373 break;
374 }
375 }
Marc Kupietz48c29682016-03-19 11:30:43 +0100376 wl->length=cn;
377 return(wl);
378}
379
380void *_get_neighbours(knnpars *pars) {
381 char *st1 = pars->token;
382 int N = pars->N;
383 long from = pars -> from;
384 unsigned long upto = pars -> upto;
385 char file_name[max_size], st[100][max_size], *sep;
386 float dist, len, *bestd, vec[max_size];
387 long long a, b, c, d, cn, *bi, *besti;
388 char ch;
389 knn *nbs = NULL;
390 wordlist *wl = pars->wl;
391
392 besti = malloc(N * sizeof(long long));
393 bestd = malloc(N * sizeof(float));
394
395 float worstbest=-1;
396
397 for (a = 0; a < N; a++) bestd[a] = 0;
398 a = 0;
399 bi = wl->wordi;
400 cn = wl->length;
401 sep = wl->sep;
402 b = bi[0];
403 c = 0;
Marc Kupietz000ad862016-02-26 14:59:12 +0100404 if (b == -1) {
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100405 N = 0;
406 goto end;
407 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200408 for (a = 0; a < size; a++) vec[a] = 0;
409 for (b = 0; b < cn; b++) {
410 if (bi[b] == -1) continue;
Marc Kupietz95aa1c02016-03-15 09:40:43 +0100411 if(b>0 && sep[b-1] == '-')
412 for (a = 0; a < size; a++) vec[a] -= M[a + bi[b] * size];
413 else
414 for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
Marc Kupietzdc22b982015-10-09 09:19:34 +0200415 }
416 len = 0;
417 for (a = 0; a < size; a++) len += vec[a] * vec[a];
418 len = sqrt(len);
419 for (a = 0; a < size; a++) vec[a] /= len;
420 for (a = 0; a < N; a++) bestd[a] = -1;
Marc Kupietz000ad862016-02-26 14:59:12 +0100421 for (c = from; c < upto; c++) {
Marc Kupietzdc22b982015-10-09 09:19:34 +0200422 a = 0;
Marc Kupietz34020dc2016-02-25 08:44:19 +0100423// do not skip taget word
Marc Kupietze8da3062016-02-25 08:37:53 +0100424// for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
425// if (a == 1) continue;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200426 dist = 0;
427 for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
Marc Kupietzbe1b9fc2016-02-26 10:34:30 +0100428 if(dist > worstbest) {
429 for (a = 0; a < N; a++) {
430 if (dist > bestd[a]) {
Marc Kupietz33679a32016-03-22 08:49:39 +0100431 memmove(bestd + a + 1, bestd + a, (N - a -1) * sizeof(float));
432 memmove(besti + a + 1, besti + a, (N - a -1) * sizeof(long long));
Marc Kupietzbe1b9fc2016-02-26 10:34:30 +0100433 bestd[a] = dist;
434 besti[a] = c;
435 break;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200436 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200437 }
Marc Kupietzbe1b9fc2016-02-26 10:34:30 +0100438 worstbest = bestd[N-1];
Marc Kupietzdc22b982015-10-09 09:19:34 +0200439 }
440 }
Marc Kupietz34020dc2016-02-25 08:44:19 +0100441
Marc Kupietz000ad862016-02-26 14:59:12 +0100442 nbs = malloc(sizeof(knn));
443 nbs->index = besti;
444 nbs->dist = bestd;
445 nbs->length = N;
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100446end:
Marc Kupietz000ad862016-02-26 14:59:12 +0100447 pthread_exit(nbs);
Marc Kupietzdc22b982015-10-09 09:19:34 +0200448}
449
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100450
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100451SV *get_neighbours(char *st1, int N, int sort_by) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100452 HV *result = newHV();
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100453 float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
Marc Kupietz50485ba2016-03-23 09:13:14 +0100454 long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100455 knn *para_nbs[MAX_THREADS];
456 knn *syn_nbs[MAX_THREADS];
Marc Kupietz000ad862016-02-26 14:59:12 +0100457 knnpars pars[MAX_THREADS];
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100458 pthread_t *pt = (pthread_t *)malloc((num_threads+1) * sizeof(pthread_t));
Marc Kupietz48c29682016-03-19 11:30:43 +0100459 wordlist *wl;
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200460 int syn_threads = (M2? window * 2 : 0);
461 int para_threads = num_threads - syn_threads;
Marc Kupietz48c29682016-03-19 11:30:43 +0100462
Marc Kupietz000ad862016-02-26 14:59:12 +0100463 if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
464
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200465 slice = words / para_threads;
466
Marc Kupietz48c29682016-03-19 11:30:43 +0100467 wl = getTargetWords(st1);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100468 if(wl->length < 1)
469 goto end;
Marc Kupietz48c29682016-03-19 11:30:43 +0100470
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100471 a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
472 for(a = 0; a < words; a++)
473 target_sums[a] = 0;
474
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200475 printf("Starting %d threads\n", para_threads);
476 fflush(stdout);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100477 for(a=0; a < para_threads; a++) {
Marc Kupietz000ad862016-02-26 14:59:12 +0100478 pars[a].token = st1;
Marc Kupietz48c29682016-03-19 11:30:43 +0100479 pars[a].wl = wl;
Marc Kupietz000ad862016-02-26 14:59:12 +0100480 pars[a].N = N;
481 pars[a].from = a*slice;
482 pars[a].upto = ((a+1)*slice > words? words:(a+1)*slice);
483 pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
484 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200485 if(M2) {
486 for(a=0; a < syn_threads; a++) {
487 pars[a + para_threads].target_sums = target_sums;
488 pars[a + para_threads].wl = wl;
489 pars[a + para_threads].N = N;
490 pars[a + para_threads].from = a;
491 pars[a + para_threads].upto = a+1;
492 pthread_create(&pt[a + para_threads], NULL, getCollocators, (void *) &pars[a + para_threads]);
493 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100494 }
495 printf("Waiting for para threads to join\n");
496 fflush(stdout);
497 for (a = 0; a < para_threads; a++) pthread_join(pt[a], &para_nbs[a]);
498 printf("Para threads joint\n");
499 fflush(stdout);
Marc Kupietz000ad862016-02-26 14:59:12 +0100500
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200501 /* if(!syn_nbs[0]) */
502 /* goto end; */
Marc Kupietz000ad862016-02-26 14:59:12 +0100503
504 for(b=0; b < N; b++) {
Marc Kupietz271e2a42016-03-22 11:37:43 +0100505 besti[b] = para_nbs[0]->index[b];
506 bestd[b] = para_nbs[0]->dist[b];
Marc Kupietz000ad862016-02-26 14:59:12 +0100507 }
508
Marc Kupietz271e2a42016-03-22 11:37:43 +0100509 for(a=1; a < para_threads; a++) {
510 for(b=0; b < para_nbs[a]->length && para_nbs[a]->index[b] >= 0; b++) {
Marc Kupietz000ad862016-02-26 14:59:12 +0100511 for(c=0; c < N; c++) {
Marc Kupietz271e2a42016-03-22 11:37:43 +0100512 if(para_nbs[a]->dist[b] > bestd[c]) {
Marc Kupietz000ad862016-02-26 14:59:12 +0100513 for(d=N-1; d>c; d--) {
514 bestd[d] = bestd[d-1];
515 besti[d] = besti[d-1];
516 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100517 besti[c] = para_nbs[a]->index[b];
518 bestd[c] = para_nbs[a]->dist[b];
Marc Kupietz000ad862016-02-26 14:59:12 +0100519 break;
520 }
521 }
522 }
523 }
524
Marc Kupietz271e2a42016-03-22 11:37:43 +0100525 AV* array = newAV();
526 for (a = 0; a < N; a++) {
Marc Kupietz271e2a42016-03-22 11:37:43 +0100527 HV* hash = newHV();
Marc Kupietz50485ba2016-03-23 09:13:14 +0100528 SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100529 if(latin_enc == 0) SvUTF8_on(word);
530 hv_store(hash, "word", strlen("word"), word , 0);
531 hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
532 hv_store(hash, "rank", strlen("rank"), newSVuv(besti[a]), 0);
533 AV *vector = newAV();
534 for (b = 0; b < size; b++) {
535 av_push(vector, newSVnv(M[b + besti[a] * size]));
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100536 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100537 hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
538 av_push(array, newRV_noinc((SV*)hash));
539 }
540 hv_store(result, "paradigmatic", strlen("paradigmatic"), newRV_noinc((SV*)array), 0);
541
Marc Kupietz50485ba2016-03-23 09:13:14 +0100542 for(b=0; b < MAX_NEIGHBOURS; b++) {
543 besti[b] = -1L;
544 bestd[b] = 0;
545 bestn[b] = 0;
546 bestp[b] = 0;
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100547 bests[b] = 0;
Marc Kupietz50485ba2016-03-23 09:13:14 +0100548 }
549
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200550 if (M2) {
551 printf("Waiting for syn threads to join\n");
552 fflush(stdout);
553 for (a = 0; a < syn_threads; a++) pthread_join(pt[a+para_threads], &syn_nbs[a]);
554 printf("syn threads joint\n");
555 fflush(stdout);
Marc Kupietz50485ba2016-03-23 09:13:14 +0100556
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200557 for(b=0; b < syn_nbs[0]->length; b++) {
558 besti[b] = syn_nbs[0]->index[b];
559 bestd[b] = syn_nbs[0]->dist[b];
560 bestn[b] = syn_nbs[0]->norm[b];
561 bestp[b] = syn_nbs[0]->pos[b];
562 bests[b] = target_sums[syn_nbs[0]->index[b]];
563 }
564
565 if(sort_by != 1) { // sort by responsiveness
566 for(a=1; a < syn_threads; a++) {
567 for(b=0; b < syn_nbs[a]->length; b++) {
568 for(c=0; c < MAX_NEIGHBOURS; c++) {
569 if(syn_nbs[a]->dist[b] > bestd[c]) {
570 for(d=MAX_NEIGHBOURS-1; d>c; d--) {
571 bestd[d] = bestd[d-1];
572 besti[d] = besti[d-1];
573 bestn[d] = bestn[d-1];
574 bestp[d] = bestp[d-1];
575 }
576 besti[c] = syn_nbs[a]->index[b];
577 bestd[c] = syn_nbs[a]->dist[b];
578 bestn[c] = syn_nbs[a]->norm[b];
579 bestp[c] = syn_nbs[a]->pos[b];
580 break;
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100581 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200582 }
583 }
584 }
585 } else { // sort by mean p
586 for(a=1; a < syn_threads; a++) {
587 for(b=0; b < syn_nbs[a]->length; b++) {
588 for(c=0; c < MAX_NEIGHBOURS; c++) {
589 if(target_sums[syn_nbs[a]->index[b]] > bests[c]) {
590 for(d=MAX_NEIGHBOURS-1; d>c; d--) {
591 bestd[d] = bestd[d-1];
592 besti[d] = besti[d-1];
593 bestn[d] = bestn[d-1];
594 bestp[d] = bestp[d-1];
595 bests[d] = bests[d-1];
596 }
597 besti[c] = syn_nbs[a]->index[b];
598 bestd[c] = syn_nbs[a]->dist[b];
599 bestn[c] = syn_nbs[a]->norm[b];
600 bestp[c] = syn_nbs[a]->pos[b];
601 bests[c] = target_sums[syn_nbs[a]->index[b]];
602 break;
603 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100604 }
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100605 }
606 }
607 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200608 array = newAV();
609 for (a = 0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
610 HV* hash = newHV();
611 SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
612 if(latin_enc == 0) SvUTF8_on(word);
613 hv_store(hash, "word", strlen("word"), word , 0);
614 hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
615 hv_store(hash, "norm", strlen("norm"), newSVnv(bestn[a]), 0);
616 hv_store(hash, "sum", strlen("sum"), newSVnv(target_sums[besti[a]]), 0);
617 hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
618 av_push(array, newRV_noinc((SV*)hash));
Marc Kupietz271e2a42016-03-22 11:37:43 +0100619 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200620 hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100621 }
Marc Kupietz000ad862016-02-26 14:59:12 +0100622end:
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100623 return newRV_noinc((SV*)result);
Marc Kupietz000ad862016-02-26 14:59:12 +0100624}
Marc Kupietz7bc85fd2016-02-24 11:42:41 +0100625
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200626int dump_vecs(char *fname) {
627 long i, j;
628 FILE *f;
629 /* if(words>200000) */
630 /* words=200000; */
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100631
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200632 if((f=fopen(fname, "w")) == NULL) {
633 fprintf(stderr, "cannot open %s for writing\n", fname);
634 return(-1);
635 }
636 fprintf(f, "%lld %lld\n", words, size);
637 for (i=0; i < words; i++) {
638 fprintf(f, "%s ", &vocab[i * max_w]);
639 for(j=0; j < size - 1; j++)
640 fprintf(f, "%f ", M[i*size + j]);
641 fprintf(f, "%f\n", M[i*size + j]);
642 }
643 fclose(f);
644 return(0);
645}
Marc Kupietzdc22b982015-10-09 09:19:34 +0200646__DATA__
647
648@@ index.html.ep
649<!DOCTYPE html>
650<html>
Marc Kupietzc4893362016-02-25 08:04:46 +0100651<head>
652 <title>DeReKo-Word-Vector-Distances</title>
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100653 <link rel="stylesheet" href="//code.jquery.com/ui/1.11.4/themes/smoothness/jquery-ui.css">
Marc Kupietzc4893362016-02-25 08:04:46 +0100654 <script src="http://code.jquery.com/jquery-latest.min.js"></script>
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100655 <script src="//code.jquery.com/ui/1.11.4/jquery-ui.js"></script>
656 <script>
657 $(function() {
Marc Kupietz5f780672016-02-25 17:15:54 +0100658 $( document ).tooltip({
659 content: function() {
660 return $(this).attr('title');
661 }}
662 )
663 })
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100664 </script>
Marc Kupietzc4893362016-02-25 08:04:46 +0100665 <script src="//d3js.org/d3.v3.min.js" charset="utf-8"></script>
666 <script src="http://klinux10/word2vec/tsne.js"></script>
Marc Kupietzd7aea722016-03-02 11:59:12 +0100667 <script src="http://klinux10/word2vec/som.js"></script>
Marc Kupietzc5990da2016-02-26 08:47:12 +0100668 <script src="http://klinux10/word2vec/labeler.js"></script>
Marc Kupietzc4893362016-02-25 08:04:46 +0100669<style>
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100670body, input {
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +0100671 font-family: Arial, sans-serif;
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100672 font-size: 11pt;
673}
674
675.ui-tooltip-content {
676 font-size: 9pt;
677 colour: #222222;
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +0100678}
Marc Kupietz5f780672016-02-25 17:15:54 +0100679
680svg > .ui-tooltip-content {
Marc Kupietzb1029362016-02-27 21:38:55 +0100681 font-size: 8pt;
Marc Kupietz5f780672016-02-25 17:15:54 +0100682 colour: #222222;
683}
684
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100685#collocators {
686 margin-bottom: 15px;
687}
688
Marc Kupietzc4893362016-02-25 08:04:46 +0100689#wrapper {
690 width: 100%;
691// border: 1px solid red;
692 overflow: hidden; /* will contain if #first is longer than #second */
693}
694#first {
Marc Kupietzb1029362016-02-27 21:38:55 +0100695 margin-right: 20px;
696 float: left;
697 // border: 1px solid green;
Marc Kupietzc4893362016-02-25 08:04:46 +0100698}
699#second {
700 border: 1px solid #333;
701 overflow: hidden; /* if you don't want #second to wrap below #first */
702}
Marc Kupietzd7aea722016-03-02 11:59:12 +0100703#som2 svg {
704 border: 1px solid #333;
705}
706
Marc Kupietz4aa62172016-02-25 10:39:27 +0100707#cost {
Marc Kupietzb1029362016-02-27 21:38:55 +0100708 font-size: 8pt;
709 color: #222222;
710 margin-top: 4px;
Marc Kupietzd7aea722016-03-02 11:59:12 +0100711 margin-bottom: 12px;
Marc Kupietz4aa62172016-02-25 10:39:27 +0100712}
Marc Kupietzd7aea722016-03-02 11:59:12 +0100713
Marc Kupietz6c1ca442016-03-03 09:35:18 +0100714#sominfo1, #sominfo {
Marc Kupietzd7aea722016-03-02 11:59:12 +0100715 font-size: 8pt;
716 color: #222222;
717 margin-top: 0px;
718}
719
Marc Kupietz6c1ca442016-03-03 09:35:18 +0100720#somcolor1, #somcolor2, #somcolor3 {
721 display: inline-block;
722 height: 10px;
723 width: 10px;
724}
725
Marc Kupietzd7aea722016-03-02 11:59:12 +0100726#third {
727 border: 1px solid #333;
728}
729
Marc Kupietzc4893362016-02-25 08:04:46 +0100730</style>
731<script>
732
Marc Kupietzc4d62f82016-03-01 11:04:24 +0100733var opt = {epsilon: <%= $epsilon %>, perplexity: <%= $perplexity %>},
Marc Kupietz9fca1732016-02-29 09:07:04 +0100734 mapWidth = 800, // width map
735 mapHeight = 800,
736 jitterRadius = 7;
737
Marc Kupietzc4893362016-02-25 08:04:46 +0100738var T = new tsnejs.tSNE(opt); // create a tSNE instance
739
740var Y;
741
742var data;
Marc Kupietzc5990da2016-02-26 08:47:12 +0100743var labeler;
Marc Kupietzc4893362016-02-25 08:04:46 +0100744
Marc Kupietzc5990da2016-02-26 08:47:12 +0100745
746function applyJitter() {
Marc Kupietzd7aea722016-03-02 11:59:12 +0100747 svg.selectAll('.tsnet')
Marc Kupietzc5990da2016-02-26 08:47:12 +0100748 .data(labels)
749 .transition()
750 .duration(50)
751 .attr("transform", function(d, i) {
Marc Kupietz9fca1732016-02-29 09:07:04 +0100752 T.Y[i][0] = (d.x - mapWidth/2 - tx)/ss/20;
753 T.Y[i][1] = (d.y - mapHeight/2 - ty)/ss/20;
Marc Kupietzc5990da2016-02-26 08:47:12 +0100754 return "translate(" +
755 (d.x) + "," +
756 (d.y) + ")";
757 });
758}
759
Marc Kupietzc4893362016-02-25 08:04:46 +0100760function updateEmbedding() {
761 var Y = T.getSolution();
Marc Kupietzd7aea722016-03-02 11:59:12 +0100762 svg.selectAll('.tsnet')
Marc Kupietz9fca1732016-02-29 09:07:04 +0100763 .data(data.words)
764 .attr("transform", function(d, i) {
765 return "translate(" +
766 ((Y[i][0]*20*ss + tx) + mapWidth/2) + "," +
767 ((Y[i][1]*20*ss + ty) + mapHeight/2) + ")"; });
Marc Kupietzc4893362016-02-25 08:04:46 +0100768}
769
770var svg;
Marc Kupietzc5990da2016-02-26 08:47:12 +0100771var labels = [];
772var anchor_array = [];
773var text;
774
Marc Kupietzc4893362016-02-25 08:04:46 +0100775function drawEmbedding() {
Marc Kupietz9fca1732016-02-29 09:07:04 +0100776 $("#embed").empty();
777 var div = d3.select("#embed");
778
779 // get min and max in each column of Y
780 var Y = T.Y;
781
782 svg = div.append("svg") // svg is global
783 .attr("width", mapWidth)
784 .attr("height", mapHeight);
785
786 var g = svg.selectAll(".b")
787 .data(data.words)
788 .enter().append("g")
Marc Kupietzd7aea722016-03-02 11:59:12 +0100789 .attr("class", "tsnet");
Marc Kupietz9fca1732016-02-29 09:07:04 +0100790
791 g.append("a")
792 .attr("xlink:href", function(word) {return "/?word="+word;})
793 .attr("title", function(d, i) {
Marc Kupietze50c8162016-03-01 10:24:43 +0100794 return "rank: "+i +" "+"freq. rank: "+data.ranks[i].toString().replace(/\B(?=(\d{3})+(?!\d))/g, ",");
Marc Kupietz9fca1732016-02-29 09:07:04 +0100795 })
Marc Kupietza350bce2016-02-25 09:34:25 +0100796 .append("text")
Marc Kupietz9fca1732016-02-29 09:07:04 +0100797 .attr("text-anchor", "top")
798 .attr("font-size", 12)
799 .attr("fill", function(d) {
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +0100800 if(data.target.indexOf(" "+d+" ") >= 0) {
Marc Kupietza350bce2016-02-25 09:34:25 +0100801 return "red";
802 } else {
803 return "#333"
Marc Kupietzc4893362016-02-25 08:04:46 +0100804 }
Marc Kupietz9fca1732016-02-29 09:07:04 +0100805 })
806 .text(function(d) { return d; });
807
808 var zoomListener = d3.behavior.zoom()
809 .scaleExtent([0.1, 10])
810 .center([0,0])
811 .on("zoom", zoomHandler);
812 zoomListener(svg);
813}
Marc Kupietzc5990da2016-02-26 08:47:12 +0100814
Marc Kupietz9fca1732016-02-29 09:07:04 +0100815var tx=0, ty=0;
816var ss=1;
817var iter_id=-1;
Marc Kupietzc5990da2016-02-26 08:47:12 +0100818
Marc Kupietz9fca1732016-02-29 09:07:04 +0100819function zoomHandler() {
820 tx = d3.event.translate[0];
821 ty = d3.event.translate[1];
822 ss = d3.event.scale;
823 updateEmbedding();
824}
825
826var stepnum = 0;
827
828function stopStep() {
829 clearInterval(iter_id);
830 text = svg.selectAll("text");
831
832 // jitter function needs different data and co-ordinate representation
833 labels = d3.range(data.words.length).map(function(i) {
834 var x = (T.Y[i][0]*20*ss + tx) + mapWidth/2;
835 var y = (T.Y[i][1]*20*ss + ty) + mapHeight/2;
836 anchor_array.push({x: x, y: y, r: jitterRadius});
837 return {
838 x: x,
839 y: y,
840 name: data.words[i]
841 };
842 });
843
844 // get the actual label bounding boxes for the jitter function
845 var index = 0;
846 text.each(function() {
847 labels[index].width = this.getBBox().width;
848 labels[index].height = this.getBBox().height;
849 index += 1;
850 });
Marc Kupietzc5990da2016-02-26 08:47:12 +0100851
852
853// setTimeout(updateEmbedding, 1);
854// setTimeout(
Marc Kupietz9fca1732016-02-29 09:07:04 +0100855 labeler = d3.labeler()
856 .label(labels)
857 .anchor(anchor_array)
858 .width(mapWidth)
859 .height(mapHeight)
860 .update(applyJitter);
861 // .start(1000);
Marc Kupietzc5990da2016-02-26 08:47:12 +0100862
Marc Kupietz9fca1732016-02-29 09:07:04 +0100863 iter_id = setInterval(jitterStep, 1);
864}
Marc Kupietzc5990da2016-02-26 08:47:12 +0100865
Marc Kupietz9fca1732016-02-29 09:07:04 +0100866var jitter_i=0;
Marc Kupietzc5990da2016-02-26 08:47:12 +0100867
868function jitterStep() {
Marc Kupietz9fca1732016-02-29 09:07:04 +0100869 if(jitter_i++ > 100) {
870 clearInterval(iter_id);
871 } else {
872 labeler.start2(10);
873 applyJitter();
874 }
Marc Kupietzc5990da2016-02-26 08:47:12 +0100875}
Marc Kupietzb1029362016-02-27 21:38:55 +0100876
877var last_cost=1000;
878
Marc Kupietz9fca1732016-02-29 09:07:04 +0100879function step() {
880 var i = T.iter;
881
882 if(i > <%= $no_iterations %>) {
883 stopStep();
884 } else {
885 var cost = Math.round(T.step() * 100000) / 100000; // do a few steps
886 $("#cost").html("tsne iteration " + i + ", cost: " + cost.toFixed(5));
887 if(i % 250 == 0 && cost >= last_cost) {
888 stopStep();
889 } else {
890 last_cost = cost;
891 updateEmbedding();
892 }
893 }
894}
Marc Kupietzc5990da2016-02-26 08:47:12 +0100895
Marc Kupietz9fca1732016-02-29 09:07:04 +0100896function showMap(j) {
897 data=j;
898 T.iter=0;
899 T.initDataRaw(data.vecs); // init embedding
900 drawEmbedding(); // draw initial embedding
901
902 if(iter_id >= 0) {
903 clearInterval(iter_id);
904 }
905 //T.debugGrad();
906 iter_id = setInterval(step, 1);
Marc Kupietzd7aea722016-03-02 11:59:12 +0100907 if(<%= $show_som %>) {
908 makeSOM(j, <%= $no_iterations %>);
909 }
Marc Kupietz9fca1732016-02-29 09:07:04 +0100910}
Marc Kupietza350bce2016-02-25 09:34:25 +0100911
Marc Kupietzc4893362016-02-25 08:04:46 +0100912</script>
913</head>
Marc Kupietzdc22b982015-10-09 09:19:34 +0200914<body>
Marc Kupietz4aa62172016-02-25 10:39:27 +0100915 <form action="<%=url_for('/')->to_abs%>" method="GET">
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100916 word(s):
917 <input type="text" name="word" size="20" value="<%= $word %>" title="When looking for multiple words use spaces as separators to search around the average vector and | as separator to get the neighbours for each word.">
918 max. neighbours: <input type="text" size="8" name="n" value="<%= $no_nbs %>">
Marc Kupietzd7aea722016-03-02 11:59:12 +0100919 max. iterations: <input type="text" name="N" size="8" value="<%= $no_iterations %>">
920 SOM <input type="checkbox" name="som" value="1" <%= ($show_som ? "checked" : "") %>>
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200921 % if($collocators) {
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100922 <span> </span>sort collocators by
923 <select name="sort">
924 <option value="0" <%= ($sort!=1? "selected":"") %>>responsiveness</option>
925 <option value="1" <%= ($sort==1? "selected":"") %>>mean p</option>
926 </select>
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200927 % }
Marc Kupietzd7aea722016-03-02 11:59:12 +0100928 <span> </span><input type="submit" value="Show">
Marc Kupietz4aa62172016-02-25 10:39:27 +0100929 </form>
930 <br>
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +0100931 % if($lists) {
Marc Kupietz4aa62172016-02-25 10:39:27 +0100932 <div id="wrapper">
933 <table id="first">
934 <tr>
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200935 <th align="right">#</th><th align="right">cos</th><th align="left">paradigmatic</th>
936 % if($collocators) {
937 <th title="Position in winodw around target word. Absolute value can be too low because of sub-sampling frequent words.">@</th><th align="right" title="&#34;Responsivenes&#34; of the collocator at the relative position @. Approximation of the probability that the combination of the target word and the collocator at the relative position @ come from the corpus.">resp.</th><th title="Probability of the collocator at window location @."align="right">p(c<sub><small>@</small></sub>)</th><th align="right"p(c<sub><small>@</small></sub>)/|w|</th><th align="left">syntagmatic</th>
938 % }
Marc Kupietz4aa62172016-02-25 10:39:27 +0100939 </tr>
Marc Kupietz5f780672016-02-25 17:15:54 +0100940 % my $j=0; my @words; my @vecs; my @ranks; for my $list (@$lists) {
Marc Kupietzc47b3902016-04-22 10:29:44 +0200941 % my $i=0; while($list) {
Marc Kupietz50485ba2016-03-23 09:13:14 +0100942 % my $item = (@$list)[$i];
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200943 % my $c = ($collocators? (@$collocators)[$i] : 0);
Marc Kupietz50485ba2016-03-23 09:13:14 +0100944 % last if(!$c && !$item);
Marc Kupietz4aa62172016-02-25 10:39:27 +0100945 <tr>
946 <td align="right">
Marc Kupietzd5642582016-03-19 22:23:13 +0100947 <%= ++$i %>.
Marc Kupietz4aa62172016-02-25 10:39:27 +0100948 </td>
Marc Kupietz50485ba2016-03-23 09:13:14 +0100949 % if($item) {
950 % if(!grep{$_ eq $item->{word}} @words) {
951 % push @vecs, $item->{vector};
952 % push @words, $item->{word};
953 % push @ranks, $item->{rank};
954 % }
Marc Kupietz4aa62172016-02-25 10:39:27 +0100955 <td align="right">
956 <%= sprintf("%.3f", $item->{dist}) %>
957 </td>
Marc Kupietzd5642582016-03-19 22:23:13 +0100958 <td>
959 <a title="freq. rank: <%= $item->{rank} %>" href="/?word=<%= $item->{word} %>">
960 <%= $item->{word} %>
961 </a>
962 </td>
Marc Kupietz50485ba2016-03-23 09:13:14 +0100963 % } else {
964 <td colspan="2"/>
965 % }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100966 % if($c) {
Marc Kupietz5f780672016-02-25 17:15:54 +0100967 <td align="right">
Marc Kupietzd5642582016-03-19 22:23:13 +0100968 <%= $c->{pos} %>:
969 </td>
970 <td align="right">
971 <%= sprintf("%.3f", $c->{dist}) %>
972 </td>
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100973 <td align="right">
974 <%= sprintf("%.3e", $c->{norm}) %>
975 </td>
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100976 <td align="right">
977 <%= sprintf("%.3e", $c->{sum}) %>
978 </td>
Marc Kupietzd5642582016-03-19 22:23:13 +0100979 <td align="left">
980 <a href="/?word=<%= $c->{word} %>">
981 <%= $c->{word} %>
Marc Kupietz5f780672016-02-25 17:15:54 +0100982 </td>
Marc Kupietz271e2a42016-03-22 11:37:43 +0100983 % } else {
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100984 <td colspan="5"/>
Marc Kupietz271e2a42016-03-22 11:37:43 +0100985 % }
Marc Kupietz4aa62172016-02-25 10:39:27 +0100986 </tr>
987 % }
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +0100988 % }
Marc Kupietz4aa62172016-02-25 10:39:27 +0100989 </table>
990 <script>
991 % use Mojo::ByteStream 'b';
992 $(window).load(function() {
Marc Kupietz5f780672016-02-25 17:15:54 +0100993 showMap(<%= b(Mojo::JSON::to_json({target => " $word ", words => \@words, vecs => \@vecs, ranks => \@ranks})); %>);
Marc Kupietz4aa62172016-02-25 10:39:27 +0100994 });
995 </script>
996 % }
997 <div id="second" style="width:800px; height:800px; font-family: arial;">
998 <div id="embed">
999 </div>
Marc Kupietz4aa62172016-02-25 10:39:27 +01001000 </div>
Marc Kupietzb1029362016-02-27 21:38:55 +01001001 <div id="cost"></div>
Marc Kupietzd7aea722016-03-02 11:59:12 +01001002 % if($show_som) {
1003 <div id="som2">
1004 </div>
Marc Kupietz6c1ca442016-03-03 09:35:18 +01001005 <div id="sominfo1"><span id="somcolor1"> </span> <span id="somword1"> </span> <span id="somcolor2"> </span> <span id="somword2"> </span> <span id="somcolor3"> </span></div>
Marc Kupietzd7aea722016-03-02 11:59:12 +01001006 <div id="sominfo">SOM iteration <span id="iterations">0</span></div>
1007 % }
Marc Kupietz4aa62172016-02-25 10:39:27 +01001008 </div>
Marc Kupietz793413b2016-04-02 21:48:57 +02001009 % if($training_args) {
1010 <p>
1011 Word vector model trained with <a href="https://code.google.com/p/word2vec/">word2vec</a> using the following parameters: <pre><%= $training_args %></pre>
1012 </p>
1013 % }
1014 </body>
Marc Kupietzdc22b982015-10-09 09:19:34 +02001015</html>
1016