blob: 6ed83a8ed47bd35dc7cbc2d3d0e55e6cb8db7758 [file] [log] [blame]
Marc Kupietzdc22b982015-10-09 09:19:34 +02001#!/usr/local/bin/perl
2use Inline C;
Marc Kupietza2e64502016-04-27 09:53:51 +02003use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -O4";
Marc Kupietza5f60042017-05-04 10:38:12 +02004#use Inline C => Config => BUILD_NOISY => 1, CFLAGS => $Config{cflags}." -O4 -mtune k9";
5#use Inline C => Config => CLEAN_AFTER_BUILD => 0, ccflags => $Config{ccflags}." -Ofast -march k8 -mtune k8 ";
Marc Kupietzdc22b982015-10-09 09:19:34 +02006use Mojolicious::Lite;
Marc Kupietzc4893362016-02-25 08:04:46 +01007use Mojo::JSON qw(decode_json encode_json to_json);
Marc Kupietz247500f2015-10-09 11:29:01 +02008use Encode qw(decode encode);
Marc Kupietza5b90152016-03-15 17:39:19 +01009use Getopt::Std;
Marc Kupietz7bc85fd2016-02-24 11:42:41 +010010use Mojo::Server::Daemon;
Marc Kupietzd4227392016-03-01 16:45:12 +010011plugin 'Log::Access';
Marc Kupietzdc22b982015-10-09 09:19:34 +020012
Marc Kupietza5b90152016-03-15 17:39:19 +010013our $opt_i = 0; # latin1-input?
14our $opt_l = undef;
15our $opt_p = 5676;
Marc Kupietza2e64502016-04-27 09:53:51 +020016our $opt_m;
Marc Kupietz6ed81872016-04-27 14:04:04 +020017our $opt_M;
Marc Kupietz43ee87e2016-04-25 10:50:08 +020018our $opt_n = '';
19our $opt_d;
Marc Kupietz5c3887d2016-04-28 08:53:35 +020020our $opt_G;
Marc Kupietza5b90152016-03-15 17:39:19 +010021
Marc Kupietz6ed81872016-04-27 14:04:04 +020022my %marked;
Marc Kupietz793413b2016-04-02 21:48:57 +020023my $training_args="";
Marc Kupietza2e64502016-04-27 09:53:51 +020024my $mergedEnd=0;
Marc Kupietz793413b2016-04-02 21:48:57 +020025
Marc Kupietza5f60042017-05-04 10:38:12 +020026getopts('d:Gil:p:m:n:M:');
Marc Kupietz6ed81872016-04-27 14:04:04 +020027
28if($opt_M) {
Marc Kupietzed930212016-04-27 15:42:38 +020029 open my $handle, '<:encoding(UTF-8)', $opt_M
30 or die "Can't open '$opt_M' for reading: $!";
31 while(<$handle>) {
Marc Kupietz6ed81872016-04-27 14:04:04 +020032 foreach my $mw (split /\s+/) {
33 $marked{$mw}=1
34 }
35 }
Marc Kupietzed930212016-04-27 15:42:38 +020036 close($handle);
Marc Kupietz6ed81872016-04-27 14:04:04 +020037}
Marc Kupietza5b90152016-03-15 17:39:19 +010038
Marc Kupietz7bc85fd2016-02-24 11:42:41 +010039# -cbow 1 -size 200 -window 8 -negative 25 -hs 0 -sample 1e-4 -threads 40 -binary 1 -iter 15
Marc Kupietza5b90152016-03-15 17:39:19 +010040if(!$ARGV[0]) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +010041 init_net("vectors15.bin", $opt_n, ($opt_i? 1 : 0));
Marc Kupietz2cb667e2016-03-10 09:44:12 +010042} else {
Marc Kupietz6b2975c2016-03-18 21:59:33 +010043 init_net($ARGV[0], $opt_n, ($opt_i? 1 : 0));
Marc Kupietz793413b2016-04-02 21:48:57 +020044 if(open(FILE, "$ARGV[0].args")) {
45 $training_args = <FILE>;
46 }
47 close(FILE);
Marc Kupietz2cb667e2016-03-10 09:44:12 +010048}
Marc Kupietzdc22b982015-10-09 09:19:34 +020049
Marc Kupietza2e64502016-04-27 09:53:51 +020050if($opt_m) {
51 $mergedEnd = mergeVectors($opt_m);
52}
53
Marc Kupietz6ed81872016-04-27 14:04:04 +020054
Marc Kupietz43ee87e2016-04-25 10:50:08 +020055if($opt_d) { # -d: dump vecs and exit
56 dump_vecs($opt_d);
57 exit;
58}
59
Marc Kupietza5b90152016-03-15 17:39:19 +010060my $daemon = Mojo::Server::Daemon->new(
61 app => app,
62 listen => ['http://'.($opt_l ? $opt_l : '*').":$opt_p"]
63);
64
Marc Kupietz5c3887d2016-04-28 08:53:35 +020065if($opt_G) {
66 print "Filtering garbage\n";
67 filter_garbage();
68}
69
Marc Kupietzdc22b982015-10-09 09:19:34 +020070get '/' => sub {
71 my $c = shift;
Marc Kupietza5f60042017-05-04 10:38:12 +020072 $c->app->log->info("get: ".$c->req->url->to_abs);
Marc Kupietzdc22b982015-10-09 09:19:34 +020073 my $word=$c->param('word');
Marc Kupietz44bee3c2016-02-25 16:26:29 +010074 my $no_nbs=$c->param('n') || 100;
75 my $no_iterations=$c->param('N') || 2000;
Marc Kupietzd4227392016-03-01 16:45:12 +010076 my $perplexity=$c->param('perplexity') || 20;
Marc Kupietzc4d62f82016-03-01 11:04:24 +010077 my $epsilon=$c->param('epsilon') || 5;
Marc Kupietzd7aea722016-03-02 11:59:12 +010078 my $som=$c->param('som') || 0;
Marc Kupietza2e64502016-04-27 09:53:51 +020079 my $searchBaseVocabFirst=$c->param('sbf') || 0;
Marc Kupietz6d9a6782016-03-23 17:25:25 +010080 my $sort=$c->param('sort') || 0;
Marc Kupietzb613b052016-04-28 14:11:59 +020081 my $json=$c->param('json') || 0;
Marc Kupietz6b2975c2016-03-18 21:59:33 +010082 my $res;
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +010083 my @lists;
Marc Kupietz6b2975c2016-03-18 21:59:33 +010084 my @collocations;
Marc Kupietz7bc85fd2016-02-24 11:42:41 +010085 if(defined($word) && $word !~ /^\s*$/) {
86 $c->inactivity_timeout(300);
Marc Kupietz44bee3c2016-02-25 16:26:29 +010087 $word =~ s/\s+/ /g;
88 for my $w (split(' *\| *', $word)) {
Marc Kupietza5f60042017-05-04 10:38:12 +020089 $c->app->log->info('Looking for neighbours of '.$w);
Marc Kupietza5b90152016-03-15 17:39:19 +010090 if($opt_i) {
Marc Kupietza2e64502016-04-27 09:53:51 +020091 $res = get_neighbours(encode("iso-8859-1", $w), $no_nbs, $sort, $searchBaseVocabFirst);
Marc Kupietza5b90152016-03-15 17:39:19 +010092 } else {
Marc Kupietza2e64502016-04-27 09:53:51 +020093 $res = get_neighbours($w, $no_nbs, $sort, $searchBaseVocabFirst);
Marc Kupietza5b90152016-03-15 17:39:19 +010094 }
Marc Kupietz6b2975c2016-03-18 21:59:33 +010095 push(@lists, $res->{paradigmatic});
Marc Kupietz7b2cbeb2016-02-25 11:22:00 +010096 }
Marc Kupietz247500f2015-10-09 11:29:01 +020097 }
Marc Kupietz000ad862016-02-26 14:59:12 +010098 $word =~ s/ *\| */ | /g;
Marc Kupietzb613b052016-04-28 14:11:59 +020099 if($json) {
100 return $c->render(json => {word => $word, list => \@lists, collocators=>$res->{syntagmatic}});
101 } else {
102 $c->render(template=>"index", word=>$word, no_nbs=>$no_nbs, no_iterations => $no_iterations, epsilon=> $epsilon, perplexity=> $perplexity, show_som=>$som, searchBaseVocabFirst=>$searchBaseVocabFirst, sort=>$sort, training_args=>$training_args, mergedEnd=> $mergedEnd, marked=>\%marked, lists=> \@lists, collocators=> $res->{syntagmatic});
103 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200104};
105
Marc Kupietza5b90152016-03-15 17:39:19 +0100106$daemon->run; # app->start;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200107
108exit;
109
110__END__
111
112__C__
113#include <stdio.h>
114#include <string.h>
115#include <math.h>
116#include <malloc.h>
117#include <stdlib.h> //strlen
Marc Kupietzf0809762016-02-26 10:13:47 +0100118#include <sys/mman.h>
Marc Kupietz000ad862016-02-26 14:59:12 +0100119#include <pthread.h>
Marc Kupietzdc22b982015-10-09 09:19:34 +0200120
121#define max_size 2000
122#define max_w 50
Marc Kupietz7bc85fd2016-02-24 11:42:41 +0100123#define MAX_NEIGHBOURS 1000
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100124#define MAX_WORDS -1
Marc Kupietz000ad862016-02-26 14:59:12 +0100125#define MAX_THREADS 100
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100126#define MAX_CC 50
127#define EXP_TABLE_SIZE 1000
128#define MAX_EXP 6
Marc Kupietz271e2a42016-03-22 11:37:43 +0100129#define MIN_RESP 0.50
Marc Kupietzdc22b982015-10-09 09:19:34 +0200130
131//the thread function
132void *connection_handler(void *);
Marc Kupietz000ad862016-02-26 14:59:12 +0100133
134typedef struct {
135 long long *index;
136 float *dist;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100137 float *norm;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100138 long long *pos;
Marc Kupietz80abb442016-03-23 21:04:08 +0100139 int length;
Marc Kupietz000ad862016-02-26 14:59:12 +0100140} knn;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100141
Marc Kupietz000ad862016-02-26 14:59:12 +0100142typedef struct {
Marc Kupietz48c29682016-03-19 11:30:43 +0100143 long long wordi[MAX_NEIGHBOURS];
144 char sep[MAX_NEIGHBOURS];
145 int length;
146} wordlist;
147
148typedef struct {
149 wordlist *wl;
Marc Kupietz000ad862016-02-26 14:59:12 +0100150 char *token;
151 int N;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100152 long from;
Marc Kupietz000ad862016-02-26 14:59:12 +0100153 unsigned long upto;
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100154 float *target_sums;
Marc Kupietz000ad862016-02-26 14:59:12 +0100155} knnpars;
156
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200157float *M, *M2=0L, *syn1neg_window, *expTable;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200158char *vocab;
Marc Kupietz5c3887d2016-04-28 08:53:35 +0200159char *garbage = NULL;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100160
Marc Kupietza2e64502016-04-27 09:53:51 +0200161long long words, size, merged_end;
Marc Kupietza5f60042017-05-04 10:38:12 +0200162long long merge_words = 0;
Marc Kupietz000ad862016-02-26 14:59:12 +0100163int num_threads=20;
Marc Kupietza5b90152016-03-15 17:39:19 +0100164int latin_enc=0;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100165int window;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200166
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100167int init_net(char *file_name, char *net_name, int latin) {
Marc Kupietz67c20282016-02-26 09:42:00 +0100168 FILE *f, *binvecs, *binwords;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100169 int binwords_fd, binvecs_fd, net_fd, i;
Marc Kupietz82b02672016-02-26 12:32:25 +0100170 long long a, b, c, d, cn;
171 float len;
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200172 double val;
Marc Kupietz82b02672016-02-26 12:32:25 +0100173
Marc Kupietz67c20282016-02-26 09:42:00 +0100174 char binvecs_fname[256], binwords_fname[256];
175 strcpy(binwords_fname, file_name);
176 strcat(binwords_fname, ".words");
177 strcpy(binvecs_fname, file_name);
178 strcat(binvecs_fname, ".vecs");
Marc Kupietzdc22b982015-10-09 09:19:34 +0200179
Marc Kupietza5b90152016-03-15 17:39:19 +0100180 latin_enc = latin;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200181 f = fopen(file_name, "rb");
182 if (f == NULL) {
183 printf("Input file %s not found\n", file_name);
184 return -1;
185 }
186 fscanf(f, "%lld", &words);
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100187 if(MAX_WORDS > 0 && words > MAX_WORDS) words = MAX_WORDS;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200188 fscanf(f, "%lld", &size);
Marc Kupietz2cb667e2016-03-10 09:44:12 +0100189 if( (binvecs_fd = open(binvecs_fname, O_RDONLY)) < 0 || (binwords_fd = open(binwords_fname, O_RDONLY)) < 0) {
190 printf("Converting %s to memory mappable structures\n", file_name);
Marc Kupietzf0809762016-02-26 10:13:47 +0100191 vocab = (char *)malloc((long long)words * max_w * sizeof(char));
192 M = (float *)malloc((long long)words * (long long)size * sizeof(float));
193 if (M == NULL) {
194 printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
195 return -1;
196 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200197 if(strstr(file_name, ".txt")) {
198 for (b = 0; b < words; b++) {
199 a = 0;
200 while (1) {
201 vocab[b * max_w + a] = fgetc(f);
202 if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
203 if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
204 }
205 vocab[b * max_w + a] = 0;
206 len = 0;
207 for (a = 0; a < size; a++) {
208 fscanf(f, "%lf", &val);
209 M[a + b * size] = val;
210 len += val * val;
211 }
212 len = sqrt(len);
213 for (a = 0; a < size; a++) M[a + b * size] /= len;
214 }
215 } else {
216 for (b = 0; b < words; b++) {
217 a = 0;
218 while (1) {
219 vocab[b * max_w + a] = fgetc(f);
220 if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
221 if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
222 }
223 vocab[b * max_w + a] = 0;
224 fread(&M[b * size], sizeof(float), size, f);
225 len = 0;
226 for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
227 len = sqrt(len);
228 for (a = 0; a < size; a++) M[a + b * size] /= len;
229 }
230 }
Marc Kupietz67c20282016-02-26 09:42:00 +0100231 if( (binvecs = fopen(binvecs_fname, "wb")) != NULL && (binwords = fopen(binwords_fname, "wb")) != NULL) {
232 fwrite(M, sizeof(float), (long long)words * (long long)size, binvecs);
233 fclose(binvecs);
234 fwrite(vocab, sizeof(char), (long long)words * max_w, binwords);
235 fclose(binwords);
236 }
Marc Kupietz2cb667e2016-03-10 09:44:12 +0100237 }
238 if( (binvecs_fd = open(binvecs_fname, O_RDONLY)) >= 0 && (binwords_fd = open(binwords_fname, O_RDONLY)) >= 0) {
239 M = mmap(0, sizeof(float) * (long long)words * (long long)size, PROT_READ, MAP_SHARED, binvecs_fd, 0);
240 vocab = mmap(0, sizeof(char) * (long long)words * max_w, PROT_READ, MAP_SHARED, binwords_fd, 0);
241 if (M == MAP_FAILED || vocab == MAP_FAILED) {
242 close(binvecs_fd);
243 close(binwords_fd);
244 fprintf(stderr, "Cannot mmap %s or %s\n", binwords_fname, binvecs_fname);
245 exit(-1);
246 }
247 } else {
248 fprintf(stderr, "Cannot open %s or %s\n", binwords_fname, binvecs_fname);
249 exit(-1);
Marc Kupietz67c20282016-02-26 09:42:00 +0100250 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200251 fclose(f);
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100252
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200253 if(net_name && strlen(net_name) > 0) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100254 if( (net_fd = open(net_name, O_RDONLY)) >= 0) {
255 window = (lseek(net_fd, 0, SEEK_END) - sizeof(float) * words * size) / words / size / sizeof(float) / 2;
256 // lseek(net_fd, sizeof(float) * words * size, SEEK_SET);
Marc Kupietz10bec2b2016-03-23 09:41:31 +0100257 // munmap(M, sizeof(float) * words * size);
258 M2 = mmap(0, sizeof(float) * words * size + sizeof(float) * 2 * window * size * words, PROT_READ, MAP_SHARED, net_fd, 0);
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200259 if (M2 == MAP_FAILED) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100260 close(net_fd);
261 fprintf(stderr, "Cannot mmap %s\n", net_name);
262 exit(-1);
263 }
Marc Kupietz10bec2b2016-03-23 09:41:31 +0100264 syn1neg_window = M2 + words * size;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100265 } else {
266 fprintf(stderr, "Cannot open %s\n", net_name);
267 exit(-1);
268 }
269 fprintf(stderr, "Successfully memmaped %s. Determined window size: %d\n", net_name, window);
270 }
271
272 expTable = (float *) malloc((EXP_TABLE_SIZE + 1) * sizeof(float));
273 for (i = 0; i < EXP_TABLE_SIZE; i++) {
274 expTable[i] = exp((i / (float) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
275 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
276 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200277 return 0;
278}
279
Marc Kupietza2e64502016-04-27 09:53:51 +0200280long mergeVectors(char *file_name){
281 FILE *f, *binvecs, *binwords;
282 int binwords_fd, binvecs_fd, net_fd, i;
283 long long a, b, c, d, cn;
284 float len;
285 float *merge_vecs;
286 char *merge_vocab;
Marc Kupietza5f60042017-05-04 10:38:12 +0200287 /* long long merge_words, merge_size; */
288 long long merge_size;
Marc Kupietza2e64502016-04-27 09:53:51 +0200289
290 char binvecs_fname[256], binwords_fname[256];
291 strcpy(binwords_fname, file_name);
292 strcat(binwords_fname, ".words");
293 strcpy(binvecs_fname, file_name);
294 strcat(binvecs_fname, ".vecs");
295
296 f = fopen(file_name, "rb");
297 if (f == NULL) {
298 printf("Input file %s not found\n", file_name);
299 exit -1;
300 }
301 fscanf(f, "%lld", &merge_words);
302 fscanf(f, "%lld", &merge_size);
303 if(merge_size != size){
304 fprintf(stderr, "vectors must have the same length\n");
305 exit(-1);
306 }
307 if( (binvecs_fd = open(binvecs_fname, O_RDONLY)) >= 0 && (binwords_fd = open(binwords_fname, O_RDONLY)) >= 0) {
308 merge_vecs = malloc(sizeof(float) * (words + merge_words) * size);
309 merge_vocab = malloc(sizeof(char) * (words + merge_words) * max_w);
310 if (merge_vecs == NULL || merge_vocab == NULL) {
311 close(binvecs_fd);
312 close(binwords_fd);
313 fprintf(stderr, "Cannot reserve memory for %s or %s\n", binwords_fname, binvecs_fname);
314 exit(-1);
315 }
316 read(binvecs_fd, merge_vecs, merge_words * size * sizeof(float));
317 read(binwords_fd, merge_vocab, merge_words * max_w);
318 } else {
319 fprintf(stderr, "Cannot open %s or %s\n", binwords_fname, binvecs_fname);
320 exit(-1);
321 }
322 printf("Successfully reallocated memory\nMerging...\n");
323 fflush(stdout);
324 memcpy(merge_vecs + merge_words * size, M, words * size * sizeof(float));
325 memcpy(merge_vocab + merge_words * max_w, vocab, words * max_w);
326 munmap(M, words * size * sizeof(float));
327 munmap(vocab, words * max_w);
328 M = merge_vecs;
329 vocab = merge_vocab;
330 merged_end = merge_words;
331 words += merge_words;
332 fclose(f);
333 printf("merged_end: %lld, words: %lld\n", merged_end, words);
334 return((long) merged_end);
335}
336
Marc Kupietz5c3887d2016-04-28 08:53:35 +0200337void filter_garbage() {
338 long i;
Marc Kupietzab591a82016-04-28 14:08:49 +0200339 unsigned char *w, previous, c;
Marc Kupietz5c3887d2016-04-28 08:53:35 +0200340 garbage = malloc(words);
341 memset(garbage, 0, words);
342 for (i = 0; i < words; i++) {
343 w = vocab + i * max_w;
344 previous = 0;
Marc Kupietzab591a82016-04-28 14:08:49 +0200345 while((c = *w++) && !garbage[i]) {
346 if( ((c <= 90 && c >= 65) && (previous >= 97 && previous <= 122)) ||
347 (previous == '-' && (c & 32)) ||
348 (previous == 0xc2 && (c == 0xa4 || c == 0xb6 ))
349 ) {
Marc Kupietz5c3887d2016-04-28 08:53:35 +0200350 garbage[i]=1;
351 continue;
352 }
353 previous = c;
354 }
355 }
356 return;
357}
358
Marc Kupietz271e2a42016-03-22 11:37:43 +0100359void *getCollocators(knnpars *pars) {
360 int N = pars->N;
361 int cc = pars->wl->wordi[0];
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100362 knn *nbs = NULL;
363 long window_layer_size = size * window * 2;
364 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
365 float f, max_f, maxmax_f;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100366 float *target_sums, *bestf, *bestn, worstbest, wpos_sum;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100367 long long *besti, *bestp;
Marc Kupietzd5642582016-03-19 22:23:13 +0100368
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200369 if(M2 == NULL || cc == -1)
Marc Kupietzd5642582016-03-19 22:23:13 +0100370 return NULL;
371
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100372 a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
373 besti = malloc(N * sizeof(long long));
374 bestp = malloc(N * sizeof(long long));
375 bestf = malloc(N * sizeof(float));
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100376 bestn = malloc(N * sizeof(float));
377
Marc Kupietz271e2a42016-03-22 11:37:43 +0100378 worstbest = MIN_RESP;
379
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100380 for (b = 0; b < words; b++)
381 target_sums[b]=0;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100382 for (b = 0; b < N; b++) {
Marc Kupietz271e2a42016-03-22 11:37:43 +0100383 besti[b] = -1;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100384 bestn[b] = 1;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100385 bestf[b] = worstbest;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100386 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100387
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100388 d = cc;
389 maxmax_f = -1;
390 maxmax_target = 0;
391
Marc Kupietz271e2a42016-03-22 11:37:43 +0100392 for (a = pars->from; a < pars->upto; a++) {
393 if(a >= window)
394 a++;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100395 wpos_sum = 0;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100396 printf("window pos: %ld\n", a);
397 if (a != window) {
398 max_f = -1;
399 window_offset = a * size;
400 if (a > window)
401 window_offset -= size;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100402 for(target = 0; target < words; target ++) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100403 if(target == d)
404 continue;
405 f = 0;
406 for (c = 0; c < size; c++)
Marc Kupietz10bec2b2016-03-23 09:41:31 +0100407 f += M2[d* size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100408 if (f < -MAX_EXP)
409 continue;
410 else if (f > MAX_EXP)
411 continue;
412 else
413 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100414 wpos_sum += f;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100415
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100416 target_sums[target] += f;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100417 if(f > worstbest) {
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100418 for (b = 0; b < N; b++) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100419 if (f > bestf[b]) {
Marc Kupietz33679a32016-03-22 08:49:39 +0100420 memmove(bestf + b + 1, bestf + b, (N - b -1) * sizeof(float));
421 memmove(besti + b + 1, besti + b, (N - b -1) * sizeof(long long));
422 memmove(bestp + b + 1, bestp + b, (N - b -1) * sizeof(long long));
423 bestf[b] = f;
424 besti[b] = target;
425 bestp[b] = window-a;
426 break;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100427 }
428 }
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100429 if(b == N - 1)
430 worstbest = bestf[N-1];
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100431 }
432 }
433 printf("%d %.2f\n", max_target, max_f);
434 printf("%s (%.2f) ", &vocab[max_target * max_w], max_f);
435 if(max_f > maxmax_f) {
436 maxmax_f = max_f;
437 maxmax_target = max_target;
438 }
Marc Kupietz33679a32016-03-22 08:49:39 +0100439 for (b = 0; b < N; b++)
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100440 if(bestp[b] == window-a)
441 bestn[b] = bestf[b] / wpos_sum;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100442 } else {
443 printf("\x1b[1m%s\x1b[0m ", &vocab[d*max_w]);
444 }
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100445
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100446 }
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100447 for (b = 0; b < words; b++)
448 pars->target_sums[b] += (target_sums[b] / wpos_sum ) / (window * 2);
449 free(target_sums);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100450 for(b=0; b<N && besti[b] >= 0; b++) // THIS LOOP IS NEEDED (b...)
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100451 printf("%s %.2f %d * ", &vocab[besti[b]*max_w], bestf[b], bestp[b]);
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100452 printf("\n");
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100453 nbs = malloc(sizeof(knn));
454 nbs->index = besti;
455 nbs->dist = bestf;
Marc Kupietzb864ccf2016-03-21 22:40:03 +0100456 nbs->norm = bestn;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100457 nbs->pos = bestp;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100458 nbs->length = b-1;
459 pthread_exit(nbs);
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100460}
461
Marc Kupietza2e64502016-04-27 09:53:51 +0200462wordlist *getTargetWords(char *st1, int search_backw) {
Marc Kupietz48c29682016-03-19 11:30:43 +0100463 wordlist *wl = malloc(sizeof(wordlist));
464 char st[100][max_size], sep[100];
465 long a, b=0, c=0, cn=0;
Marc Kupietza2e64502016-04-27 09:53:51 +0200466 int unmerged;
467
Marc Kupietzdc22b982015-10-09 09:19:34 +0200468 while (1) {
469 st[cn][b] = st1[c];
470 b++;
471 c++;
472 st[cn][b] = 0;
473 if (st1[c] == 0) break;
Marc Kupietz95aa1c02016-03-15 09:40:43 +0100474 if (st1[c] == ' ' || st1[c] == '-') {
475 sep[cn++] = st1[c];
Marc Kupietzdc22b982015-10-09 09:19:34 +0200476 b = 0;
477 c++;
478 }
479 }
480 cn++;
481 for (a = 0; a < cn; a++) {
Marc Kupietza2e64502016-04-27 09:53:51 +0200482 if(search_backw) {
483 for (b = words - 1; b >= 0; b--) if (!strcmp(&vocab[b * max_w], st[a])) break;
484 } else {
485 for (b = 0; b < words; b++) if (!strcmp(&vocab[b * max_w], st[a])) break;
486 }
Marc Kupietz34a3ee92016-02-27 22:43:16 +0100487 if (b == words) b = -1;
Marc Kupietz48c29682016-03-19 11:30:43 +0100488 wl->wordi[a] = b;
489 fprintf(stderr, "Word: \"%s\" Position in vocabulary: %lld\n", st[a], wl->wordi[a]);
Marc Kupietzdc22b982015-10-09 09:19:34 +0200490 if (b == -1) {
Marc Kupietze8da3062016-02-25 08:37:53 +0100491 fprintf(stderr, "Out of dictionary word!\n");
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100492 cn--;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200493 break;
494 }
495 }
Marc Kupietz48c29682016-03-19 11:30:43 +0100496 wl->length=cn;
497 return(wl);
498}
499
500void *_get_neighbours(knnpars *pars) {
501 char *st1 = pars->token;
502 int N = pars->N;
503 long from = pars -> from;
504 unsigned long upto = pars -> upto;
505 char file_name[max_size], st[100][max_size], *sep;
506 float dist, len, *bestd, vec[max_size];
507 long long a, b, c, d, cn, *bi, *besti;
508 char ch;
509 knn *nbs = NULL;
510 wordlist *wl = pars->wl;
511
512 besti = malloc(N * sizeof(long long));
513 bestd = malloc(N * sizeof(float));
514
515 float worstbest=-1;
516
517 for (a = 0; a < N; a++) bestd[a] = 0;
518 a = 0;
519 bi = wl->wordi;
520 cn = wl->length;
521 sep = wl->sep;
522 b = bi[0];
523 c = 0;
Marc Kupietz000ad862016-02-26 14:59:12 +0100524 if (b == -1) {
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100525 N = 0;
526 goto end;
527 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200528 for (a = 0; a < size; a++) vec[a] = 0;
529 for (b = 0; b < cn; b++) {
530 if (bi[b] == -1) continue;
Marc Kupietz95aa1c02016-03-15 09:40:43 +0100531 if(b>0 && sep[b-1] == '-')
532 for (a = 0; a < size; a++) vec[a] -= M[a + bi[b] * size];
533 else
534 for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
Marc Kupietzdc22b982015-10-09 09:19:34 +0200535 }
536 len = 0;
537 for (a = 0; a < size; a++) len += vec[a] * vec[a];
538 len = sqrt(len);
539 for (a = 0; a < size; a++) vec[a] /= len;
540 for (a = 0; a < N; a++) bestd[a] = -1;
Marc Kupietz000ad862016-02-26 14:59:12 +0100541 for (c = from; c < upto; c++) {
Marc Kupietz5c3887d2016-04-28 08:53:35 +0200542 if(garbage && garbage[c]) continue;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200543 a = 0;
Marc Kupietz34020dc2016-02-25 08:44:19 +0100544// do not skip taget word
Marc Kupietze8da3062016-02-25 08:37:53 +0100545// for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
546// if (a == 1) continue;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200547 dist = 0;
548 for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
Marc Kupietzbe1b9fc2016-02-26 10:34:30 +0100549 if(dist > worstbest) {
550 for (a = 0; a < N; a++) {
551 if (dist > bestd[a]) {
Marc Kupietz33679a32016-03-22 08:49:39 +0100552 memmove(bestd + a + 1, bestd + a, (N - a -1) * sizeof(float));
553 memmove(besti + a + 1, besti + a, (N - a -1) * sizeof(long long));
Marc Kupietzbe1b9fc2016-02-26 10:34:30 +0100554 bestd[a] = dist;
555 besti[a] = c;
556 break;
Marc Kupietzdc22b982015-10-09 09:19:34 +0200557 }
Marc Kupietzdc22b982015-10-09 09:19:34 +0200558 }
Marc Kupietzbe1b9fc2016-02-26 10:34:30 +0100559 worstbest = bestd[N-1];
Marc Kupietzdc22b982015-10-09 09:19:34 +0200560 }
561 }
Marc Kupietz34020dc2016-02-25 08:44:19 +0100562
Marc Kupietz000ad862016-02-26 14:59:12 +0100563 nbs = malloc(sizeof(knn));
564 nbs->index = besti;
565 nbs->dist = bestd;
566 nbs->length = N;
Marc Kupietz44bee3c2016-02-25 16:26:29 +0100567end:
Marc Kupietz000ad862016-02-26 14:59:12 +0100568 pthread_exit(nbs);
Marc Kupietzdc22b982015-10-09 09:19:34 +0200569}
570
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100571
Marc Kupietza2e64502016-04-27 09:53:51 +0200572SV *get_neighbours(char *st1, int N, int sort_by, int search_backw) {
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100573 HV *result = newHV();
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100574 float *target_sums, bestd[MAX_NEIGHBOURS], bestn[MAX_NEIGHBOURS], bests[MAX_NEIGHBOURS], vec[max_size];
Marc Kupietza5f60042017-05-04 10:38:12 +0200575 long long old_words;
Marc Kupietz50485ba2016-03-23 09:13:14 +0100576 long besti[MAX_NEIGHBOURS], bestp[MAX_NEIGHBOURS], a, b, c, d, slice;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100577 knn *para_nbs[MAX_THREADS];
578 knn *syn_nbs[MAX_THREADS];
Marc Kupietz000ad862016-02-26 14:59:12 +0100579 knnpars pars[MAX_THREADS];
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100580 pthread_t *pt = (pthread_t *)malloc((num_threads+1) * sizeof(pthread_t));
Marc Kupietz48c29682016-03-19 11:30:43 +0100581 wordlist *wl;
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200582 int syn_threads = (M2? window * 2 : 0);
583 int para_threads = num_threads - syn_threads;
Marc Kupietz48c29682016-03-19 11:30:43 +0100584
Marc Kupietz000ad862016-02-26 14:59:12 +0100585 if(N>MAX_NEIGHBOURS) N=MAX_NEIGHBOURS;
586
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200587
Marc Kupietza2e64502016-04-27 09:53:51 +0200588 wl = getTargetWords(st1, search_backw);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100589 if(wl->length < 1)
590 goto end;
Marc Kupietz48c29682016-03-19 11:30:43 +0100591
Marc Kupietza5f60042017-05-04 10:38:12 +0200592 old_words = words;
593 if(merge_words > 0)
594 words = merge_words * 1.25; /* HACK */
595 slice = words / para_threads;
596
Marc Kupietzce3d4c62016-03-23 16:11:25 +0100597 a = posix_memalign((void **) &target_sums, 128, words * sizeof(float));
598 for(a = 0; a < words; a++)
599 target_sums[a] = 0;
600
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200601 printf("Starting %d threads\n", para_threads);
602 fflush(stdout);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100603 for(a=0; a < para_threads; a++) {
Marc Kupietz000ad862016-02-26 14:59:12 +0100604 pars[a].token = st1;
Marc Kupietz48c29682016-03-19 11:30:43 +0100605 pars[a].wl = wl;
Marc Kupietz000ad862016-02-26 14:59:12 +0100606 pars[a].N = N;
607 pars[a].from = a*slice;
608 pars[a].upto = ((a+1)*slice > words? words:(a+1)*slice);
609 pthread_create(&pt[a], NULL, _get_neighbours, (void *) &pars[a]);
610 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200611 if(M2) {
612 for(a=0; a < syn_threads; a++) {
613 pars[a + para_threads].target_sums = target_sums;
614 pars[a + para_threads].wl = wl;
615 pars[a + para_threads].N = N;
616 pars[a + para_threads].from = a;
617 pars[a + para_threads].upto = a+1;
618 pthread_create(&pt[a + para_threads], NULL, getCollocators, (void *) &pars[a + para_threads]);
619 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100620 }
621 printf("Waiting for para threads to join\n");
622 fflush(stdout);
623 for (a = 0; a < para_threads; a++) pthread_join(pt[a], &para_nbs[a]);
624 printf("Para threads joint\n");
625 fflush(stdout);
Marc Kupietz000ad862016-02-26 14:59:12 +0100626
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200627 /* if(!syn_nbs[0]) */
628 /* goto end; */
Marc Kupietz000ad862016-02-26 14:59:12 +0100629
630 for(b=0; b < N; b++) {
Marc Kupietz271e2a42016-03-22 11:37:43 +0100631 besti[b] = para_nbs[0]->index[b];
632 bestd[b] = para_nbs[0]->dist[b];
Marc Kupietz000ad862016-02-26 14:59:12 +0100633 }
634
Marc Kupietz271e2a42016-03-22 11:37:43 +0100635 for(a=1; a < para_threads; a++) {
636 for(b=0; b < para_nbs[a]->length && para_nbs[a]->index[b] >= 0; b++) {
Marc Kupietza5f60042017-05-04 10:38:12 +0200637 for(c=0; c < N * para_threads; c++) {
Marc Kupietz271e2a42016-03-22 11:37:43 +0100638 if(para_nbs[a]->dist[b] > bestd[c]) {
Marc Kupietz000ad862016-02-26 14:59:12 +0100639 for(d=N-1; d>c; d--) {
640 bestd[d] = bestd[d-1];
641 besti[d] = besti[d-1];
642 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100643 besti[c] = para_nbs[a]->index[b];
644 bestd[c] = para_nbs[a]->dist[b];
Marc Kupietz000ad862016-02-26 14:59:12 +0100645 break;
646 }
647 }
648 }
649 }
650
Marc Kupietz271e2a42016-03-22 11:37:43 +0100651 AV* array = newAV();
Marc Kupietza5f60042017-05-04 10:38:12 +0200652 int i;
653 int l1_words=0, l2_words=0;
654 for (a = 0, i = 0; i < N && a < 600; a++) {
655 long long c = besti[a];
656 if(merge_words > 0) {
657 if(c >= merge_words) {
658 if(l1_words > N / 2)
659 continue;
660 else
661 l1_words++;
662 } else {
663 if(l2_words > N / 2)
664 continue;
665 else
666 l2_words++;
667 }
668 }
669 fflush(stdout);
670 printf("%s l1:%d l2:%d i:%d a:%ld\n", &vocab[c * max_w], l1_words, l2_words, i, a);
671
Marc Kupietz271e2a42016-03-22 11:37:43 +0100672 HV* hash = newHV();
Marc Kupietza5f60042017-05-04 10:38:12 +0200673 SV* word = newSVpvf(&vocab[c * max_w], 0);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100674 if(latin_enc == 0) SvUTF8_on(word);
Marc Kupietza5f60042017-05-04 10:38:12 +0200675 fflush(stdout);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100676 hv_store(hash, "word", strlen("word"), word , 0);
677 hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
678 hv_store(hash, "rank", strlen("rank"), newSVuv(besti[a]), 0);
679 AV *vector = newAV();
680 for (b = 0; b < size; b++) {
681 av_push(vector, newSVnv(M[b + besti[a] * size]));
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100682 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100683 hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV*)vector), 0);
684 av_push(array, newRV_noinc((SV*)hash));
Marc Kupietza5f60042017-05-04 10:38:12 +0200685 i++;
Marc Kupietz271e2a42016-03-22 11:37:43 +0100686 }
687 hv_store(result, "paradigmatic", strlen("paradigmatic"), newRV_noinc((SV*)array), 0);
688
Marc Kupietz50485ba2016-03-23 09:13:14 +0100689 for(b=0; b < MAX_NEIGHBOURS; b++) {
690 besti[b] = -1L;
691 bestd[b] = 0;
692 bestn[b] = 0;
693 bestp[b] = 0;
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100694 bests[b] = 0;
Marc Kupietz50485ba2016-03-23 09:13:14 +0100695 }
696
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200697 if (M2) {
698 printf("Waiting for syn threads to join\n");
699 fflush(stdout);
700 for (a = 0; a < syn_threads; a++) pthread_join(pt[a+para_threads], &syn_nbs[a]);
701 printf("syn threads joint\n");
702 fflush(stdout);
Marc Kupietz50485ba2016-03-23 09:13:14 +0100703
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200704 for(b=0; b < syn_nbs[0]->length; b++) {
705 besti[b] = syn_nbs[0]->index[b];
706 bestd[b] = syn_nbs[0]->dist[b];
707 bestn[b] = syn_nbs[0]->norm[b];
708 bestp[b] = syn_nbs[0]->pos[b];
709 bests[b] = target_sums[syn_nbs[0]->index[b]];
710 }
711
712 if(sort_by != 1) { // sort by responsiveness
713 for(a=1; a < syn_threads; a++) {
714 for(b=0; b < syn_nbs[a]->length; b++) {
715 for(c=0; c < MAX_NEIGHBOURS; c++) {
716 if(syn_nbs[a]->dist[b] > bestd[c]) {
717 for(d=MAX_NEIGHBOURS-1; d>c; d--) {
718 bestd[d] = bestd[d-1];
719 besti[d] = besti[d-1];
720 bestn[d] = bestn[d-1];
721 bestp[d] = bestp[d-1];
722 }
723 besti[c] = syn_nbs[a]->index[b];
724 bestd[c] = syn_nbs[a]->dist[b];
725 bestn[c] = syn_nbs[a]->norm[b];
726 bestp[c] = syn_nbs[a]->pos[b];
727 break;
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100728 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200729 }
730 }
731 }
732 } else { // sort by mean p
733 for(a=1; a < syn_threads; a++) {
734 for(b=0; b < syn_nbs[a]->length; b++) {
735 for(c=0; c < MAX_NEIGHBOURS; c++) {
736 if(target_sums[syn_nbs[a]->index[b]] > bests[c]) {
737 for(d=MAX_NEIGHBOURS-1; d>c; d--) {
738 bestd[d] = bestd[d-1];
739 besti[d] = besti[d-1];
740 bestn[d] = bestn[d-1];
741 bestp[d] = bestp[d-1];
742 bests[d] = bests[d-1];
743 }
744 besti[c] = syn_nbs[a]->index[b];
745 bestd[c] = syn_nbs[a]->dist[b];
746 bestn[c] = syn_nbs[a]->norm[b];
747 bestp[c] = syn_nbs[a]->pos[b];
748 bests[c] = target_sums[syn_nbs[a]->index[b]];
749 break;
750 }
Marc Kupietz271e2a42016-03-22 11:37:43 +0100751 }
Marc Kupietz6d9a6782016-03-23 17:25:25 +0100752 }
753 }
754 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200755 array = newAV();
756 for (a = 0; a < MAX_NEIGHBOURS && besti[a] >= 0; a++) {
757 HV* hash = newHV();
758 SV* word = newSVpvf(&vocab[besti[a] * max_w], 0);
759 if(latin_enc == 0) SvUTF8_on(word);
760 hv_store(hash, "word", strlen("word"), word , 0);
761 hv_store(hash, "dist", strlen("dist"), newSVnv(bestd[a]), 0);
762 hv_store(hash, "norm", strlen("norm"), newSVnv(bestn[a]), 0);
763 hv_store(hash, "sum", strlen("sum"), newSVnv(target_sums[besti[a]]), 0);
764 hv_store(hash, "pos", strlen("pos"), newSVnv(bestp[a]), 0);
765 av_push(array, newRV_noinc((SV*)hash));
Marc Kupietz271e2a42016-03-22 11:37:43 +0100766 }
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200767 hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV*)array), 0);
Marc Kupietz271e2a42016-03-22 11:37:43 +0100768 }
Marc Kupietz000ad862016-02-26 14:59:12 +0100769end:
Marc Kupietza5f60042017-05-04 10:38:12 +0200770 words = old_words;
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100771 return newRV_noinc((SV*)result);
Marc Kupietz000ad862016-02-26 14:59:12 +0100772}
Marc Kupietz7bc85fd2016-02-24 11:42:41 +0100773
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200774int dump_vecs(char *fname) {
775 long i, j;
776 FILE *f;
777 /* if(words>200000) */
778 /* words=200000; */
Marc Kupietz6b2975c2016-03-18 21:59:33 +0100779
Marc Kupietz43ee87e2016-04-25 10:50:08 +0200780 if((f=fopen(fname, "w")) == NULL) {
781 fprintf(stderr, "cannot open %s for writing\n", fname);
782 return(-1);
783 }
784 fprintf(f, "%lld %lld\n", words, size);
785 for (i=0; i < words; i++) {
786 fprintf(f, "%s ", &vocab[i * max_w]);
787 for(j=0; j < size - 1; j++)
788 fprintf(f, "%f ", M[i*size + j]);
789 fprintf(f, "%f\n", M[i*size + j]);
790 }
791 fclose(f);
792 return(0);
793}
Marc Kupietzdc22b982015-10-09 09:19:34 +0200794