blob: 5b23b03e22955af40b80c5dec6248516a7d7f6cc [file] [log] [blame]
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001#include <collocatordb.h>
Marc Kupietz969cab92019-08-05 11:13:42 +02002#include <math.h>
3#include <pthread.h>
4#include <stdio.h>
Marc Kupietzc0d41872021-02-25 16:33:22 +01005#include <stdlib.h>
Marc Kupietz969cab92019-08-05 11:13:42 +02006#include <string.h>
7#include <sys/mman.h>
Marc Kupietzf11d20c2019-08-02 15:42:04 +02008
9#define max_size 2000
10#define max_w 50
11#define MAX_NEIGHBOURS 1000
12#define MAX_WORDS -1
13#define MAX_THREADS 100
14#define MAX_CC 50
15#define EXP_TABLE_SIZE 1000
16#define MAX_EXP 6
17#define MIN_RESP 0.50
18
19//the thread function
20void *connection_handler(void *);
21
22typedef struct {
Marc Kupietz969cab92019-08-05 11:13:42 +020023 long long wordi;
24 long position;
25 float activation;
26 float average;
27 float cprobability; // column wise probability
28 float cprobability_sum;
29 float probability;
30 float activation_sum;
31 float max_activation;
32 float heat[16];
Marc Kupietzf11d20c2019-08-02 15:42:04 +020033} collocator;
34
35typedef struct {
Marc Kupietz969cab92019-08-05 11:13:42 +020036 collocator *best;
37 int length;
Marc Kupietzf11d20c2019-08-02 15:42:04 +020038} knn;
Marc Kupietz969cab92019-08-05 11:13:42 +020039
Marc Kupietzf11d20c2019-08-02 15:42:04 +020040typedef struct {
41 long long wordi[MAX_NEIGHBOURS];
42 char sep[MAX_NEIGHBOURS];
43 int length;
44} wordlist;
45
46typedef struct {
47 long cutoff;
48 wordlist *wl;
Marc Kupietz969cab92019-08-05 11:13:42 +020049 char *token;
50 int N;
51 long from;
52 unsigned long upto;
Marc Kupietzf11d20c2019-08-02 15:42:04 +020053 collocator *best;
54 float *target_sums;
55 float *window_sums;
56 float threshold;
57} knnpars;
58
59typedef struct {
60 uint32_t index;
61 float value;
62} sparse_t;
63
64typedef struct {
65 uint32_t len;
66 sparse_t nbr[100];
67} profile_t;
68
Marc Kupietz969cab92019-08-05 11:13:42 +020069float *M, *M2 = 0L, *syn1neg_window, *expTable;
Marc Kupietzf11d20c2019-08-02 15:42:04 +020070float *window_sums;
71char *vocab;
72char *garbage = NULL;
73COLLOCATORDB *cdb = NULL;
74profile_t *sprofiles = NULL;
75size_t sprofiles_qty = 0;
76
77long long words, size, merged_end;
78long long merge_words = 0;
Marc Kupietz969cab92019-08-05 11:13:42 +020079int num_threads = 20;
80int latin_enc = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +020081int window;
82
83/* load collocation profiles if file exists */
84int load_sprofiles(char *vecsname) {
85 char *basename = strdup(vecsname);
86 char *pos = strstr(basename, ".vecs");
Marc Kupietz969cab92019-08-05 11:13:42 +020087 if (pos)
88 *pos = 0;
89
Marc Kupietzf11d20c2019-08-02 15:42:04 +020090 char binsprofiles_fname[256];
91 strcpy(binsprofiles_fname, basename);
Marc Kupietz969cab92019-08-05 11:13:42 +020092 strcat(binsprofiles_fname, ".sprofiles.bin");
Marc Kupietzf11d20c2019-08-02 15:42:04 +020093 FILE *fp = fopen(binsprofiles_fname, "rb");
94 if (fp == NULL) {
95 printf("Collocation profiles %s not found. No problem.\n", binsprofiles_fname);
96 return 0;
97 }
98 fseek(fp, 0L, SEEK_END);
99 size_t sz = ftell(fp);
100 fclose(fp);
101
102 int fd = open(binsprofiles_fname, O_RDONLY);
Marc Kupietz969cab92019-08-05 11:13:42 +0200103 sprofiles = mmap(0, sz, PROT_READ, MAP_SHARED, fd, 0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200104 if (sprofiles == MAP_FAILED) {
105 close(fd);
106 fprintf(stderr, "Cannot mmap %s\n", binsprofiles_fname);
107 sprofiles = NULL;
108 return 0;
Marc Kupietz969cab92019-08-05 11:13:42 +0200109 } else {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200110 sprofiles_qty = sz / sizeof(profile_t);
111 fprintf(stderr, "Successfully mmaped %s containing similar profiles for %ld word forms.\n", binsprofiles_fname, sprofiles_qty);
112 }
113 return 1;
114}
115
Marc Kupietzc0d41872021-02-25 16:33:22 +0100116char *removeExtension(char* myStr) {
117 char *retStr;
118 char *lastExt;
119 if (myStr == NULL) return NULL;
120 if ((retStr = malloc (strlen (myStr) + 1)) == NULL) return NULL;
121 strcpy (retStr, myStr);
122 lastExt = strrchr (retStr, '.');
123 if (lastExt != NULL)
124 *lastExt = '\0';
125 return retStr;
126}
127
Marc Kupietz0efe49b2020-04-06 18:30:22 +0200128int init_net(char *file_name, char *net_name, int latin, int do_open_cdb) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200129 FILE *f, *binvecs, *binwords;
Marc Kupietz969cab92019-08-05 11:13:42 +0200130 int binwords_fd, binvecs_fd, net_fd, i;
Marc Kupietz59865a92021-03-11 17:16:51 +0100131 long long a, b;
Marc Kupietz969cab92019-08-05 11:13:42 +0200132 float len;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200133 double val;
134
Marc Kupietzc0d41872021-02-25 16:33:22 +0100135 char binvecs_fname[1024], binwords_fname[1024];
136
137 if (strstr(file_name, ".txt")) {
138 strcpy(binwords_fname, removeExtension(file_name));
139 } else {
140 strcpy(binwords_fname, file_name);
141 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200142 strcat(binwords_fname, ".words");
143 strcpy(binvecs_fname, file_name);
144 strcat(binvecs_fname, ".vecs");
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200145
146 latin_enc = latin;
147 f = fopen(file_name, "rb");
148 if (f == NULL) {
149 printf("Input file %s not found\n", file_name);
150 return -1;
151 }
152 fscanf(f, "%lld", &words);
Marc Kupietz969cab92019-08-05 11:13:42 +0200153 if (MAX_WORDS > 0 && words > MAX_WORDS) words = MAX_WORDS;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200154 fscanf(f, "%lld", &size);
Marc Kupietz969cab92019-08-05 11:13:42 +0200155 if ((binvecs_fd = open(binvecs_fname, O_RDONLY)) < 0 || (binwords_fd = open(binwords_fname, O_RDONLY)) < 0) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200156 printf("Converting %s to memory mappable structures\n", file_name);
Marc Kupietz969cab92019-08-05 11:13:42 +0200157 vocab = (char *)malloc((long long)words * max_w * sizeof(char));
158 M = (float *)malloc((long long)words * (long long)size * sizeof(float));
159 if (M == NULL) {
160 printf("Cannot allocate memory: %lld MB %lld %lld\n", (long long)words * size * sizeof(float) / 1048576, words, size);
161 return -1;
162 }
163 if (strstr(file_name, ".txt")) {
Marc Kupietzc0d41872021-02-25 16:33:22 +0100164 printf("%lld words in ascii vector file with vector size %lld\n", words, size);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200165 for (b = 0; b < words; b++) {
166 a = 0;
167 while (1) {
168 vocab[b * max_w + a] = fgetc(f);
169 if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
170 if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
171 }
172 vocab[b * max_w + a] = 0;
173 len = 0;
174 for (a = 0; a < size; a++) {
175 fscanf(f, "%lf", &val);
176 M[a + b * size] = val;
177 len += val * val;
Marc Kupietz969cab92019-08-05 11:13:42 +0200178 }
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200179 len = sqrt(len);
180 for (a = 0; a < size; a++) M[a + b * size] /= len;
181 }
182 } else {
183 for (b = 0; b < words; b++) {
184 a = 0;
185 while (1) {
186 vocab[b * max_w + a] = fgetc(f);
187 if (feof(f) || (vocab[b * max_w + a] == ' ')) break;
188 if ((a < max_w) && (vocab[b * max_w + a] != '\n')) a++;
189 }
190 vocab[b * max_w + a] = 0;
191 fread(&M[b * size], sizeof(float), size, f);
192 len = 0;
193 for (a = 0; a < size; a++) len += M[a + b * size] * M[a + b * size];
194 len = sqrt(len);
195 for (a = 0; a < size; a++) M[a + b * size] /= len;
196 }
197 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200198 if ((binvecs = fopen(binvecs_fname, "wb")) != NULL && (binwords = fopen(binwords_fname, "wb")) != NULL) {
199 fwrite(M, sizeof(float), (long long)words * (long long)size, binvecs);
200 fclose(binvecs);
201 fwrite(vocab, sizeof(char), (long long)words * max_w, binwords);
202 fclose(binwords);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200203 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200204 }
205 if ((binvecs_fd = open(binvecs_fname, O_RDONLY)) >= 0 && (binwords_fd = open(binwords_fname, O_RDONLY)) >= 0) {
206 M = mmap(0, sizeof(float) * (long long)words * (long long)size, PROT_READ, MAP_SHARED, binvecs_fd, 0);
207 vocab = mmap(0, sizeof(char) * (long long)words * max_w, PROT_READ, MAP_SHARED, binwords_fd, 0);
208 if (M == MAP_FAILED || vocab == MAP_FAILED) {
209 close(binvecs_fd);
210 close(binwords_fd);
211 fprintf(stderr, "Cannot mmap %s or %s\n", binwords_fname, binvecs_fname);
212 exit(-1);
213 }
214 } else {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200215 fprintf(stderr, "Cannot open %s or %s\n", binwords_fname, binvecs_fname);
216 exit(-1);
Marc Kupietz969cab92019-08-05 11:13:42 +0200217 }
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200218 fclose(f);
219
Marc Kupietz969cab92019-08-05 11:13:42 +0200220 if (net_name && strlen(net_name) > 0) {
221 if ((net_fd = open(net_name, O_RDONLY)) >= 0) {
222 window = (lseek(net_fd, 0, SEEK_END) - sizeof(float) * words * size) / words / size / sizeof(float) / 2;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200223 // lseek(net_fd, sizeof(float) * words * size, SEEK_SET);
224 // munmap(M, sizeof(float) * words * size);
225 M2 = mmap(0, sizeof(float) * words * size + sizeof(float) * 2 * window * size * words, PROT_READ, MAP_SHARED, net_fd, 0);
226 if (M2 == MAP_FAILED) {
227 close(net_fd);
228 fprintf(stderr, "Cannot mmap %s\n", net_name);
229 exit(-1);
230 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200231 syn1neg_window = M2 + words * size;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200232 } else {
233 fprintf(stderr, "Cannot open %s\n", net_name);
234 exit(-1);
235 }
236 fprintf(stderr, "Successfully memmaped %s. Determined window size: %d\n", net_name, window);
237
Marc Kupietz0efe49b2020-04-06 18:30:22 +0200238 if (do_open_cdb) {
239 char collocatordb_name[2048];
240 strcpy(collocatordb_name, net_name);
241 char *ext = rindex(collocatordb_name, '.');
242 if (ext) {
243 strcpy(ext, ".rocksdb");
244 if (access(collocatordb_name, R_OK) == 0) {
245 *ext = 0;
246 fprintf(stderr, "Opening collocator DB %s\n", collocatordb_name);
247 cdb = open_collocatordb(collocatordb_name);
Marc Kupietzc0d41872021-02-25 16:33:22 +0100248 } else {
249 fprintf(stderr, "Cannot open collocator DB %s\n", collocatordb_name);
Marc Kupietz0efe49b2020-04-06 18:30:22 +0200250 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200251 }
252 }
253 }
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200254
Marc Kupietz969cab92019-08-05 11:13:42 +0200255 expTable = (float *)malloc((EXP_TABLE_SIZE + 1) * sizeof(float));
256 for (i = 0; i < EXP_TABLE_SIZE; i++) {
257 expTable[i] = exp((i / (float)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
258 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
259 }
260 window_sums = malloc(sizeof(float) * (window + 1) * 2);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200261
262 return 0;
263}
264
Marc Kupietz969cab92019-08-05 11:13:42 +0200265long mergeVectors(char *file_name) {
Marc Kupietz59865a92021-03-11 17:16:51 +0100266 FILE *f;
267 int binwords_fd, binvecs_fd;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200268 float *merge_vecs;
269 char *merge_vocab;
Marc Kupietz969cab92019-08-05 11:13:42 +0200270 /* long long merge_words, merge_size; */
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200271 long long merge_size;
272
Marc Kupietz969cab92019-08-05 11:13:42 +0200273 char binvecs_fname[256], binwords_fname[256];
Marc Kupietzc0d41872021-02-25 16:33:22 +0100274
275
Marc Kupietz969cab92019-08-05 11:13:42 +0200276 strcpy(binwords_fname, file_name);
277 strcat(binwords_fname, ".words");
278 strcpy(binvecs_fname, file_name);
279 strcat(binvecs_fname, ".vecs");
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200280
281 f = fopen(file_name, "rb");
282 if (f == NULL) {
283 printf("Input file %s not found\n", file_name);
Marc Kupietz59865a92021-03-11 17:16:51 +0100284 exit(-1);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200285 }
286 fscanf(f, "%lld", &merge_words);
287 fscanf(f, "%lld", &merge_size);
Marc Kupietz969cab92019-08-05 11:13:42 +0200288 if (merge_size != size) {
289 fprintf(stderr, "vectors must have the same length\n");
290 exit(-1);
291 }
292 if ((binvecs_fd = open(binvecs_fname, O_RDONLY)) >= 0 && (binwords_fd = open(binwords_fname, O_RDONLY)) >= 0) {
293 merge_vecs = malloc(sizeof(float) * (words + merge_words) * size);
294 merge_vocab = malloc(sizeof(char) * (words + merge_words) * max_w);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200295 if (merge_vecs == NULL || merge_vocab == NULL) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200296 close(binvecs_fd);
297 close(binwords_fd);
298 fprintf(stderr, "Cannot reserve memory for %s or %s\n", binwords_fname, binvecs_fname);
299 exit(-1);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200300 }
301 read(binvecs_fd, merge_vecs, merge_words * size * sizeof(float));
302 read(binwords_fd, merge_vocab, merge_words * max_w);
Marc Kupietz969cab92019-08-05 11:13:42 +0200303 } else {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200304 fprintf(stderr, "Cannot open %s or %s\n", binwords_fname, binvecs_fname);
305 exit(-1);
Marc Kupietz969cab92019-08-05 11:13:42 +0200306 }
307 printf("Successfully reallocated memory\nMerging...\n");
308 fflush(stdout);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200309 memcpy(merge_vecs + merge_words * size, M, words * size * sizeof(float));
310 memcpy(merge_vocab + merge_words * max_w, vocab, words * max_w);
311 munmap(M, words * size * sizeof(float));
312 munmap(vocab, words * max_w);
313 M = merge_vecs;
314 vocab = merge_vocab;
315 merged_end = merge_words;
316 words += merge_words;
317 fclose(f);
Marc Kupietz969cab92019-08-05 11:13:42 +0200318 printf("merged_end: %lld, words: %lld\n", merged_end, words);
319 //printBiggestMergedDifferences();
320 return ((long)merged_end);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200321}
322
323void filter_garbage() {
324 long i;
325 unsigned char *w, previous, c;
326 garbage = malloc(words);
327 memset(garbage, 0, words);
328 for (i = 0; i < words; i++) {
Marc Kupietz59865a92021-03-11 17:16:51 +0100329 w = (unsigned char *) vocab + i * max_w;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200330 previous = 0;
Marc Kupietz59865a92021-03-11 17:16:51 +0100331 if (strncmp("quot", (const char *)w, 4) == 0) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200332 garbage[i] = 1;
333 // printf("Gargabe: %s\n", vocab + i * max_w);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200334 } else {
Marc Kupietz969cab92019-08-05 11:13:42 +0200335 while ((c = *w++) && !garbage[i]) {
336 if (((c <= 90 && c >= 65) && (previous >= 97 && previous <= 122)) ||
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200337 (previous == '-' && (c & 32)) ||
Marc Kupietz969cab92019-08-05 11:13:42 +0200338 (previous == 0xc2 && (c == 0xa4 || c == 0xb6)) ||
339 (previous == 'q' && c == 'u' && *(w) == 'o' && *(w + 1) == 't') || /* quot */
340 c == '<') {
341 garbage[i] = 1;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200342 continue;
343 }
344 previous = c;
345 }
346 }
347 }
348 return;
349}
350
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200351knn *simpleGetCollocators(int word, int number, long cutoff, int *result) {
352 knnpars *pars = calloc(sizeof(knnpars), 1);
Marc Kupietz59865a92021-03-11 17:16:51 +0100353 float *target_sums = NULL;
354 float *my_window_sums = malloc(sizeof(float) * (window + 1) * 2);
Marc Kupietz969cab92019-08-05 11:13:42 +0200355 pars->cutoff = (cutoff ? cutoff : 300000);
Marc Kupietz59865a92021-03-11 17:16:51 +0100356 long a;
Marc Kupietz969cab92019-08-05 11:13:42 +0200357 for (a = 0; a < cutoff; a++)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200358 target_sums[a] = 0;
359 pars->target_sums = target_sums;
Marc Kupietz59865a92021-03-11 17:16:51 +0100360 pars->window_sums = my_window_sums;
Marc Kupietz969cab92019-08-05 11:13:42 +0200361 pars->N = (number ? number : 20);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200362 pars->from = 0;
Marc Kupietz969cab92019-08-05 11:13:42 +0200363 pars->upto = window * 2 - 1;
364 knn *syn_nbs = NULL; // = (knn*) getCollocators(pars);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200365 free(pars);
Marc Kupietz59865a92021-03-11 17:16:51 +0100366 free(my_window_sums);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200367 free(target_sums);
368 return syn_nbs;
369}
370
371void *getCollocators(void *args) {
372 knnpars *pars = args;
Marc Kupietz969cab92019-08-05 11:13:42 +0200373 int N = pars->N;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200374
375 int cc = pars->wl->wordi[0];
Marc Kupietz969cab92019-08-05 11:13:42 +0200376 knn *nbs = NULL;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200377 long window_layer_size = size * window * 2;
Marc Kupietz59865a92021-03-11 17:16:51 +0100378 long a, b, c, d, window_offset, target, max_target = 0, maxmax_target;
Marc Kupietz969cab92019-08-05 11:13:42 +0200379 float f, max_f, maxmax_f;
380 float *target_sums = NULL, worstbest, wpos_sum;
381 collocator *best;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200382
Marc Kupietz969cab92019-08-05 11:13:42 +0200383 if (M2 == NULL || cc == -1)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200384 return NULL;
385
Marc Kupietz969cab92019-08-05 11:13:42 +0200386 a = posix_memalign((void **)&target_sums, 128, pars->cutoff * sizeof(float));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200387 memset(target_sums, 0, pars->cutoff * sizeof(float));
Marc Kupietz969cab92019-08-05 11:13:42 +0200388 best = malloc((N > 200 ? N : 200) * sizeof(collocator));
389 memset(best, 0, (N > 200 ? N : 200) * sizeof(collocator));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200390 worstbest = pars->threshold;
391
392 for (b = 0; b < pars->cutoff; b++)
Marc Kupietz969cab92019-08-05 11:13:42 +0200393 target_sums[b] = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200394 for (b = 0; b < N; b++) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200395 best[b].wordi = -1;
396 best[b].probability = 1;
397 best[b].activation = worstbest;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200398 }
399
400 d = cc;
401 maxmax_f = -1;
402 maxmax_target = 0;
403
404 for (a = pars->from; a < pars->upto; a++) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200405 if (a >= window)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200406 a++;
407 wpos_sum = 0;
408 printf("window pos: %ld\n", a);
409 if (a != window) {
410 max_f = -1;
411 window_offset = a * size;
412 if (a > window)
413 window_offset -= size;
Marc Kupietz969cab92019-08-05 11:13:42 +0200414 for (target = 0; target < pars->cutoff; target++) {
415 if (garbage && garbage[target]) continue;
416 if (target == d)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200417 continue;
418 f = 0;
419 for (c = 0; c < size; c++)
Marc Kupietz969cab92019-08-05 11:13:42 +0200420 f += M2[d * size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200421 if (f < -MAX_EXP)
422 continue;
423 else if (f > MAX_EXP)
424 continue;
425 else
Marc Kupietz969cab92019-08-05 11:13:42 +0200426 f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200427 wpos_sum += f;
428
429 target_sums[target] += f;
Marc Kupietz969cab92019-08-05 11:13:42 +0200430 if (f > worstbest) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200431 for (b = 0; b < N; b++) {
432 if (f > best[b].activation) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200433 memmove(best + b + 1, best + b, (N - b - 1) * sizeof(collocator));
434 best[b].activation = f;
435 best[b].wordi = target;
436 best[b].position = window - a;
437 break;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200438 }
439 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200440 if (b == N - 1)
441 worstbest = best[N - 1].activation;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200442 }
443 }
Marc Kupietz59865a92021-03-11 17:16:51 +0100444 printf("%ld %.2f\n", max_target, max_f);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200445 printf("%s (%.2f) ", &vocab[max_target * max_w], max_f);
Marc Kupietz969cab92019-08-05 11:13:42 +0200446 if (max_f > maxmax_f) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200447 maxmax_f = max_f;
448 maxmax_target = max_target;
449 }
450 for (b = 0; b < N; b++)
Marc Kupietz969cab92019-08-05 11:13:42 +0200451 if (best[b].position == window - a)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200452 best[b].cprobability = best[b].activation / wpos_sum;
453 } else {
Marc Kupietz969cab92019-08-05 11:13:42 +0200454 printf("\x1b[1m%s\x1b[0m ", &vocab[d * max_w]);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200455 }
456 pars->window_sums[a] = wpos_sum;
457 }
458 for (b = 0; b < pars->cutoff; b++)
Marc Kupietz969cab92019-08-05 11:13:42 +0200459 pars->target_sums[b] += target_sums[b]; //(target_sums[b] / wpos_sum ) / (window * 2);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200460
461 free(target_sums);
Marc Kupietz969cab92019-08-05 11:13:42 +0200462 for (b = 0; b < N && best[b].wordi >= 0; b++)
463 ;
Marc Kupietz59865a92021-03-11 17:16:51 +0100464 // THIS LOOP IS NEEDED (b...)
Marc Kupietz969cab92019-08-05 11:13:42 +0200465 // printf("%d: best syn: %s %.2f %.5f\n", b, &vocab[best[b].wordi*max_w], best[b].activation, best[b].probability);
466 // printf("\n");
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200467 nbs = malloc(sizeof(knn));
Marc Kupietz969cab92019-08-05 11:13:42 +0200468 nbs->best = best;
469 nbs->length = b - 1;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200470 pthread_exit(nbs);
471}
472
Marc Kupietz0efe49b2020-04-06 18:30:22 +0200473float getOutputWeight(int hidden, long target, int window_position) {
474 const long window_layer_size = size * window * 2;
475 int a;
476
477 if (window_position == 0 || window_position > window || window_position < -window) {
478 fprintf(stderr, "window_position: %d - assert: -%d <= window_position <= %d && window_position != 0 failed.\n", window_position, window, window);
479 exit(-1);
480 }
481
482 if (hidden >= size) {
Marc Kupietz59865a92021-03-11 17:16:51 +0100483 fprintf(stderr, "hidden: %d - assert: hidden < %lld failed.\n", hidden, size);
Marc Kupietz0efe49b2020-04-06 18:30:22 +0200484 exit(-1);
485 }
486
487 if (target >= words) {
Marc Kupietz59865a92021-03-11 17:16:51 +0100488 fprintf(stderr, "target: %ld - assert: target < %lld failed.\n", target, words);
Marc Kupietz0efe49b2020-04-06 18:30:22 +0200489 exit(-1);
490 }
491
492 a = window_position + window;
493 if (a > window) {
494 --a;
495 }
496 long window_offset = a * size;
497 return syn1neg_window[target * window_layer_size + window_offset + hidden];
498}
499
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200500AV *getVecs(AV *array) {
501 int i, b;
502 AV *result = newAV();
Marc Kupietz969cab92019-08-05 11:13:42 +0200503 for (i = 0; i <= av_len(array); i++) {
504 SV **elem = av_fetch(array, i, 0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200505 if (elem != NULL) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200506 long j = (long)SvNV(*elem);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200507 AV *vector = newAV();
508 for (b = 0; b < size; b++) {
509 av_push(vector, newSVnv(M[b + j * size]));
510 }
Marc Kupietzbdd779a2024-08-05 10:02:29 +0200511 av_push(result, newRV_noinc((SV *)vector));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200512 }
513 }
514 return result;
515}
516
517char *getSimilarProfiles(long node) {
518 int i;
519 char buffer[120000];
520 char pair_buffer[2048];
Marc Kupietz969cab92019-08-05 11:13:42 +0200521 buffer[0] = '[';
522 buffer[1] = 0;
523 if (node >= sprofiles_qty) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200524 printf("Not available in precomputed profile\n");
Marc Kupietz969cab92019-08-05 11:13:42 +0200525 return (strdup("[{\"w\":\"not available\", \"v\":0}]\n"));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200526 }
527
528 printf("******* %s ******\n", &vocab[max_w * node]);
Marc Kupietz969cab92019-08-05 11:13:42 +0200529
530 for (i = 0; i < 100 && i < sprofiles[node].len; i++) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200531 sprintf(pair_buffer, "{\"w\":\"%s\", \"v\":%f},", &vocab[max_w * (sprofiles[node].nbr[i].index)], sprofiles[node].nbr[i].value);
532 strcat(buffer, pair_buffer);
533 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200534 buffer[strlen(buffer) - 1] = ']';
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200535 strcat(buffer, "\n");
Marc Kupietz59865a92021-03-11 17:16:51 +0100536 printf("%s", buffer);
Marc Kupietz969cab92019-08-05 11:13:42 +0200537 return (strdup(buffer));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200538}
539
Marc Kupietzf6080012021-03-12 09:14:42 +0100540char *getCollocationScores(long node, long collocate) {
541 char *res = (cdb ? strdup(get_collocation_scores_as_json(cdb, node, collocate)) : "[]");
542 return res;
543}
544
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200545char *getClassicCollocators(long node) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200546 char *res = (cdb ? strdup(get_collocators_as_json(cdb, node)) : "[]");
547 return res;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200548}
549
550wordlist *getTargetWords(char *st1, int search_backw) {
551 wordlist *wl = malloc(sizeof(wordlist));
Marc Kupietz59865a92021-03-11 17:16:51 +0100552 char st[100][max_size];
Marc Kupietz969cab92019-08-05 11:13:42 +0200553 long a, b = 0, c = 0, cn = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200554
555 while (1) {
556 st[cn][b] = st1[c];
557 b++;
558 c++;
559 st[cn][b] = 0;
560 if (st1[c] == 0) break;
Marc Kupietzc0d41872021-02-25 16:33:22 +0100561 if (st1[c] == ' ' /*|| st1[c] == '-'*/) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200562 b = 0;
563 c++;
564 }
565 }
566 cn++;
567 for (a = 0; a < cn; a++) {
568 if (search_backw) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200569 for (b = words - 1; b >= (merge_words ? merge_words : 0) && strcmp(&vocab[b * max_w], st[a]) != 0; b--)
570 ;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200571 } else {
Marc Kupietz969cab92019-08-05 11:13:42 +0200572 for (b = 0; b < (merge_words ? merge_words : words) && strcmp(&vocab[b * max_w], st[a]) != 0; b++)
573 ;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200574 }
575 if (b == words) b = -1;
576 wl->wordi[a] = b;
577 if (b == -1) {
578 fprintf(stderr, "Out of dictionary word!\n");
579 cn--;
580 } else {
Marc Kupietz969cab92019-08-05 11:13:42 +0200581 fprintf(stderr, "Word: \"%s\" Position in vocabulary: %lld\n", &vocab[wl->wordi[a] * max_w], wl->wordi[a]);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200582 }
583 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200584 wl->length = cn;
585 return (wl);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200586}
587
Marc Kupietzcb43e492019-12-03 10:07:53 +0100588long getWordNumber(char *word) {
589 wordlist *wl = getTargetWords(word, 0);
590 if(wl->length > 0)
591 return(wl->wordi[0]);
592 return(0);
593}
594
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200595float get_distance(long b, long c) {
596 long a;
597 float dist = 0;
598 for (a = 0; a < size; a++) dist += M[a + c * size] * M[a + b * size];
599 return dist;
600}
601
Marc Kupietz969cab92019-08-05 11:13:42 +0200602char *getBiggestMergedDifferences() {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200603 static char *result = NULL;
Marc Kupietz59865a92021-03-11 17:16:51 +0100604 float dist;
605 long long a, c;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200606 int N = 1000;
607
Marc Kupietz969cab92019-08-05 11:13:42 +0200608 if (merged_end == 0)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200609 result = "[]";
Marc Kupietz969cab92019-08-05 11:13:42 +0200610
611 if (result != NULL)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200612 return result;
613
614 printf("Looking for biggest distances between main and merged vectors ...\n");
615 collocator *best;
616 best = malloc(N * sizeof(collocator));
617 memset(best, 0, N * sizeof(collocator));
618
Marc Kupietz969cab92019-08-05 11:13:42 +0200619 float worstbest = 1000000;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200620
621 for (a = 0; a < N; a++) best[a].activation = worstbest;
622
623 for (c = 0; c < 500000; c++) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200624 if (garbage && garbage[c]) continue;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200625 dist = 0;
Marc Kupietz969cab92019-08-05 11:13:42 +0200626 for (a = 0; a < size; a++) dist += M[a + c * size] * M[a + (c + merged_end) * size];
627 if (dist < worstbest) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200628 for (a = 0; a < N; a++) {
629 if (dist < best[a].activation) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200630 memmove(best + a + 1, best + a, (N - a - 1) * sizeof(collocator));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200631 best[a].activation = dist;
632 best[a].wordi = c;
633 break;
634 }
635 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200636 worstbest = best[N - 1].activation;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200637 }
638 }
639
Marc Kupietz969cab92019-08-05 11:13:42 +0200640 result = malloc(N * max_w);
Marc Kupietzbdd779a2024-08-05 10:02:29 +0200641 char *p = (char *) result;
Marc Kupietz969cab92019-08-05 11:13:42 +0200642 *p++ = '[';
643 *p = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200644 for (a = 0; a < N; a++) {
Marc Kupietz59865a92021-03-11 17:16:51 +0100645 p += sprintf(p, "{\"rank\":%lld,\"word\":\"%s\",\"dist\":%.3f},", a, &vocab[best[a].wordi * max_w], 1 - best[a].activation);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200646 }
647 *--p = ']';
Marc Kupietz969cab92019-08-05 11:13:42 +0200648 return (result);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200649}
650
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200651float cos_similarity(long b, long c) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200652 float dist = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200653 long a;
Marc Kupietz969cab92019-08-05 11:13:42 +0200654 for (a = 0; a < size; a++) dist += M[b * size + a] * M[c * size + a];
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200655 return dist;
656}
657
658char *cos_similarity_as_json(char *w1, char *w2) {
659 wordlist *a, *b;
660 float res;
661 a = getTargetWords(w1, 0);
662 b = getTargetWords(w2, 0);
Marc Kupietz969cab92019-08-05 11:13:42 +0200663 if (a == NULL || b == NULL || a->length != 1 || b->length != 1)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200664 res = -1;
665 else
666 res = cos_similarity(a->wordi[0], b->wordi[0]);
667 fprintf(stderr, "a: %lld b: %lld res:%f\n", a->wordi[0], b->wordi[0], res);
668 char *json = malloc(16);
669 sprintf(json, "%.5f", res);
670 return json;
671}
672
673void *_get_neighbours(void *arg) {
674 knnpars *pars = arg;
Marc Kupietz969cab92019-08-05 11:13:42 +0200675 int N = pars->N;
676 long from = pars->from;
677 unsigned long upto = pars->upto;
Marc Kupietz59865a92021-03-11 17:16:51 +0100678 char *sep;
Marc Kupietz969cab92019-08-05 11:13:42 +0200679 float dist, len, vec[max_size];
Marc Kupietz59865a92021-03-11 17:16:51 +0100680 long long a, b, c, cn, *bi;
Marc Kupietz969cab92019-08-05 11:13:42 +0200681 knn *nbs = NULL;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200682 wordlist *wl = pars->wl;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200683
Marc Kupietz969cab92019-08-05 11:13:42 +0200684 collocator *best = pars->best;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200685
Marc Kupietz969cab92019-08-05 11:13:42 +0200686 float worstbest = -1;
687
688 for (a = 0; a < N; a++) best[a].activation = 0;
689 a = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200690 bi = wl->wordi;
Marc Kupietz969cab92019-08-05 11:13:42 +0200691 cn = wl->length;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200692 sep = wl->sep;
Marc Kupietz969cab92019-08-05 11:13:42 +0200693 b = bi[0];
Marc Kupietz969cab92019-08-05 11:13:42 +0200694 if (b == -1) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200695 goto end;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200696 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200697 for (a = 0; a < size; a++) vec[a] = 0;
698 for (b = 0; b < cn; b++) {
699 if (bi[b] == -1) continue;
700 if (b > 0 && sep[b - 1] == '-')
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200701 for (a = 0; a < size; a++) vec[a] -= M[a + bi[b] * size];
702 else
703 for (a = 0; a < size; a++) vec[a] += M[a + bi[b] * size];
Marc Kupietz969cab92019-08-05 11:13:42 +0200704 }
705 len = 0;
706 for (a = 0; a < size; a++) len += vec[a] * vec[a];
707 len = sqrt(len);
708 for (a = 0; a < size; a++) vec[a] /= len;
709 for (a = 0; a < N; a++) best[a].activation = -1;
710 for (c = from; c < upto; c++) {
711 if (garbage && garbage[c]) continue;
712 a = 0;
713 // do not skip taget word
714 // for (b = 0; b < cn; b++) if (bi[b] == c) a = 1;
715 // if (a == 1) continue;
716 dist = 0;
717 for (a = 0; a < size; a++) dist += vec[a] * M[a + c * size];
718 if (dist > worstbest) {
719 for (a = 0; a < N; a++) {
720 if (dist > best[a].activation) {
721 memmove(best + a + 1, best + a, (N - a - 1) * sizeof(collocator));
722 best[a].activation = dist;
723 best[a].wordi = c;
724 break;
725 }
726 }
727 worstbest = best[N - 1].activation;
728 }
729 }
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200730
731end:
Marc Kupietz969cab92019-08-05 11:13:42 +0200732 pthread_exit(nbs);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200733}
734
Marc Kupietz969cab92019-08-05 11:13:42 +0200735int cmp_activation(const void *a, const void *b) {
736 float fb = ((collocator *)a)->activation;
737 float fa = ((collocator *)b)->activation;
738 return (fa > fb) - (fa < fb);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200739}
740
Marc Kupietz969cab92019-08-05 11:13:42 +0200741int cmp_probability(const void *a, const void *b) {
742 float fb = ((collocator *)a)->probability;
743 float fa = ((collocator *)b)->probability;
744 return (fa > fb) - (fa < fb);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200745}
746
Marc Kupietz969cab92019-08-05 11:13:42 +0200747char *getPosWiseW2VCollocatorsAsTsv(char *word, long maxPerPos, long cutoff, float threshold) {
748 HV *result = newHV();
Marc Kupietz59865a92021-03-11 17:16:51 +0100749 float *target_sums = NULL;
750 long a, b;
Marc Kupietz969cab92019-08-05 11:13:42 +0200751 knn *para_nbs[MAX_THREADS];
752 knn *syn_nbs[MAX_THREADS];
753 knnpars pars[MAX_THREADS];
754 pthread_t *pt = (pthread_t *)malloc((num_threads + 1) * sizeof(pthread_t));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200755 wordlist *wl;
Marc Kupietz969cab92019-08-05 11:13:42 +0200756 int syn_threads = (M2 ? window * 2 : 0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200757 int search_backw = 0;
758 collocator *best = NULL;
Marc Kupietz969cab92019-08-05 11:13:42 +0200759 posix_memalign((void **)&best, 128, 10 * (maxPerPos >= 200 ? maxPerPos : 200) * sizeof(collocator));
760 memset(best, 0, (maxPerPos >= 200 ? maxPerPos : 200) * sizeof(collocator));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200761
Marc Kupietz969cab92019-08-05 11:13:42 +0200762 if (cutoff < 1 || cutoff > words)
763 cutoff = words;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200764
765 wl = getTargetWords(word, search_backw);
Marc Kupietz969cab92019-08-05 11:13:42 +0200766 if (wl == NULL || wl->length < 1)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200767 return "";
768
Marc Kupietz969cab92019-08-05 11:13:42 +0200769 a = posix_memalign((void **)&target_sums, 128, cutoff * sizeof(float));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200770 memset(target_sums, 0, cutoff * sizeof(float));
771
772 printf("Starting %d threads\n", syn_threads);
773 fflush(stdout);
Marc Kupietz969cab92019-08-05 11:13:42 +0200774 for (a = 0; a < syn_threads; a++) {
775 pars[a].cutoff = cutoff;
776 pars[a].target_sums = target_sums;
777 pars[a].window_sums = window_sums;
778 pars[a].wl = wl;
779 pars[a].N = maxPerPos;
780 pars[a].threshold = threshold;
781 pars[a].from = a;
782 pars[a].upto = a + 1;
783 pthread_create(&pt[a], NULL, getCollocators, (void *)&pars[a]);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200784 }
785 printf("Waiting for syn threads to join\n");
786 fflush(stdout);
Marc Kupietz969cab92019-08-05 11:13:42 +0200787 for (a = 0; a < syn_threads; a++) pthread_join(pt[a], (void *)&syn_nbs[a]);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200788 printf("Syn threads joint\n");
789 fflush(stdout);
Marc Kupietz969cab92019-08-05 11:13:42 +0200790 result = malloc(maxPerPos * 80 * syn_threads);
Marc Kupietzbdd779a2024-08-05 10:02:29 +0200791 char *p = (char *) result;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200792 *p = 0;
Marc Kupietz969cab92019-08-05 11:13:42 +0200793 for (a = syn_threads - 1; a >= 0; a--) {
794 for (b = 0; b < syn_nbs[a]->length; b++) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200795 p += sprintf(p, "%ld\t%s\t%f\n", syn_nbs[a]->best[b].position, &vocab[syn_nbs[a]->best[b].wordi * max_w], syn_nbs[a]->best[b].activation);
796 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200797 }
Marc Kupietzbdd779a2024-08-05 10:02:29 +0200798 return ((char *)result);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200799}
800
801SV *get_neighbours(char *st1, int N, int sort_by, int search_backw, long cutoff, int dedupe, int no_similar_profiles) {
802 HV *result = newHV();
Marc Kupietz59865a92021-03-11 17:16:51 +0100803 float *target_sums = NULL;
Marc Kupietz969cab92019-08-05 11:13:42 +0200804 long a, b, c, d, slice;
805 knn *para_nbs[MAX_THREADS];
806 knn *syn_nbs[MAX_THREADS];
807 knnpars pars[MAX_THREADS];
808 pthread_t *pt = (pthread_t *)malloc((num_threads + 1) * sizeof(pthread_t));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200809 wordlist *wl;
Marc Kupietz969cab92019-08-05 11:13:42 +0200810 int syn_threads = (M2 ? window * 2 : 0);
811 int para_threads = (no_similar_profiles ? 0 : num_threads - syn_threads);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200812
813 collocator *best = NULL;
Marc Kupietz969cab92019-08-05 11:13:42 +0200814 posix_memalign((void **)&best, 128, 10 * (N >= 200 ? N : 200) * sizeof(collocator));
815 memset(best, 0, (N >= 200 ? N : 200) * sizeof(collocator));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200816
Marc Kupietz969cab92019-08-05 11:13:42 +0200817 if (N > MAX_NEIGHBOURS) N = MAX_NEIGHBOURS;
818
819 if (cutoff < 1 || cutoff > words)
820 cutoff = words;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200821
822 wl = getTargetWords(st1, search_backw);
Marc Kupietz969cab92019-08-05 11:13:42 +0200823 if (wl == NULL || wl->length < 1)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200824 goto end;
825
Marc Kupietz969cab92019-08-05 11:13:42 +0200826 slice = cutoff / para_threads;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200827
Marc Kupietz969cab92019-08-05 11:13:42 +0200828 a = posix_memalign((void **)&target_sums, 128, cutoff * sizeof(float));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200829 memset(target_sums, 0, cutoff * sizeof(float));
830
Marc Kupietzc0d41872021-02-25 16:33:22 +0100831 printf("Starting %d threads for paradigmatic search\n", para_threads);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200832 fflush(stdout);
Marc Kupietz969cab92019-08-05 11:13:42 +0200833 for (a = 0; a < para_threads; a++) {
834 pars[a].cutoff = cutoff;
835 pars[a].token = st1;
836 pars[a].wl = wl;
837 pars[a].N = N;
838 pars[a].best = &best[N * a];
839 if (merge_words == 0 || search_backw == 0) {
840 pars[a].from = a * slice;
841 pars[a].upto = ((a + 1) * slice > cutoff ? cutoff : (a + 1) * slice);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200842 } else {
843 pars[a].from = merge_words + a * slice;
Marc Kupietz969cab92019-08-05 11:13:42 +0200844 pars[a].upto = merge_words + ((a + 1) * slice > cutoff ? cutoff : (a + 1) * slice);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200845 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200846 printf("From: %ld, Upto: %ld\n", pars[a].from, pars[a].upto);
847 pthread_create(&pt[a], NULL, _get_neighbours, (void *)&pars[a]);
848 }
849 if (M2) {
850 for (a = 0; a < syn_threads; a++) {
851 pars[a + para_threads].cutoff = cutoff;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200852 pars[a + para_threads].target_sums = target_sums;
853 pars[a + para_threads].window_sums = window_sums;
854 pars[a + para_threads].wl = wl;
855 pars[a + para_threads].N = N;
856 pars[a + para_threads].threshold = MIN_RESP;
857 pars[a + para_threads].from = a;
Marc Kupietz969cab92019-08-05 11:13:42 +0200858 pars[a + para_threads].upto = a + 1;
859 pthread_create(&pt[a + para_threads], NULL, getCollocators, (void *)&pars[a + para_threads]);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200860 }
861 }
862 printf("Waiting for para threads to join\n");
863 fflush(stdout);
Marc Kupietz969cab92019-08-05 11:13:42 +0200864 for (a = 0; a < para_threads; a++) pthread_join(pt[a], (void *)&para_nbs[a]);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200865 printf("Para threads joint\n");
866 fflush(stdout);
867
Marc Kupietz969cab92019-08-05 11:13:42 +0200868 /* if(!syn_nbs[0]) */
869 /* goto end; */
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200870
Marc Kupietz969cab92019-08-05 11:13:42 +0200871 qsort(best, N * para_threads, sizeof(collocator), cmp_activation);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200872
873 long long chosen[MAX_NEIGHBOURS];
Marc Kupietz59865a92021-03-11 17:16:51 +0100874 printf("N: %d\n", N);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200875
Marc Kupietz969cab92019-08-05 11:13:42 +0200876 AV *array = newAV();
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200877 int i, j;
Marc Kupietz969cab92019-08-05 11:13:42 +0200878 int l1_words = 0, l2_words = 0;
879
880 for (a = 0, i = 0; i < N && a < N * para_threads; a++) {
881 int filtered = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200882 long long c = best[a].wordi;
883 if ((merge_words && dedupe && i > 1) || (!merge_words && dedupe && i > 0)) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200884 for (j = 0; j < i && !filtered; j++)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200885 if (strcasestr(&vocab[c * max_w], &vocab[chosen[j] * max_w]) ||
886 strcasestr(&vocab[chosen[j] * max_w], &vocab[c * max_w])) {
Marc Kupietz969cab92019-08-05 11:13:42 +0200887 printf("filtering %s %s\n", &vocab[chosen[j] * max_w], &vocab[c * max_w]);
888 filtered = 1;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200889 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200890 if (filtered)
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200891 continue;
892 }
893
Marc Kupietz969cab92019-08-05 11:13:42 +0200894 if (0 && merge_words > 0) {
895 if (c >= merge_words) {
896 if (l1_words > N / 2)
897 continue;
898 else
899 l1_words++;
900 } else {
901 if (l2_words > N / 2)
902 continue;
903 else
904 l2_words++;
905 }
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200906 }
907
Marc Kupietz969cab92019-08-05 11:13:42 +0200908 // printf("%s l1:%d l2:%d i:%d a:%ld\n", &vocab[c * max_w], l1_words, l2_words, i, a);
909 // fflush(stdout);
910 HV *hash = newHV();
911 SV *word = newSVpvf(&vocab[c * max_w], 0);
912 chosen[i] = c;
913 if (latin_enc == 0) SvUTF8_on(word);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200914 fflush(stdout);
Marc Kupietz969cab92019-08-05 11:13:42 +0200915 hv_store(hash, "word", strlen("word"), word, 0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200916 hv_store(hash, "dist", strlen("dist"), newSVnv(best[a].activation), 0);
917 hv_store(hash, "rank", strlen("rank"), newSVuv(best[a].wordi), 0);
918 AV *vector = newAV();
919 for (b = 0; b < size; b++) {
920 av_push(vector, newSVnv(M[b + best[a].wordi * size]));
921 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200922 hv_store(hash, "vector", strlen("vector"), newRV_noinc((SV *)vector), 0);
923 av_push(array, newRV_noinc((SV *)hash));
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200924 i++;
925 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200926 hv_store(result, "paradigmatic", strlen("paradigmatic"), newRV_noinc((SV *)array), 0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200927
Marc Kupietz969cab92019-08-05 11:13:42 +0200928 for (b = 0; b < MAX_NEIGHBOURS; b++) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200929 best[b].wordi = -1L;
930 best[b].activation = 0;
931 best[b].probability = 0;
932 best[b].position = 0;
933 best[b].activation_sum = 0;
Marc Kupietz969cab92019-08-05 11:13:42 +0200934 memset(best[b].heat, 0, sizeof(float) * 16);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200935 }
936
Marc Kupietz969cab92019-08-05 11:13:42 +0200937 float total_activation = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200938
939 if (M2) {
940 printf("Waiting for syn threads to join\n");
941 fflush(stdout);
Marc Kupietz969cab92019-08-05 11:13:42 +0200942 for (a = 0; a < syn_threads; a++) pthread_join(pt[a + para_threads], (void *)&syn_nbs[a]);
943 for (a = 0; a <= syn_threads; a++) {
944 if (a == window) continue;
945 total_activation += window_sums[a];
Marc Kupietz59865a92021-03-11 17:16:51 +0100946 printf("window pos: %ld, sum: %f\n", a, window_sums[a]);
Marc Kupietz969cab92019-08-05 11:13:42 +0200947 }
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200948 printf("syn threads joint\n");
949 fflush(stdout);
950
Marc Kupietz969cab92019-08-05 11:13:42 +0200951 for (b = 0; b < syn_nbs[0]->length; b++) {
952 memcpy(best + b, &syn_nbs[0]->best[b], sizeof(collocator));
953 best[b].position = -1; // syn_nbs[0]->pos[b];
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200954 best[b].activation_sum = target_sums[syn_nbs[0]->best[b].wordi];
Marc Kupietz969cab92019-08-05 11:13:42 +0200955 best[b].max_activation = 0.0;
956 best[b].average = 0.0;
957 best[b].probability = 0.0;
958 best[b].cprobability = syn_nbs[0]->best[b].cprobability;
959 memset(best[b].heat, 0, sizeof(float) * 16);
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200960 }
Marc Kupietz969cab92019-08-05 11:13:42 +0200961
962 float best_window_sum[MAX_NEIGHBOURS];
Marc Kupietz59865a92021-03-11 17:16:51 +0100963 int found_index = 0, i = 0, w;
Marc Kupietz969cab92019-08-05 11:13:42 +0200964 for (a = 0; a < syn_threads; a++) {
965 for (b = 0; b < syn_nbs[a]->length; b++) {
966 for (i = 0; i < found_index; i++)
967 if (best[i].wordi == syn_nbs[a]->best[b].wordi)
968 break;
969 if (i >= found_index) {
970 best[found_index].max_activation = 0.0;
971 best[found_index].average = 0.0;
972 best[found_index].probability = 0.0;
973 memset(best[found_index].heat, 0, sizeof(float) * 16);
974 best[found_index].cprobability = syn_nbs[a]->best[b].cprobability;
975 best[found_index].activation_sum = target_sums[syn_nbs[a]->best[b].wordi]; // syn_nbs[a]->best[b].activation_sum;
976 best[found_index++].wordi = syn_nbs[a]->best[b].wordi;
977 // printf("found: %s\n", &vocab[syn_nbs[a]->index[b] * max_w]);
978 }
979 }
980 }
981 sort_by = 0; // ALWAYS AUTO-FOCUS
982 if (sort_by != 1 && sort_by != 2) { // sort by auto focus mean
983 printf("window: %d - syn_threads: %d, %d\n", window, syn_threads, (1 << syn_threads) - 1);
984 int wpos;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200985 int bits_set = 0;
Marc Kupietz969cab92019-08-05 11:13:42 +0200986 for (i = 0; i < found_index; i++) {
987 best[i].activation = best[i].probability = best[i].average = best[i].cprobability_sum = 0;
988 for (w = 1; w < (1 << syn_threads); w++) { // loop through all possible windows
989 float word_window_sum = 0, word_window_average = 0, word_cprobability_sum = 0, word_activation_sum = 0, total_window_sum = 0;
Marc Kupietzf11d20c2019-08-02 15:42:04 +0200990 bits_set = 0;
Marc Kupietz969cab92019-08-05 11:13:42 +0200991 for (a = 0; a < syn_threads; a++) {
992 if ((1 << a) & w) {
993 wpos = (a >= window ? a + 1 : a);
994 total_window_sum += window_sums[wpos];
995 }
996 }
997 // printf("%d window-sum %f\n", w, total_window_sum);
998 for (a = 0; a < syn_threads; a++) {
999 if ((1 << a) & w) {
1000 wpos = (a >= window ? a + 1 : a);
1001 bits_set++;
1002 for (b = 0; b < syn_nbs[a]->length; b++)
1003 if (best[i].wordi == syn_nbs[a]->best[b].wordi) {
1004 // float acti = syn_nbs[a]->best[b].activation / total_window_sum;
1005 // word_window_sum += syn_nbs[a]->dist[b] * syn_nbs[a]->norm[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
1006 // word_window_sum += syn_nbs[a]->norm[b]; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
1007 // word_window_sum = (word_window_sum + syn_nbs[a]->norm[b]) - (word_window_sum * syn_nbs[a]->norm[b]); // syn_nbs[a]->norm[b];
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001008
Marc Kupietz969cab92019-08-05 11:13:42 +02001009 word_window_sum += syn_nbs[a]->best[b].activation; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
1010 // word_window_sum += acti - (word_window_sum * acti); syn_nbs[a]->best[b].activation; // / window_sums[wpos]; // syn_nbs[a]->norm[b];
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001011
Marc Kupietz969cab92019-08-05 11:13:42 +02001012 word_window_average += syn_nbs[a]->best[b].activation; // - word_window_average * syn_nbs[a]->best[b].activation; // conormalied activation sum
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001013 word_cprobability_sum += syn_nbs[a]->best[b].cprobability - word_cprobability_sum * syn_nbs[a]->best[b].cprobability; // conormalied column probability sum
Marc Kupietz969cab92019-08-05 11:13:42 +02001014 word_activation_sum += syn_nbs[a]->best[b].activation;
1015 if (syn_nbs[a]->best[b].activation > best[i].max_activation)
1016 best[i].max_activation = syn_nbs[a]->best[b].activation;
1017 if (syn_nbs[a]->best[b].activation > best[i].heat[wpos])
1018 best[i].heat[wpos] = syn_nbs[a]->best[b].activation;
1019 }
1020 }
1021 }
1022 if (bits_set) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001023 word_window_average /= bits_set;
Marc Kupietz969cab92019-08-05 11:13:42 +02001024 // word_activation_sum /= bits_set;
1025 // word_window_sum /= bits_set;
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001026 }
1027
Marc Kupietz969cab92019-08-05 11:13:42 +02001028 word_window_sum /= total_window_sum;
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001029
Marc Kupietz969cab92019-08-05 11:13:42 +02001030 if (word_window_sum > best[i].probability) {
1031 // best[i].position = w;
1032 best[i].probability = word_window_sum;
1033 }
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001034
Marc Kupietz969cab92019-08-05 11:13:42 +02001035 if (word_cprobability_sum > best[i].cprobability_sum) {
1036 best[i].position = w;
1037 best[i].cprobability_sum = word_cprobability_sum;
1038 }
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001039
Marc Kupietz969cab92019-08-05 11:13:42 +02001040 best[i].average = word_window_average;
1041 // best[i].activation = word_activation_sum;
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001042 }
Marc Kupietz969cab92019-08-05 11:13:42 +02001043 }
1044 qsort(best, found_index, sizeof(collocator), cmp_probability);
1045 // for(i=0; i < found_index; i++) {
1046 // printf("found: %s - sum: %f - window: %d\n", &vocab[best[i].wordi * max_w], best[i].activation, best[i].position);
1047 // }
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001048
Marc Kupietz969cab92019-08-05 11:13:42 +02001049 } else if (sort_by == 1) { // responsiveness any window position
1050 int wpos;
1051 for (i = 0; i < found_index; i++) {
1052 float word_window_sum = 0, word_activation_sum = 0, total_window_sum = 0;
1053 for (a = 0; a < syn_threads; a++) {
1054 wpos = (a >= window ? a + 1 : a);
1055 for (b = 0; b < syn_nbs[a]->length; b++)
1056 if (best[i].wordi == syn_nbs[a]->best[b].wordi) {
1057 best[i].probability += syn_nbs[a]->best[b].probability;
1058 if (syn_nbs[a]->best[b].activation > 0.25)
1059 best[i].position |= 1 << wpos;
1060 if (syn_nbs[a]->best[b].activation > best[i].activation) {
1061 best[i].activation = syn_nbs[a]->best[b].activation;
1062 }
1063 }
1064 }
1065 }
1066 qsort(best, found_index, sizeof(collocator), cmp_activation);
1067 } else if (sort_by == 2) { // single window position
1068 for (a = 1; a < syn_threads; a++) {
1069 for (b = 0; b < syn_nbs[a]->length; b++) {
1070 for (c = 0; c < MAX_NEIGHBOURS; c++) {
1071 if (syn_nbs[a]->best[b].activation > best[c].activation) {
1072 for (d = MAX_NEIGHBOURS - 1; d > c; d--) {
1073 memmove(best + d, best + d - 1, sizeof(collocator));
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001074 }
1075 memcpy(best + c, &syn_nbs[a]->best[b], sizeof(collocator));
Marc Kupietz969cab92019-08-05 11:13:42 +02001076 best[c].position = 1 << (-syn_nbs[a]->best[b].position + window - (syn_nbs[a]->best[b].position < 0 ? 1 : 0));
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001077 break;
1078 }
1079 }
1080 }
1081 }
Marc Kupietz969cab92019-08-05 11:13:42 +02001082 } else { // sort by mean p
1083 for (a = 1; a < syn_threads; a++) {
1084 for (b = 0; b < syn_nbs[a]->length; b++) {
1085 for (c = 0; c < MAX_NEIGHBOURS; c++) {
1086 if (target_sums[syn_nbs[a]->best[b].wordi] > best[c].activation_sum) {
1087 for (d = MAX_NEIGHBOURS - 1; d > c; d--) {
1088 memmove(best + d, best + d - 1, sizeof(collocator));
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001089 }
1090 memcpy(best + c, &syn_nbs[a]->best[b], sizeof(collocator));
Marc Kupietz969cab92019-08-05 11:13:42 +02001091 best[c].position = (1 << 2 * window) - 1; // syn_nbs[a]->pos[b];
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001092 best[c].activation_sum = target_sums[syn_nbs[a]->best[b].wordi];
1093 break;
1094 }
1095 }
1096 }
1097 }
1098 }
1099 array = newAV();
Marc Kupietz969cab92019-08-05 11:13:42 +02001100 for (a = 0, i = 0; a < MAX_NEIGHBOURS && best[a].wordi >= 0; a++) {
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001101 long long c = best[a].wordi;
Marc Kupietz969cab92019-08-05 11:13:42 +02001102 /*
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001103 if (dedupe) {
1104 int filtered=0;
1105 for (j=0; j<i; j++)
1106 if (strcasestr(&vocab[c * max_w], chosen[j]) ||
1107 strcasestr(chosen[j], &vocab[c * max_w])) {
1108 printf("filtering %s %s\n", chosen[j], &vocab[c * max_w]);
1109 filtered = 1;
1110 }
1111 if(filtered)
1112 continue;
1113 }
1114*/
Marc Kupietz969cab92019-08-05 11:13:42 +02001115 chosen[i++] = c;
1116 HV *hash = newHV();
1117 SV *word = newSVpvf(&vocab[best[a].wordi * max_w], 0);
1118 AV *heat = newAV();
1119 if (latin_enc == 0) SvUTF8_on(word);
1120 hv_store(hash, "word", strlen("word"), word, 0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001121 hv_store(hash, "rank", strlen("rank"), newSVuv(best[a].wordi), 0);
1122 hv_store(hash, "average", strlen("average"), newSVnv(best[a].average), 0);
1123 hv_store(hash, "prob", strlen("prob"), newSVnv(best[a].probability), 0);
1124 hv_store(hash, "cprob", strlen("cprob"), newSVnv(best[a].cprobability_sum), 0);
Marc Kupietz969cab92019-08-05 11:13:42 +02001125 hv_store(hash, "max", strlen("max"), newSVnv(best[a].max_activation), 0); // newSVnv(target_sums[best[a].wordi]), 0);
1126 hv_store(hash, "overall", strlen("overall"), newSVnv(best[a].activation_sum / total_activation), 0); // newSVnv(target_sums[best[a].wordi]), 0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001127 hv_store(hash, "pos", strlen("pos"), newSVnv(best[a].position), 0);
Marc Kupietz969cab92019-08-05 11:13:42 +02001128 best[a].heat[5] = 0;
1129 for (i = 10; i >= 0; i--) av_push(heat, newSVnv(best[a].heat[i]));
1130 hv_store(hash, "heat", strlen("heat"), newRV_noinc((SV *)heat), 0);
1131 av_push(array, newRV_noinc((SV *)hash));
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001132 }
Marc Kupietz969cab92019-08-05 11:13:42 +02001133 hv_store(result, "syntagmatic", strlen("syntagmatic"), newRV_noinc((SV *)array), 0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001134 }
1135end:
Marc Kupietz969cab92019-08-05 11:13:42 +02001136 free(best);
1137 return newRV_noinc((SV *)result);
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001138}
1139
1140int dump_vecs(char *fname) {
Marc Kupietz969cab92019-08-05 11:13:42 +02001141 long i, j;
1142 FILE *f;
1143 /* if(words>100000)
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001144 words=100000;
1145*/
Marc Kupietz969cab92019-08-05 11:13:42 +02001146 if ((f = fopen(fname, "w")) == NULL) {
1147 fprintf(stderr, "cannot open %s for writing\n", fname);
1148 return (-1);
1149 }
1150 fprintf(f, "%lld %lld\n", words, size);
1151 for (i = 0; i < words; i++) {
1152 fprintf(f, "%s ", &vocab[i * max_w]);
1153 for (j = 0; j < size - 1; j++)
1154 fprintf(f, "%f ", M[i * size + j]);
1155 fprintf(f, "%f\n", M[i * size + j]);
1156 }
1157 fclose(f);
1158 return (0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001159}
1160
1161int dump_for_numpy(char *fname) {
Marc Kupietz969cab92019-08-05 11:13:42 +02001162 long i, j;
1163 FILE *f;
Marc Kupietzc0d41872021-02-25 16:33:22 +01001164 int max = words; // 300000;
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001165
Marc Kupietz969cab92019-08-05 11:13:42 +02001166 if ((f = fopen(fname, "w")) == NULL) {
1167 fprintf(stderr, "cannot open %s for writing\n", fname);
1168 return (-1);
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001169 }
Marc Kupietz969cab92019-08-05 11:13:42 +02001170 for (i = 0; i < max; i++) {
1171 for (j = 0; j < size - 1; j++)
1172 fprintf(f, "%f\t", M[i * size + j]);
1173 fprintf(f, "%f\n", M[i * size + j]);
1174 printf("%s\r\n", &vocab[i * max_w]);
1175 }
1176 if (merged_end > 0) {
1177 for (i = 0; i < max; i++) {
1178 for (j = 0; j < size - 1; j++)
1179 fprintf(f, "%f\t", M[(merged_end + i) * size + j]);
1180 fprintf(f, "%f\n", M[(merged_end + i) * size + j]);
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001181 printf("_%s\r\n", &vocab[i * max_w]);
1182 }
Marc Kupietz969cab92019-08-05 11:13:42 +02001183 }
1184 fclose(f);
1185 return (0);
Marc Kupietzf11d20c2019-08-02 15:42:04 +02001186}
Marc Kupietz043db152023-11-05 17:47:53 +01001187
1188unsigned long getVocabSize() {
1189 return (unsigned long) words;
1190}