blob: bff3c4cfbfde5b60500459045d75de665afb3676 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Marc Kupietze23c5402016-07-14 11:10:09 +020015#include <locale.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010016#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020019#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010020#include <math.h>
21#include <pthread.h>
Marc Kupietz613edbf2018-01-11 21:38:03 +010022#include <collocatordb.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010023
24#define MAX_STRING 100
25#define EXP_TABLE_SIZE 1000
26#define MAX_EXP 6
27#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010028#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010029#define MAX_CODE_LENGTH 40
30
31const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
32
33typedef float real; // Precision of float numbers
34
35struct vocab_word {
36 long long cn;
37 int *point;
38 char *word, *code, codelen;
39};
40
41char train_file[MAX_STRING], output_file[MAX_STRING];
42char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
43char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietze423f732017-12-22 17:57:03 +010044char magic_stop_file[MAX_STRING];
45
Marc Kupietzd6f9c712016-03-16 11:50:56 +010046struct vocab_word *vocab;
47int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz879333c2023-12-20 11:41:09 +010048 num_threads = 12, min_reduce = 1, metadata_categories = 0, expected_metadata_categories = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010049int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020050long long *threadPos;
51int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010052long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
53long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
54 classes = 0;
55real alpha = 0.025, starting_alpha, sample = 1e-3;
56real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020057real avgWordLength=0;
Marc Kupietzb366bcd2018-01-11 21:29:41 +010058clock_t start, start_clock;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010059
60real *syn1_window, *syn1neg_window, *syn1nce_window;
61int w_offset, window_layer_size;
62
63int window_hidden_size = 500;
64real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
65 *syn_hidden_word_nce;
66
67int hs = 0, negative = 5;
68const int table_size = 1e8;
69int *table;
70
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010071long cc = 0;
72
Marc Kupietzd6f9c712016-03-16 11:50:56 +010073//constrastive negative sampling
74char negative_classes_file[MAX_STRING];
75int *word_to_group;
76int *group_to_table; //group_size*table_size
77int class_number;
78
79//nce
80real* noise_distribution;
81int nce = 0;
82
83//param caps
84real CAP_VALUE = 50;
85int cap = 0;
86
Marc Kupietz613edbf2018-01-11 21:38:03 +010087COLLOCATORDB *cdb = NULL;
88
Marc Kupietzd6f9c712016-03-16 11:50:56 +010089void capParam(real* array, int index) {
90 if (array[index] > CAP_VALUE)
91 array[index] = CAP_VALUE;
92 else if (array[index] < -CAP_VALUE)
93 array[index] = -CAP_VALUE;
94}
95
96real hardTanh(real x) {
97 if (x >= 1) {
98 return 1;
99 } else if (x <= -1) {
100 return -1;
101 } else {
102 return x;
103 }
104}
105
106real dHardTanh(real x, real g) {
107 if (x > 1 && g > 0) {
108 return 0;
109 }
110 if (x < -1 && g < 0) {
111 return 0;
112 }
113 return 1;
114}
115
116void InitUnigramTable() {
117 int a, i;
118 long long train_words_pow = 0;
119 real d1, power = 0.75;
120 table = (int *) malloc(table_size * sizeof(int));
121 for (a = 0; a < vocab_size; a++)
122 train_words_pow += pow(vocab[a].cn, power);
123 i = 0;
124 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
125 for (a = 0; a < table_size; a++) {
126 table[a] = i;
127 if (a / (real) table_size > d1) {
128 i++;
129 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
130 }
131 if (i >= vocab_size)
132 i = vocab_size - 1;
133 }
134
135 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
136 for (a = 0; a < vocab_size; a++)
137 noise_distribution[a] = pow(vocab[a].cn, power)
138 / (real) train_words_pow;
139}
140
141// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
142void ReadWord(char *word, FILE *fin) {
143 int a = 0, ch;
144 while (!feof(fin)) {
145 ch = fgetc(fin);
146 if (ch == 13)
147 continue;
148 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
Marc Kupietz879333c2023-12-20 11:41:09 +0100149 if (ch == '\t' && expected_metadata_categories > 0) {
150 a = 0;
151 expected_metadata_categories--;
152 } else {
153 if (a > 0) {
154 if (ch == '\n') {
155 expected_metadata_categories = metadata_categories;
156 ungetc(ch, fin);
157 }
158 break;
159 }
160 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100161 if (ch == '\n') {
162 strcpy(word, (char *) "</s>");
Marc Kupietz879333c2023-12-20 11:41:09 +0100163 expected_metadata_categories = metadata_categories;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100164 return;
165 } else
166 continue;
167 }
168 word[a] = ch;
169 a++;
170 if (a >= MAX_STRING - 1)
171 a--; // Truncate too long words
172 }
173 word[a] = 0;
174}
175
176// Returns hash value of a word
177int GetWordHash(char *word) {
178 unsigned long long a, hash = 0;
179 for (a = 0; a < strlen(word); a++)
180 hash = hash * 257 + word[a];
181 hash = hash % vocab_hash_size;
182 return hash;
183}
184
185// Returns position of a word in the vocabulary; if the word is not found, returns -1
186int SearchVocab(char *word) {
187 unsigned int hash = GetWordHash(word);
188 while (1) {
189 if (vocab_hash[hash] == -1)
190 return -1;
191 if (!strcmp(word, vocab[vocab_hash[hash]].word))
192 return vocab_hash[hash];
193 hash = (hash + 1) % vocab_hash_size;
194 }
195 return -1;
196}
197
198// Reads a word and returns its index in the vocabulary
199int ReadWordIndex(FILE *fin) {
200 char word[MAX_STRING];
201 ReadWord(word, fin);
202 if (feof(fin))
203 return -1;
204 return SearchVocab(word);
205}
206
207// Adds a word to the vocabulary
208int AddWordToVocab(char *word) {
209 unsigned int hash, length = strlen(word) + 1;
210 if (length > MAX_STRING)
211 length = MAX_STRING;
212 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
213 strcpy(vocab[vocab_size].word, word);
214 vocab[vocab_size].cn = 0;
215 vocab_size++;
216 // Reallocate memory if needed
217 if (vocab_size + 2 >= vocab_max_size) {
218 vocab_max_size += 1000;
219 vocab = (struct vocab_word *) realloc(vocab,
220 vocab_max_size * sizeof(struct vocab_word));
221 }
222 hash = GetWordHash(word);
223 while (vocab_hash[hash] != -1)
224 hash = (hash + 1) % vocab_hash_size;
225 vocab_hash[hash] = vocab_size - 1;
226 return vocab_size - 1;
227}
228
229// Used later for sorting by word counts
230int VocabCompare(const void *a, const void *b) {
231 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
232}
233
234// Sorts the vocabulary by frequency using word counts
235void SortVocab() {
236 int a, size;
237 unsigned int hash;
238 // Sort the vocabulary and keep </s> at the first position
239 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
240 for (a = 0; a < vocab_hash_size; a++)
241 vocab_hash[a] = -1;
242 size = vocab_size;
243 train_words = 0;
244 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200245 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100246 // Words occuring less than min_count times will be discarded from the vocab
247 if ((vocab[a].cn < min_count) && (a != 0)) {
248 vocab_size--;
249 free(vocab[a].word);
250 } else {
251 // Hash will be re-computed, as after the sorting it is not actual
252 hash = GetWordHash(vocab[a].word);
253 while (vocab_hash[hash] != -1)
254 hash = (hash + 1) % vocab_hash_size;
255 vocab_hash[hash] = a;
256 train_words += vocab[a].cn;
257 }
258 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200259 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100260 vocab = (struct vocab_word *) realloc(vocab,
261 (vocab_size + 1) * sizeof(struct vocab_word));
262 // Allocate memory for the binary tree construction
263 for (a = 0; a < vocab_size; a++) {
264 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
265 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
266 }
267}
268
269// Reduces the vocabulary by removing infrequent tokens
270void ReduceVocab() {
271 int a, b = 0;
272 unsigned int hash;
273 for (a = 0; a < vocab_size; a++)
274 if (vocab[a].cn > min_reduce) {
275 vocab[b].cn = vocab[a].cn;
276 vocab[b].word = vocab[a].word;
277 b++;
278 } else
279 free(vocab[a].word);
280 vocab_size = b;
281 for (a = 0; a < vocab_hash_size; a++)
282 vocab_hash[a] = -1;
283 for (a = 0; a < vocab_size; a++) {
284 // Hash will be re-computed, as it is not actual
285 hash = GetWordHash(vocab[a].word);
286 while (vocab_hash[hash] != -1)
287 hash = (hash + 1) % vocab_hash_size;
288 vocab_hash[hash] = a;
289 }
290 fflush(stdout);
291 min_reduce++;
292}
293
294// Create binary Huffman tree using the word counts
295// Frequent words will have short uniqe binary codes
296void CreateBinaryTree() {
297 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
298 char code[MAX_CODE_LENGTH];
299 long long *count = (long long *) calloc(vocab_size * 2 + 1,
300 sizeof(long long));
301 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
302 sizeof(long long));
303 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
304 sizeof(long long));
305 for (a = 0; a < vocab_size; a++)
306 count[a] = vocab[a].cn;
307 for (a = vocab_size; a < vocab_size * 2; a++)
308 count[a] = 1e15;
309 pos1 = vocab_size - 1;
310 pos2 = vocab_size;
311 // Following algorithm constructs the Huffman tree by adding one node at a time
312 for (a = 0; a < vocab_size - 1; a++) {
313 // First, find two smallest nodes 'min1, min2'
314 if (pos1 >= 0) {
315 if (count[pos1] < count[pos2]) {
316 min1i = pos1;
317 pos1--;
318 } else {
319 min1i = pos2;
320 pos2++;
321 }
322 } else {
323 min1i = pos2;
324 pos2++;
325 }
326 if (pos1 >= 0) {
327 if (count[pos1] < count[pos2]) {
328 min2i = pos1;
329 pos1--;
330 } else {
331 min2i = pos2;
332 pos2++;
333 }
334 } else {
335 min2i = pos2;
336 pos2++;
337 }
338 count[vocab_size + a] = count[min1i] + count[min2i];
339 parent_node[min1i] = vocab_size + a;
340 parent_node[min2i] = vocab_size + a;
341 binary[min2i] = 1;
342 }
343 // Now assign binary code to each vocabulary word
344 for (a = 0; a < vocab_size; a++) {
345 b = a;
346 i = 0;
347 while (1) {
348 code[i] = binary[b];
349 point[i] = b;
350 i++;
351 b = parent_node[b];
352 if (b == vocab_size * 2 - 2)
353 break;
354 }
355 vocab[a].codelen = i;
356 vocab[a].point[0] = vocab_size - 2;
357 for (b = 0; b < i; b++) {
358 vocab[a].code[i - b - 1] = code[b];
359 vocab[a].point[i - b] = point[b] - vocab_size;
360 }
361 }
362 free(count);
363 free(binary);
364 free(parent_node);
365}
366
367void LearnVocabFromTrainFile() {
368 char word[MAX_STRING];
369 FILE *fin;
370 long long a, i;
371 for (a = 0; a < vocab_hash_size; a++)
372 vocab_hash[a] = -1;
373 fin = fopen(train_file, "rb");
374 if (fin == NULL) {
375 printf("ERROR: training data file not found!\n");
376 exit(1);
377 }
378 vocab_size = 0;
379 AddWordToVocab((char *) "</s>");
Marc Kupietz879333c2023-12-20 11:41:09 +0100380 for (int j=0; j < metadata_categories; j++) {
381 ReadWord(word, fin);
382 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100383 while (1) {
384 ReadWord(word, fin);
385 if (feof(fin))
386 break;
387 train_words++;
388 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
389 printf("%lldK%c", train_words / 1000, 13);
390 fflush(stdout);
391 }
392 i = SearchVocab(word);
393 if (i == -1) {
394 a = AddWordToVocab(word);
395 vocab[a].cn = 1;
396 } else
397 vocab[i].cn++;
398 if (vocab_size > vocab_hash_size * 0.7)
399 ReduceVocab();
400 }
401 SortVocab();
402 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200403 printf("Vocab size: %'lld\n", vocab_size);
404 printf("Words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100405 }
406 file_size = ftell(fin);
407 fclose(fin);
408}
409
410void SaveVocab() {
411 long long i;
412 FILE *fo = fopen(save_vocab_file, "wb");
413 for (i = 0; i < vocab_size; i++)
414 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
415 fclose(fo);
416}
417
418void ReadVocab() {
419 long long a, i = 0;
420 char c;
421 char word[MAX_STRING];
422 FILE *fin = fopen(read_vocab_file, "rb");
423 if (fin == NULL) {
424 printf("Vocabulary file not found\n");
425 exit(1);
426 }
427 for (a = 0; a < vocab_hash_size; a++)
428 vocab_hash[a] = -1;
429 vocab_size = 0;
430 while (1) {
431 ReadWord(word, fin);
432 if (feof(fin))
433 break;
434 a = AddWordToVocab(word);
435 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
436 i++;
437 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200438 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100439 fin = fopen(train_file, "rb");
440 if (fin == NULL) {
441 printf("ERROR: training data file not found!\n");
442 exit(1);
443 }
444 fseek(fin, 0, SEEK_END);
445 file_size = ftell(fin);
446 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200447 SortVocab();
448 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200449 printf("Vocab size: %'lld\n", vocab_size);
450 printf("Words in vocab's train file: %'lld\n", train_words);
451 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200452 }
Marc Kupietze23c5402016-07-14 11:10:09 +0200453 train_words = file_size / avgWordLength;
454 if(debug_mode > 0)
455 printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100456}
457
458void InitClassUnigramTable() {
459 long long a, c;
460 printf("loading class unigrams \n");
461 FILE *fin = fopen(negative_classes_file, "rb");
462 if (fin == NULL) {
463 printf("ERROR: class file not found!\n");
464 exit(1);
465 }
466 word_to_group = (int *) malloc(vocab_size * sizeof(int));
467 for (a = 0; a < vocab_size; a++)
468 word_to_group[a] = -1;
469 char class[MAX_STRING];
470 char prev_class[MAX_STRING];
471 prev_class[0] = 0;
472 char word[MAX_STRING];
473 class_number = -1;
474 while (1) {
475 if (feof(fin))
476 break;
477 ReadWord(class, fin);
478 ReadWord(word, fin);
479 int word_index = SearchVocab(word);
480 if (word_index != -1) {
481 if (strcmp(class, prev_class) != 0) {
482 class_number++;
483 strcpy(prev_class, class);
484 }
485 word_to_group[word_index] = class_number;
486 }
487 ReadWord(word, fin);
488 }
489 class_number++;
490 fclose(fin);
491
492 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
493 long long train_words_pow = 0;
494 real d1, power = 0.75;
495
496 for (c = 0; c < class_number; c++) {
497 long long offset = c * table_size;
498 train_words_pow = 0;
499 for (a = 0; a < vocab_size; a++)
500 if (word_to_group[a] == c)
501 train_words_pow += pow(vocab[a].cn, power);
502 int i = 0;
503 while (word_to_group[i] != c && i < vocab_size)
504 i++;
505 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
506 for (a = 0; a < table_size; a++) {
507 //printf("index %lld , word %d\n", a, i);
508 group_to_table[offset + a] = i;
509 if (a / (real) table_size > d1) {
510 i++;
511 while (word_to_group[i] != c && i < vocab_size)
512 i++;
513 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
514 }
515 if (i >= vocab_size)
516 while (word_to_group[i] != c && i >= 0)
517 i--;
518 }
519 }
520}
521
Marc Kupietz61485ad2023-12-22 16:16:59 +0100522void SaveArgs(unsigned int argc, char **argv) {
Marc Kupietz210b9d52016-04-02 21:48:13 +0200523 unsigned int i;
Marc Kupietz44136742017-12-22 17:52:56 +0100524 char args_file[MAX_STRING];
525 strcpy(args_file, output_file);
Marc Kupietz210b9d52016-04-02 21:48:13 +0200526 strcat(args_file, ".args");
527 FILE *fargs = fopen(args_file, "w");
528 if (fargs == NULL) {
529 printf("Cannot save args to %s.\n", args_file);
530 return;
531 }
532
Marc Kupietz44136742017-12-22 17:52:56 +0100533 for(i=1; i<argc; i++)
534 fprintf(fargs, "%s ", argv[i]);
535
536 fprintf(fargs, "\n");
Marc Kupietz210b9d52016-04-02 21:48:13 +0200537 fclose(fargs);
Marc Kupietz44136742017-12-22 17:52:56 +0100538
Marc Kupietz210b9d52016-04-02 21:48:13 +0200539 return;
540}
541
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100542void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100543 if(type != 3 || negative <= 0) {
544 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
545 return;
546 }
547
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100548 FILE *fnet = fopen(save_net_file, "wb");
549 if (fnet == NULL) {
550 printf("Net parameter file not found\n");
551 exit(1);
552 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100553 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100554 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100555 fclose(fnet);
556}
557
558void InitNet() {
559 long long a, b;
560 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100561 long long read;
562
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100563 window_layer_size = layer1_size * window * 2;
564 a = posix_memalign((void **) &syn0, 128,
565 (long long) vocab_size * layer1_size * sizeof(real));
566 if (syn0 == NULL) {
567 printf("Memory allocation failed\n");
568 exit(1);
569 }
570
571 if (hs) {
572 a = posix_memalign((void **) &syn1, 128,
573 (long long) vocab_size * layer1_size * sizeof(real));
574 if (syn1 == NULL) {
575 printf("Memory allocation failed\n");
576 exit(1);
577 }
578 a = posix_memalign((void **) &syn1_window, 128,
579 (long long) vocab_size * window_layer_size * sizeof(real));
580 if (syn1_window == NULL) {
581 printf("Memory allocation failed\n");
582 exit(1);
583 }
584 a = posix_memalign((void **) &syn_hidden_word, 128,
585 (long long) vocab_size * window_hidden_size * sizeof(real));
586 if (syn_hidden_word == NULL) {
587 printf("Memory allocation failed\n");
588 exit(1);
589 }
590
591 for (a = 0; a < vocab_size; a++)
592 for (b = 0; b < layer1_size; b++)
593 syn1[a * layer1_size + b] = 0;
594 for (a = 0; a < vocab_size; a++)
595 for (b = 0; b < window_layer_size; b++)
596 syn1_window[a * window_layer_size + b] = 0;
597 for (a = 0; a < vocab_size; a++)
598 for (b = 0; b < window_hidden_size; b++)
599 syn_hidden_word[a * window_hidden_size + b] = 0;
600 }
601 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100602 if(type == 0) {
603 a = posix_memalign((void **) &syn1neg, 128,
604 (long long) vocab_size * layer1_size * sizeof(real));
605 if (syn1neg == NULL) {
606 printf("Memory allocation failed\n");
607 exit(1);
608 }
609 for (a = 0; a < vocab_size; a++)
610 for (b = 0; b < layer1_size; b++)
611 syn1neg[a * layer1_size + b] = 0;
612 } else if (type == 3) {
613 a = posix_memalign((void **) &syn1neg_window, 128,
614 (long long) vocab_size * window_layer_size * sizeof(real));
615 if (syn1neg_window == NULL) {
616 printf("Memory allocation failed\n");
617 exit(1);
618 }
619 for (a = 0; a < vocab_size; a++)
620 for (b = 0; b < window_layer_size; b++)
621 syn1neg_window[a * window_layer_size + b] = 0;
622 } else if (type == 4) {
623 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
624 (long long) vocab_size * window_hidden_size * sizeof(real));
625 if (syn_hidden_word_neg == NULL) {
626 printf("Memory allocation failed\n");
627 exit(1);
628 }
629 for (a = 0; a < vocab_size; a++)
630 for (b = 0; b < window_hidden_size; b++)
631 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100632 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100633 }
634 if (nce > 0) {
635 a = posix_memalign((void **) &syn1nce, 128,
636 (long long) vocab_size * layer1_size * sizeof(real));
637 if (syn1nce == NULL) {
638 printf("Memory allocation failed\n");
639 exit(1);
640 }
641 a = posix_memalign((void **) &syn1nce_window, 128,
642 (long long) vocab_size * window_layer_size * sizeof(real));
643 if (syn1nce_window == NULL) {
644 printf("Memory allocation failed\n");
645 exit(1);
646 }
647 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
648 (long long) vocab_size * window_hidden_size * sizeof(real));
649 if (syn_hidden_word_nce == NULL) {
650 printf("Memory allocation failed\n");
651 exit(1);
652 }
653
654 for (a = 0; a < vocab_size; a++)
655 for (b = 0; b < layer1_size; b++)
656 syn1nce[a * layer1_size + b] = 0;
657 for (a = 0; a < vocab_size; a++)
658 for (b = 0; b < window_layer_size; b++)
659 syn1nce_window[a * window_layer_size + b] = 0;
660 for (a = 0; a < vocab_size; a++)
661 for (b = 0; b < window_hidden_size; b++)
662 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
663 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100664
Marc Kupietz1006a272016-03-16 15:50:20 +0100665 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100666 a = posix_memalign((void **) &syn_window_hidden, 128,
667 window_hidden_size * window_layer_size * sizeof(real));
668 if (syn_window_hidden == NULL) {
669 printf("Memory allocation failed\n");
670 exit(1);
671 }
672 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
673 next_random = next_random * (unsigned long long) 25214903917 + 11;
674 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
675 - 0.5) / (window_hidden_size * window_layer_size);
676 }
677 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100678
679 if (read_net_file[0] == 0) {
680 for (a = 0; a < vocab_size; a++)
681 for (b = 0; b < layer1_size; b++) {
682 next_random = next_random * (unsigned long long) 25214903917
683 + 11;
684 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
685 / (real) 65536) - 0.5) / layer1_size;
686 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100687 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100688 FILE *fnet = fopen(read_net_file, "rb");
689 if (fnet == NULL) {
690 printf("Net parameter file not found\n");
691 exit(1);
692 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100693 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
694 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
695 if(read != vocab_size * layer1_size) {
696 fprintf(stderr, "read-net failed %lld\n", read);
697 exit(-1);
698 }
699 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
700 if(read != (long long) vocab_size * window_layer_size) {
701 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
702 (long long) sizeof(real) * vocab_size * window_layer_size);
703 exit(-1);
704 }
705 fgetc(fnet);
706 if(!feof(fnet)) {
707 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
708 exit(-1);
709 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100710 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100711 } else {
712 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
713 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100714 }
715
716 CreateBinaryTree();
717}
718
Marc Kupietz202723e2016-07-14 09:12:00 +0200719char *currentDateTime(char *buf, real offset) {
720 time_t t;
721 time(&t);
722 t += (long) offset;
723 struct tm tstruct;
724 tstruct = *localtime(&t);
725 strftime(buf, 80, "%c", &tstruct);
726 return buf;
727}
728
729void *MonitorThread(void *id) {
730 char *timebuf = malloc(80);;
731 int i, n=num_threads;
732 long long sum;
733 sleep(1);
734 while(n > 0) {
735 sleep(1);
736 sum = n = 0;
737 for(i=0; i < num_threads; i++) {
738 if(threadPos[i] >= 0) {
739 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
740 n++;
741 } else {
742 sum += iter * file_size / num_threads;
743 }
744 }
745 if(n == 0)
746 break;
747 real finished_portion = (real) sum / (float) (file_size * iter);
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100748 long long now = time(NULL);
749 long long elapsed = (now - start);
750 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
Marc Kupietz202723e2016-07-14 09:12:00 +0200751
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100752 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
Marc Kupietz202723e2016-07-14 09:12:00 +0200753 alpha,
754 finished_portion * 100,
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100755 (float) sum / elapsed / 1000,
Marc Kupietz202723e2016-07-14 09:12:00 +0200756 elapsed,
757 ttg,
758 currentDateTime(timebuf, ttg)
759 );
760 fflush(stdout);
761 }
762 pthread_exit(NULL);
763}
764
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100765void *TrainModelThread(void *id) {
766 long long a, b, d, cw, word, last_word, sentence_length = 0,
767 sentence_position = 0;
768 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
769 long long l1, l2, c, target, label, local_iter = iter;
770 unsigned long long next_random = (long long) id;
771 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100772 int input_len_1 = layer1_size;
773 int window_offset = -1;
774 if (type == 2 || type == 4) {
775 input_len_1 = window_layer_size;
776 }
777 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
778 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200779 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100780
781 int input_len_2 = 0;
782 if (type == 4) {
783 input_len_2 = window_hidden_size;
784 }
785 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
786 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
787
788 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200789 long long start_pos = file_size / (long long) num_threads * (long long) id;
790 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
791 long long current_pos = start_pos;
792 long long last_pos = start_pos;;
793 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100794 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200795 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100796 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200797 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100798 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100799 alpha = starting_alpha
800 * (1 - word_count_actual / (real) (iter * train_words + 1));
801 if (alpha < starting_alpha * 0.0001)
802 alpha = starting_alpha * 0.0001;
803 }
804 if (sentence_length == 0) {
805 while (1) {
806 word = ReadWordIndex(fi);
807 if (feof(fi))
808 break;
809 if (word == -1)
810 continue;
811 word_count++;
812 if (word == 0)
813 break;
814 // The subsampling randomly discards frequent words while keeping the ranking same
815 if (sample > 0) {
816 real ran = (sqrt(vocab[word].cn / (sample * train_words))
817 + 1) * (sample * train_words) / vocab[word].cn;
818 next_random = next_random * (unsigned long long) 25214903917
819 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100820 if (ran < (next_random & 0xFFFF) / (real) 65536) {
821 if(type == 3) // in structured skipgrams
822 word = -2; // keep the window position correct
823 else
824 continue;
825 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100826 }
827 sen[sentence_length] = word;
828 sentence_length++;
829 if (sentence_length >= MAX_SENTENCE_LENGTH)
830 break;
831 }
832 sentence_position = 0;
833 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200834 current_pos = threadPos[(long) id] = ftell(fi);
835 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100836 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200837 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100838 local_iter--;
839 if (local_iter == 0)
840 break;
Marc Kupietze423f732017-12-22 17:57:03 +0100841 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
842 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
843 break;
844 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100845 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200846 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100847 last_word_count = 0;
848 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200849 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100850 continue;
851 }
852 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200853 while (word == -2 && sentence_position<sentence_length)
854 word = sen[++sentence_position];
855 if (sentence_position>=sentence_length) {
856 sentence_length=0;
857 continue;
858 }
859 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100860 continue;
861 for (c = 0; c < input_len_1; c++)
862 neu1[c] = 0;
863 for (c = 0; c < input_len_1; c++)
864 neu1e[c] = 0;
865 for (c = 0; c < input_len_2; c++)
866 neu2[c] = 0;
867 for (c = 0; c < input_len_2; c++)
868 neu2e[c] = 0;
869 next_random = next_random * (unsigned long long) 25214903917 + 11;
870 b = next_random % window;
871 if (type == 0) { //train the cbow architecture
872 // in -> hidden
873 cw = 0;
874 for (a = b; a < window * 2 + 1 - b; a++)
875 if (a != window) {
876 c = sentence_position - window + a;
877 if (c < 0)
878 continue;
879 if (c >= sentence_length)
880 continue;
881 last_word = sen[c];
882 if (last_word == -1)
883 continue;
884 for (c = 0; c < layer1_size; c++)
885 neu1[c] += syn0[c + last_word * layer1_size];
886 cw++;
887 }
888 if (cw) {
889 for (c = 0; c < layer1_size; c++)
890 neu1[c] /= cw;
891 if (hs)
892 for (d = 0; d < vocab[word].codelen; d++) {
893 f = 0;
894 l2 = vocab[word].point[d] * layer1_size;
895 // Propagate hidden -> output
896 for (c = 0; c < layer1_size; c++)
897 f += neu1[c] * syn1[c + l2];
898 if (f <= -MAX_EXP)
899 continue;
900 else if (f >= MAX_EXP)
901 continue;
902 else
903 f = expTable[(int) ((f + MAX_EXP)
904 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
905 // 'g' is the gradient multiplied by the learning rate
906 g = (1 - vocab[word].code[d] - f) * alpha;
907 // Propagate errors output -> hidden
908 for (c = 0; c < layer1_size; c++)
909 neu1e[c] += g * syn1[c + l2];
910 // Learn weights hidden -> output
911 for (c = 0; c < layer1_size; c++)
912 syn1[c + l2] += g * neu1[c];
913 if (cap == 1)
914 for (c = 0; c < layer1_size; c++)
915 capParam(syn1, c + l2);
916 }
917 // NEGATIVE SAMPLING
918 if (negative > 0)
919 for (d = 0; d < negative + 1; d++) {
920 if (d == 0) {
921 target = word;
922 label = 1;
923 } else {
924 next_random = next_random
925 * (unsigned long long) 25214903917 + 11;
926 if (word_to_group != NULL
927 && word_to_group[word] != -1) {
928 target = word;
929 while (target == word) {
930 target = group_to_table[word_to_group[word]
931 * table_size
932 + (next_random >> 16) % table_size];
933 next_random = next_random
934 * (unsigned long long) 25214903917
935 + 11;
936 }
937 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
938 } else {
939 target =
940 table[(next_random >> 16) % table_size];
941 }
942 if (target == 0)
943 target = next_random % (vocab_size - 1) + 1;
944 if (target == word)
945 continue;
946 label = 0;
947 }
948 l2 = target * layer1_size;
949 f = 0;
950 for (c = 0; c < layer1_size; c++)
951 f += neu1[c] * syn1neg[c + l2];
952 if (f > MAX_EXP)
953 g = (label - 1) * alpha;
954 else if (f < -MAX_EXP)
955 g = (label - 0) * alpha;
956 else
957 g = (label
958 - expTable[(int) ((f + MAX_EXP)
959 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
960 * alpha;
961 for (c = 0; c < layer1_size; c++)
962 neu1e[c] += g * syn1neg[c + l2];
963 for (c = 0; c < layer1_size; c++)
964 syn1neg[c + l2] += g * neu1[c];
965 if (cap == 1)
966 for (c = 0; c < layer1_size; c++)
967 capParam(syn1neg, c + l2);
968 }
969 // Noise Contrastive Estimation
970 if (nce > 0)
971 for (d = 0; d < nce + 1; d++) {
972 if (d == 0) {
973 target = word;
974 label = 1;
975 } else {
976 next_random = next_random
977 * (unsigned long long) 25214903917 + 11;
978 if (word_to_group != NULL
979 && word_to_group[word] != -1) {
980 target = word;
981 while (target == word) {
982 target = group_to_table[word_to_group[word]
983 * table_size
984 + (next_random >> 16) % table_size];
985 next_random = next_random
986 * (unsigned long long) 25214903917
987 + 11;
988 }
989 } else {
990 target =
991 table[(next_random >> 16) % table_size];
992 }
993 if (target == 0)
994 target = next_random % (vocab_size - 1) + 1;
995 if (target == word)
996 continue;
997 label = 0;
998 }
999 l2 = target * layer1_size;
1000 f = 0;
1001
1002 for (c = 0; c < layer1_size; c++)
1003 f += neu1[c] * syn1nce[c + l2];
1004 if (f > MAX_EXP)
1005 g = (label - 1) * alpha;
1006 else if (f < -MAX_EXP)
1007 g = (label - 0) * alpha;
1008 else {
1009 f = exp(f);
1010 g =
1011 (label
1012 - f
1013 / (noise_distribution[target]
1014 * nce + f)) * alpha;
1015 }
1016 for (c = 0; c < layer1_size; c++)
1017 neu1e[c] += g * syn1nce[c + l2];
1018 for (c = 0; c < layer1_size; c++)
1019 syn1nce[c + l2] += g * neu1[c];
1020 if (cap == 1)
1021 for (c = 0; c < layer1_size; c++)
1022 capParam(syn1nce, c + l2);
1023 }
1024 // hidden -> in
1025 for (a = b; a < window * 2 + 1 - b; a++)
1026 if (a != window) {
1027 c = sentence_position - window + a;
1028 if (c < 0)
1029 continue;
1030 if (c >= sentence_length)
1031 continue;
1032 last_word = sen[c];
1033 if (last_word == -1)
1034 continue;
1035 for (c = 0; c < layer1_size; c++)
1036 syn0[c + last_word * layer1_size] += neu1e[c];
1037 }
1038 }
1039 } else if (type == 1) { //train skip-gram
1040 for (a = b; a < window * 2 + 1 - b; a++)
1041 if (a != window) {
1042 c = sentence_position - window + a;
1043 if (c < 0)
1044 continue;
1045 if (c >= sentence_length)
1046 continue;
1047 last_word = sen[c];
1048 if (last_word == -1)
1049 continue;
1050 l1 = last_word * layer1_size;
1051 for (c = 0; c < layer1_size; c++)
1052 neu1e[c] = 0;
1053 // HIERARCHICAL SOFTMAX
1054 if (hs)
1055 for (d = 0; d < vocab[word].codelen; d++) {
1056 f = 0;
1057 l2 = vocab[word].point[d] * layer1_size;
1058 // Propagate hidden -> output
1059 for (c = 0; c < layer1_size; c++)
1060 f += syn0[c + l1] * syn1[c + l2];
1061 if (f <= -MAX_EXP)
1062 continue;
1063 else if (f >= MAX_EXP)
1064 continue;
1065 else
1066 f = expTable[(int) ((f + MAX_EXP)
1067 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1068 // 'g' is the gradient multiplied by the learning rate
1069 g = (1 - vocab[word].code[d] - f) * alpha;
1070 // Propagate errors output -> hidden
1071 for (c = 0; c < layer1_size; c++)
1072 neu1e[c] += g * syn1[c + l2];
1073 // Learn weights hidden -> output
1074 for (c = 0; c < layer1_size; c++)
1075 syn1[c + l2] += g * syn0[c + l1];
1076 if (cap == 1)
1077 for (c = 0; c < layer1_size; c++)
1078 capParam(syn1, c + l2);
1079 }
1080 // NEGATIVE SAMPLING
1081 if (negative > 0)
1082 for (d = 0; d < negative + 1; d++) {
1083 if (d == 0) {
1084 target = word;
1085 label = 1;
1086 } else {
1087 next_random = next_random
1088 * (unsigned long long) 25214903917 + 11;
1089 if (word_to_group != NULL
1090 && word_to_group[word] != -1) {
1091 target = word;
1092 while (target == word) {
1093 target =
1094 group_to_table[word_to_group[word]
1095 * table_size
1096 + (next_random >> 16)
1097 % table_size];
1098 next_random =
1099 next_random
1100 * (unsigned long long) 25214903917
1101 + 11;
1102 }
1103 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1104 } else {
1105 target = table[(next_random >> 16)
1106 % table_size];
1107 }
1108 if (target == 0)
1109 target = next_random % (vocab_size - 1) + 1;
1110 if (target == word)
1111 continue;
1112 label = 0;
1113 }
1114 l2 = target * layer1_size;
1115 f = 0;
1116 for (c = 0; c < layer1_size; c++)
1117 f += syn0[c + l1] * syn1neg[c + l2];
1118 if (f > MAX_EXP)
1119 g = (label - 1) * alpha;
1120 else if (f < -MAX_EXP)
1121 g = (label - 0) * alpha;
1122 else
1123 g =
1124 (label
1125 - expTable[(int) ((f + MAX_EXP)
1126 * (EXP_TABLE_SIZE
1127 / MAX_EXP / 2))])
1128 * alpha;
1129 for (c = 0; c < layer1_size; c++)
1130 neu1e[c] += g * syn1neg[c + l2];
1131 for (c = 0; c < layer1_size; c++)
1132 syn1neg[c + l2] += g * syn0[c + l1];
1133 if (cap == 1)
1134 for (c = 0; c < layer1_size; c++)
1135 capParam(syn1neg, c + l2);
1136 }
1137 //Noise Contrastive Estimation
1138 if (nce > 0)
1139 for (d = 0; d < nce + 1; d++) {
1140 if (d == 0) {
1141 target = word;
1142 label = 1;
1143 } else {
1144 next_random = next_random
1145 * (unsigned long long) 25214903917 + 11;
1146 if (word_to_group != NULL
1147 && word_to_group[word] != -1) {
1148 target = word;
1149 while (target == word) {
1150 target =
1151 group_to_table[word_to_group[word]
1152 * table_size
1153 + (next_random >> 16)
1154 % table_size];
1155 next_random =
1156 next_random
1157 * (unsigned long long) 25214903917
1158 + 11;
1159 }
1160 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1161 } else {
1162 target = table[(next_random >> 16)
1163 % table_size];
1164 }
1165 if (target == 0)
1166 target = next_random % (vocab_size - 1) + 1;
1167 if (target == word)
1168 continue;
1169 label = 0;
1170 }
1171 l2 = target * layer1_size;
1172 f = 0;
1173 for (c = 0; c < layer1_size; c++)
1174 f += syn0[c + l1] * syn1nce[c + l2];
1175 if (f > MAX_EXP)
1176 g = (label - 1) * alpha;
1177 else if (f < -MAX_EXP)
1178 g = (label - 0) * alpha;
1179 else {
1180 f = exp(f);
1181 g = (label
1182 - f
1183 / (noise_distribution[target]
1184 * nce + f)) * alpha;
1185 }
1186 for (c = 0; c < layer1_size; c++)
1187 neu1e[c] += g * syn1nce[c + l2];
1188 for (c = 0; c < layer1_size; c++)
1189 syn1nce[c + l2] += g * syn0[c + l1];
1190 if (cap == 1)
1191 for (c = 0; c < layer1_size; c++)
1192 capParam(syn1nce, c + l2);
1193 }
1194 // Learn weights input -> hidden
1195 for (c = 0; c < layer1_size; c++)
1196 syn0[c + l1] += neu1e[c];
1197 }
1198 } else if (type == 2) { //train the cwindow architecture
1199 // in -> hidden
1200 cw = 0;
1201 for (a = 0; a < window * 2 + 1; a++)
1202 if (a != window) {
1203 c = sentence_position - window + a;
1204 if (c < 0)
1205 continue;
1206 if (c >= sentence_length)
1207 continue;
1208 last_word = sen[c];
1209 if (last_word == -1)
1210 continue;
1211 window_offset = a * layer1_size;
1212 if (a > window)
1213 window_offset -= layer1_size;
1214 for (c = 0; c < layer1_size; c++)
1215 neu1[c + window_offset] += syn0[c
1216 + last_word * layer1_size];
1217 cw++;
1218 }
1219 if (cw) {
1220 if (hs)
1221 for (d = 0; d < vocab[word].codelen; d++) {
1222 f = 0;
1223 l2 = vocab[word].point[d] * window_layer_size;
1224 // Propagate hidden -> output
1225 for (c = 0; c < window_layer_size; c++)
1226 f += neu1[c] * syn1_window[c + l2];
1227 if (f <= -MAX_EXP)
1228 continue;
1229 else if (f >= MAX_EXP)
1230 continue;
1231 else
1232 f = expTable[(int) ((f + MAX_EXP)
1233 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1234 // 'g' is the gradient multiplied by the learning rate
1235 g = (1 - vocab[word].code[d] - f) * alpha;
1236 // Propagate errors output -> hidden
1237 for (c = 0; c < window_layer_size; c++)
1238 neu1e[c] += g * syn1_window[c + l2];
1239 // Learn weights hidden -> output
1240 for (c = 0; c < window_layer_size; c++)
1241 syn1_window[c + l2] += g * neu1[c];
1242 if (cap == 1)
1243 for (c = 0; c < window_layer_size; c++)
1244 capParam(syn1_window, c + l2);
1245 }
1246 // NEGATIVE SAMPLING
1247 if (negative > 0)
1248 for (d = 0; d < negative + 1; d++) {
1249 if (d == 0) {
1250 target = word;
1251 label = 1;
1252 } else {
1253 next_random = next_random
1254 * (unsigned long long) 25214903917 + 11;
1255 if (word_to_group != NULL
1256 && word_to_group[word] != -1) {
1257 target = word;
1258 while (target == word) {
1259 target = group_to_table[word_to_group[word]
1260 * table_size
1261 + (next_random >> 16) % table_size];
1262 next_random = next_random
1263 * (unsigned long long) 25214903917
1264 + 11;
1265 }
1266 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1267 } else {
1268 target =
1269 table[(next_random >> 16) % table_size];
1270 }
1271 if (target == 0)
1272 target = next_random % (vocab_size - 1) + 1;
1273 if (target == word)
1274 continue;
1275 label = 0;
1276 }
1277 l2 = target * window_layer_size;
1278 f = 0;
1279 for (c = 0; c < window_layer_size; c++)
1280 f += neu1[c] * syn1neg_window[c + l2];
1281 if (f > MAX_EXP)
1282 g = (label - 1) * alpha;
1283 else if (f < -MAX_EXP)
1284 g = (label - 0) * alpha;
1285 else
1286 g = (label
1287 - expTable[(int) ((f + MAX_EXP)
1288 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1289 * alpha;
1290 for (c = 0; c < window_layer_size; c++)
1291 neu1e[c] += g * syn1neg_window[c + l2];
1292 for (c = 0; c < window_layer_size; c++)
1293 syn1neg_window[c + l2] += g * neu1[c];
1294 if (cap == 1)
1295 for (c = 0; c < window_layer_size; c++)
1296 capParam(syn1neg_window, c + l2);
1297 }
1298 // Noise Contrastive Estimation
1299 if (nce > 0)
1300 for (d = 0; d < nce + 1; d++) {
1301 if (d == 0) {
1302 target = word;
1303 label = 1;
1304 } else {
1305 next_random = next_random
1306 * (unsigned long long) 25214903917 + 11;
1307 if (word_to_group != NULL
1308 && word_to_group[word] != -1) {
1309 target = word;
1310 while (target == word) {
1311 target = group_to_table[word_to_group[word]
1312 * table_size
1313 + (next_random >> 16) % table_size];
1314 next_random = next_random
1315 * (unsigned long long) 25214903917
1316 + 11;
1317 }
1318 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1319 } else {
1320 target =
1321 table[(next_random >> 16) % table_size];
1322 }
1323 if (target == 0)
1324 target = next_random % (vocab_size - 1) + 1;
1325 if (target == word)
1326 continue;
1327 label = 0;
1328 }
1329 l2 = target * window_layer_size;
1330 f = 0;
1331 for (c = 0; c < window_layer_size; c++)
1332 f += neu1[c] * syn1nce_window[c + l2];
1333 if (f > MAX_EXP)
1334 g = (label - 1) * alpha;
1335 else if (f < -MAX_EXP)
1336 g = (label - 0) * alpha;
1337 else {
1338 f = exp(f);
1339 g =
1340 (label
1341 - f
1342 / (noise_distribution[target]
1343 * nce + f)) * alpha;
1344 }
1345 for (c = 0; c < window_layer_size; c++)
1346 neu1e[c] += g * syn1nce_window[c + l2];
1347 for (c = 0; c < window_layer_size; c++)
1348 syn1nce_window[c + l2] += g * neu1[c];
1349 if (cap == 1)
1350 for (c = 0; c < window_layer_size; c++)
1351 capParam(syn1nce_window, c + l2);
1352 }
1353 // hidden -> in
1354 for (a = 0; a < window * 2 + 1; a++)
1355 if (a != window) {
1356 c = sentence_position - window + a;
1357 if (c < 0)
1358 continue;
1359 if (c >= sentence_length)
1360 continue;
1361 last_word = sen[c];
1362 if (last_word == -1)
1363 continue;
1364 window_offset = a * layer1_size;
1365 if (a > window)
1366 window_offset -= layer1_size;
1367 for (c = 0; c < layer1_size; c++)
1368 syn0[c + last_word * layer1_size] += neu1e[c
1369 + window_offset];
1370 }
1371 }
1372 } else if (type == 3) { //train structured skip-gram
1373 for (a = 0; a < window * 2 + 1; a++)
1374 if (a != window) {
1375 c = sentence_position - window + a;
1376 if (c < 0)
1377 continue;
1378 if (c >= sentence_length)
1379 continue;
1380 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001381 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001382 continue;
1383 l1 = last_word * layer1_size;
1384 window_offset = a * layer1_size;
1385 if (a > window)
1386 window_offset -= layer1_size;
1387 for (c = 0; c < layer1_size; c++)
1388 neu1e[c] = 0;
1389 // HIERARCHICAL SOFTMAX
1390 if (hs)
1391 for (d = 0; d < vocab[word].codelen; d++) {
1392 f = 0;
1393 l2 = vocab[word].point[d] * window_layer_size;
1394 // Propagate hidden -> output
1395 for (c = 0; c < layer1_size; c++)
1396 f += syn0[c + l1]
1397 * syn1_window[c + l2 + window_offset];
1398 if (f <= -MAX_EXP)
1399 continue;
1400 else if (f >= MAX_EXP)
1401 continue;
1402 else
1403 f = expTable[(int) ((f + MAX_EXP)
1404 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1405 // 'g' is the gradient multiplied by the learning rate
1406 g = (1 - vocab[word].code[d] - f) * alpha;
1407 // Propagate errors output -> hidden
1408 for (c = 0; c < layer1_size; c++)
1409 neu1e[c] += g
1410 * syn1_window[c + l2 + window_offset];
1411 // Learn weights hidden -> output
1412 for (c = 0; c < layer1_size; c++)
1413 syn1[c + l2 + window_offset] += g
1414 * syn0[c + l1];
1415 if (cap == 1)
1416 for (c = 0; c < layer1_size; c++)
1417 capParam(syn1, c + l2 + window_offset);
1418 }
1419 // NEGATIVE SAMPLING
1420 if (negative > 0)
1421 for (d = 0; d < negative + 1; d++) {
1422 if (d == 0) {
1423 target = word;
1424 label = 1;
1425 } else {
1426 next_random = next_random
1427 * (unsigned long long) 25214903917 + 11;
1428 if (word_to_group != NULL
1429 && word_to_group[word] != -1) {
1430 target = word;
1431 while (target == word) {
1432 target =
1433 group_to_table[word_to_group[word]
1434 * table_size
1435 + (next_random >> 16)
1436 % table_size];
1437 next_random =
1438 next_random
1439 * (unsigned long long) 25214903917
1440 + 11;
1441 }
1442 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1443 } else {
1444 target = table[(next_random >> 16)
1445 % table_size];
1446 }
1447 if (target == 0)
1448 target = next_random % (vocab_size - 1) + 1;
1449 if (target == word)
1450 continue;
1451 label = 0;
1452 }
1453 l2 = target * window_layer_size;
1454 f = 0;
1455 for (c = 0; c < layer1_size; c++)
1456 f +=
1457 syn0[c + l1]
1458 * syn1neg_window[c + l2
1459 + window_offset];
1460 if (f > MAX_EXP)
1461 g = (label - 1) * alpha;
1462 else if (f < -MAX_EXP)
1463 g = (label - 0) * alpha;
1464 else
1465 g =
1466 (label
1467 - expTable[(int) ((f + MAX_EXP)
1468 * (EXP_TABLE_SIZE
1469 / MAX_EXP / 2))])
1470 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001471 if(debug_mode > 2 && ((long long) id) == 0) {
1472 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1473 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1474 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001475 for (c = 0; c < layer1_size; c++)
1476 neu1e[c] +=
1477 g
1478 * syn1neg_window[c + l2
1479 + window_offset];
1480 for (c = 0; c < layer1_size; c++)
1481 syn1neg_window[c + l2 + window_offset] += g
1482 * syn0[c + l1];
1483 if (cap == 1)
1484 for (c = 0; c < layer1_size; c++)
1485 capParam(syn1neg_window,
1486 c + l2 + window_offset);
1487 }
1488 // Noise Constrastive Estimation
1489 if (nce > 0)
1490 for (d = 0; d < nce + 1; d++) {
1491 if (d == 0) {
1492 target = word;
1493 label = 1;
1494 } else {
1495 next_random = next_random
1496 * (unsigned long long) 25214903917 + 11;
1497 if (word_to_group != NULL
1498 && word_to_group[word] != -1) {
1499 target = word;
1500 while (target == word) {
1501 target =
1502 group_to_table[word_to_group[word]
1503 * table_size
1504 + (next_random >> 16)
1505 % table_size];
1506 next_random =
1507 next_random
1508 * (unsigned long long) 25214903917
1509 + 11;
1510 }
1511 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1512 } else {
1513 target = table[(next_random >> 16)
1514 % table_size];
1515 }
1516 if (target == 0)
1517 target = next_random % (vocab_size - 1) + 1;
1518 if (target == word)
1519 continue;
1520 label = 0;
1521 }
1522 l2 = target * window_layer_size;
1523 f = 0;
1524 for (c = 0; c < layer1_size; c++)
1525 f +=
1526 syn0[c + l1]
1527 * syn1nce_window[c + l2
1528 + window_offset];
1529 if (f > MAX_EXP)
1530 g = (label - 1) * alpha;
1531 else if (f < -MAX_EXP)
1532 g = (label - 0) * alpha;
1533 else {
1534 f = exp(f);
1535 g = (label
1536 - f
1537 / (noise_distribution[target]
1538 * nce + f)) * alpha;
1539 }
1540 for (c = 0; c < layer1_size; c++)
1541 neu1e[c] +=
1542 g
1543 * syn1nce_window[c + l2
1544 + window_offset];
1545 for (c = 0; c < layer1_size; c++)
1546 syn1nce_window[c + l2 + window_offset] += g
1547 * syn0[c + l1];
1548 if (cap == 1)
1549 for (c = 0; c < layer1_size; c++)
1550 capParam(syn1nce_window,
1551 c + l2 + window_offset);
1552 }
1553 // Learn weights input -> hidden
1554 for (c = 0; c < layer1_size; c++) {
1555 syn0[c + l1] += neu1e[c];
1556 if (syn0[c + l1] > 50)
1557 syn0[c + l1] = 50;
1558 if (syn0[c + l1] < -50)
1559 syn0[c + l1] = -50;
1560 }
1561 }
1562 } else if (type == 4) { //training senna
1563 // in -> hidden
1564 cw = 0;
1565 for (a = 0; a < window * 2 + 1; a++)
1566 if (a != window) {
1567 c = sentence_position - window + a;
1568 if (c < 0)
1569 continue;
1570 if (c >= sentence_length)
1571 continue;
1572 last_word = sen[c];
1573 if (last_word == -1)
1574 continue;
1575 window_offset = a * layer1_size;
1576 if (a > window)
1577 window_offset -= layer1_size;
1578 for (c = 0; c < layer1_size; c++)
1579 neu1[c + window_offset] += syn0[c
1580 + last_word * layer1_size];
1581 cw++;
1582 }
1583 if (cw) {
1584 for (a = 0; a < window_hidden_size; a++) {
1585 c = a * window_layer_size;
1586 for (b = 0; b < window_layer_size; b++) {
1587 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1588 }
1589 }
1590 if (hs)
1591 for (d = 0; d < vocab[word].codelen; d++) {
1592 f = 0;
1593 l2 = vocab[word].point[d] * window_hidden_size;
1594 // Propagate hidden -> output
1595 for (c = 0; c < window_hidden_size; c++)
1596 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1597 if (f <= -MAX_EXP)
1598 continue;
1599 else if (f >= MAX_EXP)
1600 continue;
1601 else
1602 f = expTable[(int) ((f + MAX_EXP)
1603 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1604 // 'g' is the gradient multiplied by the learning rate
1605 g = (1 - vocab[word].code[d] - f) * alpha;
1606 // Propagate errors output -> hidden
1607 for (c = 0; c < window_hidden_size; c++)
1608 neu2e[c] += dHardTanh(neu2[c], g) * g
1609 * syn_hidden_word[c + l2];
1610 // Learn weights hidden -> output
1611 for (c = 0; c < window_hidden_size; c++)
1612 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1613 * neu2[c];
1614 }
1615 // NEGATIVE SAMPLING
1616 if (negative > 0)
1617 for (d = 0; d < negative + 1; d++) {
1618 if (d == 0) {
1619 target = word;
1620 label = 1;
1621 } else {
1622 next_random = next_random
1623 * (unsigned long long) 25214903917 + 11;
1624 if (word_to_group != NULL
1625 && word_to_group[word] != -1) {
1626 target = word;
1627 while (target == word) {
1628 target = group_to_table[word_to_group[word]
1629 * table_size
1630 + (next_random >> 16) % table_size];
1631 next_random = next_random
1632 * (unsigned long long) 25214903917
1633 + 11;
1634 }
1635 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1636 } else {
1637 target =
1638 table[(next_random >> 16) % table_size];
1639 }
1640 if (target == 0)
1641 target = next_random % (vocab_size - 1) + 1;
1642 if (target == word)
1643 continue;
1644 label = 0;
1645 }
1646 l2 = target * window_hidden_size;
1647 f = 0;
1648 for (c = 0; c < window_hidden_size; c++)
1649 f += hardTanh(neu2[c])
1650 * syn_hidden_word_neg[c + l2];
1651 if (f > MAX_EXP)
1652 g = (label - 1) * alpha / negative;
1653 else if (f < -MAX_EXP)
1654 g = (label - 0) * alpha / negative;
1655 else
1656 g = (label
1657 - expTable[(int) ((f + MAX_EXP)
1658 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1659 * alpha / negative;
1660 for (c = 0; c < window_hidden_size; c++)
1661 neu2e[c] += dHardTanh(neu2[c], g) * g
1662 * syn_hidden_word_neg[c + l2];
1663 for (c = 0; c < window_hidden_size; c++)
1664 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1665 * g * neu2[c];
1666 }
1667 for (a = 0; a < window_hidden_size; a++)
1668 for (b = 0; b < window_layer_size; b++)
1669 neu1e[b] += neu2e[a]
1670 * syn_window_hidden[a * window_layer_size + b];
1671 for (a = 0; a < window_hidden_size; a++)
1672 for (b = 0; b < window_layer_size; b++)
1673 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1674 * neu1[b];
1675 // hidden -> in
1676 for (a = 0; a < window * 2 + 1; a++)
1677 if (a != window) {
1678 c = sentence_position - window + a;
1679 if (c < 0)
1680 continue;
1681 if (c >= sentence_length)
1682 continue;
1683 last_word = sen[c];
1684 if (last_word == -1)
1685 continue;
1686 window_offset = a * layer1_size;
1687 if (a > window)
1688 window_offset -= layer1_size;
1689 for (c = 0; c < layer1_size; c++)
1690 syn0[c + last_word * layer1_size] += neu1e[c
1691 + window_offset];
1692 }
1693 }
Marc Kupietz613edbf2018-01-11 21:38:03 +01001694 } else if(type == 5) {
1695 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1696 c = sentence_position - window + a;
1697 if (c < 0) continue;
1698 if (c >= sentence_length) continue;
1699 last_word = sen[c];
1700 if (last_word == -1) continue;
1701 inc_collocator(cdb, word, last_word, a - window);
1702 // printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
1703 // cw++;
1704 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001705 } else {
1706 printf("unknown type %i", type);
1707 exit(0);
1708 }
1709 sentence_position++;
1710 if (sentence_position >= sentence_length) {
1711 sentence_length = 0;
1712 continue;
1713 }
1714 }
1715 fclose(fi);
1716 free(neu1);
1717 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001718 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001719 pthread_exit(NULL);
1720}
1721
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001722void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001723 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001724 real f, max_f, maxmax_f;
Marc Kupietzf00e7b02023-12-22 11:11:56 +01001725 real *target_sums=0L, bestf[MAX_CC], worstbest;
Marc Kupietz71996e72016-03-18 13:40:24 +01001726 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001727 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001728 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1729
1730 for (d = cc; d < vocab_size; d++) {
1731 for (b = 0; b < vocab_size; b++)
1732 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001733 for (b = 0; b < N; b++)
1734 bestf[b]=-1;
1735 worstbest = -1;
1736
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001737 maxmax_f = -1;
1738 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001739 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001740 if (a != window) {
1741 max_f = -1;
1742 window_offset = a * layer1_size;
1743 if (a > window)
1744 window_offset -= layer1_size;
1745 for(target = 0; target < vocab_size; target ++) {
1746 if(target == d)
1747 continue;
1748 f = 0;
1749 for (c = 0; c < layer1_size; c++)
1750 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1751 if (f < -MAX_EXP)
1752 continue;
1753 else if (f > MAX_EXP)
1754 continue;
1755 else
1756 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1757 if(f > max_f) {
1758 max_f = f;
1759 max_target = target;
1760 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001761 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001762 if(f > worstbest) {
1763 for (b = 0; b < N; b++) {
1764 if (f > bestf[b]) {
1765 for (e = N - 1; e > b; e--) {
1766 bestf[e] = bestf[e - 1];
1767 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001768 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001769 }
1770 bestf[b] = f;
1771 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001772 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001773 break;
1774 }
1775 }
1776 worstbest = bestf[N-1];
1777 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001778 }
1779 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1780 if(max_f > maxmax_f) {
1781 maxmax_f = max_f;
1782 maxmax_target = max_target;
1783 }
1784 } else {
1785 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1786 }
1787 }
1788 max_f = -1;
1789 for (b = 0; b < vocab_size; b++) {
1790 if(target_sums[b] > max_f) {
1791 max_f = target_sums[b];
1792 max_target = b;
1793 }
1794 }
1795 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001796 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001797 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001798 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001799 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001800 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001801 }
1802}
1803
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001804void TrainModel() {
1805 long a, b, c, d;
1806 FILE *fo;
1807 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001808 threadPos = malloc(num_threads * sizeof(long long));
1809 threadIters = malloc(num_threads * sizeof(int));
1810 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001811 printf("Starting training using file %s\n", train_file);
1812 starting_alpha = alpha;
1813 if (read_vocab_file[0] != 0)
1814 ReadVocab();
1815 else
1816 LearnVocabFromTrainFile();
1817 if (save_vocab_file[0] != 0)
1818 SaveVocab();
1819 if (output_file[0] == 0)
1820 return;
1821 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001822 if(cc > 0)
1823 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001824 if (negative > 0 || nce > 0)
1825 InitUnigramTable();
1826 if (negative_classes_file[0] != 0)
1827 InitClassUnigramTable();
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001828 start = time(NULL);
1829 start_clock = clock();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001830 for (a = 0; a < num_threads; a++)
1831 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001832 if(debug_mode > 1)
1833 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001834 for (a = 0; a < num_threads; a++)
1835 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001836 if(debug_mode > 1) {
1837 pthread_join(pt[num_threads], NULL);
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001838 clock_t now = time(NULL);
1839 clock_t now_clock = clock();
1840 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
Marc Kupietz613edbf2018-01-11 21:38:03 +01001841 if(type == 5) // don't save vectorsmfor classic collocators
1842 return;
Marc Kupietz202723e2016-07-14 09:12:00 +02001843 printf("Saving vectors to %s ...", output_file);
1844 fflush(stdout);
1845 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001846 fo = fopen(output_file, "wb");
1847 if (classes == 0) {
1848 // Save the word vectors
1849 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1850 for (a = 0; a < vocab_size; a++) {
1851 fprintf(fo, "%s ", vocab[a].word);
1852 if (binary)
1853 for (b = 0; b < layer1_size; b++)
1854 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1855 else
1856 for (b = 0; b < layer1_size; b++)
1857 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1858 fprintf(fo, "\n");
1859 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001860 if(debug_mode > 1)
1861 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001862 } else {
1863 // Run K-means on the word vectors
1864 int clcn = classes, iter = 10, closeid;
1865 int *centcn = (int *) malloc(classes * sizeof(int));
1866 int *cl = (int *) calloc(vocab_size, sizeof(int));
1867 real closev, x;
1868 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1869 for (a = 0; a < vocab_size; a++)
1870 cl[a] = a % clcn;
1871 for (a = 0; a < iter; a++) {
1872 for (b = 0; b < clcn * layer1_size; b++)
1873 cent[b] = 0;
1874 for (b = 0; b < clcn; b++)
1875 centcn[b] = 1;
1876 for (c = 0; c < vocab_size; c++) {
1877 for (d = 0; d < layer1_size; d++)
1878 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1879 centcn[cl[c]]++;
1880 }
1881 for (b = 0; b < clcn; b++) {
1882 closev = 0;
1883 for (c = 0; c < layer1_size; c++) {
1884 cent[layer1_size * b + c] /= centcn[b];
1885 closev += cent[layer1_size * b + c]
1886 * cent[layer1_size * b + c];
1887 }
1888 closev = sqrt(closev);
1889 for (c = 0; c < layer1_size; c++)
1890 cent[layer1_size * b + c] /= closev;
1891 }
1892 for (c = 0; c < vocab_size; c++) {
1893 closev = -10;
1894 closeid = 0;
1895 for (d = 0; d < clcn; d++) {
1896 x = 0;
1897 for (b = 0; b < layer1_size; b++)
1898 x += cent[layer1_size * d + b]
1899 * syn0[c * layer1_size + b];
1900 if (x > closev) {
1901 closev = x;
1902 closeid = d;
1903 }
1904 }
1905 cl[c] = closeid;
1906 }
1907 }
1908 // Save the K-means classes
1909 for (a = 0; a < vocab_size; a++)
1910 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1911 free(centcn);
1912 free(cent);
1913 free(cl);
1914 }
1915 fclose(fo);
1916 if (save_net_file[0] != 0)
1917 SaveNet();
1918}
1919
1920int ArgPos(char *str, int argc, char **argv) {
1921 int a;
1922 for (a = 1; a < argc; a++)
1923 if (!strcmp(str, argv[a])) {
1924 if (a == argc - 1) {
1925 printf("Argument missing for %s\n", str);
1926 exit(1);
1927 }
1928 return a;
1929 }
1930 return -1;
1931}
1932
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001933void print_help() {
Marc Kupietz83a67d42021-03-22 17:29:36 +01001934 printf("WORD VECTOR estimation toolkit v 0.9.0\n\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001935 printf("Options:\n");
1936 printf("Parameters for training:\n");
1937 printf("\t-train <file>\n");
1938 printf("\t\tUse text data from <file> to train the model\n");
1939 printf("\t-output <file>\n");
1940 printf(
1941 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1942 printf("\t-size <int>\n");
1943 printf("\t\tSet size of word vectors; default is 100\n");
1944 printf("\t-window <int>\n");
1945 printf("\t\tSet max skip length between words; default is 5\n");
1946 printf("\t-sample <float>\n");
1947 printf(
1948 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1949 printf(
1950 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1951 printf("\t-hs <int>\n");
1952 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1953 printf("\t-negative <int>\n");
1954 printf(
1955 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1956 printf("\t-negative-classes <file>\n");
1957 printf("\t\tNegative classes to sample from\n");
1958 printf("\t-nce <int>\n");
1959 printf(
1960 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1961 printf("\t-threads <int>\n");
1962 printf("\t\tUse <int> threads (default 12)\n");
1963 printf("\t-iter <int>\n");
1964 printf("\t\tRun more training iterations (default 5)\n");
1965 printf("\t-min-count <int>\n");
1966 printf(
1967 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1968 printf("\t-alpha <float>\n");
1969 printf(
1970 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1971 printf("\t-classes <int>\n");
1972 printf(
1973 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1974 printf("\t-debug <int>\n");
1975 printf(
1976 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1977 printf("\t-binary <int>\n");
1978 printf(
1979 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1980 printf("\t-save-vocab <file>\n");
1981 printf("\t\tThe vocabulary will be saved to <file>\n");
1982 printf("\t-read-vocab <file>\n");
1983 printf(
1984 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1985 printf("\t-read-net <file>\n");
1986 printf(
1987 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1988 printf("\t-save-net <file>\n");
1989 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietze423f732017-12-22 17:57:03 +01001990 printf("\t-magic-stop-file <file>\n");
1991 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001992 printf("\t-show-cc <int>\n");
1993 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001994 printf("\t-type <int>\n");
1995 printf(
Marc Kupietz613edbf2018-01-11 21:38:03 +01001996 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001997 printf("\t-cap <int>\n");
1998 printf(
1999 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
2000 printf("\nExamples:\n");
2001 printf(
Marc Kupietz83a67d42021-03-22 17:29:36 +01002002 "./dereko2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietzc7f773b2017-12-02 12:04:03 +01002003}
2004
2005int main(int argc, char **argv) {
2006 int i;
2007 setlocale(LC_ALL, "");
2008 if (argc == 1) {
2009 print_help();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002010 return 0;
2011 }
2012 output_file[0] = 0;
2013 save_vocab_file[0] = 0;
2014 read_vocab_file[0] = 0;
2015 save_net_file[0] = 0;
2016 read_net_file[0] = 0;
2017 negative_classes_file[0] = 0;
Marc Kupietzc7f773b2017-12-02 12:04:03 +01002018 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
2019 print_help();
2020 return(0);
2021 }
2022 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
2023 print_help();
2024 return(0);
2025 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002026 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2027 layer1_size = atoi(argv[i + 1]);
2028 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2029 strcpy(train_file, argv[i + 1]);
2030 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2031 strcpy(save_vocab_file, argv[i + 1]);
2032 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2033 strcpy(read_vocab_file, argv[i + 1]);
2034 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2035 strcpy(save_net_file, argv[i + 1]);
2036 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2037 strcpy(read_net_file, argv[i + 1]);
Marc Kupietze423f732017-12-22 17:57:03 +01002038 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2039 strcpy(magic_stop_file, argv[i + 1]);
2040 if (access(magic_stop_file, F_OK ) != -1) {
2041 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2042 exit(1);
2043 }
2044 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002045 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2046 debug_mode = atoi(argv[i + 1]);
2047 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2048 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002049 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2050 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002051 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2052 type = atoi(argv[i + 1]);
2053 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2054 strcpy(output_file, argv[i + 1]);
2055 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2056 window = atoi(argv[i + 1]);
2057 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2058 sample = atof(argv[i + 1]);
2059 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2060 hs = atoi(argv[i + 1]);
2061 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2062 negative = atoi(argv[i + 1]);
2063 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2064 strcpy(negative_classes_file, argv[i + 1]);
2065 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2066 nce = atoi(argv[i + 1]);
2067 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2068 num_threads = atoi(argv[i + 1]);
2069 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2070 iter = atoi(argv[i + 1]);
2071 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2072 min_count = atoi(argv[i + 1]);
2073 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2074 classes = atoi(argv[i + 1]);
Marc Kupietz879333c2023-12-20 11:41:09 +01002075 if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0)
2076 metadata_categories = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002077 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2078 cap = atoi(argv[i + 1]);
2079 if (type == 0 || type == 2 || type == 4)
2080 alpha = 0.05;
Marc Kupietz613edbf2018-01-11 21:38:03 +01002081 if (type==5) {
2082 sample = 0;
2083 cdb = open_collocatordb_for_write(output_file);
2084 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002085 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2086 alpha = atof(argv[i + 1]);
2087 vocab = (struct vocab_word *) calloc(vocab_max_size,
2088 sizeof(struct vocab_word));
2089 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2090 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2091 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2092 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2093 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2094 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002095 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002096 TrainModel();
2097 return 0;
2098}
2099