blob: 5b8ef43fc637640e445bd27822db49f5d233e28c [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Marc Kupietze23c5402016-07-14 11:10:09 +020015#include <locale.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010016#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020019#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010020#include <math.h>
21#include <pthread.h>
Marc Kupietz613edbf2018-01-11 21:38:03 +010022#include <collocatordb.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010023
24#define MAX_STRING 100
25#define EXP_TABLE_SIZE 1000
26#define MAX_EXP 6
27#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010028#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010029#define MAX_CODE_LENGTH 40
30
31const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
32
33typedef float real; // Precision of float numbers
34
35struct vocab_word {
36 long long cn;
37 int *point;
38 char *word, *code, codelen;
39};
40
41char train_file[MAX_STRING], output_file[MAX_STRING];
42char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
43char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietze423f732017-12-22 17:57:03 +010044char magic_stop_file[MAX_STRING];
45
Marc Kupietzd6f9c712016-03-16 11:50:56 +010046struct vocab_word *vocab;
47int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietzc2731b22016-07-14 08:56:14 +020048 num_threads = 12, min_reduce = 1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010049int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020050long long *threadPos;
51int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010052long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
53long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
54 classes = 0;
55real alpha = 0.025, starting_alpha, sample = 1e-3;
56real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020057real avgWordLength=0;
Marc Kupietzb366bcd2018-01-11 21:29:41 +010058clock_t start, start_clock;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010059
60real *syn1_window, *syn1neg_window, *syn1nce_window;
61int w_offset, window_layer_size;
62
63int window_hidden_size = 500;
64real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
65 *syn_hidden_word_nce;
66
67int hs = 0, negative = 5;
68const int table_size = 1e8;
69int *table;
70
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010071long cc = 0;
72
Marc Kupietzd6f9c712016-03-16 11:50:56 +010073//constrastive negative sampling
74char negative_classes_file[MAX_STRING];
75int *word_to_group;
76int *group_to_table; //group_size*table_size
77int class_number;
78
79//nce
80real* noise_distribution;
81int nce = 0;
82
83//param caps
84real CAP_VALUE = 50;
85int cap = 0;
86
Marc Kupietz613edbf2018-01-11 21:38:03 +010087COLLOCATORDB *cdb = NULL;
88
Marc Kupietzd6f9c712016-03-16 11:50:56 +010089void capParam(real* array, int index) {
90 if (array[index] > CAP_VALUE)
91 array[index] = CAP_VALUE;
92 else if (array[index] < -CAP_VALUE)
93 array[index] = -CAP_VALUE;
94}
95
96real hardTanh(real x) {
97 if (x >= 1) {
98 return 1;
99 } else if (x <= -1) {
100 return -1;
101 } else {
102 return x;
103 }
104}
105
106real dHardTanh(real x, real g) {
107 if (x > 1 && g > 0) {
108 return 0;
109 }
110 if (x < -1 && g < 0) {
111 return 0;
112 }
113 return 1;
114}
115
116void InitUnigramTable() {
117 int a, i;
118 long long train_words_pow = 0;
119 real d1, power = 0.75;
120 table = (int *) malloc(table_size * sizeof(int));
121 for (a = 0; a < vocab_size; a++)
122 train_words_pow += pow(vocab[a].cn, power);
123 i = 0;
124 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
125 for (a = 0; a < table_size; a++) {
126 table[a] = i;
127 if (a / (real) table_size > d1) {
128 i++;
129 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
130 }
131 if (i >= vocab_size)
132 i = vocab_size - 1;
133 }
134
135 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
136 for (a = 0; a < vocab_size; a++)
137 noise_distribution[a] = pow(vocab[a].cn, power)
138 / (real) train_words_pow;
139}
140
141// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
142void ReadWord(char *word, FILE *fin) {
143 int a = 0, ch;
144 while (!feof(fin)) {
145 ch = fgetc(fin);
146 if (ch == 13)
147 continue;
148 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
149 if (a > 0) {
150 if (ch == '\n')
151 ungetc(ch, fin);
152 break;
153 }
154 if (ch == '\n') {
155 strcpy(word, (char *) "</s>");
156 return;
157 } else
158 continue;
159 }
160 word[a] = ch;
161 a++;
162 if (a >= MAX_STRING - 1)
163 a--; // Truncate too long words
164 }
165 word[a] = 0;
166}
167
168// Returns hash value of a word
169int GetWordHash(char *word) {
170 unsigned long long a, hash = 0;
171 for (a = 0; a < strlen(word); a++)
172 hash = hash * 257 + word[a];
173 hash = hash % vocab_hash_size;
174 return hash;
175}
176
177// Returns position of a word in the vocabulary; if the word is not found, returns -1
178int SearchVocab(char *word) {
179 unsigned int hash = GetWordHash(word);
180 while (1) {
181 if (vocab_hash[hash] == -1)
182 return -1;
183 if (!strcmp(word, vocab[vocab_hash[hash]].word))
184 return vocab_hash[hash];
185 hash = (hash + 1) % vocab_hash_size;
186 }
187 return -1;
188}
189
190// Reads a word and returns its index in the vocabulary
191int ReadWordIndex(FILE *fin) {
192 char word[MAX_STRING];
193 ReadWord(word, fin);
194 if (feof(fin))
195 return -1;
196 return SearchVocab(word);
197}
198
199// Adds a word to the vocabulary
200int AddWordToVocab(char *word) {
201 unsigned int hash, length = strlen(word) + 1;
202 if (length > MAX_STRING)
203 length = MAX_STRING;
204 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
205 strcpy(vocab[vocab_size].word, word);
206 vocab[vocab_size].cn = 0;
207 vocab_size++;
208 // Reallocate memory if needed
209 if (vocab_size + 2 >= vocab_max_size) {
210 vocab_max_size += 1000;
211 vocab = (struct vocab_word *) realloc(vocab,
212 vocab_max_size * sizeof(struct vocab_word));
213 }
214 hash = GetWordHash(word);
215 while (vocab_hash[hash] != -1)
216 hash = (hash + 1) % vocab_hash_size;
217 vocab_hash[hash] = vocab_size - 1;
218 return vocab_size - 1;
219}
220
221// Used later for sorting by word counts
222int VocabCompare(const void *a, const void *b) {
„feldmueller“7f1fc332024-10-21 18:05:57 +0200223 long long freq1 = ((struct vocab_word *) a)->cn;
224 long long freq2 = ((struct vocab_word *) b)->cn;
225 if (freq1 < freq2) return 1;
226 else if (freq1 > freq2) return -1;
227 else return 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100228}
229
230// Sorts the vocabulary by frequency using word counts
231void SortVocab() {
232 int a, size;
233 unsigned int hash;
234 // Sort the vocabulary and keep </s> at the first position
235 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
236 for (a = 0; a < vocab_hash_size; a++)
237 vocab_hash[a] = -1;
238 size = vocab_size;
239 train_words = 0;
240 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200241 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100242 // Words occuring less than min_count times will be discarded from the vocab
243 if ((vocab[a].cn < min_count) && (a != 0)) {
244 vocab_size--;
245 free(vocab[a].word);
246 } else {
247 // Hash will be re-computed, as after the sorting it is not actual
248 hash = GetWordHash(vocab[a].word);
249 while (vocab_hash[hash] != -1)
250 hash = (hash + 1) % vocab_hash_size;
251 vocab_hash[hash] = a;
252 train_words += vocab[a].cn;
253 }
254 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200255 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100256 vocab = (struct vocab_word *) realloc(vocab,
257 (vocab_size + 1) * sizeof(struct vocab_word));
258 // Allocate memory for the binary tree construction
259 for (a = 0; a < vocab_size; a++) {
260 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
261 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
262 }
263}
264
265// Reduces the vocabulary by removing infrequent tokens
266void ReduceVocab() {
267 int a, b = 0;
268 unsigned int hash;
269 for (a = 0; a < vocab_size; a++)
270 if (vocab[a].cn > min_reduce) {
271 vocab[b].cn = vocab[a].cn;
272 vocab[b].word = vocab[a].word;
273 b++;
274 } else
275 free(vocab[a].word);
276 vocab_size = b;
277 for (a = 0; a < vocab_hash_size; a++)
278 vocab_hash[a] = -1;
279 for (a = 0; a < vocab_size; a++) {
280 // Hash will be re-computed, as it is not actual
281 hash = GetWordHash(vocab[a].word);
282 while (vocab_hash[hash] != -1)
283 hash = (hash + 1) % vocab_hash_size;
284 vocab_hash[hash] = a;
285 }
286 fflush(stdout);
287 min_reduce++;
288}
289
290// Create binary Huffman tree using the word counts
291// Frequent words will have short uniqe binary codes
292void CreateBinaryTree() {
293 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
294 char code[MAX_CODE_LENGTH];
295 long long *count = (long long *) calloc(vocab_size * 2 + 1,
296 sizeof(long long));
297 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
298 sizeof(long long));
299 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
300 sizeof(long long));
301 for (a = 0; a < vocab_size; a++)
302 count[a] = vocab[a].cn;
303 for (a = vocab_size; a < vocab_size * 2; a++)
304 count[a] = 1e15;
305 pos1 = vocab_size - 1;
306 pos2 = vocab_size;
307 // Following algorithm constructs the Huffman tree by adding one node at a time
308 for (a = 0; a < vocab_size - 1; a++) {
309 // First, find two smallest nodes 'min1, min2'
310 if (pos1 >= 0) {
311 if (count[pos1] < count[pos2]) {
312 min1i = pos1;
313 pos1--;
314 } else {
315 min1i = pos2;
316 pos2++;
317 }
318 } else {
319 min1i = pos2;
320 pos2++;
321 }
322 if (pos1 >= 0) {
323 if (count[pos1] < count[pos2]) {
324 min2i = pos1;
325 pos1--;
326 } else {
327 min2i = pos2;
328 pos2++;
329 }
330 } else {
331 min2i = pos2;
332 pos2++;
333 }
334 count[vocab_size + a] = count[min1i] + count[min2i];
335 parent_node[min1i] = vocab_size + a;
336 parent_node[min2i] = vocab_size + a;
337 binary[min2i] = 1;
338 }
339 // Now assign binary code to each vocabulary word
340 for (a = 0; a < vocab_size; a++) {
341 b = a;
342 i = 0;
343 while (1) {
344 code[i] = binary[b];
345 point[i] = b;
346 i++;
347 b = parent_node[b];
348 if (b == vocab_size * 2 - 2)
349 break;
350 }
351 vocab[a].codelen = i;
352 vocab[a].point[0] = vocab_size - 2;
353 for (b = 0; b < i; b++) {
354 vocab[a].code[i - b - 1] = code[b];
355 vocab[a].point[i - b] = point[b] - vocab_size;
356 }
357 }
358 free(count);
359 free(binary);
360 free(parent_node);
361}
362
363void LearnVocabFromTrainFile() {
364 char word[MAX_STRING];
365 FILE *fin;
366 long long a, i;
367 for (a = 0; a < vocab_hash_size; a++)
368 vocab_hash[a] = -1;
369 fin = fopen(train_file, "rb");
370 if (fin == NULL) {
371 printf("ERROR: training data file not found!\n");
372 exit(1);
373 }
374 vocab_size = 0;
375 AddWordToVocab((char *) "</s>");
376 while (1) {
377 ReadWord(word, fin);
378 if (feof(fin))
379 break;
380 train_words++;
381 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
382 printf("%lldK%c", train_words / 1000, 13);
383 fflush(stdout);
384 }
385 i = SearchVocab(word);
386 if (i == -1) {
387 a = AddWordToVocab(word);
388 vocab[a].cn = 1;
389 } else
390 vocab[i].cn++;
391 if (vocab_size > vocab_hash_size * 0.7)
392 ReduceVocab();
393 }
394 SortVocab();
395 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200396 printf("Vocab size: %'lld\n", vocab_size);
397 printf("Words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100398 }
399 file_size = ftell(fin);
400 fclose(fin);
401}
402
403void SaveVocab() {
404 long long i;
405 FILE *fo = fopen(save_vocab_file, "wb");
406 for (i = 0; i < vocab_size; i++)
407 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
408 fclose(fo);
409}
410
411void ReadVocab() {
412 long long a, i = 0;
413 char c;
414 char word[MAX_STRING];
415 FILE *fin = fopen(read_vocab_file, "rb");
416 if (fin == NULL) {
417 printf("Vocabulary file not found\n");
418 exit(1);
419 }
420 for (a = 0; a < vocab_hash_size; a++)
421 vocab_hash[a] = -1;
422 vocab_size = 0;
423 while (1) {
424 ReadWord(word, fin);
425 if (feof(fin))
426 break;
427 a = AddWordToVocab(word);
428 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
429 i++;
430 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200431 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100432 fin = fopen(train_file, "rb");
433 if (fin == NULL) {
434 printf("ERROR: training data file not found!\n");
435 exit(1);
436 }
437 fseek(fin, 0, SEEK_END);
438 file_size = ftell(fin);
439 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200440 SortVocab();
441 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200442 printf("Vocab size: %'lld\n", vocab_size);
443 printf("Words in vocab's train file: %'lld\n", train_words);
444 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200445 }
Marc Kupietze23c5402016-07-14 11:10:09 +0200446 train_words = file_size / avgWordLength;
447 if(debug_mode > 0)
448 printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100449}
450
451void InitClassUnigramTable() {
452 long long a, c;
453 printf("loading class unigrams \n");
454 FILE *fin = fopen(negative_classes_file, "rb");
455 if (fin == NULL) {
456 printf("ERROR: class file not found!\n");
457 exit(1);
458 }
459 word_to_group = (int *) malloc(vocab_size * sizeof(int));
460 for (a = 0; a < vocab_size; a++)
461 word_to_group[a] = -1;
462 char class[MAX_STRING];
463 char prev_class[MAX_STRING];
464 prev_class[0] = 0;
465 char word[MAX_STRING];
466 class_number = -1;
467 while (1) {
468 if (feof(fin))
469 break;
470 ReadWord(class, fin);
471 ReadWord(word, fin);
472 int word_index = SearchVocab(word);
473 if (word_index != -1) {
474 if (strcmp(class, prev_class) != 0) {
475 class_number++;
476 strcpy(prev_class, class);
477 }
478 word_to_group[word_index] = class_number;
479 }
480 ReadWord(word, fin);
481 }
482 class_number++;
483 fclose(fin);
484
485 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
486 long long train_words_pow = 0;
487 real d1, power = 0.75;
488
489 for (c = 0; c < class_number; c++) {
490 long long offset = c * table_size;
491 train_words_pow = 0;
492 for (a = 0; a < vocab_size; a++)
493 if (word_to_group[a] == c)
494 train_words_pow += pow(vocab[a].cn, power);
495 int i = 0;
496 while (word_to_group[i] != c && i < vocab_size)
497 i++;
498 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
499 for (a = 0; a < table_size; a++) {
500 //printf("index %lld , word %d\n", a, i);
501 group_to_table[offset + a] = i;
502 if (a / (real) table_size > d1) {
503 i++;
504 while (word_to_group[i] != c && i < vocab_size)
505 i++;
506 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
507 }
508 if (i >= vocab_size)
509 while (word_to_group[i] != c && i >= 0)
510 i--;
511 }
512 }
513}
514
Marc Kupietz61485ad2023-12-22 16:16:59 +0100515void SaveArgs(unsigned int argc, char **argv) {
Marc Kupietz210b9d52016-04-02 21:48:13 +0200516 unsigned int i;
Marc Kupietz44136742017-12-22 17:52:56 +0100517 char args_file[MAX_STRING];
518 strcpy(args_file, output_file);
Marc Kupietz210b9d52016-04-02 21:48:13 +0200519 strcat(args_file, ".args");
520 FILE *fargs = fopen(args_file, "w");
521 if (fargs == NULL) {
522 printf("Cannot save args to %s.\n", args_file);
523 return;
524 }
525
Marc Kupietz44136742017-12-22 17:52:56 +0100526 for(i=1; i<argc; i++)
527 fprintf(fargs, "%s ", argv[i]);
528
529 fprintf(fargs, "\n");
Marc Kupietz210b9d52016-04-02 21:48:13 +0200530 fclose(fargs);
Marc Kupietz44136742017-12-22 17:52:56 +0100531
Marc Kupietz210b9d52016-04-02 21:48:13 +0200532 return;
533}
534
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100535void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100536 if(type != 3 || negative <= 0) {
537 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
538 return;
539 }
540
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100541 FILE *fnet = fopen(save_net_file, "wb");
542 if (fnet == NULL) {
543 printf("Net parameter file not found\n");
544 exit(1);
545 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100546 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100547 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100548 fclose(fnet);
549}
550
551void InitNet() {
552 long long a, b;
553 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100554 long long read;
555
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100556 window_layer_size = layer1_size * window * 2;
557 a = posix_memalign((void **) &syn0, 128,
558 (long long) vocab_size * layer1_size * sizeof(real));
559 if (syn0 == NULL) {
560 printf("Memory allocation failed\n");
561 exit(1);
562 }
563
564 if (hs) {
565 a = posix_memalign((void **) &syn1, 128,
566 (long long) vocab_size * layer1_size * sizeof(real));
567 if (syn1 == NULL) {
568 printf("Memory allocation failed\n");
569 exit(1);
570 }
571 a = posix_memalign((void **) &syn1_window, 128,
572 (long long) vocab_size * window_layer_size * sizeof(real));
573 if (syn1_window == NULL) {
574 printf("Memory allocation failed\n");
575 exit(1);
576 }
577 a = posix_memalign((void **) &syn_hidden_word, 128,
578 (long long) vocab_size * window_hidden_size * sizeof(real));
579 if (syn_hidden_word == NULL) {
580 printf("Memory allocation failed\n");
581 exit(1);
582 }
583
584 for (a = 0; a < vocab_size; a++)
585 for (b = 0; b < layer1_size; b++)
586 syn1[a * layer1_size + b] = 0;
587 for (a = 0; a < vocab_size; a++)
588 for (b = 0; b < window_layer_size; b++)
589 syn1_window[a * window_layer_size + b] = 0;
590 for (a = 0; a < vocab_size; a++)
591 for (b = 0; b < window_hidden_size; b++)
592 syn_hidden_word[a * window_hidden_size + b] = 0;
593 }
594 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100595 if(type == 0) {
596 a = posix_memalign((void **) &syn1neg, 128,
597 (long long) vocab_size * layer1_size * sizeof(real));
598 if (syn1neg == NULL) {
599 printf("Memory allocation failed\n");
600 exit(1);
601 }
602 for (a = 0; a < vocab_size; a++)
603 for (b = 0; b < layer1_size; b++)
604 syn1neg[a * layer1_size + b] = 0;
605 } else if (type == 3) {
606 a = posix_memalign((void **) &syn1neg_window, 128,
607 (long long) vocab_size * window_layer_size * sizeof(real));
608 if (syn1neg_window == NULL) {
609 printf("Memory allocation failed\n");
610 exit(1);
611 }
612 for (a = 0; a < vocab_size; a++)
613 for (b = 0; b < window_layer_size; b++)
614 syn1neg_window[a * window_layer_size + b] = 0;
615 } else if (type == 4) {
616 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
617 (long long) vocab_size * window_hidden_size * sizeof(real));
618 if (syn_hidden_word_neg == NULL) {
619 printf("Memory allocation failed\n");
620 exit(1);
621 }
622 for (a = 0; a < vocab_size; a++)
623 for (b = 0; b < window_hidden_size; b++)
624 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100625 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100626 }
627 if (nce > 0) {
628 a = posix_memalign((void **) &syn1nce, 128,
629 (long long) vocab_size * layer1_size * sizeof(real));
630 if (syn1nce == NULL) {
631 printf("Memory allocation failed\n");
632 exit(1);
633 }
634 a = posix_memalign((void **) &syn1nce_window, 128,
635 (long long) vocab_size * window_layer_size * sizeof(real));
636 if (syn1nce_window == NULL) {
637 printf("Memory allocation failed\n");
638 exit(1);
639 }
640 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
641 (long long) vocab_size * window_hidden_size * sizeof(real));
642 if (syn_hidden_word_nce == NULL) {
643 printf("Memory allocation failed\n");
644 exit(1);
645 }
646
647 for (a = 0; a < vocab_size; a++)
648 for (b = 0; b < layer1_size; b++)
649 syn1nce[a * layer1_size + b] = 0;
650 for (a = 0; a < vocab_size; a++)
651 for (b = 0; b < window_layer_size; b++)
652 syn1nce_window[a * window_layer_size + b] = 0;
653 for (a = 0; a < vocab_size; a++)
654 for (b = 0; b < window_hidden_size; b++)
655 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
656 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100657
Marc Kupietz1006a272016-03-16 15:50:20 +0100658 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100659 a = posix_memalign((void **) &syn_window_hidden, 128,
660 window_hidden_size * window_layer_size * sizeof(real));
661 if (syn_window_hidden == NULL) {
662 printf("Memory allocation failed\n");
663 exit(1);
664 }
665 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
666 next_random = next_random * (unsigned long long) 25214903917 + 11;
667 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
668 - 0.5) / (window_hidden_size * window_layer_size);
669 }
670 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100671
672 if (read_net_file[0] == 0) {
673 for (a = 0; a < vocab_size; a++)
674 for (b = 0; b < layer1_size; b++) {
675 next_random = next_random * (unsigned long long) 25214903917
676 + 11;
677 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
678 / (real) 65536) - 0.5) / layer1_size;
679 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100680 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100681 FILE *fnet = fopen(read_net_file, "rb");
682 if (fnet == NULL) {
683 printf("Net parameter file not found\n");
684 exit(1);
685 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100686 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
687 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
688 if(read != vocab_size * layer1_size) {
689 fprintf(stderr, "read-net failed %lld\n", read);
690 exit(-1);
691 }
692 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
693 if(read != (long long) vocab_size * window_layer_size) {
694 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
695 (long long) sizeof(real) * vocab_size * window_layer_size);
696 exit(-1);
697 }
698 fgetc(fnet);
699 if(!feof(fnet)) {
700 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
701 exit(-1);
702 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100703 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100704 } else {
705 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
706 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100707 }
708
709 CreateBinaryTree();
710}
711
Marc Kupietz202723e2016-07-14 09:12:00 +0200712char *currentDateTime(char *buf, real offset) {
713 time_t t;
714 time(&t);
715 t += (long) offset;
716 struct tm tstruct;
717 tstruct = *localtime(&t);
718 strftime(buf, 80, "%c", &tstruct);
719 return buf;
720}
721
722void *MonitorThread(void *id) {
723 char *timebuf = malloc(80);;
724 int i, n=num_threads;
725 long long sum;
726 sleep(1);
727 while(n > 0) {
728 sleep(1);
729 sum = n = 0;
730 for(i=0; i < num_threads; i++) {
731 if(threadPos[i] >= 0) {
732 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
733 n++;
734 } else {
735 sum += iter * file_size / num_threads;
736 }
737 }
738 if(n == 0)
739 break;
740 real finished_portion = (real) sum / (float) (file_size * iter);
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100741 long long now = time(NULL);
742 long long elapsed = (now - start);
743 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
Marc Kupietz202723e2016-07-14 09:12:00 +0200744
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100745 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
Marc Kupietz202723e2016-07-14 09:12:00 +0200746 alpha,
747 finished_portion * 100,
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100748 (float) sum / elapsed / 1000,
Marc Kupietz202723e2016-07-14 09:12:00 +0200749 elapsed,
750 ttg,
751 currentDateTime(timebuf, ttg)
752 );
753 fflush(stdout);
754 }
755 pthread_exit(NULL);
756}
757
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100758void *TrainModelThread(void *id) {
759 long long a, b, d, cw, word, last_word, sentence_length = 0,
760 sentence_position = 0;
761 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
762 long long l1, l2, c, target, label, local_iter = iter;
763 unsigned long long next_random = (long long) id;
764 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100765 int input_len_1 = layer1_size;
766 int window_offset = -1;
767 if (type == 2 || type == 4) {
768 input_len_1 = window_layer_size;
769 }
770 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
771 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200772 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100773
774 int input_len_2 = 0;
775 if (type == 4) {
776 input_len_2 = window_hidden_size;
777 }
778 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
779 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
780
781 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200782 long long start_pos = file_size / (long long) num_threads * (long long) id;
783 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
784 long long current_pos = start_pos;
785 long long last_pos = start_pos;;
786 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100787 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200788 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100789 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200790 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100791 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100792 alpha = starting_alpha
793 * (1 - word_count_actual / (real) (iter * train_words + 1));
794 if (alpha < starting_alpha * 0.0001)
795 alpha = starting_alpha * 0.0001;
796 }
797 if (sentence_length == 0) {
798 while (1) {
799 word = ReadWordIndex(fi);
800 if (feof(fi))
801 break;
802 if (word == -1)
803 continue;
804 word_count++;
805 if (word == 0)
806 break;
807 // The subsampling randomly discards frequent words while keeping the ranking same
808 if (sample > 0) {
809 real ran = (sqrt(vocab[word].cn / (sample * train_words))
810 + 1) * (sample * train_words) / vocab[word].cn;
811 next_random = next_random * (unsigned long long) 25214903917
812 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100813 if (ran < (next_random & 0xFFFF) / (real) 65536) {
814 if(type == 3) // in structured skipgrams
815 word = -2; // keep the window position correct
816 else
817 continue;
818 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100819 }
820 sen[sentence_length] = word;
821 sentence_length++;
822 if (sentence_length >= MAX_SENTENCE_LENGTH)
823 break;
824 }
825 sentence_position = 0;
826 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200827 current_pos = threadPos[(long) id] = ftell(fi);
828 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100829 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200830 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100831 local_iter--;
832 if (local_iter == 0)
833 break;
Marc Kupietze423f732017-12-22 17:57:03 +0100834 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
835 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
836 break;
837 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100838 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200839 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100840 last_word_count = 0;
841 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200842 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100843 continue;
844 }
845 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200846 while (word == -2 && sentence_position<sentence_length)
847 word = sen[++sentence_position];
848 if (sentence_position>=sentence_length) {
849 sentence_length=0;
850 continue;
851 }
852 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100853 continue;
854 for (c = 0; c < input_len_1; c++)
855 neu1[c] = 0;
856 for (c = 0; c < input_len_1; c++)
857 neu1e[c] = 0;
858 for (c = 0; c < input_len_2; c++)
859 neu2[c] = 0;
860 for (c = 0; c < input_len_2; c++)
861 neu2e[c] = 0;
862 next_random = next_random * (unsigned long long) 25214903917 + 11;
863 b = next_random % window;
864 if (type == 0) { //train the cbow architecture
865 // in -> hidden
866 cw = 0;
867 for (a = b; a < window * 2 + 1 - b; a++)
868 if (a != window) {
869 c = sentence_position - window + a;
870 if (c < 0)
871 continue;
872 if (c >= sentence_length)
873 continue;
874 last_word = sen[c];
875 if (last_word == -1)
876 continue;
877 for (c = 0; c < layer1_size; c++)
878 neu1[c] += syn0[c + last_word * layer1_size];
879 cw++;
880 }
881 if (cw) {
882 for (c = 0; c < layer1_size; c++)
883 neu1[c] /= cw;
884 if (hs)
885 for (d = 0; d < vocab[word].codelen; d++) {
886 f = 0;
887 l2 = vocab[word].point[d] * layer1_size;
888 // Propagate hidden -> output
889 for (c = 0; c < layer1_size; c++)
890 f += neu1[c] * syn1[c + l2];
891 if (f <= -MAX_EXP)
892 continue;
893 else if (f >= MAX_EXP)
894 continue;
895 else
896 f = expTable[(int) ((f + MAX_EXP)
897 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
898 // 'g' is the gradient multiplied by the learning rate
899 g = (1 - vocab[word].code[d] - f) * alpha;
900 // Propagate errors output -> hidden
901 for (c = 0; c < layer1_size; c++)
902 neu1e[c] += g * syn1[c + l2];
903 // Learn weights hidden -> output
904 for (c = 0; c < layer1_size; c++)
905 syn1[c + l2] += g * neu1[c];
906 if (cap == 1)
907 for (c = 0; c < layer1_size; c++)
908 capParam(syn1, c + l2);
909 }
910 // NEGATIVE SAMPLING
911 if (negative > 0)
912 for (d = 0; d < negative + 1; d++) {
913 if (d == 0) {
914 target = word;
915 label = 1;
916 } else {
917 next_random = next_random
918 * (unsigned long long) 25214903917 + 11;
919 if (word_to_group != NULL
920 && word_to_group[word] != -1) {
921 target = word;
922 while (target == word) {
923 target = group_to_table[word_to_group[word]
924 * table_size
925 + (next_random >> 16) % table_size];
926 next_random = next_random
927 * (unsigned long long) 25214903917
928 + 11;
929 }
930 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
931 } else {
932 target =
933 table[(next_random >> 16) % table_size];
934 }
935 if (target == 0)
936 target = next_random % (vocab_size - 1) + 1;
937 if (target == word)
938 continue;
939 label = 0;
940 }
941 l2 = target * layer1_size;
942 f = 0;
943 for (c = 0; c < layer1_size; c++)
944 f += neu1[c] * syn1neg[c + l2];
945 if (f > MAX_EXP)
946 g = (label - 1) * alpha;
947 else if (f < -MAX_EXP)
948 g = (label - 0) * alpha;
949 else
950 g = (label
951 - expTable[(int) ((f + MAX_EXP)
952 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
953 * alpha;
954 for (c = 0; c < layer1_size; c++)
955 neu1e[c] += g * syn1neg[c + l2];
956 for (c = 0; c < layer1_size; c++)
957 syn1neg[c + l2] += g * neu1[c];
958 if (cap == 1)
959 for (c = 0; c < layer1_size; c++)
960 capParam(syn1neg, c + l2);
961 }
962 // Noise Contrastive Estimation
963 if (nce > 0)
964 for (d = 0; d < nce + 1; d++) {
965 if (d == 0) {
966 target = word;
967 label = 1;
968 } else {
969 next_random = next_random
970 * (unsigned long long) 25214903917 + 11;
971 if (word_to_group != NULL
972 && word_to_group[word] != -1) {
973 target = word;
974 while (target == word) {
975 target = group_to_table[word_to_group[word]
976 * table_size
977 + (next_random >> 16) % table_size];
978 next_random = next_random
979 * (unsigned long long) 25214903917
980 + 11;
981 }
982 } else {
983 target =
984 table[(next_random >> 16) % table_size];
985 }
986 if (target == 0)
987 target = next_random % (vocab_size - 1) + 1;
988 if (target == word)
989 continue;
990 label = 0;
991 }
992 l2 = target * layer1_size;
993 f = 0;
994
995 for (c = 0; c < layer1_size; c++)
996 f += neu1[c] * syn1nce[c + l2];
997 if (f > MAX_EXP)
998 g = (label - 1) * alpha;
999 else if (f < -MAX_EXP)
1000 g = (label - 0) * alpha;
1001 else {
1002 f = exp(f);
1003 g =
1004 (label
1005 - f
1006 / (noise_distribution[target]
1007 * nce + f)) * alpha;
1008 }
1009 for (c = 0; c < layer1_size; c++)
1010 neu1e[c] += g * syn1nce[c + l2];
1011 for (c = 0; c < layer1_size; c++)
1012 syn1nce[c + l2] += g * neu1[c];
1013 if (cap == 1)
1014 for (c = 0; c < layer1_size; c++)
1015 capParam(syn1nce, c + l2);
1016 }
1017 // hidden -> in
1018 for (a = b; a < window * 2 + 1 - b; a++)
1019 if (a != window) {
1020 c = sentence_position - window + a;
1021 if (c < 0)
1022 continue;
1023 if (c >= sentence_length)
1024 continue;
1025 last_word = sen[c];
1026 if (last_word == -1)
1027 continue;
1028 for (c = 0; c < layer1_size; c++)
1029 syn0[c + last_word * layer1_size] += neu1e[c];
1030 }
1031 }
1032 } else if (type == 1) { //train skip-gram
1033 for (a = b; a < window * 2 + 1 - b; a++)
1034 if (a != window) {
1035 c = sentence_position - window + a;
1036 if (c < 0)
1037 continue;
1038 if (c >= sentence_length)
1039 continue;
1040 last_word = sen[c];
1041 if (last_word == -1)
1042 continue;
1043 l1 = last_word * layer1_size;
1044 for (c = 0; c < layer1_size; c++)
1045 neu1e[c] = 0;
1046 // HIERARCHICAL SOFTMAX
1047 if (hs)
1048 for (d = 0; d < vocab[word].codelen; d++) {
1049 f = 0;
1050 l2 = vocab[word].point[d] * layer1_size;
1051 // Propagate hidden -> output
1052 for (c = 0; c < layer1_size; c++)
1053 f += syn0[c + l1] * syn1[c + l2];
1054 if (f <= -MAX_EXP)
1055 continue;
1056 else if (f >= MAX_EXP)
1057 continue;
1058 else
1059 f = expTable[(int) ((f + MAX_EXP)
1060 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1061 // 'g' is the gradient multiplied by the learning rate
1062 g = (1 - vocab[word].code[d] - f) * alpha;
1063 // Propagate errors output -> hidden
1064 for (c = 0; c < layer1_size; c++)
1065 neu1e[c] += g * syn1[c + l2];
1066 // Learn weights hidden -> output
1067 for (c = 0; c < layer1_size; c++)
1068 syn1[c + l2] += g * syn0[c + l1];
1069 if (cap == 1)
1070 for (c = 0; c < layer1_size; c++)
1071 capParam(syn1, c + l2);
1072 }
1073 // NEGATIVE SAMPLING
1074 if (negative > 0)
1075 for (d = 0; d < negative + 1; d++) {
1076 if (d == 0) {
1077 target = word;
1078 label = 1;
1079 } else {
1080 next_random = next_random
1081 * (unsigned long long) 25214903917 + 11;
1082 if (word_to_group != NULL
1083 && word_to_group[word] != -1) {
1084 target = word;
1085 while (target == word) {
1086 target =
1087 group_to_table[word_to_group[word]
1088 * table_size
1089 + (next_random >> 16)
1090 % table_size];
1091 next_random =
1092 next_random
1093 * (unsigned long long) 25214903917
1094 + 11;
1095 }
1096 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1097 } else {
1098 target = table[(next_random >> 16)
1099 % table_size];
1100 }
1101 if (target == 0)
1102 target = next_random % (vocab_size - 1) + 1;
1103 if (target == word)
1104 continue;
1105 label = 0;
1106 }
1107 l2 = target * layer1_size;
1108 f = 0;
1109 for (c = 0; c < layer1_size; c++)
1110 f += syn0[c + l1] * syn1neg[c + l2];
1111 if (f > MAX_EXP)
1112 g = (label - 1) * alpha;
1113 else if (f < -MAX_EXP)
1114 g = (label - 0) * alpha;
1115 else
1116 g =
1117 (label
1118 - expTable[(int) ((f + MAX_EXP)
1119 * (EXP_TABLE_SIZE
1120 / MAX_EXP / 2))])
1121 * alpha;
1122 for (c = 0; c < layer1_size; c++)
1123 neu1e[c] += g * syn1neg[c + l2];
1124 for (c = 0; c < layer1_size; c++)
1125 syn1neg[c + l2] += g * syn0[c + l1];
1126 if (cap == 1)
1127 for (c = 0; c < layer1_size; c++)
1128 capParam(syn1neg, c + l2);
1129 }
1130 //Noise Contrastive Estimation
1131 if (nce > 0)
1132 for (d = 0; d < nce + 1; d++) {
1133 if (d == 0) {
1134 target = word;
1135 label = 1;
1136 } else {
1137 next_random = next_random
1138 * (unsigned long long) 25214903917 + 11;
1139 if (word_to_group != NULL
1140 && word_to_group[word] != -1) {
1141 target = word;
1142 while (target == word) {
1143 target =
1144 group_to_table[word_to_group[word]
1145 * table_size
1146 + (next_random >> 16)
1147 % table_size];
1148 next_random =
1149 next_random
1150 * (unsigned long long) 25214903917
1151 + 11;
1152 }
1153 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1154 } else {
1155 target = table[(next_random >> 16)
1156 % table_size];
1157 }
1158 if (target == 0)
1159 target = next_random % (vocab_size - 1) + 1;
1160 if (target == word)
1161 continue;
1162 label = 0;
1163 }
1164 l2 = target * layer1_size;
1165 f = 0;
1166 for (c = 0; c < layer1_size; c++)
1167 f += syn0[c + l1] * syn1nce[c + l2];
1168 if (f > MAX_EXP)
1169 g = (label - 1) * alpha;
1170 else if (f < -MAX_EXP)
1171 g = (label - 0) * alpha;
1172 else {
1173 f = exp(f);
1174 g = (label
1175 - f
1176 / (noise_distribution[target]
1177 * nce + f)) * alpha;
1178 }
1179 for (c = 0; c < layer1_size; c++)
1180 neu1e[c] += g * syn1nce[c + l2];
1181 for (c = 0; c < layer1_size; c++)
1182 syn1nce[c + l2] += g * syn0[c + l1];
1183 if (cap == 1)
1184 for (c = 0; c < layer1_size; c++)
1185 capParam(syn1nce, c + l2);
1186 }
1187 // Learn weights input -> hidden
1188 for (c = 0; c < layer1_size; c++)
1189 syn0[c + l1] += neu1e[c];
1190 }
1191 } else if (type == 2) { //train the cwindow architecture
1192 // in -> hidden
1193 cw = 0;
1194 for (a = 0; a < window * 2 + 1; a++)
1195 if (a != window) {
1196 c = sentence_position - window + a;
1197 if (c < 0)
1198 continue;
1199 if (c >= sentence_length)
1200 continue;
1201 last_word = sen[c];
1202 if (last_word == -1)
1203 continue;
1204 window_offset = a * layer1_size;
1205 if (a > window)
1206 window_offset -= layer1_size;
1207 for (c = 0; c < layer1_size; c++)
1208 neu1[c + window_offset] += syn0[c
1209 + last_word * layer1_size];
1210 cw++;
1211 }
1212 if (cw) {
1213 if (hs)
1214 for (d = 0; d < vocab[word].codelen; d++) {
1215 f = 0;
1216 l2 = vocab[word].point[d] * window_layer_size;
1217 // Propagate hidden -> output
1218 for (c = 0; c < window_layer_size; c++)
1219 f += neu1[c] * syn1_window[c + l2];
1220 if (f <= -MAX_EXP)
1221 continue;
1222 else if (f >= MAX_EXP)
1223 continue;
1224 else
1225 f = expTable[(int) ((f + MAX_EXP)
1226 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1227 // 'g' is the gradient multiplied by the learning rate
1228 g = (1 - vocab[word].code[d] - f) * alpha;
1229 // Propagate errors output -> hidden
1230 for (c = 0; c < window_layer_size; c++)
1231 neu1e[c] += g * syn1_window[c + l2];
1232 // Learn weights hidden -> output
1233 for (c = 0; c < window_layer_size; c++)
1234 syn1_window[c + l2] += g * neu1[c];
1235 if (cap == 1)
1236 for (c = 0; c < window_layer_size; c++)
1237 capParam(syn1_window, c + l2);
1238 }
1239 // NEGATIVE SAMPLING
1240 if (negative > 0)
1241 for (d = 0; d < negative + 1; d++) {
1242 if (d == 0) {
1243 target = word;
1244 label = 1;
1245 } else {
1246 next_random = next_random
1247 * (unsigned long long) 25214903917 + 11;
1248 if (word_to_group != NULL
1249 && word_to_group[word] != -1) {
1250 target = word;
1251 while (target == word) {
1252 target = group_to_table[word_to_group[word]
1253 * table_size
1254 + (next_random >> 16) % table_size];
1255 next_random = next_random
1256 * (unsigned long long) 25214903917
1257 + 11;
1258 }
1259 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1260 } else {
1261 target =
1262 table[(next_random >> 16) % table_size];
1263 }
1264 if (target == 0)
1265 target = next_random % (vocab_size - 1) + 1;
1266 if (target == word)
1267 continue;
1268 label = 0;
1269 }
1270 l2 = target * window_layer_size;
1271 f = 0;
1272 for (c = 0; c < window_layer_size; c++)
1273 f += neu1[c] * syn1neg_window[c + l2];
1274 if (f > MAX_EXP)
1275 g = (label - 1) * alpha;
1276 else if (f < -MAX_EXP)
1277 g = (label - 0) * alpha;
1278 else
1279 g = (label
1280 - expTable[(int) ((f + MAX_EXP)
1281 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1282 * alpha;
1283 for (c = 0; c < window_layer_size; c++)
1284 neu1e[c] += g * syn1neg_window[c + l2];
1285 for (c = 0; c < window_layer_size; c++)
1286 syn1neg_window[c + l2] += g * neu1[c];
1287 if (cap == 1)
1288 for (c = 0; c < window_layer_size; c++)
1289 capParam(syn1neg_window, c + l2);
1290 }
1291 // Noise Contrastive Estimation
1292 if (nce > 0)
1293 for (d = 0; d < nce + 1; d++) {
1294 if (d == 0) {
1295 target = word;
1296 label = 1;
1297 } else {
1298 next_random = next_random
1299 * (unsigned long long) 25214903917 + 11;
1300 if (word_to_group != NULL
1301 && word_to_group[word] != -1) {
1302 target = word;
1303 while (target == word) {
1304 target = group_to_table[word_to_group[word]
1305 * table_size
1306 + (next_random >> 16) % table_size];
1307 next_random = next_random
1308 * (unsigned long long) 25214903917
1309 + 11;
1310 }
1311 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1312 } else {
1313 target =
1314 table[(next_random >> 16) % table_size];
1315 }
1316 if (target == 0)
1317 target = next_random % (vocab_size - 1) + 1;
1318 if (target == word)
1319 continue;
1320 label = 0;
1321 }
1322 l2 = target * window_layer_size;
1323 f = 0;
1324 for (c = 0; c < window_layer_size; c++)
1325 f += neu1[c] * syn1nce_window[c + l2];
1326 if (f > MAX_EXP)
1327 g = (label - 1) * alpha;
1328 else if (f < -MAX_EXP)
1329 g = (label - 0) * alpha;
1330 else {
1331 f = exp(f);
1332 g =
1333 (label
1334 - f
1335 / (noise_distribution[target]
1336 * nce + f)) * alpha;
1337 }
1338 for (c = 0; c < window_layer_size; c++)
1339 neu1e[c] += g * syn1nce_window[c + l2];
1340 for (c = 0; c < window_layer_size; c++)
1341 syn1nce_window[c + l2] += g * neu1[c];
1342 if (cap == 1)
1343 for (c = 0; c < window_layer_size; c++)
1344 capParam(syn1nce_window, c + l2);
1345 }
1346 // hidden -> in
1347 for (a = 0; a < window * 2 + 1; a++)
1348 if (a != window) {
1349 c = sentence_position - window + a;
1350 if (c < 0)
1351 continue;
1352 if (c >= sentence_length)
1353 continue;
1354 last_word = sen[c];
1355 if (last_word == -1)
1356 continue;
1357 window_offset = a * layer1_size;
1358 if (a > window)
1359 window_offset -= layer1_size;
1360 for (c = 0; c < layer1_size; c++)
1361 syn0[c + last_word * layer1_size] += neu1e[c
1362 + window_offset];
1363 }
1364 }
1365 } else if (type == 3) { //train structured skip-gram
1366 for (a = 0; a < window * 2 + 1; a++)
1367 if (a != window) {
1368 c = sentence_position - window + a;
1369 if (c < 0)
1370 continue;
1371 if (c >= sentence_length)
1372 continue;
1373 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001374 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001375 continue;
1376 l1 = last_word * layer1_size;
1377 window_offset = a * layer1_size;
1378 if (a > window)
1379 window_offset -= layer1_size;
1380 for (c = 0; c < layer1_size; c++)
1381 neu1e[c] = 0;
1382 // HIERARCHICAL SOFTMAX
1383 if (hs)
1384 for (d = 0; d < vocab[word].codelen; d++) {
1385 f = 0;
1386 l2 = vocab[word].point[d] * window_layer_size;
1387 // Propagate hidden -> output
1388 for (c = 0; c < layer1_size; c++)
1389 f += syn0[c + l1]
1390 * syn1_window[c + l2 + window_offset];
1391 if (f <= -MAX_EXP)
1392 continue;
1393 else if (f >= MAX_EXP)
1394 continue;
1395 else
1396 f = expTable[(int) ((f + MAX_EXP)
1397 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1398 // 'g' is the gradient multiplied by the learning rate
1399 g = (1 - vocab[word].code[d] - f) * alpha;
1400 // Propagate errors output -> hidden
1401 for (c = 0; c < layer1_size; c++)
1402 neu1e[c] += g
1403 * syn1_window[c + l2 + window_offset];
1404 // Learn weights hidden -> output
1405 for (c = 0; c < layer1_size; c++)
1406 syn1[c + l2 + window_offset] += g
1407 * syn0[c + l1];
1408 if (cap == 1)
1409 for (c = 0; c < layer1_size; c++)
1410 capParam(syn1, c + l2 + window_offset);
1411 }
1412 // NEGATIVE SAMPLING
1413 if (negative > 0)
1414 for (d = 0; d < negative + 1; d++) {
1415 if (d == 0) {
1416 target = word;
1417 label = 1;
1418 } else {
1419 next_random = next_random
1420 * (unsigned long long) 25214903917 + 11;
1421 if (word_to_group != NULL
1422 && word_to_group[word] != -1) {
1423 target = word;
1424 while (target == word) {
1425 target =
1426 group_to_table[word_to_group[word]
1427 * table_size
1428 + (next_random >> 16)
1429 % table_size];
1430 next_random =
1431 next_random
1432 * (unsigned long long) 25214903917
1433 + 11;
1434 }
1435 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1436 } else {
1437 target = table[(next_random >> 16)
1438 % table_size];
1439 }
1440 if (target == 0)
1441 target = next_random % (vocab_size - 1) + 1;
1442 if (target == word)
1443 continue;
1444 label = 0;
1445 }
1446 l2 = target * window_layer_size;
1447 f = 0;
1448 for (c = 0; c < layer1_size; c++)
1449 f +=
1450 syn0[c + l1]
1451 * syn1neg_window[c + l2
1452 + window_offset];
1453 if (f > MAX_EXP)
1454 g = (label - 1) * alpha;
1455 else if (f < -MAX_EXP)
1456 g = (label - 0) * alpha;
1457 else
1458 g =
1459 (label
1460 - expTable[(int) ((f + MAX_EXP)
1461 * (EXP_TABLE_SIZE
1462 / MAX_EXP / 2))])
1463 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001464 if(debug_mode > 2 && ((long long) id) == 0) {
1465 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1466 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1467 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001468 for (c = 0; c < layer1_size; c++)
1469 neu1e[c] +=
1470 g
1471 * syn1neg_window[c + l2
1472 + window_offset];
1473 for (c = 0; c < layer1_size; c++)
1474 syn1neg_window[c + l2 + window_offset] += g
1475 * syn0[c + l1];
1476 if (cap == 1)
1477 for (c = 0; c < layer1_size; c++)
1478 capParam(syn1neg_window,
1479 c + l2 + window_offset);
1480 }
1481 // Noise Constrastive Estimation
1482 if (nce > 0)
1483 for (d = 0; d < nce + 1; d++) {
1484 if (d == 0) {
1485 target = word;
1486 label = 1;
1487 } else {
1488 next_random = next_random
1489 * (unsigned long long) 25214903917 + 11;
1490 if (word_to_group != NULL
1491 && word_to_group[word] != -1) {
1492 target = word;
1493 while (target == word) {
1494 target =
1495 group_to_table[word_to_group[word]
1496 * table_size
1497 + (next_random >> 16)
1498 % table_size];
1499 next_random =
1500 next_random
1501 * (unsigned long long) 25214903917
1502 + 11;
1503 }
1504 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1505 } else {
1506 target = table[(next_random >> 16)
1507 % table_size];
1508 }
1509 if (target == 0)
1510 target = next_random % (vocab_size - 1) + 1;
1511 if (target == word)
1512 continue;
1513 label = 0;
1514 }
1515 l2 = target * window_layer_size;
1516 f = 0;
1517 for (c = 0; c < layer1_size; c++)
1518 f +=
1519 syn0[c + l1]
1520 * syn1nce_window[c + l2
1521 + window_offset];
1522 if (f > MAX_EXP)
1523 g = (label - 1) * alpha;
1524 else if (f < -MAX_EXP)
1525 g = (label - 0) * alpha;
1526 else {
1527 f = exp(f);
1528 g = (label
1529 - f
1530 / (noise_distribution[target]
1531 * nce + f)) * alpha;
1532 }
1533 for (c = 0; c < layer1_size; c++)
1534 neu1e[c] +=
1535 g
1536 * syn1nce_window[c + l2
1537 + window_offset];
1538 for (c = 0; c < layer1_size; c++)
1539 syn1nce_window[c + l2 + window_offset] += g
1540 * syn0[c + l1];
1541 if (cap == 1)
1542 for (c = 0; c < layer1_size; c++)
1543 capParam(syn1nce_window,
1544 c + l2 + window_offset);
1545 }
1546 // Learn weights input -> hidden
1547 for (c = 0; c < layer1_size; c++) {
1548 syn0[c + l1] += neu1e[c];
1549 if (syn0[c + l1] > 50)
1550 syn0[c + l1] = 50;
1551 if (syn0[c + l1] < -50)
1552 syn0[c + l1] = -50;
1553 }
1554 }
1555 } else if (type == 4) { //training senna
1556 // in -> hidden
1557 cw = 0;
1558 for (a = 0; a < window * 2 + 1; a++)
1559 if (a != window) {
1560 c = sentence_position - window + a;
1561 if (c < 0)
1562 continue;
1563 if (c >= sentence_length)
1564 continue;
1565 last_word = sen[c];
1566 if (last_word == -1)
1567 continue;
1568 window_offset = a * layer1_size;
1569 if (a > window)
1570 window_offset -= layer1_size;
1571 for (c = 0; c < layer1_size; c++)
1572 neu1[c + window_offset] += syn0[c
1573 + last_word * layer1_size];
1574 cw++;
1575 }
1576 if (cw) {
1577 for (a = 0; a < window_hidden_size; a++) {
1578 c = a * window_layer_size;
1579 for (b = 0; b < window_layer_size; b++) {
1580 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1581 }
1582 }
1583 if (hs)
1584 for (d = 0; d < vocab[word].codelen; d++) {
1585 f = 0;
1586 l2 = vocab[word].point[d] * window_hidden_size;
1587 // Propagate hidden -> output
1588 for (c = 0; c < window_hidden_size; c++)
1589 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1590 if (f <= -MAX_EXP)
1591 continue;
1592 else if (f >= MAX_EXP)
1593 continue;
1594 else
1595 f = expTable[(int) ((f + MAX_EXP)
1596 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1597 // 'g' is the gradient multiplied by the learning rate
1598 g = (1 - vocab[word].code[d] - f) * alpha;
1599 // Propagate errors output -> hidden
1600 for (c = 0; c < window_hidden_size; c++)
1601 neu2e[c] += dHardTanh(neu2[c], g) * g
1602 * syn_hidden_word[c + l2];
1603 // Learn weights hidden -> output
1604 for (c = 0; c < window_hidden_size; c++)
1605 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1606 * neu2[c];
1607 }
1608 // NEGATIVE SAMPLING
1609 if (negative > 0)
1610 for (d = 0; d < negative + 1; d++) {
1611 if (d == 0) {
1612 target = word;
1613 label = 1;
1614 } else {
1615 next_random = next_random
1616 * (unsigned long long) 25214903917 + 11;
1617 if (word_to_group != NULL
1618 && word_to_group[word] != -1) {
1619 target = word;
1620 while (target == word) {
1621 target = group_to_table[word_to_group[word]
1622 * table_size
1623 + (next_random >> 16) % table_size];
1624 next_random = next_random
1625 * (unsigned long long) 25214903917
1626 + 11;
1627 }
1628 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1629 } else {
1630 target =
1631 table[(next_random >> 16) % table_size];
1632 }
1633 if (target == 0)
1634 target = next_random % (vocab_size - 1) + 1;
1635 if (target == word)
1636 continue;
1637 label = 0;
1638 }
1639 l2 = target * window_hidden_size;
1640 f = 0;
1641 for (c = 0; c < window_hidden_size; c++)
1642 f += hardTanh(neu2[c])
1643 * syn_hidden_word_neg[c + l2];
1644 if (f > MAX_EXP)
1645 g = (label - 1) * alpha / negative;
1646 else if (f < -MAX_EXP)
1647 g = (label - 0) * alpha / negative;
1648 else
1649 g = (label
1650 - expTable[(int) ((f + MAX_EXP)
1651 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1652 * alpha / negative;
1653 for (c = 0; c < window_hidden_size; c++)
1654 neu2e[c] += dHardTanh(neu2[c], g) * g
1655 * syn_hidden_word_neg[c + l2];
1656 for (c = 0; c < window_hidden_size; c++)
1657 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1658 * g * neu2[c];
1659 }
1660 for (a = 0; a < window_hidden_size; a++)
1661 for (b = 0; b < window_layer_size; b++)
1662 neu1e[b] += neu2e[a]
1663 * syn_window_hidden[a * window_layer_size + b];
1664 for (a = 0; a < window_hidden_size; a++)
1665 for (b = 0; b < window_layer_size; b++)
1666 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1667 * neu1[b];
1668 // hidden -> in
1669 for (a = 0; a < window * 2 + 1; a++)
1670 if (a != window) {
1671 c = sentence_position - window + a;
1672 if (c < 0)
1673 continue;
1674 if (c >= sentence_length)
1675 continue;
1676 last_word = sen[c];
1677 if (last_word == -1)
1678 continue;
1679 window_offset = a * layer1_size;
1680 if (a > window)
1681 window_offset -= layer1_size;
1682 for (c = 0; c < layer1_size; c++)
1683 syn0[c + last_word * layer1_size] += neu1e[c
1684 + window_offset];
1685 }
1686 }
Marc Kupietz613edbf2018-01-11 21:38:03 +01001687 } else if(type == 5) {
1688 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1689 c = sentence_position - window + a;
1690 if (c < 0) continue;
1691 if (c >= sentence_length) continue;
1692 last_word = sen[c];
1693 if (last_word == -1) continue;
1694 inc_collocator(cdb, word, last_word, a - window);
1695 // printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
1696 // cw++;
1697 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001698 } else {
1699 printf("unknown type %i", type);
1700 exit(0);
1701 }
1702 sentence_position++;
1703 if (sentence_position >= sentence_length) {
1704 sentence_length = 0;
1705 continue;
1706 }
1707 }
1708 fclose(fi);
1709 free(neu1);
1710 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001711 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001712 pthread_exit(NULL);
1713}
1714
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001715void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001716 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001717 real f, max_f, maxmax_f;
Marc Kupietzf00e7b02023-12-22 11:11:56 +01001718 real *target_sums=0L, bestf[MAX_CC], worstbest;
Marc Kupietz71996e72016-03-18 13:40:24 +01001719 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001720 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001721 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1722
1723 for (d = cc; d < vocab_size; d++) {
1724 for (b = 0; b < vocab_size; b++)
1725 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001726 for (b = 0; b < N; b++)
1727 bestf[b]=-1;
1728 worstbest = -1;
1729
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001730 maxmax_f = -1;
1731 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001732 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001733 if (a != window) {
1734 max_f = -1;
1735 window_offset = a * layer1_size;
1736 if (a > window)
1737 window_offset -= layer1_size;
1738 for(target = 0; target < vocab_size; target ++) {
1739 if(target == d)
1740 continue;
1741 f = 0;
1742 for (c = 0; c < layer1_size; c++)
1743 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1744 if (f < -MAX_EXP)
1745 continue;
1746 else if (f > MAX_EXP)
1747 continue;
1748 else
1749 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1750 if(f > max_f) {
1751 max_f = f;
1752 max_target = target;
1753 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001754 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001755 if(f > worstbest) {
1756 for (b = 0; b < N; b++) {
1757 if (f > bestf[b]) {
1758 for (e = N - 1; e > b; e--) {
1759 bestf[e] = bestf[e - 1];
1760 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001761 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001762 }
1763 bestf[b] = f;
1764 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001765 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001766 break;
1767 }
1768 }
1769 worstbest = bestf[N-1];
1770 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001771 }
1772 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1773 if(max_f > maxmax_f) {
1774 maxmax_f = max_f;
1775 maxmax_target = max_target;
1776 }
1777 } else {
1778 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1779 }
1780 }
1781 max_f = -1;
1782 for (b = 0; b < vocab_size; b++) {
1783 if(target_sums[b] > max_f) {
1784 max_f = target_sums[b];
1785 max_target = b;
1786 }
1787 }
1788 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001789 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001790 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001791 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001792 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001793 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001794 }
1795}
1796
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001797void TrainModel() {
1798 long a, b, c, d;
1799 FILE *fo;
1800 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001801 threadPos = malloc(num_threads * sizeof(long long));
1802 threadIters = malloc(num_threads * sizeof(int));
1803 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001804 printf("Starting training using file %s\n", train_file);
1805 starting_alpha = alpha;
1806 if (read_vocab_file[0] != 0)
1807 ReadVocab();
1808 else
1809 LearnVocabFromTrainFile();
1810 if (save_vocab_file[0] != 0)
1811 SaveVocab();
1812 if (output_file[0] == 0)
1813 return;
1814 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001815 if(cc > 0)
1816 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001817 if (negative > 0 || nce > 0)
1818 InitUnigramTable();
1819 if (negative_classes_file[0] != 0)
1820 InitClassUnigramTable();
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001821 start = time(NULL);
1822 start_clock = clock();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001823 for (a = 0; a < num_threads; a++)
1824 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001825 if(debug_mode > 1)
1826 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001827 for (a = 0; a < num_threads; a++)
1828 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001829 if(debug_mode > 1) {
1830 pthread_join(pt[num_threads], NULL);
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001831 clock_t now = time(NULL);
1832 clock_t now_clock = clock();
1833 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
Marc Kupietz613edbf2018-01-11 21:38:03 +01001834 if(type == 5) // don't save vectorsmfor classic collocators
1835 return;
Marc Kupietz202723e2016-07-14 09:12:00 +02001836 printf("Saving vectors to %s ...", output_file);
1837 fflush(stdout);
1838 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001839 fo = fopen(output_file, "wb");
1840 if (classes == 0) {
1841 // Save the word vectors
1842 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1843 for (a = 0; a < vocab_size; a++) {
1844 fprintf(fo, "%s ", vocab[a].word);
1845 if (binary)
1846 for (b = 0; b < layer1_size; b++)
1847 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1848 else
1849 for (b = 0; b < layer1_size; b++)
1850 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1851 fprintf(fo, "\n");
1852 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001853 if(debug_mode > 1)
1854 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001855 } else {
1856 // Run K-means on the word vectors
1857 int clcn = classes, iter = 10, closeid;
1858 int *centcn = (int *) malloc(classes * sizeof(int));
1859 int *cl = (int *) calloc(vocab_size, sizeof(int));
1860 real closev, x;
1861 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1862 for (a = 0; a < vocab_size; a++)
1863 cl[a] = a % clcn;
1864 for (a = 0; a < iter; a++) {
1865 for (b = 0; b < clcn * layer1_size; b++)
1866 cent[b] = 0;
1867 for (b = 0; b < clcn; b++)
1868 centcn[b] = 1;
1869 for (c = 0; c < vocab_size; c++) {
1870 for (d = 0; d < layer1_size; d++)
1871 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1872 centcn[cl[c]]++;
1873 }
1874 for (b = 0; b < clcn; b++) {
1875 closev = 0;
1876 for (c = 0; c < layer1_size; c++) {
1877 cent[layer1_size * b + c] /= centcn[b];
1878 closev += cent[layer1_size * b + c]
1879 * cent[layer1_size * b + c];
1880 }
1881 closev = sqrt(closev);
1882 for (c = 0; c < layer1_size; c++)
1883 cent[layer1_size * b + c] /= closev;
1884 }
1885 for (c = 0; c < vocab_size; c++) {
1886 closev = -10;
1887 closeid = 0;
1888 for (d = 0; d < clcn; d++) {
1889 x = 0;
1890 for (b = 0; b < layer1_size; b++)
1891 x += cent[layer1_size * d + b]
1892 * syn0[c * layer1_size + b];
1893 if (x > closev) {
1894 closev = x;
1895 closeid = d;
1896 }
1897 }
1898 cl[c] = closeid;
1899 }
1900 }
1901 // Save the K-means classes
1902 for (a = 0; a < vocab_size; a++)
1903 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1904 free(centcn);
1905 free(cent);
1906 free(cl);
1907 }
1908 fclose(fo);
1909 if (save_net_file[0] != 0)
1910 SaveNet();
1911}
1912
1913int ArgPos(char *str, int argc, char **argv) {
1914 int a;
1915 for (a = 1; a < argc; a++)
1916 if (!strcmp(str, argv[a])) {
1917 if (a == argc - 1) {
1918 printf("Argument missing for %s\n", str);
1919 exit(1);
1920 }
1921 return a;
1922 }
1923 return -1;
1924}
1925
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001926void print_help() {
Marc Kupietz83a67d42021-03-22 17:29:36 +01001927 printf("WORD VECTOR estimation toolkit v 0.9.0\n\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001928 printf("Options:\n");
1929 printf("Parameters for training:\n");
1930 printf("\t-train <file>\n");
1931 printf("\t\tUse text data from <file> to train the model\n");
1932 printf("\t-output <file>\n");
1933 printf(
1934 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1935 printf("\t-size <int>\n");
1936 printf("\t\tSet size of word vectors; default is 100\n");
1937 printf("\t-window <int>\n");
1938 printf("\t\tSet max skip length between words; default is 5\n");
1939 printf("\t-sample <float>\n");
1940 printf(
1941 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1942 printf(
1943 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1944 printf("\t-hs <int>\n");
1945 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1946 printf("\t-negative <int>\n");
1947 printf(
1948 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1949 printf("\t-negative-classes <file>\n");
1950 printf("\t\tNegative classes to sample from\n");
1951 printf("\t-nce <int>\n");
1952 printf(
1953 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1954 printf("\t-threads <int>\n");
1955 printf("\t\tUse <int> threads (default 12)\n");
1956 printf("\t-iter <int>\n");
1957 printf("\t\tRun more training iterations (default 5)\n");
1958 printf("\t-min-count <int>\n");
1959 printf(
1960 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1961 printf("\t-alpha <float>\n");
1962 printf(
1963 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1964 printf("\t-classes <int>\n");
1965 printf(
1966 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1967 printf("\t-debug <int>\n");
1968 printf(
1969 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1970 printf("\t-binary <int>\n");
1971 printf(
1972 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1973 printf("\t-save-vocab <file>\n");
1974 printf("\t\tThe vocabulary will be saved to <file>\n");
1975 printf("\t-read-vocab <file>\n");
1976 printf(
1977 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1978 printf("\t-read-net <file>\n");
1979 printf(
1980 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1981 printf("\t-save-net <file>\n");
1982 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietze423f732017-12-22 17:57:03 +01001983 printf("\t-magic-stop-file <file>\n");
1984 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001985 printf("\t-show-cc <int>\n");
1986 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001987 printf("\t-type <int>\n");
1988 printf(
Marc Kupietz613edbf2018-01-11 21:38:03 +01001989 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001990 printf("\t-cap <int>\n");
1991 printf(
1992 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1993 printf("\nExamples:\n");
1994 printf(
Marc Kupietz83a67d42021-03-22 17:29:36 +01001995 "./dereko2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001996}
1997
1998int main(int argc, char **argv) {
1999 int i;
2000 setlocale(LC_ALL, "");
2001 if (argc == 1) {
2002 print_help();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002003 return 0;
2004 }
2005 output_file[0] = 0;
2006 save_vocab_file[0] = 0;
2007 read_vocab_file[0] = 0;
2008 save_net_file[0] = 0;
2009 read_net_file[0] = 0;
2010 negative_classes_file[0] = 0;
Marc Kupietzc7f773b2017-12-02 12:04:03 +01002011 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
2012 print_help();
2013 return(0);
2014 }
2015 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
2016 print_help();
2017 return(0);
2018 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002019 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2020 layer1_size = atoi(argv[i + 1]);
2021 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2022 strcpy(train_file, argv[i + 1]);
2023 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2024 strcpy(save_vocab_file, argv[i + 1]);
2025 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2026 strcpy(read_vocab_file, argv[i + 1]);
2027 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2028 strcpy(save_net_file, argv[i + 1]);
2029 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2030 strcpy(read_net_file, argv[i + 1]);
Marc Kupietze423f732017-12-22 17:57:03 +01002031 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2032 strcpy(magic_stop_file, argv[i + 1]);
2033 if (access(magic_stop_file, F_OK ) != -1) {
2034 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2035 exit(1);
2036 }
2037 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002038 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2039 debug_mode = atoi(argv[i + 1]);
2040 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2041 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002042 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2043 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002044 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2045 type = atoi(argv[i + 1]);
2046 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2047 strcpy(output_file, argv[i + 1]);
2048 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2049 window = atoi(argv[i + 1]);
2050 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2051 sample = atof(argv[i + 1]);
2052 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2053 hs = atoi(argv[i + 1]);
2054 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2055 negative = atoi(argv[i + 1]);
2056 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2057 strcpy(negative_classes_file, argv[i + 1]);
2058 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2059 nce = atoi(argv[i + 1]);
2060 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2061 num_threads = atoi(argv[i + 1]);
2062 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2063 iter = atoi(argv[i + 1]);
2064 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2065 min_count = atoi(argv[i + 1]);
2066 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2067 classes = atoi(argv[i + 1]);
2068 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2069 cap = atoi(argv[i + 1]);
2070 if (type == 0 || type == 2 || type == 4)
2071 alpha = 0.05;
Marc Kupietz613edbf2018-01-11 21:38:03 +01002072 if (type==5) {
2073 sample = 0;
2074 cdb = open_collocatordb_for_write(output_file);
2075 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002076 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2077 alpha = atof(argv[i + 1]);
2078 vocab = (struct vocab_word *) calloc(vocab_max_size,
2079 sizeof(struct vocab_word));
2080 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2081 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2082 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2083 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2084 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2085 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002086 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002087 TrainModel();
2088 return 0;
2089}
2090