blob: 4e5a30526ab6db403d45d3b20d2a2663414bf8d3 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Marc Kupietze23c5402016-07-14 11:10:09 +020015#include <locale.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010016#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020019#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010020#include <math.h>
21#include <pthread.h>
Marc Kupietz613edbf2018-01-11 21:38:03 +010022#include <collocatordb.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010023
24#define MAX_STRING 100
25#define EXP_TABLE_SIZE 1000
26#define MAX_EXP 6
27#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010028#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010029#define MAX_CODE_LENGTH 40
30
31const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
32
33typedef float real; // Precision of float numbers
34
35struct vocab_word {
36 long long cn;
37 int *point;
38 char *word, *code, codelen;
39};
40
41char train_file[MAX_STRING], output_file[MAX_STRING];
42char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
43char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietze423f732017-12-22 17:57:03 +010044char magic_stop_file[MAX_STRING];
45
Marc Kupietzd6f9c712016-03-16 11:50:56 +010046struct vocab_word *vocab;
47int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietzc2731b22016-07-14 08:56:14 +020048 num_threads = 12, min_reduce = 1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010049int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020050long long *threadPos;
51int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010052long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
53long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
54 classes = 0;
55real alpha = 0.025, starting_alpha, sample = 1e-3;
56real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020057real avgWordLength=0;
Marc Kupietzb366bcd2018-01-11 21:29:41 +010058clock_t start, start_clock;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010059
60real *syn1_window, *syn1neg_window, *syn1nce_window;
61int w_offset, window_layer_size;
62
63int window_hidden_size = 500;
64real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
65 *syn_hidden_word_nce;
66
67int hs = 0, negative = 5;
68const int table_size = 1e8;
69int *table;
70
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010071long cc = 0;
72
Marc Kupietzd6f9c712016-03-16 11:50:56 +010073//constrastive negative sampling
74char negative_classes_file[MAX_STRING];
75int *word_to_group;
76int *group_to_table; //group_size*table_size
77int class_number;
78
79//nce
80real* noise_distribution;
81int nce = 0;
82
83//param caps
84real CAP_VALUE = 50;
85int cap = 0;
86
Marc Kupietz613edbf2018-01-11 21:38:03 +010087COLLOCATORDB *cdb = NULL;
88
Marc Kupietzd6f9c712016-03-16 11:50:56 +010089void capParam(real* array, int index) {
90 if (array[index] > CAP_VALUE)
91 array[index] = CAP_VALUE;
92 else if (array[index] < -CAP_VALUE)
93 array[index] = -CAP_VALUE;
94}
95
96real hardTanh(real x) {
97 if (x >= 1) {
98 return 1;
99 } else if (x <= -1) {
100 return -1;
101 } else {
102 return x;
103 }
104}
105
106real dHardTanh(real x, real g) {
107 if (x > 1 && g > 0) {
108 return 0;
109 }
110 if (x < -1 && g < 0) {
111 return 0;
112 }
113 return 1;
114}
115
116void InitUnigramTable() {
117 int a, i;
118 long long train_words_pow = 0;
119 real d1, power = 0.75;
120 table = (int *) malloc(table_size * sizeof(int));
121 for (a = 0; a < vocab_size; a++)
122 train_words_pow += pow(vocab[a].cn, power);
123 i = 0;
124 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
125 for (a = 0; a < table_size; a++) {
126 table[a] = i;
127 if (a / (real) table_size > d1) {
128 i++;
129 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
130 }
131 if (i >= vocab_size)
132 i = vocab_size - 1;
133 }
134
135 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
136 for (a = 0; a < vocab_size; a++)
137 noise_distribution[a] = pow(vocab[a].cn, power)
138 / (real) train_words_pow;
139}
140
141// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
142void ReadWord(char *word, FILE *fin) {
143 int a = 0, ch;
144 while (!feof(fin)) {
145 ch = fgetc(fin);
146 if (ch == 13)
147 continue;
148 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
149 if (a > 0) {
150 if (ch == '\n')
151 ungetc(ch, fin);
152 break;
153 }
154 if (ch == '\n') {
155 strcpy(word, (char *) "</s>");
156 return;
157 } else
158 continue;
159 }
160 word[a] = ch;
161 a++;
162 if (a >= MAX_STRING - 1)
163 a--; // Truncate too long words
164 }
165 word[a] = 0;
166}
167
168// Returns hash value of a word
169int GetWordHash(char *word) {
170 unsigned long long a, hash = 0;
171 for (a = 0; a < strlen(word); a++)
172 hash = hash * 257 + word[a];
173 hash = hash % vocab_hash_size;
174 return hash;
175}
176
177// Returns position of a word in the vocabulary; if the word is not found, returns -1
178int SearchVocab(char *word) {
179 unsigned int hash = GetWordHash(word);
180 while (1) {
181 if (vocab_hash[hash] == -1)
182 return -1;
183 if (!strcmp(word, vocab[vocab_hash[hash]].word))
184 return vocab_hash[hash];
185 hash = (hash + 1) % vocab_hash_size;
186 }
187 return -1;
188}
189
190// Reads a word and returns its index in the vocabulary
191int ReadWordIndex(FILE *fin) {
192 char word[MAX_STRING];
193 ReadWord(word, fin);
194 if (feof(fin))
195 return -1;
196 return SearchVocab(word);
197}
198
199// Adds a word to the vocabulary
200int AddWordToVocab(char *word) {
201 unsigned int hash, length = strlen(word) + 1;
202 if (length > MAX_STRING)
203 length = MAX_STRING;
204 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
205 strcpy(vocab[vocab_size].word, word);
206 vocab[vocab_size].cn = 0;
207 vocab_size++;
208 // Reallocate memory if needed
209 if (vocab_size + 2 >= vocab_max_size) {
210 vocab_max_size += 1000;
211 vocab = (struct vocab_word *) realloc(vocab,
212 vocab_max_size * sizeof(struct vocab_word));
213 }
214 hash = GetWordHash(word);
215 while (vocab_hash[hash] != -1)
216 hash = (hash + 1) % vocab_hash_size;
217 vocab_hash[hash] = vocab_size - 1;
218 return vocab_size - 1;
219}
220
221// Used later for sorting by word counts
222int VocabCompare(const void *a, const void *b) {
223 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
224}
225
226// Sorts the vocabulary by frequency using word counts
227void SortVocab() {
228 int a, size;
229 unsigned int hash;
230 // Sort the vocabulary and keep </s> at the first position
231 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
232 for (a = 0; a < vocab_hash_size; a++)
233 vocab_hash[a] = -1;
234 size = vocab_size;
235 train_words = 0;
236 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200237 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100238 // Words occuring less than min_count times will be discarded from the vocab
239 if ((vocab[a].cn < min_count) && (a != 0)) {
240 vocab_size--;
241 free(vocab[a].word);
242 } else {
243 // Hash will be re-computed, as after the sorting it is not actual
244 hash = GetWordHash(vocab[a].word);
245 while (vocab_hash[hash] != -1)
246 hash = (hash + 1) % vocab_hash_size;
247 vocab_hash[hash] = a;
248 train_words += vocab[a].cn;
249 }
250 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200251 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100252 vocab = (struct vocab_word *) realloc(vocab,
253 (vocab_size + 1) * sizeof(struct vocab_word));
254 // Allocate memory for the binary tree construction
255 for (a = 0; a < vocab_size; a++) {
256 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
257 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
258 }
259}
260
261// Reduces the vocabulary by removing infrequent tokens
262void ReduceVocab() {
263 int a, b = 0;
264 unsigned int hash;
265 for (a = 0; a < vocab_size; a++)
266 if (vocab[a].cn > min_reduce) {
267 vocab[b].cn = vocab[a].cn;
268 vocab[b].word = vocab[a].word;
269 b++;
270 } else
271 free(vocab[a].word);
272 vocab_size = b;
273 for (a = 0; a < vocab_hash_size; a++)
274 vocab_hash[a] = -1;
275 for (a = 0; a < vocab_size; a++) {
276 // Hash will be re-computed, as it is not actual
277 hash = GetWordHash(vocab[a].word);
278 while (vocab_hash[hash] != -1)
279 hash = (hash + 1) % vocab_hash_size;
280 vocab_hash[hash] = a;
281 }
282 fflush(stdout);
283 min_reduce++;
284}
285
286// Create binary Huffman tree using the word counts
287// Frequent words will have short uniqe binary codes
288void CreateBinaryTree() {
289 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
290 char code[MAX_CODE_LENGTH];
291 long long *count = (long long *) calloc(vocab_size * 2 + 1,
292 sizeof(long long));
293 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
294 sizeof(long long));
295 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
296 sizeof(long long));
297 for (a = 0; a < vocab_size; a++)
298 count[a] = vocab[a].cn;
299 for (a = vocab_size; a < vocab_size * 2; a++)
300 count[a] = 1e15;
301 pos1 = vocab_size - 1;
302 pos2 = vocab_size;
303 // Following algorithm constructs the Huffman tree by adding one node at a time
304 for (a = 0; a < vocab_size - 1; a++) {
305 // First, find two smallest nodes 'min1, min2'
306 if (pos1 >= 0) {
307 if (count[pos1] < count[pos2]) {
308 min1i = pos1;
309 pos1--;
310 } else {
311 min1i = pos2;
312 pos2++;
313 }
314 } else {
315 min1i = pos2;
316 pos2++;
317 }
318 if (pos1 >= 0) {
319 if (count[pos1] < count[pos2]) {
320 min2i = pos1;
321 pos1--;
322 } else {
323 min2i = pos2;
324 pos2++;
325 }
326 } else {
327 min2i = pos2;
328 pos2++;
329 }
330 count[vocab_size + a] = count[min1i] + count[min2i];
331 parent_node[min1i] = vocab_size + a;
332 parent_node[min2i] = vocab_size + a;
333 binary[min2i] = 1;
334 }
335 // Now assign binary code to each vocabulary word
336 for (a = 0; a < vocab_size; a++) {
337 b = a;
338 i = 0;
339 while (1) {
340 code[i] = binary[b];
341 point[i] = b;
342 i++;
343 b = parent_node[b];
344 if (b == vocab_size * 2 - 2)
345 break;
346 }
347 vocab[a].codelen = i;
348 vocab[a].point[0] = vocab_size - 2;
349 for (b = 0; b < i; b++) {
350 vocab[a].code[i - b - 1] = code[b];
351 vocab[a].point[i - b] = point[b] - vocab_size;
352 }
353 }
354 free(count);
355 free(binary);
356 free(parent_node);
357}
358
359void LearnVocabFromTrainFile() {
360 char word[MAX_STRING];
361 FILE *fin;
362 long long a, i;
363 for (a = 0; a < vocab_hash_size; a++)
364 vocab_hash[a] = -1;
365 fin = fopen(train_file, "rb");
366 if (fin == NULL) {
367 printf("ERROR: training data file not found!\n");
368 exit(1);
369 }
370 vocab_size = 0;
371 AddWordToVocab((char *) "</s>");
372 while (1) {
373 ReadWord(word, fin);
374 if (feof(fin))
375 break;
376 train_words++;
377 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
378 printf("%lldK%c", train_words / 1000, 13);
379 fflush(stdout);
380 }
381 i = SearchVocab(word);
382 if (i == -1) {
383 a = AddWordToVocab(word);
384 vocab[a].cn = 1;
385 } else
386 vocab[i].cn++;
387 if (vocab_size > vocab_hash_size * 0.7)
388 ReduceVocab();
389 }
390 SortVocab();
391 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200392 printf("Vocab size: %'lld\n", vocab_size);
393 printf("Words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100394 }
395 file_size = ftell(fin);
396 fclose(fin);
397}
398
399void SaveVocab() {
400 long long i;
401 FILE *fo = fopen(save_vocab_file, "wb");
402 for (i = 0; i < vocab_size; i++)
403 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
404 fclose(fo);
405}
406
407void ReadVocab() {
408 long long a, i = 0;
409 char c;
410 char word[MAX_STRING];
411 FILE *fin = fopen(read_vocab_file, "rb");
412 if (fin == NULL) {
413 printf("Vocabulary file not found\n");
414 exit(1);
415 }
416 for (a = 0; a < vocab_hash_size; a++)
417 vocab_hash[a] = -1;
418 vocab_size = 0;
419 while (1) {
420 ReadWord(word, fin);
421 if (feof(fin))
422 break;
423 a = AddWordToVocab(word);
424 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
425 i++;
426 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200427 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100428 fin = fopen(train_file, "rb");
429 if (fin == NULL) {
430 printf("ERROR: training data file not found!\n");
431 exit(1);
432 }
433 fseek(fin, 0, SEEK_END);
434 file_size = ftell(fin);
435 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200436 SortVocab();
437 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200438 printf("Vocab size: %'lld\n", vocab_size);
439 printf("Words in vocab's train file: %'lld\n", train_words);
440 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200441 }
Marc Kupietze23c5402016-07-14 11:10:09 +0200442 train_words = file_size / avgWordLength;
443 if(debug_mode > 0)
444 printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100445}
446
447void InitClassUnigramTable() {
448 long long a, c;
449 printf("loading class unigrams \n");
450 FILE *fin = fopen(negative_classes_file, "rb");
451 if (fin == NULL) {
452 printf("ERROR: class file not found!\n");
453 exit(1);
454 }
455 word_to_group = (int *) malloc(vocab_size * sizeof(int));
456 for (a = 0; a < vocab_size; a++)
457 word_to_group[a] = -1;
458 char class[MAX_STRING];
459 char prev_class[MAX_STRING];
460 prev_class[0] = 0;
461 char word[MAX_STRING];
462 class_number = -1;
463 while (1) {
464 if (feof(fin))
465 break;
466 ReadWord(class, fin);
467 ReadWord(word, fin);
468 int word_index = SearchVocab(word);
469 if (word_index != -1) {
470 if (strcmp(class, prev_class) != 0) {
471 class_number++;
472 strcpy(prev_class, class);
473 }
474 word_to_group[word_index] = class_number;
475 }
476 ReadWord(word, fin);
477 }
478 class_number++;
479 fclose(fin);
480
481 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
482 long long train_words_pow = 0;
483 real d1, power = 0.75;
484
485 for (c = 0; c < class_number; c++) {
486 long long offset = c * table_size;
487 train_words_pow = 0;
488 for (a = 0; a < vocab_size; a++)
489 if (word_to_group[a] == c)
490 train_words_pow += pow(vocab[a].cn, power);
491 int i = 0;
492 while (word_to_group[i] != c && i < vocab_size)
493 i++;
494 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
495 for (a = 0; a < table_size; a++) {
496 //printf("index %lld , word %d\n", a, i);
497 group_to_table[offset + a] = i;
498 if (a / (real) table_size > d1) {
499 i++;
500 while (word_to_group[i] != c && i < vocab_size)
501 i++;
502 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
503 }
504 if (i >= vocab_size)
505 while (word_to_group[i] != c && i >= 0)
506 i--;
507 }
508 }
509}
510
Marc Kupietz210b9d52016-04-02 21:48:13 +0200511void SaveArgs(int argc, char **argv) {
512 unsigned int i;
Marc Kupietz44136742017-12-22 17:52:56 +0100513 char args_file[MAX_STRING];
514 strcpy(args_file, output_file);
Marc Kupietz210b9d52016-04-02 21:48:13 +0200515 strcat(args_file, ".args");
516 FILE *fargs = fopen(args_file, "w");
517 if (fargs == NULL) {
518 printf("Cannot save args to %s.\n", args_file);
519 return;
520 }
521
Marc Kupietz44136742017-12-22 17:52:56 +0100522 for(i=1; i<argc; i++)
523 fprintf(fargs, "%s ", argv[i]);
524
525 fprintf(fargs, "\n");
Marc Kupietz210b9d52016-04-02 21:48:13 +0200526 fclose(fargs);
Marc Kupietz44136742017-12-22 17:52:56 +0100527
Marc Kupietz210b9d52016-04-02 21:48:13 +0200528 return;
529}
530
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100531void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100532 if(type != 3 || negative <= 0) {
533 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
534 return;
535 }
536
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100537 FILE *fnet = fopen(save_net_file, "wb");
538 if (fnet == NULL) {
539 printf("Net parameter file not found\n");
540 exit(1);
541 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100542 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100543 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100544 fclose(fnet);
545}
546
547void InitNet() {
548 long long a, b;
549 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100550 long long read;
551
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100552 window_layer_size = layer1_size * window * 2;
553 a = posix_memalign((void **) &syn0, 128,
554 (long long) vocab_size * layer1_size * sizeof(real));
555 if (syn0 == NULL) {
556 printf("Memory allocation failed\n");
557 exit(1);
558 }
559
560 if (hs) {
561 a = posix_memalign((void **) &syn1, 128,
562 (long long) vocab_size * layer1_size * sizeof(real));
563 if (syn1 == NULL) {
564 printf("Memory allocation failed\n");
565 exit(1);
566 }
567 a = posix_memalign((void **) &syn1_window, 128,
568 (long long) vocab_size * window_layer_size * sizeof(real));
569 if (syn1_window == NULL) {
570 printf("Memory allocation failed\n");
571 exit(1);
572 }
573 a = posix_memalign((void **) &syn_hidden_word, 128,
574 (long long) vocab_size * window_hidden_size * sizeof(real));
575 if (syn_hidden_word == NULL) {
576 printf("Memory allocation failed\n");
577 exit(1);
578 }
579
580 for (a = 0; a < vocab_size; a++)
581 for (b = 0; b < layer1_size; b++)
582 syn1[a * layer1_size + b] = 0;
583 for (a = 0; a < vocab_size; a++)
584 for (b = 0; b < window_layer_size; b++)
585 syn1_window[a * window_layer_size + b] = 0;
586 for (a = 0; a < vocab_size; a++)
587 for (b = 0; b < window_hidden_size; b++)
588 syn_hidden_word[a * window_hidden_size + b] = 0;
589 }
590 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100591 if(type == 0) {
592 a = posix_memalign((void **) &syn1neg, 128,
593 (long long) vocab_size * layer1_size * sizeof(real));
594 if (syn1neg == NULL) {
595 printf("Memory allocation failed\n");
596 exit(1);
597 }
598 for (a = 0; a < vocab_size; a++)
599 for (b = 0; b < layer1_size; b++)
600 syn1neg[a * layer1_size + b] = 0;
601 } else if (type == 3) {
602 a = posix_memalign((void **) &syn1neg_window, 128,
603 (long long) vocab_size * window_layer_size * sizeof(real));
604 if (syn1neg_window == NULL) {
605 printf("Memory allocation failed\n");
606 exit(1);
607 }
608 for (a = 0; a < vocab_size; a++)
609 for (b = 0; b < window_layer_size; b++)
610 syn1neg_window[a * window_layer_size + b] = 0;
611 } else if (type == 4) {
612 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
613 (long long) vocab_size * window_hidden_size * sizeof(real));
614 if (syn_hidden_word_neg == NULL) {
615 printf("Memory allocation failed\n");
616 exit(1);
617 }
618 for (a = 0; a < vocab_size; a++)
619 for (b = 0; b < window_hidden_size; b++)
620 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100621 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100622 }
623 if (nce > 0) {
624 a = posix_memalign((void **) &syn1nce, 128,
625 (long long) vocab_size * layer1_size * sizeof(real));
626 if (syn1nce == NULL) {
627 printf("Memory allocation failed\n");
628 exit(1);
629 }
630 a = posix_memalign((void **) &syn1nce_window, 128,
631 (long long) vocab_size * window_layer_size * sizeof(real));
632 if (syn1nce_window == NULL) {
633 printf("Memory allocation failed\n");
634 exit(1);
635 }
636 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
637 (long long) vocab_size * window_hidden_size * sizeof(real));
638 if (syn_hidden_word_nce == NULL) {
639 printf("Memory allocation failed\n");
640 exit(1);
641 }
642
643 for (a = 0; a < vocab_size; a++)
644 for (b = 0; b < layer1_size; b++)
645 syn1nce[a * layer1_size + b] = 0;
646 for (a = 0; a < vocab_size; a++)
647 for (b = 0; b < window_layer_size; b++)
648 syn1nce_window[a * window_layer_size + b] = 0;
649 for (a = 0; a < vocab_size; a++)
650 for (b = 0; b < window_hidden_size; b++)
651 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
652 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100653
Marc Kupietz1006a272016-03-16 15:50:20 +0100654 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100655 a = posix_memalign((void **) &syn_window_hidden, 128,
656 window_hidden_size * window_layer_size * sizeof(real));
657 if (syn_window_hidden == NULL) {
658 printf("Memory allocation failed\n");
659 exit(1);
660 }
661 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
662 next_random = next_random * (unsigned long long) 25214903917 + 11;
663 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
664 - 0.5) / (window_hidden_size * window_layer_size);
665 }
666 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100667
668 if (read_net_file[0] == 0) {
669 for (a = 0; a < vocab_size; a++)
670 for (b = 0; b < layer1_size; b++) {
671 next_random = next_random * (unsigned long long) 25214903917
672 + 11;
673 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
674 / (real) 65536) - 0.5) / layer1_size;
675 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100676 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100677 FILE *fnet = fopen(read_net_file, "rb");
678 if (fnet == NULL) {
679 printf("Net parameter file not found\n");
680 exit(1);
681 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100682 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
683 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
684 if(read != vocab_size * layer1_size) {
685 fprintf(stderr, "read-net failed %lld\n", read);
686 exit(-1);
687 }
688 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
689 if(read != (long long) vocab_size * window_layer_size) {
690 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
691 (long long) sizeof(real) * vocab_size * window_layer_size);
692 exit(-1);
693 }
694 fgetc(fnet);
695 if(!feof(fnet)) {
696 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
697 exit(-1);
698 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100699 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100700 } else {
701 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
702 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100703 }
704
705 CreateBinaryTree();
706}
707
Marc Kupietz202723e2016-07-14 09:12:00 +0200708char *currentDateTime(char *buf, real offset) {
709 time_t t;
710 time(&t);
711 t += (long) offset;
712 struct tm tstruct;
713 tstruct = *localtime(&t);
714 strftime(buf, 80, "%c", &tstruct);
715 return buf;
716}
717
718void *MonitorThread(void *id) {
719 char *timebuf = malloc(80);;
720 int i, n=num_threads;
721 long long sum;
722 sleep(1);
723 while(n > 0) {
724 sleep(1);
725 sum = n = 0;
726 for(i=0; i < num_threads; i++) {
727 if(threadPos[i] >= 0) {
728 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
729 n++;
730 } else {
731 sum += iter * file_size / num_threads;
732 }
733 }
734 if(n == 0)
735 break;
736 real finished_portion = (real) sum / (float) (file_size * iter);
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100737 long long now = time(NULL);
738 long long elapsed = (now - start);
739 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
Marc Kupietz202723e2016-07-14 09:12:00 +0200740
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100741 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
Marc Kupietz202723e2016-07-14 09:12:00 +0200742 alpha,
743 finished_portion * 100,
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100744 (float) sum / elapsed / 1000,
Marc Kupietz202723e2016-07-14 09:12:00 +0200745 elapsed,
746 ttg,
747 currentDateTime(timebuf, ttg)
748 );
749 fflush(stdout);
750 }
751 pthread_exit(NULL);
752}
753
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100754void *TrainModelThread(void *id) {
755 long long a, b, d, cw, word, last_word, sentence_length = 0,
756 sentence_position = 0;
757 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
758 long long l1, l2, c, target, label, local_iter = iter;
759 unsigned long long next_random = (long long) id;
760 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100761 int input_len_1 = layer1_size;
762 int window_offset = -1;
763 if (type == 2 || type == 4) {
764 input_len_1 = window_layer_size;
765 }
766 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
767 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200768 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100769
770 int input_len_2 = 0;
771 if (type == 4) {
772 input_len_2 = window_hidden_size;
773 }
774 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
775 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
776
777 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200778 long long start_pos = file_size / (long long) num_threads * (long long) id;
779 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
780 long long current_pos = start_pos;
781 long long last_pos = start_pos;;
782 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100783 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200784 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100785 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200786 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100787 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100788 alpha = starting_alpha
789 * (1 - word_count_actual / (real) (iter * train_words + 1));
790 if (alpha < starting_alpha * 0.0001)
791 alpha = starting_alpha * 0.0001;
792 }
793 if (sentence_length == 0) {
794 while (1) {
795 word = ReadWordIndex(fi);
796 if (feof(fi))
797 break;
798 if (word == -1)
799 continue;
800 word_count++;
801 if (word == 0)
802 break;
803 // The subsampling randomly discards frequent words while keeping the ranking same
804 if (sample > 0) {
805 real ran = (sqrt(vocab[word].cn / (sample * train_words))
806 + 1) * (sample * train_words) / vocab[word].cn;
807 next_random = next_random * (unsigned long long) 25214903917
808 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100809 if (ran < (next_random & 0xFFFF) / (real) 65536) {
810 if(type == 3) // in structured skipgrams
811 word = -2; // keep the window position correct
812 else
813 continue;
814 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100815 }
816 sen[sentence_length] = word;
817 sentence_length++;
818 if (sentence_length >= MAX_SENTENCE_LENGTH)
819 break;
820 }
821 sentence_position = 0;
822 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200823 current_pos = threadPos[(long) id] = ftell(fi);
824 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100825 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200826 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100827 local_iter--;
828 if (local_iter == 0)
829 break;
Marc Kupietze423f732017-12-22 17:57:03 +0100830 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
831 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
832 break;
833 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100834 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200835 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100836 last_word_count = 0;
837 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200838 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100839 continue;
840 }
841 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200842 while (word == -2 && sentence_position<sentence_length)
843 word = sen[++sentence_position];
844 if (sentence_position>=sentence_length) {
845 sentence_length=0;
846 continue;
847 }
848 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100849 continue;
850 for (c = 0; c < input_len_1; c++)
851 neu1[c] = 0;
852 for (c = 0; c < input_len_1; c++)
853 neu1e[c] = 0;
854 for (c = 0; c < input_len_2; c++)
855 neu2[c] = 0;
856 for (c = 0; c < input_len_2; c++)
857 neu2e[c] = 0;
858 next_random = next_random * (unsigned long long) 25214903917 + 11;
859 b = next_random % window;
860 if (type == 0) { //train the cbow architecture
861 // in -> hidden
862 cw = 0;
863 for (a = b; a < window * 2 + 1 - b; a++)
864 if (a != window) {
865 c = sentence_position - window + a;
866 if (c < 0)
867 continue;
868 if (c >= sentence_length)
869 continue;
870 last_word = sen[c];
871 if (last_word == -1)
872 continue;
873 for (c = 0; c < layer1_size; c++)
874 neu1[c] += syn0[c + last_word * layer1_size];
875 cw++;
876 }
877 if (cw) {
878 for (c = 0; c < layer1_size; c++)
879 neu1[c] /= cw;
880 if (hs)
881 for (d = 0; d < vocab[word].codelen; d++) {
882 f = 0;
883 l2 = vocab[word].point[d] * layer1_size;
884 // Propagate hidden -> output
885 for (c = 0; c < layer1_size; c++)
886 f += neu1[c] * syn1[c + l2];
887 if (f <= -MAX_EXP)
888 continue;
889 else if (f >= MAX_EXP)
890 continue;
891 else
892 f = expTable[(int) ((f + MAX_EXP)
893 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
894 // 'g' is the gradient multiplied by the learning rate
895 g = (1 - vocab[word].code[d] - f) * alpha;
896 // Propagate errors output -> hidden
897 for (c = 0; c < layer1_size; c++)
898 neu1e[c] += g * syn1[c + l2];
899 // Learn weights hidden -> output
900 for (c = 0; c < layer1_size; c++)
901 syn1[c + l2] += g * neu1[c];
902 if (cap == 1)
903 for (c = 0; c < layer1_size; c++)
904 capParam(syn1, c + l2);
905 }
906 // NEGATIVE SAMPLING
907 if (negative > 0)
908 for (d = 0; d < negative + 1; d++) {
909 if (d == 0) {
910 target = word;
911 label = 1;
912 } else {
913 next_random = next_random
914 * (unsigned long long) 25214903917 + 11;
915 if (word_to_group != NULL
916 && word_to_group[word] != -1) {
917 target = word;
918 while (target == word) {
919 target = group_to_table[word_to_group[word]
920 * table_size
921 + (next_random >> 16) % table_size];
922 next_random = next_random
923 * (unsigned long long) 25214903917
924 + 11;
925 }
926 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
927 } else {
928 target =
929 table[(next_random >> 16) % table_size];
930 }
931 if (target == 0)
932 target = next_random % (vocab_size - 1) + 1;
933 if (target == word)
934 continue;
935 label = 0;
936 }
937 l2 = target * layer1_size;
938 f = 0;
939 for (c = 0; c < layer1_size; c++)
940 f += neu1[c] * syn1neg[c + l2];
941 if (f > MAX_EXP)
942 g = (label - 1) * alpha;
943 else if (f < -MAX_EXP)
944 g = (label - 0) * alpha;
945 else
946 g = (label
947 - expTable[(int) ((f + MAX_EXP)
948 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
949 * alpha;
950 for (c = 0; c < layer1_size; c++)
951 neu1e[c] += g * syn1neg[c + l2];
952 for (c = 0; c < layer1_size; c++)
953 syn1neg[c + l2] += g * neu1[c];
954 if (cap == 1)
955 for (c = 0; c < layer1_size; c++)
956 capParam(syn1neg, c + l2);
957 }
958 // Noise Contrastive Estimation
959 if (nce > 0)
960 for (d = 0; d < nce + 1; d++) {
961 if (d == 0) {
962 target = word;
963 label = 1;
964 } else {
965 next_random = next_random
966 * (unsigned long long) 25214903917 + 11;
967 if (word_to_group != NULL
968 && word_to_group[word] != -1) {
969 target = word;
970 while (target == word) {
971 target = group_to_table[word_to_group[word]
972 * table_size
973 + (next_random >> 16) % table_size];
974 next_random = next_random
975 * (unsigned long long) 25214903917
976 + 11;
977 }
978 } else {
979 target =
980 table[(next_random >> 16) % table_size];
981 }
982 if (target == 0)
983 target = next_random % (vocab_size - 1) + 1;
984 if (target == word)
985 continue;
986 label = 0;
987 }
988 l2 = target * layer1_size;
989 f = 0;
990
991 for (c = 0; c < layer1_size; c++)
992 f += neu1[c] * syn1nce[c + l2];
993 if (f > MAX_EXP)
994 g = (label - 1) * alpha;
995 else if (f < -MAX_EXP)
996 g = (label - 0) * alpha;
997 else {
998 f = exp(f);
999 g =
1000 (label
1001 - f
1002 / (noise_distribution[target]
1003 * nce + f)) * alpha;
1004 }
1005 for (c = 0; c < layer1_size; c++)
1006 neu1e[c] += g * syn1nce[c + l2];
1007 for (c = 0; c < layer1_size; c++)
1008 syn1nce[c + l2] += g * neu1[c];
1009 if (cap == 1)
1010 for (c = 0; c < layer1_size; c++)
1011 capParam(syn1nce, c + l2);
1012 }
1013 // hidden -> in
1014 for (a = b; a < window * 2 + 1 - b; a++)
1015 if (a != window) {
1016 c = sentence_position - window + a;
1017 if (c < 0)
1018 continue;
1019 if (c >= sentence_length)
1020 continue;
1021 last_word = sen[c];
1022 if (last_word == -1)
1023 continue;
1024 for (c = 0; c < layer1_size; c++)
1025 syn0[c + last_word * layer1_size] += neu1e[c];
1026 }
1027 }
1028 } else if (type == 1) { //train skip-gram
1029 for (a = b; a < window * 2 + 1 - b; a++)
1030 if (a != window) {
1031 c = sentence_position - window + a;
1032 if (c < 0)
1033 continue;
1034 if (c >= sentence_length)
1035 continue;
1036 last_word = sen[c];
1037 if (last_word == -1)
1038 continue;
1039 l1 = last_word * layer1_size;
1040 for (c = 0; c < layer1_size; c++)
1041 neu1e[c] = 0;
1042 // HIERARCHICAL SOFTMAX
1043 if (hs)
1044 for (d = 0; d < vocab[word].codelen; d++) {
1045 f = 0;
1046 l2 = vocab[word].point[d] * layer1_size;
1047 // Propagate hidden -> output
1048 for (c = 0; c < layer1_size; c++)
1049 f += syn0[c + l1] * syn1[c + l2];
1050 if (f <= -MAX_EXP)
1051 continue;
1052 else if (f >= MAX_EXP)
1053 continue;
1054 else
1055 f = expTable[(int) ((f + MAX_EXP)
1056 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1057 // 'g' is the gradient multiplied by the learning rate
1058 g = (1 - vocab[word].code[d] - f) * alpha;
1059 // Propagate errors output -> hidden
1060 for (c = 0; c < layer1_size; c++)
1061 neu1e[c] += g * syn1[c + l2];
1062 // Learn weights hidden -> output
1063 for (c = 0; c < layer1_size; c++)
1064 syn1[c + l2] += g * syn0[c + l1];
1065 if (cap == 1)
1066 for (c = 0; c < layer1_size; c++)
1067 capParam(syn1, c + l2);
1068 }
1069 // NEGATIVE SAMPLING
1070 if (negative > 0)
1071 for (d = 0; d < negative + 1; d++) {
1072 if (d == 0) {
1073 target = word;
1074 label = 1;
1075 } else {
1076 next_random = next_random
1077 * (unsigned long long) 25214903917 + 11;
1078 if (word_to_group != NULL
1079 && word_to_group[word] != -1) {
1080 target = word;
1081 while (target == word) {
1082 target =
1083 group_to_table[word_to_group[word]
1084 * table_size
1085 + (next_random >> 16)
1086 % table_size];
1087 next_random =
1088 next_random
1089 * (unsigned long long) 25214903917
1090 + 11;
1091 }
1092 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1093 } else {
1094 target = table[(next_random >> 16)
1095 % table_size];
1096 }
1097 if (target == 0)
1098 target = next_random % (vocab_size - 1) + 1;
1099 if (target == word)
1100 continue;
1101 label = 0;
1102 }
1103 l2 = target * layer1_size;
1104 f = 0;
1105 for (c = 0; c < layer1_size; c++)
1106 f += syn0[c + l1] * syn1neg[c + l2];
1107 if (f > MAX_EXP)
1108 g = (label - 1) * alpha;
1109 else if (f < -MAX_EXP)
1110 g = (label - 0) * alpha;
1111 else
1112 g =
1113 (label
1114 - expTable[(int) ((f + MAX_EXP)
1115 * (EXP_TABLE_SIZE
1116 / MAX_EXP / 2))])
1117 * alpha;
1118 for (c = 0; c < layer1_size; c++)
1119 neu1e[c] += g * syn1neg[c + l2];
1120 for (c = 0; c < layer1_size; c++)
1121 syn1neg[c + l2] += g * syn0[c + l1];
1122 if (cap == 1)
1123 for (c = 0; c < layer1_size; c++)
1124 capParam(syn1neg, c + l2);
1125 }
1126 //Noise Contrastive Estimation
1127 if (nce > 0)
1128 for (d = 0; d < nce + 1; d++) {
1129 if (d == 0) {
1130 target = word;
1131 label = 1;
1132 } else {
1133 next_random = next_random
1134 * (unsigned long long) 25214903917 + 11;
1135 if (word_to_group != NULL
1136 && word_to_group[word] != -1) {
1137 target = word;
1138 while (target == word) {
1139 target =
1140 group_to_table[word_to_group[word]
1141 * table_size
1142 + (next_random >> 16)
1143 % table_size];
1144 next_random =
1145 next_random
1146 * (unsigned long long) 25214903917
1147 + 11;
1148 }
1149 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1150 } else {
1151 target = table[(next_random >> 16)
1152 % table_size];
1153 }
1154 if (target == 0)
1155 target = next_random % (vocab_size - 1) + 1;
1156 if (target == word)
1157 continue;
1158 label = 0;
1159 }
1160 l2 = target * layer1_size;
1161 f = 0;
1162 for (c = 0; c < layer1_size; c++)
1163 f += syn0[c + l1] * syn1nce[c + l2];
1164 if (f > MAX_EXP)
1165 g = (label - 1) * alpha;
1166 else if (f < -MAX_EXP)
1167 g = (label - 0) * alpha;
1168 else {
1169 f = exp(f);
1170 g = (label
1171 - f
1172 / (noise_distribution[target]
1173 * nce + f)) * alpha;
1174 }
1175 for (c = 0; c < layer1_size; c++)
1176 neu1e[c] += g * syn1nce[c + l2];
1177 for (c = 0; c < layer1_size; c++)
1178 syn1nce[c + l2] += g * syn0[c + l1];
1179 if (cap == 1)
1180 for (c = 0; c < layer1_size; c++)
1181 capParam(syn1nce, c + l2);
1182 }
1183 // Learn weights input -> hidden
1184 for (c = 0; c < layer1_size; c++)
1185 syn0[c + l1] += neu1e[c];
1186 }
1187 } else if (type == 2) { //train the cwindow architecture
1188 // in -> hidden
1189 cw = 0;
1190 for (a = 0; a < window * 2 + 1; a++)
1191 if (a != window) {
1192 c = sentence_position - window + a;
1193 if (c < 0)
1194 continue;
1195 if (c >= sentence_length)
1196 continue;
1197 last_word = sen[c];
1198 if (last_word == -1)
1199 continue;
1200 window_offset = a * layer1_size;
1201 if (a > window)
1202 window_offset -= layer1_size;
1203 for (c = 0; c < layer1_size; c++)
1204 neu1[c + window_offset] += syn0[c
1205 + last_word * layer1_size];
1206 cw++;
1207 }
1208 if (cw) {
1209 if (hs)
1210 for (d = 0; d < vocab[word].codelen; d++) {
1211 f = 0;
1212 l2 = vocab[word].point[d] * window_layer_size;
1213 // Propagate hidden -> output
1214 for (c = 0; c < window_layer_size; c++)
1215 f += neu1[c] * syn1_window[c + l2];
1216 if (f <= -MAX_EXP)
1217 continue;
1218 else if (f >= MAX_EXP)
1219 continue;
1220 else
1221 f = expTable[(int) ((f + MAX_EXP)
1222 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1223 // 'g' is the gradient multiplied by the learning rate
1224 g = (1 - vocab[word].code[d] - f) * alpha;
1225 // Propagate errors output -> hidden
1226 for (c = 0; c < window_layer_size; c++)
1227 neu1e[c] += g * syn1_window[c + l2];
1228 // Learn weights hidden -> output
1229 for (c = 0; c < window_layer_size; c++)
1230 syn1_window[c + l2] += g * neu1[c];
1231 if (cap == 1)
1232 for (c = 0; c < window_layer_size; c++)
1233 capParam(syn1_window, c + l2);
1234 }
1235 // NEGATIVE SAMPLING
1236 if (negative > 0)
1237 for (d = 0; d < negative + 1; d++) {
1238 if (d == 0) {
1239 target = word;
1240 label = 1;
1241 } else {
1242 next_random = next_random
1243 * (unsigned long long) 25214903917 + 11;
1244 if (word_to_group != NULL
1245 && word_to_group[word] != -1) {
1246 target = word;
1247 while (target == word) {
1248 target = group_to_table[word_to_group[word]
1249 * table_size
1250 + (next_random >> 16) % table_size];
1251 next_random = next_random
1252 * (unsigned long long) 25214903917
1253 + 11;
1254 }
1255 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1256 } else {
1257 target =
1258 table[(next_random >> 16) % table_size];
1259 }
1260 if (target == 0)
1261 target = next_random % (vocab_size - 1) + 1;
1262 if (target == word)
1263 continue;
1264 label = 0;
1265 }
1266 l2 = target * window_layer_size;
1267 f = 0;
1268 for (c = 0; c < window_layer_size; c++)
1269 f += neu1[c] * syn1neg_window[c + l2];
1270 if (f > MAX_EXP)
1271 g = (label - 1) * alpha;
1272 else if (f < -MAX_EXP)
1273 g = (label - 0) * alpha;
1274 else
1275 g = (label
1276 - expTable[(int) ((f + MAX_EXP)
1277 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1278 * alpha;
1279 for (c = 0; c < window_layer_size; c++)
1280 neu1e[c] += g * syn1neg_window[c + l2];
1281 for (c = 0; c < window_layer_size; c++)
1282 syn1neg_window[c + l2] += g * neu1[c];
1283 if (cap == 1)
1284 for (c = 0; c < window_layer_size; c++)
1285 capParam(syn1neg_window, c + l2);
1286 }
1287 // Noise Contrastive Estimation
1288 if (nce > 0)
1289 for (d = 0; d < nce + 1; d++) {
1290 if (d == 0) {
1291 target = word;
1292 label = 1;
1293 } else {
1294 next_random = next_random
1295 * (unsigned long long) 25214903917 + 11;
1296 if (word_to_group != NULL
1297 && word_to_group[word] != -1) {
1298 target = word;
1299 while (target == word) {
1300 target = group_to_table[word_to_group[word]
1301 * table_size
1302 + (next_random >> 16) % table_size];
1303 next_random = next_random
1304 * (unsigned long long) 25214903917
1305 + 11;
1306 }
1307 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1308 } else {
1309 target =
1310 table[(next_random >> 16) % table_size];
1311 }
1312 if (target == 0)
1313 target = next_random % (vocab_size - 1) + 1;
1314 if (target == word)
1315 continue;
1316 label = 0;
1317 }
1318 l2 = target * window_layer_size;
1319 f = 0;
1320 for (c = 0; c < window_layer_size; c++)
1321 f += neu1[c] * syn1nce_window[c + l2];
1322 if (f > MAX_EXP)
1323 g = (label - 1) * alpha;
1324 else if (f < -MAX_EXP)
1325 g = (label - 0) * alpha;
1326 else {
1327 f = exp(f);
1328 g =
1329 (label
1330 - f
1331 / (noise_distribution[target]
1332 * nce + f)) * alpha;
1333 }
1334 for (c = 0; c < window_layer_size; c++)
1335 neu1e[c] += g * syn1nce_window[c + l2];
1336 for (c = 0; c < window_layer_size; c++)
1337 syn1nce_window[c + l2] += g * neu1[c];
1338 if (cap == 1)
1339 for (c = 0; c < window_layer_size; c++)
1340 capParam(syn1nce_window, c + l2);
1341 }
1342 // hidden -> in
1343 for (a = 0; a < window * 2 + 1; a++)
1344 if (a != window) {
1345 c = sentence_position - window + a;
1346 if (c < 0)
1347 continue;
1348 if (c >= sentence_length)
1349 continue;
1350 last_word = sen[c];
1351 if (last_word == -1)
1352 continue;
1353 window_offset = a * layer1_size;
1354 if (a > window)
1355 window_offset -= layer1_size;
1356 for (c = 0; c < layer1_size; c++)
1357 syn0[c + last_word * layer1_size] += neu1e[c
1358 + window_offset];
1359 }
1360 }
1361 } else if (type == 3) { //train structured skip-gram
1362 for (a = 0; a < window * 2 + 1; a++)
1363 if (a != window) {
1364 c = sentence_position - window + a;
1365 if (c < 0)
1366 continue;
1367 if (c >= sentence_length)
1368 continue;
1369 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001370 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001371 continue;
1372 l1 = last_word * layer1_size;
1373 window_offset = a * layer1_size;
1374 if (a > window)
1375 window_offset -= layer1_size;
1376 for (c = 0; c < layer1_size; c++)
1377 neu1e[c] = 0;
1378 // HIERARCHICAL SOFTMAX
1379 if (hs)
1380 for (d = 0; d < vocab[word].codelen; d++) {
1381 f = 0;
1382 l2 = vocab[word].point[d] * window_layer_size;
1383 // Propagate hidden -> output
1384 for (c = 0; c < layer1_size; c++)
1385 f += syn0[c + l1]
1386 * syn1_window[c + l2 + window_offset];
1387 if (f <= -MAX_EXP)
1388 continue;
1389 else if (f >= MAX_EXP)
1390 continue;
1391 else
1392 f = expTable[(int) ((f + MAX_EXP)
1393 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1394 // 'g' is the gradient multiplied by the learning rate
1395 g = (1 - vocab[word].code[d] - f) * alpha;
1396 // Propagate errors output -> hidden
1397 for (c = 0; c < layer1_size; c++)
1398 neu1e[c] += g
1399 * syn1_window[c + l2 + window_offset];
1400 // Learn weights hidden -> output
1401 for (c = 0; c < layer1_size; c++)
1402 syn1[c + l2 + window_offset] += g
1403 * syn0[c + l1];
1404 if (cap == 1)
1405 for (c = 0; c < layer1_size; c++)
1406 capParam(syn1, c + l2 + window_offset);
1407 }
1408 // NEGATIVE SAMPLING
1409 if (negative > 0)
1410 for (d = 0; d < negative + 1; d++) {
1411 if (d == 0) {
1412 target = word;
1413 label = 1;
1414 } else {
1415 next_random = next_random
1416 * (unsigned long long) 25214903917 + 11;
1417 if (word_to_group != NULL
1418 && word_to_group[word] != -1) {
1419 target = word;
1420 while (target == word) {
1421 target =
1422 group_to_table[word_to_group[word]
1423 * table_size
1424 + (next_random >> 16)
1425 % table_size];
1426 next_random =
1427 next_random
1428 * (unsigned long long) 25214903917
1429 + 11;
1430 }
1431 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1432 } else {
1433 target = table[(next_random >> 16)
1434 % table_size];
1435 }
1436 if (target == 0)
1437 target = next_random % (vocab_size - 1) + 1;
1438 if (target == word)
1439 continue;
1440 label = 0;
1441 }
1442 l2 = target * window_layer_size;
1443 f = 0;
1444 for (c = 0; c < layer1_size; c++)
1445 f +=
1446 syn0[c + l1]
1447 * syn1neg_window[c + l2
1448 + window_offset];
1449 if (f > MAX_EXP)
1450 g = (label - 1) * alpha;
1451 else if (f < -MAX_EXP)
1452 g = (label - 0) * alpha;
1453 else
1454 g =
1455 (label
1456 - expTable[(int) ((f + MAX_EXP)
1457 * (EXP_TABLE_SIZE
1458 / MAX_EXP / 2))])
1459 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001460 if(debug_mode > 2 && ((long long) id) == 0) {
1461 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1462 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1463 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001464 for (c = 0; c < layer1_size; c++)
1465 neu1e[c] +=
1466 g
1467 * syn1neg_window[c + l2
1468 + window_offset];
1469 for (c = 0; c < layer1_size; c++)
1470 syn1neg_window[c + l2 + window_offset] += g
1471 * syn0[c + l1];
1472 if (cap == 1)
1473 for (c = 0; c < layer1_size; c++)
1474 capParam(syn1neg_window,
1475 c + l2 + window_offset);
1476 }
1477 // Noise Constrastive Estimation
1478 if (nce > 0)
1479 for (d = 0; d < nce + 1; d++) {
1480 if (d == 0) {
1481 target = word;
1482 label = 1;
1483 } else {
1484 next_random = next_random
1485 * (unsigned long long) 25214903917 + 11;
1486 if (word_to_group != NULL
1487 && word_to_group[word] != -1) {
1488 target = word;
1489 while (target == word) {
1490 target =
1491 group_to_table[word_to_group[word]
1492 * table_size
1493 + (next_random >> 16)
1494 % table_size];
1495 next_random =
1496 next_random
1497 * (unsigned long long) 25214903917
1498 + 11;
1499 }
1500 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1501 } else {
1502 target = table[(next_random >> 16)
1503 % table_size];
1504 }
1505 if (target == 0)
1506 target = next_random % (vocab_size - 1) + 1;
1507 if (target == word)
1508 continue;
1509 label = 0;
1510 }
1511 l2 = target * window_layer_size;
1512 f = 0;
1513 for (c = 0; c < layer1_size; c++)
1514 f +=
1515 syn0[c + l1]
1516 * syn1nce_window[c + l2
1517 + window_offset];
1518 if (f > MAX_EXP)
1519 g = (label - 1) * alpha;
1520 else if (f < -MAX_EXP)
1521 g = (label - 0) * alpha;
1522 else {
1523 f = exp(f);
1524 g = (label
1525 - f
1526 / (noise_distribution[target]
1527 * nce + f)) * alpha;
1528 }
1529 for (c = 0; c < layer1_size; c++)
1530 neu1e[c] +=
1531 g
1532 * syn1nce_window[c + l2
1533 + window_offset];
1534 for (c = 0; c < layer1_size; c++)
1535 syn1nce_window[c + l2 + window_offset] += g
1536 * syn0[c + l1];
1537 if (cap == 1)
1538 for (c = 0; c < layer1_size; c++)
1539 capParam(syn1nce_window,
1540 c + l2 + window_offset);
1541 }
1542 // Learn weights input -> hidden
1543 for (c = 0; c < layer1_size; c++) {
1544 syn0[c + l1] += neu1e[c];
1545 if (syn0[c + l1] > 50)
1546 syn0[c + l1] = 50;
1547 if (syn0[c + l1] < -50)
1548 syn0[c + l1] = -50;
1549 }
1550 }
1551 } else if (type == 4) { //training senna
1552 // in -> hidden
1553 cw = 0;
1554 for (a = 0; a < window * 2 + 1; a++)
1555 if (a != window) {
1556 c = sentence_position - window + a;
1557 if (c < 0)
1558 continue;
1559 if (c >= sentence_length)
1560 continue;
1561 last_word = sen[c];
1562 if (last_word == -1)
1563 continue;
1564 window_offset = a * layer1_size;
1565 if (a > window)
1566 window_offset -= layer1_size;
1567 for (c = 0; c < layer1_size; c++)
1568 neu1[c + window_offset] += syn0[c
1569 + last_word * layer1_size];
1570 cw++;
1571 }
1572 if (cw) {
1573 for (a = 0; a < window_hidden_size; a++) {
1574 c = a * window_layer_size;
1575 for (b = 0; b < window_layer_size; b++) {
1576 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1577 }
1578 }
1579 if (hs)
1580 for (d = 0; d < vocab[word].codelen; d++) {
1581 f = 0;
1582 l2 = vocab[word].point[d] * window_hidden_size;
1583 // Propagate hidden -> output
1584 for (c = 0; c < window_hidden_size; c++)
1585 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1586 if (f <= -MAX_EXP)
1587 continue;
1588 else if (f >= MAX_EXP)
1589 continue;
1590 else
1591 f = expTable[(int) ((f + MAX_EXP)
1592 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1593 // 'g' is the gradient multiplied by the learning rate
1594 g = (1 - vocab[word].code[d] - f) * alpha;
1595 // Propagate errors output -> hidden
1596 for (c = 0; c < window_hidden_size; c++)
1597 neu2e[c] += dHardTanh(neu2[c], g) * g
1598 * syn_hidden_word[c + l2];
1599 // Learn weights hidden -> output
1600 for (c = 0; c < window_hidden_size; c++)
1601 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1602 * neu2[c];
1603 }
1604 // NEGATIVE SAMPLING
1605 if (negative > 0)
1606 for (d = 0; d < negative + 1; d++) {
1607 if (d == 0) {
1608 target = word;
1609 label = 1;
1610 } else {
1611 next_random = next_random
1612 * (unsigned long long) 25214903917 + 11;
1613 if (word_to_group != NULL
1614 && word_to_group[word] != -1) {
1615 target = word;
1616 while (target == word) {
1617 target = group_to_table[word_to_group[word]
1618 * table_size
1619 + (next_random >> 16) % table_size];
1620 next_random = next_random
1621 * (unsigned long long) 25214903917
1622 + 11;
1623 }
1624 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1625 } else {
1626 target =
1627 table[(next_random >> 16) % table_size];
1628 }
1629 if (target == 0)
1630 target = next_random % (vocab_size - 1) + 1;
1631 if (target == word)
1632 continue;
1633 label = 0;
1634 }
1635 l2 = target * window_hidden_size;
1636 f = 0;
1637 for (c = 0; c < window_hidden_size; c++)
1638 f += hardTanh(neu2[c])
1639 * syn_hidden_word_neg[c + l2];
1640 if (f > MAX_EXP)
1641 g = (label - 1) * alpha / negative;
1642 else if (f < -MAX_EXP)
1643 g = (label - 0) * alpha / negative;
1644 else
1645 g = (label
1646 - expTable[(int) ((f + MAX_EXP)
1647 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1648 * alpha / negative;
1649 for (c = 0; c < window_hidden_size; c++)
1650 neu2e[c] += dHardTanh(neu2[c], g) * g
1651 * syn_hidden_word_neg[c + l2];
1652 for (c = 0; c < window_hidden_size; c++)
1653 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1654 * g * neu2[c];
1655 }
1656 for (a = 0; a < window_hidden_size; a++)
1657 for (b = 0; b < window_layer_size; b++)
1658 neu1e[b] += neu2e[a]
1659 * syn_window_hidden[a * window_layer_size + b];
1660 for (a = 0; a < window_hidden_size; a++)
1661 for (b = 0; b < window_layer_size; b++)
1662 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1663 * neu1[b];
1664 // hidden -> in
1665 for (a = 0; a < window * 2 + 1; a++)
1666 if (a != window) {
1667 c = sentence_position - window + a;
1668 if (c < 0)
1669 continue;
1670 if (c >= sentence_length)
1671 continue;
1672 last_word = sen[c];
1673 if (last_word == -1)
1674 continue;
1675 window_offset = a * layer1_size;
1676 if (a > window)
1677 window_offset -= layer1_size;
1678 for (c = 0; c < layer1_size; c++)
1679 syn0[c + last_word * layer1_size] += neu1e[c
1680 + window_offset];
1681 }
1682 }
Marc Kupietz613edbf2018-01-11 21:38:03 +01001683 } else if(type == 5) {
1684 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1685 c = sentence_position - window + a;
1686 if (c < 0) continue;
1687 if (c >= sentence_length) continue;
1688 last_word = sen[c];
1689 if (last_word == -1) continue;
1690 inc_collocator(cdb, word, last_word, a - window);
1691 // printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
1692 // cw++;
1693 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001694 } else {
1695 printf("unknown type %i", type);
1696 exit(0);
1697 }
1698 sentence_position++;
1699 if (sentence_position >= sentence_length) {
1700 sentence_length = 0;
1701 continue;
1702 }
1703 }
1704 fclose(fi);
1705 free(neu1);
1706 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001707 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001708 pthread_exit(NULL);
1709}
1710
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001711void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001712 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001713 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001714 real *target_sums, bestf[MAX_CC], worstbest;
1715 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001716 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001717 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1718
1719 for (d = cc; d < vocab_size; d++) {
1720 for (b = 0; b < vocab_size; b++)
1721 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001722 for (b = 0; b < N; b++)
1723 bestf[b]=-1;
1724 worstbest = -1;
1725
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001726 maxmax_f = -1;
1727 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001728 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001729 if (a != window) {
1730 max_f = -1;
1731 window_offset = a * layer1_size;
1732 if (a > window)
1733 window_offset -= layer1_size;
1734 for(target = 0; target < vocab_size; target ++) {
1735 if(target == d)
1736 continue;
1737 f = 0;
1738 for (c = 0; c < layer1_size; c++)
1739 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1740 if (f < -MAX_EXP)
1741 continue;
1742 else if (f > MAX_EXP)
1743 continue;
1744 else
1745 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1746 if(f > max_f) {
1747 max_f = f;
1748 max_target = target;
1749 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001750 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001751 if(f > worstbest) {
1752 for (b = 0; b < N; b++) {
1753 if (f > bestf[b]) {
1754 for (e = N - 1; e > b; e--) {
1755 bestf[e] = bestf[e - 1];
1756 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001757 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001758 }
1759 bestf[b] = f;
1760 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001761 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001762 break;
1763 }
1764 }
1765 worstbest = bestf[N-1];
1766 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001767 }
1768 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1769 if(max_f > maxmax_f) {
1770 maxmax_f = max_f;
1771 maxmax_target = max_target;
1772 }
1773 } else {
1774 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1775 }
1776 }
1777 max_f = -1;
1778 for (b = 0; b < vocab_size; b++) {
1779 if(target_sums[b] > max_f) {
1780 max_f = target_sums[b];
1781 max_target = b;
1782 }
1783 }
1784 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001785 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001786 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001787 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001788 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001789 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001790 }
1791}
1792
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001793void TrainModel() {
1794 long a, b, c, d;
1795 FILE *fo;
1796 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001797 threadPos = malloc(num_threads * sizeof(long long));
1798 threadIters = malloc(num_threads * sizeof(int));
1799 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001800 printf("Starting training using file %s\n", train_file);
1801 starting_alpha = alpha;
1802 if (read_vocab_file[0] != 0)
1803 ReadVocab();
1804 else
1805 LearnVocabFromTrainFile();
1806 if (save_vocab_file[0] != 0)
1807 SaveVocab();
1808 if (output_file[0] == 0)
1809 return;
1810 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001811 if(cc > 0)
1812 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001813 if (negative > 0 || nce > 0)
1814 InitUnigramTable();
1815 if (negative_classes_file[0] != 0)
1816 InitClassUnigramTable();
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001817 start = time(NULL);
1818 start_clock = clock();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001819 for (a = 0; a < num_threads; a++)
1820 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001821 if(debug_mode > 1)
1822 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001823 for (a = 0; a < num_threads; a++)
1824 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001825 if(debug_mode > 1) {
1826 pthread_join(pt[num_threads], NULL);
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001827 clock_t now = time(NULL);
1828 clock_t now_clock = clock();
1829 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
Marc Kupietz613edbf2018-01-11 21:38:03 +01001830 if(type == 5) // don't save vectorsmfor classic collocators
1831 return;
Marc Kupietz202723e2016-07-14 09:12:00 +02001832 printf("Saving vectors to %s ...", output_file);
1833 fflush(stdout);
1834 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001835 fo = fopen(output_file, "wb");
1836 if (classes == 0) {
1837 // Save the word vectors
1838 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1839 for (a = 0; a < vocab_size; a++) {
1840 fprintf(fo, "%s ", vocab[a].word);
1841 if (binary)
1842 for (b = 0; b < layer1_size; b++)
1843 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1844 else
1845 for (b = 0; b < layer1_size; b++)
1846 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1847 fprintf(fo, "\n");
1848 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001849 if(debug_mode > 1)
1850 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001851 } else {
1852 // Run K-means on the word vectors
1853 int clcn = classes, iter = 10, closeid;
1854 int *centcn = (int *) malloc(classes * sizeof(int));
1855 int *cl = (int *) calloc(vocab_size, sizeof(int));
1856 real closev, x;
1857 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1858 for (a = 0; a < vocab_size; a++)
1859 cl[a] = a % clcn;
1860 for (a = 0; a < iter; a++) {
1861 for (b = 0; b < clcn * layer1_size; b++)
1862 cent[b] = 0;
1863 for (b = 0; b < clcn; b++)
1864 centcn[b] = 1;
1865 for (c = 0; c < vocab_size; c++) {
1866 for (d = 0; d < layer1_size; d++)
1867 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1868 centcn[cl[c]]++;
1869 }
1870 for (b = 0; b < clcn; b++) {
1871 closev = 0;
1872 for (c = 0; c < layer1_size; c++) {
1873 cent[layer1_size * b + c] /= centcn[b];
1874 closev += cent[layer1_size * b + c]
1875 * cent[layer1_size * b + c];
1876 }
1877 closev = sqrt(closev);
1878 for (c = 0; c < layer1_size; c++)
1879 cent[layer1_size * b + c] /= closev;
1880 }
1881 for (c = 0; c < vocab_size; c++) {
1882 closev = -10;
1883 closeid = 0;
1884 for (d = 0; d < clcn; d++) {
1885 x = 0;
1886 for (b = 0; b < layer1_size; b++)
1887 x += cent[layer1_size * d + b]
1888 * syn0[c * layer1_size + b];
1889 if (x > closev) {
1890 closev = x;
1891 closeid = d;
1892 }
1893 }
1894 cl[c] = closeid;
1895 }
1896 }
1897 // Save the K-means classes
1898 for (a = 0; a < vocab_size; a++)
1899 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1900 free(centcn);
1901 free(cent);
1902 free(cl);
1903 }
1904 fclose(fo);
1905 if (save_net_file[0] != 0)
1906 SaveNet();
1907}
1908
1909int ArgPos(char *str, int argc, char **argv) {
1910 int a;
1911 for (a = 1; a < argc; a++)
1912 if (!strcmp(str, argv[a])) {
1913 if (a == argc - 1) {
1914 printf("Argument missing for %s\n", str);
1915 exit(1);
1916 }
1917 return a;
1918 }
1919 return -1;
1920}
1921
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001922void print_help() {
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001923 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1924 printf("Options:\n");
1925 printf("Parameters for training:\n");
1926 printf("\t-train <file>\n");
1927 printf("\t\tUse text data from <file> to train the model\n");
1928 printf("\t-output <file>\n");
1929 printf(
1930 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1931 printf("\t-size <int>\n");
1932 printf("\t\tSet size of word vectors; default is 100\n");
1933 printf("\t-window <int>\n");
1934 printf("\t\tSet max skip length between words; default is 5\n");
1935 printf("\t-sample <float>\n");
1936 printf(
1937 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1938 printf(
1939 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1940 printf("\t-hs <int>\n");
1941 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1942 printf("\t-negative <int>\n");
1943 printf(
1944 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1945 printf("\t-negative-classes <file>\n");
1946 printf("\t\tNegative classes to sample from\n");
1947 printf("\t-nce <int>\n");
1948 printf(
1949 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1950 printf("\t-threads <int>\n");
1951 printf("\t\tUse <int> threads (default 12)\n");
1952 printf("\t-iter <int>\n");
1953 printf("\t\tRun more training iterations (default 5)\n");
1954 printf("\t-min-count <int>\n");
1955 printf(
1956 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1957 printf("\t-alpha <float>\n");
1958 printf(
1959 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1960 printf("\t-classes <int>\n");
1961 printf(
1962 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1963 printf("\t-debug <int>\n");
1964 printf(
1965 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1966 printf("\t-binary <int>\n");
1967 printf(
1968 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1969 printf("\t-save-vocab <file>\n");
1970 printf("\t\tThe vocabulary will be saved to <file>\n");
1971 printf("\t-read-vocab <file>\n");
1972 printf(
1973 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1974 printf("\t-read-net <file>\n");
1975 printf(
1976 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1977 printf("\t-save-net <file>\n");
1978 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietze423f732017-12-22 17:57:03 +01001979 printf("\t-magic-stop-file <file>\n");
1980 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001981 printf("\t-show-cc <int>\n");
1982 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001983 printf("\t-type <int>\n");
1984 printf(
Marc Kupietz613edbf2018-01-11 21:38:03 +01001985 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001986 printf("\t-cap <int>\n");
1987 printf(
1988 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1989 printf("\nExamples:\n");
1990 printf(
1991 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001992}
1993
1994int main(int argc, char **argv) {
1995 int i;
1996 setlocale(LC_ALL, "");
1997 if (argc == 1) {
1998 print_help();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001999 return 0;
2000 }
2001 output_file[0] = 0;
2002 save_vocab_file[0] = 0;
2003 read_vocab_file[0] = 0;
2004 save_net_file[0] = 0;
2005 read_net_file[0] = 0;
2006 negative_classes_file[0] = 0;
Marc Kupietzc7f773b2017-12-02 12:04:03 +01002007 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
2008 print_help();
2009 return(0);
2010 }
2011 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
2012 print_help();
2013 return(0);
2014 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002015 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2016 layer1_size = atoi(argv[i + 1]);
2017 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2018 strcpy(train_file, argv[i + 1]);
2019 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2020 strcpy(save_vocab_file, argv[i + 1]);
2021 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2022 strcpy(read_vocab_file, argv[i + 1]);
2023 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2024 strcpy(save_net_file, argv[i + 1]);
2025 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2026 strcpy(read_net_file, argv[i + 1]);
Marc Kupietze423f732017-12-22 17:57:03 +01002027 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2028 strcpy(magic_stop_file, argv[i + 1]);
2029 if (access(magic_stop_file, F_OK ) != -1) {
2030 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2031 exit(1);
2032 }
2033 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002034 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2035 debug_mode = atoi(argv[i + 1]);
2036 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2037 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002038 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2039 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002040 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2041 type = atoi(argv[i + 1]);
2042 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2043 strcpy(output_file, argv[i + 1]);
2044 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2045 window = atoi(argv[i + 1]);
2046 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2047 sample = atof(argv[i + 1]);
2048 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2049 hs = atoi(argv[i + 1]);
2050 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2051 negative = atoi(argv[i + 1]);
2052 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2053 strcpy(negative_classes_file, argv[i + 1]);
2054 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2055 nce = atoi(argv[i + 1]);
2056 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2057 num_threads = atoi(argv[i + 1]);
2058 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2059 iter = atoi(argv[i + 1]);
2060 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2061 min_count = atoi(argv[i + 1]);
2062 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2063 classes = atoi(argv[i + 1]);
2064 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2065 cap = atoi(argv[i + 1]);
2066 if (type == 0 || type == 2 || type == 4)
2067 alpha = 0.05;
Marc Kupietz613edbf2018-01-11 21:38:03 +01002068 if (type==5) {
2069 sample = 0;
2070 cdb = open_collocatordb_for_write(output_file);
2071 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002072 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2073 alpha = atof(argv[i + 1]);
2074 vocab = (struct vocab_word *) calloc(vocab_max_size,
2075 sizeof(struct vocab_word));
2076 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2077 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2078 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2079 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2080 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2081 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002082 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002083 TrainModel();
2084 return 0;
2085}
2086