blob: eb0add1facf9b1ee3943c87d9113181afd72de23 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Marc Kupietze23c5402016-07-14 11:10:09 +020015#include <locale.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010016#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020019#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010020#include <math.h>
21#include <pthread.h>
22
23#define MAX_STRING 100
24#define EXP_TABLE_SIZE 1000
25#define MAX_EXP 6
26#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010027#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010028#define MAX_CODE_LENGTH 40
29
30const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
31
32typedef float real; // Precision of float numbers
33
34struct vocab_word {
35 long long cn;
36 int *point;
37 char *word, *code, codelen;
38};
39
40char train_file[MAX_STRING], output_file[MAX_STRING];
41char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
42char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietze423f732017-12-22 17:57:03 +010043char magic_stop_file[MAX_STRING];
44
Marc Kupietzd6f9c712016-03-16 11:50:56 +010045struct vocab_word *vocab;
46int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietzc2731b22016-07-14 08:56:14 +020047 num_threads = 12, min_reduce = 1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010048int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020049long long *threadPos;
50int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010051long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
52long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
53 classes = 0;
54real alpha = 0.025, starting_alpha, sample = 1e-3;
55real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020056real avgWordLength=0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010057clock_t start;
58
59real *syn1_window, *syn1neg_window, *syn1nce_window;
60int w_offset, window_layer_size;
61
62int window_hidden_size = 500;
63real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
64 *syn_hidden_word_nce;
65
66int hs = 0, negative = 5;
67const int table_size = 1e8;
68int *table;
69
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010070long cc = 0;
71
Marc Kupietzd6f9c712016-03-16 11:50:56 +010072//constrastive negative sampling
73char negative_classes_file[MAX_STRING];
74int *word_to_group;
75int *group_to_table; //group_size*table_size
76int class_number;
77
78//nce
79real* noise_distribution;
80int nce = 0;
81
82//param caps
83real CAP_VALUE = 50;
84int cap = 0;
85
86void capParam(real* array, int index) {
87 if (array[index] > CAP_VALUE)
88 array[index] = CAP_VALUE;
89 else if (array[index] < -CAP_VALUE)
90 array[index] = -CAP_VALUE;
91}
92
93real hardTanh(real x) {
94 if (x >= 1) {
95 return 1;
96 } else if (x <= -1) {
97 return -1;
98 } else {
99 return x;
100 }
101}
102
103real dHardTanh(real x, real g) {
104 if (x > 1 && g > 0) {
105 return 0;
106 }
107 if (x < -1 && g < 0) {
108 return 0;
109 }
110 return 1;
111}
112
113void InitUnigramTable() {
114 int a, i;
115 long long train_words_pow = 0;
116 real d1, power = 0.75;
117 table = (int *) malloc(table_size * sizeof(int));
118 for (a = 0; a < vocab_size; a++)
119 train_words_pow += pow(vocab[a].cn, power);
120 i = 0;
121 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
122 for (a = 0; a < table_size; a++) {
123 table[a] = i;
124 if (a / (real) table_size > d1) {
125 i++;
126 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
127 }
128 if (i >= vocab_size)
129 i = vocab_size - 1;
130 }
131
132 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
133 for (a = 0; a < vocab_size; a++)
134 noise_distribution[a] = pow(vocab[a].cn, power)
135 / (real) train_words_pow;
136}
137
138// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
139void ReadWord(char *word, FILE *fin) {
140 int a = 0, ch;
141 while (!feof(fin)) {
142 ch = fgetc(fin);
143 if (ch == 13)
144 continue;
145 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
146 if (a > 0) {
147 if (ch == '\n')
148 ungetc(ch, fin);
149 break;
150 }
151 if (ch == '\n') {
152 strcpy(word, (char *) "</s>");
153 return;
154 } else
155 continue;
156 }
157 word[a] = ch;
158 a++;
159 if (a >= MAX_STRING - 1)
160 a--; // Truncate too long words
161 }
162 word[a] = 0;
163}
164
165// Returns hash value of a word
166int GetWordHash(char *word) {
167 unsigned long long a, hash = 0;
168 for (a = 0; a < strlen(word); a++)
169 hash = hash * 257 + word[a];
170 hash = hash % vocab_hash_size;
171 return hash;
172}
173
174// Returns position of a word in the vocabulary; if the word is not found, returns -1
175int SearchVocab(char *word) {
176 unsigned int hash = GetWordHash(word);
177 while (1) {
178 if (vocab_hash[hash] == -1)
179 return -1;
180 if (!strcmp(word, vocab[vocab_hash[hash]].word))
181 return vocab_hash[hash];
182 hash = (hash + 1) % vocab_hash_size;
183 }
184 return -1;
185}
186
187// Reads a word and returns its index in the vocabulary
188int ReadWordIndex(FILE *fin) {
189 char word[MAX_STRING];
190 ReadWord(word, fin);
191 if (feof(fin))
192 return -1;
193 return SearchVocab(word);
194}
195
196// Adds a word to the vocabulary
197int AddWordToVocab(char *word) {
198 unsigned int hash, length = strlen(word) + 1;
199 if (length > MAX_STRING)
200 length = MAX_STRING;
201 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
202 strcpy(vocab[vocab_size].word, word);
203 vocab[vocab_size].cn = 0;
204 vocab_size++;
205 // Reallocate memory if needed
206 if (vocab_size + 2 >= vocab_max_size) {
207 vocab_max_size += 1000;
208 vocab = (struct vocab_word *) realloc(vocab,
209 vocab_max_size * sizeof(struct vocab_word));
210 }
211 hash = GetWordHash(word);
212 while (vocab_hash[hash] != -1)
213 hash = (hash + 1) % vocab_hash_size;
214 vocab_hash[hash] = vocab_size - 1;
215 return vocab_size - 1;
216}
217
218// Used later for sorting by word counts
219int VocabCompare(const void *a, const void *b) {
220 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
221}
222
223// Sorts the vocabulary by frequency using word counts
224void SortVocab() {
225 int a, size;
226 unsigned int hash;
227 // Sort the vocabulary and keep </s> at the first position
228 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
229 for (a = 0; a < vocab_hash_size; a++)
230 vocab_hash[a] = -1;
231 size = vocab_size;
232 train_words = 0;
233 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200234 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100235 // Words occuring less than min_count times will be discarded from the vocab
236 if ((vocab[a].cn < min_count) && (a != 0)) {
237 vocab_size--;
238 free(vocab[a].word);
239 } else {
240 // Hash will be re-computed, as after the sorting it is not actual
241 hash = GetWordHash(vocab[a].word);
242 while (vocab_hash[hash] != -1)
243 hash = (hash + 1) % vocab_hash_size;
244 vocab_hash[hash] = a;
245 train_words += vocab[a].cn;
246 }
247 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200248 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100249 vocab = (struct vocab_word *) realloc(vocab,
250 (vocab_size + 1) * sizeof(struct vocab_word));
251 // Allocate memory for the binary tree construction
252 for (a = 0; a < vocab_size; a++) {
253 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
254 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
255 }
256}
257
258// Reduces the vocabulary by removing infrequent tokens
259void ReduceVocab() {
260 int a, b = 0;
261 unsigned int hash;
262 for (a = 0; a < vocab_size; a++)
263 if (vocab[a].cn > min_reduce) {
264 vocab[b].cn = vocab[a].cn;
265 vocab[b].word = vocab[a].word;
266 b++;
267 } else
268 free(vocab[a].word);
269 vocab_size = b;
270 for (a = 0; a < vocab_hash_size; a++)
271 vocab_hash[a] = -1;
272 for (a = 0; a < vocab_size; a++) {
273 // Hash will be re-computed, as it is not actual
274 hash = GetWordHash(vocab[a].word);
275 while (vocab_hash[hash] != -1)
276 hash = (hash + 1) % vocab_hash_size;
277 vocab_hash[hash] = a;
278 }
279 fflush(stdout);
280 min_reduce++;
281}
282
283// Create binary Huffman tree using the word counts
284// Frequent words will have short uniqe binary codes
285void CreateBinaryTree() {
286 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
287 char code[MAX_CODE_LENGTH];
288 long long *count = (long long *) calloc(vocab_size * 2 + 1,
289 sizeof(long long));
290 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
291 sizeof(long long));
292 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
293 sizeof(long long));
294 for (a = 0; a < vocab_size; a++)
295 count[a] = vocab[a].cn;
296 for (a = vocab_size; a < vocab_size * 2; a++)
297 count[a] = 1e15;
298 pos1 = vocab_size - 1;
299 pos2 = vocab_size;
300 // Following algorithm constructs the Huffman tree by adding one node at a time
301 for (a = 0; a < vocab_size - 1; a++) {
302 // First, find two smallest nodes 'min1, min2'
303 if (pos1 >= 0) {
304 if (count[pos1] < count[pos2]) {
305 min1i = pos1;
306 pos1--;
307 } else {
308 min1i = pos2;
309 pos2++;
310 }
311 } else {
312 min1i = pos2;
313 pos2++;
314 }
315 if (pos1 >= 0) {
316 if (count[pos1] < count[pos2]) {
317 min2i = pos1;
318 pos1--;
319 } else {
320 min2i = pos2;
321 pos2++;
322 }
323 } else {
324 min2i = pos2;
325 pos2++;
326 }
327 count[vocab_size + a] = count[min1i] + count[min2i];
328 parent_node[min1i] = vocab_size + a;
329 parent_node[min2i] = vocab_size + a;
330 binary[min2i] = 1;
331 }
332 // Now assign binary code to each vocabulary word
333 for (a = 0; a < vocab_size; a++) {
334 b = a;
335 i = 0;
336 while (1) {
337 code[i] = binary[b];
338 point[i] = b;
339 i++;
340 b = parent_node[b];
341 if (b == vocab_size * 2 - 2)
342 break;
343 }
344 vocab[a].codelen = i;
345 vocab[a].point[0] = vocab_size - 2;
346 for (b = 0; b < i; b++) {
347 vocab[a].code[i - b - 1] = code[b];
348 vocab[a].point[i - b] = point[b] - vocab_size;
349 }
350 }
351 free(count);
352 free(binary);
353 free(parent_node);
354}
355
356void LearnVocabFromTrainFile() {
357 char word[MAX_STRING];
358 FILE *fin;
359 long long a, i;
360 for (a = 0; a < vocab_hash_size; a++)
361 vocab_hash[a] = -1;
362 fin = fopen(train_file, "rb");
363 if (fin == NULL) {
364 printf("ERROR: training data file not found!\n");
365 exit(1);
366 }
367 vocab_size = 0;
368 AddWordToVocab((char *) "</s>");
369 while (1) {
370 ReadWord(word, fin);
371 if (feof(fin))
372 break;
373 train_words++;
374 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
375 printf("%lldK%c", train_words / 1000, 13);
376 fflush(stdout);
377 }
378 i = SearchVocab(word);
379 if (i == -1) {
380 a = AddWordToVocab(word);
381 vocab[a].cn = 1;
382 } else
383 vocab[i].cn++;
384 if (vocab_size > vocab_hash_size * 0.7)
385 ReduceVocab();
386 }
387 SortVocab();
388 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200389 printf("Vocab size: %'lld\n", vocab_size);
390 printf("Words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100391 }
392 file_size = ftell(fin);
393 fclose(fin);
394}
395
396void SaveVocab() {
397 long long i;
398 FILE *fo = fopen(save_vocab_file, "wb");
399 for (i = 0; i < vocab_size; i++)
400 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
401 fclose(fo);
402}
403
404void ReadVocab() {
405 long long a, i = 0;
406 char c;
407 char word[MAX_STRING];
408 FILE *fin = fopen(read_vocab_file, "rb");
409 if (fin == NULL) {
410 printf("Vocabulary file not found\n");
411 exit(1);
412 }
413 for (a = 0; a < vocab_hash_size; a++)
414 vocab_hash[a] = -1;
415 vocab_size = 0;
416 while (1) {
417 ReadWord(word, fin);
418 if (feof(fin))
419 break;
420 a = AddWordToVocab(word);
421 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
422 i++;
423 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200424 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100425 fin = fopen(train_file, "rb");
426 if (fin == NULL) {
427 printf("ERROR: training data file not found!\n");
428 exit(1);
429 }
430 fseek(fin, 0, SEEK_END);
431 file_size = ftell(fin);
432 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200433 SortVocab();
434 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200435 printf("Vocab size: %'lld\n", vocab_size);
436 printf("Words in vocab's train file: %'lld\n", train_words);
437 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200438 }
Marc Kupietze23c5402016-07-14 11:10:09 +0200439 train_words = file_size / avgWordLength;
440 if(debug_mode > 0)
441 printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100442}
443
444void InitClassUnigramTable() {
445 long long a, c;
446 printf("loading class unigrams \n");
447 FILE *fin = fopen(negative_classes_file, "rb");
448 if (fin == NULL) {
449 printf("ERROR: class file not found!\n");
450 exit(1);
451 }
452 word_to_group = (int *) malloc(vocab_size * sizeof(int));
453 for (a = 0; a < vocab_size; a++)
454 word_to_group[a] = -1;
455 char class[MAX_STRING];
456 char prev_class[MAX_STRING];
457 prev_class[0] = 0;
458 char word[MAX_STRING];
459 class_number = -1;
460 while (1) {
461 if (feof(fin))
462 break;
463 ReadWord(class, fin);
464 ReadWord(word, fin);
465 int word_index = SearchVocab(word);
466 if (word_index != -1) {
467 if (strcmp(class, prev_class) != 0) {
468 class_number++;
469 strcpy(prev_class, class);
470 }
471 word_to_group[word_index] = class_number;
472 }
473 ReadWord(word, fin);
474 }
475 class_number++;
476 fclose(fin);
477
478 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
479 long long train_words_pow = 0;
480 real d1, power = 0.75;
481
482 for (c = 0; c < class_number; c++) {
483 long long offset = c * table_size;
484 train_words_pow = 0;
485 for (a = 0; a < vocab_size; a++)
486 if (word_to_group[a] == c)
487 train_words_pow += pow(vocab[a].cn, power);
488 int i = 0;
489 while (word_to_group[i] != c && i < vocab_size)
490 i++;
491 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
492 for (a = 0; a < table_size; a++) {
493 //printf("index %lld , word %d\n", a, i);
494 group_to_table[offset + a] = i;
495 if (a / (real) table_size > d1) {
496 i++;
497 while (word_to_group[i] != c && i < vocab_size)
498 i++;
499 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
500 }
501 if (i >= vocab_size)
502 while (word_to_group[i] != c && i >= 0)
503 i--;
504 }
505 }
506}
507
Marc Kupietz210b9d52016-04-02 21:48:13 +0200508void SaveArgs(int argc, char **argv) {
509 unsigned int i;
Marc Kupietz44136742017-12-22 17:52:56 +0100510 char args_file[MAX_STRING];
511 strcpy(args_file, output_file);
Marc Kupietz210b9d52016-04-02 21:48:13 +0200512 strcat(args_file, ".args");
513 FILE *fargs = fopen(args_file, "w");
514 if (fargs == NULL) {
515 printf("Cannot save args to %s.\n", args_file);
516 return;
517 }
518
Marc Kupietz44136742017-12-22 17:52:56 +0100519 for(i=1; i<argc; i++)
520 fprintf(fargs, "%s ", argv[i]);
521
522 fprintf(fargs, "\n");
Marc Kupietz210b9d52016-04-02 21:48:13 +0200523 fclose(fargs);
Marc Kupietz44136742017-12-22 17:52:56 +0100524
Marc Kupietz210b9d52016-04-02 21:48:13 +0200525 return;
526}
527
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100528void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100529 if(type != 3 || negative <= 0) {
530 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
531 return;
532 }
533
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100534 FILE *fnet = fopen(save_net_file, "wb");
535 if (fnet == NULL) {
536 printf("Net parameter file not found\n");
537 exit(1);
538 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100539 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100540 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100541 fclose(fnet);
542}
543
544void InitNet() {
545 long long a, b;
546 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100547 long long read;
548
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100549 window_layer_size = layer1_size * window * 2;
550 a = posix_memalign((void **) &syn0, 128,
551 (long long) vocab_size * layer1_size * sizeof(real));
552 if (syn0 == NULL) {
553 printf("Memory allocation failed\n");
554 exit(1);
555 }
556
557 if (hs) {
558 a = posix_memalign((void **) &syn1, 128,
559 (long long) vocab_size * layer1_size * sizeof(real));
560 if (syn1 == NULL) {
561 printf("Memory allocation failed\n");
562 exit(1);
563 }
564 a = posix_memalign((void **) &syn1_window, 128,
565 (long long) vocab_size * window_layer_size * sizeof(real));
566 if (syn1_window == NULL) {
567 printf("Memory allocation failed\n");
568 exit(1);
569 }
570 a = posix_memalign((void **) &syn_hidden_word, 128,
571 (long long) vocab_size * window_hidden_size * sizeof(real));
572 if (syn_hidden_word == NULL) {
573 printf("Memory allocation failed\n");
574 exit(1);
575 }
576
577 for (a = 0; a < vocab_size; a++)
578 for (b = 0; b < layer1_size; b++)
579 syn1[a * layer1_size + b] = 0;
580 for (a = 0; a < vocab_size; a++)
581 for (b = 0; b < window_layer_size; b++)
582 syn1_window[a * window_layer_size + b] = 0;
583 for (a = 0; a < vocab_size; a++)
584 for (b = 0; b < window_hidden_size; b++)
585 syn_hidden_word[a * window_hidden_size + b] = 0;
586 }
587 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100588 if(type == 0) {
589 a = posix_memalign((void **) &syn1neg, 128,
590 (long long) vocab_size * layer1_size * sizeof(real));
591 if (syn1neg == NULL) {
592 printf("Memory allocation failed\n");
593 exit(1);
594 }
595 for (a = 0; a < vocab_size; a++)
596 for (b = 0; b < layer1_size; b++)
597 syn1neg[a * layer1_size + b] = 0;
598 } else if (type == 3) {
599 a = posix_memalign((void **) &syn1neg_window, 128,
600 (long long) vocab_size * window_layer_size * sizeof(real));
601 if (syn1neg_window == NULL) {
602 printf("Memory allocation failed\n");
603 exit(1);
604 }
605 for (a = 0; a < vocab_size; a++)
606 for (b = 0; b < window_layer_size; b++)
607 syn1neg_window[a * window_layer_size + b] = 0;
608 } else if (type == 4) {
609 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
610 (long long) vocab_size * window_hidden_size * sizeof(real));
611 if (syn_hidden_word_neg == NULL) {
612 printf("Memory allocation failed\n");
613 exit(1);
614 }
615 for (a = 0; a < vocab_size; a++)
616 for (b = 0; b < window_hidden_size; b++)
617 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100618 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100619 }
620 if (nce > 0) {
621 a = posix_memalign((void **) &syn1nce, 128,
622 (long long) vocab_size * layer1_size * sizeof(real));
623 if (syn1nce == NULL) {
624 printf("Memory allocation failed\n");
625 exit(1);
626 }
627 a = posix_memalign((void **) &syn1nce_window, 128,
628 (long long) vocab_size * window_layer_size * sizeof(real));
629 if (syn1nce_window == NULL) {
630 printf("Memory allocation failed\n");
631 exit(1);
632 }
633 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
634 (long long) vocab_size * window_hidden_size * sizeof(real));
635 if (syn_hidden_word_nce == NULL) {
636 printf("Memory allocation failed\n");
637 exit(1);
638 }
639
640 for (a = 0; a < vocab_size; a++)
641 for (b = 0; b < layer1_size; b++)
642 syn1nce[a * layer1_size + b] = 0;
643 for (a = 0; a < vocab_size; a++)
644 for (b = 0; b < window_layer_size; b++)
645 syn1nce_window[a * window_layer_size + b] = 0;
646 for (a = 0; a < vocab_size; a++)
647 for (b = 0; b < window_hidden_size; b++)
648 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
649 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100650
Marc Kupietz1006a272016-03-16 15:50:20 +0100651 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100652 a = posix_memalign((void **) &syn_window_hidden, 128,
653 window_hidden_size * window_layer_size * sizeof(real));
654 if (syn_window_hidden == NULL) {
655 printf("Memory allocation failed\n");
656 exit(1);
657 }
658 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
659 next_random = next_random * (unsigned long long) 25214903917 + 11;
660 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
661 - 0.5) / (window_hidden_size * window_layer_size);
662 }
663 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100664
665 if (read_net_file[0] == 0) {
666 for (a = 0; a < vocab_size; a++)
667 for (b = 0; b < layer1_size; b++) {
668 next_random = next_random * (unsigned long long) 25214903917
669 + 11;
670 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
671 / (real) 65536) - 0.5) / layer1_size;
672 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100673 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100674 FILE *fnet = fopen(read_net_file, "rb");
675 if (fnet == NULL) {
676 printf("Net parameter file not found\n");
677 exit(1);
678 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100679 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
680 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
681 if(read != vocab_size * layer1_size) {
682 fprintf(stderr, "read-net failed %lld\n", read);
683 exit(-1);
684 }
685 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
686 if(read != (long long) vocab_size * window_layer_size) {
687 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
688 (long long) sizeof(real) * vocab_size * window_layer_size);
689 exit(-1);
690 }
691 fgetc(fnet);
692 if(!feof(fnet)) {
693 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
694 exit(-1);
695 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100696 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100697 } else {
698 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
699 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100700 }
701
702 CreateBinaryTree();
703}
704
Marc Kupietz202723e2016-07-14 09:12:00 +0200705char *currentDateTime(char *buf, real offset) {
706 time_t t;
707 time(&t);
708 t += (long) offset;
709 struct tm tstruct;
710 tstruct = *localtime(&t);
711 strftime(buf, 80, "%c", &tstruct);
712 return buf;
713}
714
715void *MonitorThread(void *id) {
716 char *timebuf = malloc(80);;
717 int i, n=num_threads;
718 long long sum;
719 sleep(1);
720 while(n > 0) {
721 sleep(1);
722 sum = n = 0;
723 for(i=0; i < num_threads; i++) {
724 if(threadPos[i] >= 0) {
725 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
726 n++;
727 } else {
728 sum += iter * file_size / num_threads;
729 }
730 }
731 if(n == 0)
732 break;
733 real finished_portion = (real) sum / (float) (file_size * iter);
734 long long now = clock();
735 long long elapsed = (now - start) / CLOCKS_PER_SEC / num_threads;
736 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed) * ((real) num_threads / n) ;
737
738 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/t/s TE: %llds TTG: %llds ETA: %s\033[K",
739 alpha,
740 finished_portion * 100,
741 (float) sum / elapsed / num_threads / 1000,
742 elapsed,
743 ttg,
744 currentDateTime(timebuf, ttg)
745 );
746 fflush(stdout);
747 }
748 pthread_exit(NULL);
749}
750
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100751void *TrainModelThread(void *id) {
752 long long a, b, d, cw, word, last_word, sentence_length = 0,
753 sentence_position = 0;
754 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
755 long long l1, l2, c, target, label, local_iter = iter;
756 unsigned long long next_random = (long long) id;
757 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100758 int input_len_1 = layer1_size;
759 int window_offset = -1;
760 if (type == 2 || type == 4) {
761 input_len_1 = window_layer_size;
762 }
763 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
764 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200765 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100766
767 int input_len_2 = 0;
768 if (type == 4) {
769 input_len_2 = window_hidden_size;
770 }
771 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
772 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
773
774 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200775 long long start_pos = file_size / (long long) num_threads * (long long) id;
776 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
777 long long current_pos = start_pos;
778 long long last_pos = start_pos;;
779 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100780 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200781 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100782 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200783 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100784 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100785 alpha = starting_alpha
786 * (1 - word_count_actual / (real) (iter * train_words + 1));
787 if (alpha < starting_alpha * 0.0001)
788 alpha = starting_alpha * 0.0001;
789 }
790 if (sentence_length == 0) {
791 while (1) {
792 word = ReadWordIndex(fi);
793 if (feof(fi))
794 break;
795 if (word == -1)
796 continue;
797 word_count++;
798 if (word == 0)
799 break;
800 // The subsampling randomly discards frequent words while keeping the ranking same
801 if (sample > 0) {
802 real ran = (sqrt(vocab[word].cn / (sample * train_words))
803 + 1) * (sample * train_words) / vocab[word].cn;
804 next_random = next_random * (unsigned long long) 25214903917
805 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100806 if (ran < (next_random & 0xFFFF) / (real) 65536) {
807 if(type == 3) // in structured skipgrams
808 word = -2; // keep the window position correct
809 else
810 continue;
811 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100812 }
813 sen[sentence_length] = word;
814 sentence_length++;
815 if (sentence_length >= MAX_SENTENCE_LENGTH)
816 break;
817 }
818 sentence_position = 0;
819 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200820 current_pos = threadPos[(long) id] = ftell(fi);
821 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100822 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200823 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100824 local_iter--;
825 if (local_iter == 0)
826 break;
Marc Kupietze423f732017-12-22 17:57:03 +0100827 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
828 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
829 break;
830 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100831 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200832 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100833 last_word_count = 0;
834 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200835 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100836 continue;
837 }
838 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200839 while (word == -2 && sentence_position<sentence_length)
840 word = sen[++sentence_position];
841 if (sentence_position>=sentence_length) {
842 sentence_length=0;
843 continue;
844 }
845 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100846 continue;
847 for (c = 0; c < input_len_1; c++)
848 neu1[c] = 0;
849 for (c = 0; c < input_len_1; c++)
850 neu1e[c] = 0;
851 for (c = 0; c < input_len_2; c++)
852 neu2[c] = 0;
853 for (c = 0; c < input_len_2; c++)
854 neu2e[c] = 0;
855 next_random = next_random * (unsigned long long) 25214903917 + 11;
856 b = next_random % window;
857 if (type == 0) { //train the cbow architecture
858 // in -> hidden
859 cw = 0;
860 for (a = b; a < window * 2 + 1 - b; a++)
861 if (a != window) {
862 c = sentence_position - window + a;
863 if (c < 0)
864 continue;
865 if (c >= sentence_length)
866 continue;
867 last_word = sen[c];
868 if (last_word == -1)
869 continue;
870 for (c = 0; c < layer1_size; c++)
871 neu1[c] += syn0[c + last_word * layer1_size];
872 cw++;
873 }
874 if (cw) {
875 for (c = 0; c < layer1_size; c++)
876 neu1[c] /= cw;
877 if (hs)
878 for (d = 0; d < vocab[word].codelen; d++) {
879 f = 0;
880 l2 = vocab[word].point[d] * layer1_size;
881 // Propagate hidden -> output
882 for (c = 0; c < layer1_size; c++)
883 f += neu1[c] * syn1[c + l2];
884 if (f <= -MAX_EXP)
885 continue;
886 else if (f >= MAX_EXP)
887 continue;
888 else
889 f = expTable[(int) ((f + MAX_EXP)
890 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
891 // 'g' is the gradient multiplied by the learning rate
892 g = (1 - vocab[word].code[d] - f) * alpha;
893 // Propagate errors output -> hidden
894 for (c = 0; c < layer1_size; c++)
895 neu1e[c] += g * syn1[c + l2];
896 // Learn weights hidden -> output
897 for (c = 0; c < layer1_size; c++)
898 syn1[c + l2] += g * neu1[c];
899 if (cap == 1)
900 for (c = 0; c < layer1_size; c++)
901 capParam(syn1, c + l2);
902 }
903 // NEGATIVE SAMPLING
904 if (negative > 0)
905 for (d = 0; d < negative + 1; d++) {
906 if (d == 0) {
907 target = word;
908 label = 1;
909 } else {
910 next_random = next_random
911 * (unsigned long long) 25214903917 + 11;
912 if (word_to_group != NULL
913 && word_to_group[word] != -1) {
914 target = word;
915 while (target == word) {
916 target = group_to_table[word_to_group[word]
917 * table_size
918 + (next_random >> 16) % table_size];
919 next_random = next_random
920 * (unsigned long long) 25214903917
921 + 11;
922 }
923 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
924 } else {
925 target =
926 table[(next_random >> 16) % table_size];
927 }
928 if (target == 0)
929 target = next_random % (vocab_size - 1) + 1;
930 if (target == word)
931 continue;
932 label = 0;
933 }
934 l2 = target * layer1_size;
935 f = 0;
936 for (c = 0; c < layer1_size; c++)
937 f += neu1[c] * syn1neg[c + l2];
938 if (f > MAX_EXP)
939 g = (label - 1) * alpha;
940 else if (f < -MAX_EXP)
941 g = (label - 0) * alpha;
942 else
943 g = (label
944 - expTable[(int) ((f + MAX_EXP)
945 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
946 * alpha;
947 for (c = 0; c < layer1_size; c++)
948 neu1e[c] += g * syn1neg[c + l2];
949 for (c = 0; c < layer1_size; c++)
950 syn1neg[c + l2] += g * neu1[c];
951 if (cap == 1)
952 for (c = 0; c < layer1_size; c++)
953 capParam(syn1neg, c + l2);
954 }
955 // Noise Contrastive Estimation
956 if (nce > 0)
957 for (d = 0; d < nce + 1; d++) {
958 if (d == 0) {
959 target = word;
960 label = 1;
961 } else {
962 next_random = next_random
963 * (unsigned long long) 25214903917 + 11;
964 if (word_to_group != NULL
965 && word_to_group[word] != -1) {
966 target = word;
967 while (target == word) {
968 target = group_to_table[word_to_group[word]
969 * table_size
970 + (next_random >> 16) % table_size];
971 next_random = next_random
972 * (unsigned long long) 25214903917
973 + 11;
974 }
975 } else {
976 target =
977 table[(next_random >> 16) % table_size];
978 }
979 if (target == 0)
980 target = next_random % (vocab_size - 1) + 1;
981 if (target == word)
982 continue;
983 label = 0;
984 }
985 l2 = target * layer1_size;
986 f = 0;
987
988 for (c = 0; c < layer1_size; c++)
989 f += neu1[c] * syn1nce[c + l2];
990 if (f > MAX_EXP)
991 g = (label - 1) * alpha;
992 else if (f < -MAX_EXP)
993 g = (label - 0) * alpha;
994 else {
995 f = exp(f);
996 g =
997 (label
998 - f
999 / (noise_distribution[target]
1000 * nce + f)) * alpha;
1001 }
1002 for (c = 0; c < layer1_size; c++)
1003 neu1e[c] += g * syn1nce[c + l2];
1004 for (c = 0; c < layer1_size; c++)
1005 syn1nce[c + l2] += g * neu1[c];
1006 if (cap == 1)
1007 for (c = 0; c < layer1_size; c++)
1008 capParam(syn1nce, c + l2);
1009 }
1010 // hidden -> in
1011 for (a = b; a < window * 2 + 1 - b; a++)
1012 if (a != window) {
1013 c = sentence_position - window + a;
1014 if (c < 0)
1015 continue;
1016 if (c >= sentence_length)
1017 continue;
1018 last_word = sen[c];
1019 if (last_word == -1)
1020 continue;
1021 for (c = 0; c < layer1_size; c++)
1022 syn0[c + last_word * layer1_size] += neu1e[c];
1023 }
1024 }
1025 } else if (type == 1) { //train skip-gram
1026 for (a = b; a < window * 2 + 1 - b; a++)
1027 if (a != window) {
1028 c = sentence_position - window + a;
1029 if (c < 0)
1030 continue;
1031 if (c >= sentence_length)
1032 continue;
1033 last_word = sen[c];
1034 if (last_word == -1)
1035 continue;
1036 l1 = last_word * layer1_size;
1037 for (c = 0; c < layer1_size; c++)
1038 neu1e[c] = 0;
1039 // HIERARCHICAL SOFTMAX
1040 if (hs)
1041 for (d = 0; d < vocab[word].codelen; d++) {
1042 f = 0;
1043 l2 = vocab[word].point[d] * layer1_size;
1044 // Propagate hidden -> output
1045 for (c = 0; c < layer1_size; c++)
1046 f += syn0[c + l1] * syn1[c + l2];
1047 if (f <= -MAX_EXP)
1048 continue;
1049 else if (f >= MAX_EXP)
1050 continue;
1051 else
1052 f = expTable[(int) ((f + MAX_EXP)
1053 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1054 // 'g' is the gradient multiplied by the learning rate
1055 g = (1 - vocab[word].code[d] - f) * alpha;
1056 // Propagate errors output -> hidden
1057 for (c = 0; c < layer1_size; c++)
1058 neu1e[c] += g * syn1[c + l2];
1059 // Learn weights hidden -> output
1060 for (c = 0; c < layer1_size; c++)
1061 syn1[c + l2] += g * syn0[c + l1];
1062 if (cap == 1)
1063 for (c = 0; c < layer1_size; c++)
1064 capParam(syn1, c + l2);
1065 }
1066 // NEGATIVE SAMPLING
1067 if (negative > 0)
1068 for (d = 0; d < negative + 1; d++) {
1069 if (d == 0) {
1070 target = word;
1071 label = 1;
1072 } else {
1073 next_random = next_random
1074 * (unsigned long long) 25214903917 + 11;
1075 if (word_to_group != NULL
1076 && word_to_group[word] != -1) {
1077 target = word;
1078 while (target == word) {
1079 target =
1080 group_to_table[word_to_group[word]
1081 * table_size
1082 + (next_random >> 16)
1083 % table_size];
1084 next_random =
1085 next_random
1086 * (unsigned long long) 25214903917
1087 + 11;
1088 }
1089 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1090 } else {
1091 target = table[(next_random >> 16)
1092 % table_size];
1093 }
1094 if (target == 0)
1095 target = next_random % (vocab_size - 1) + 1;
1096 if (target == word)
1097 continue;
1098 label = 0;
1099 }
1100 l2 = target * layer1_size;
1101 f = 0;
1102 for (c = 0; c < layer1_size; c++)
1103 f += syn0[c + l1] * syn1neg[c + l2];
1104 if (f > MAX_EXP)
1105 g = (label - 1) * alpha;
1106 else if (f < -MAX_EXP)
1107 g = (label - 0) * alpha;
1108 else
1109 g =
1110 (label
1111 - expTable[(int) ((f + MAX_EXP)
1112 * (EXP_TABLE_SIZE
1113 / MAX_EXP / 2))])
1114 * alpha;
1115 for (c = 0; c < layer1_size; c++)
1116 neu1e[c] += g * syn1neg[c + l2];
1117 for (c = 0; c < layer1_size; c++)
1118 syn1neg[c + l2] += g * syn0[c + l1];
1119 if (cap == 1)
1120 for (c = 0; c < layer1_size; c++)
1121 capParam(syn1neg, c + l2);
1122 }
1123 //Noise Contrastive Estimation
1124 if (nce > 0)
1125 for (d = 0; d < nce + 1; d++) {
1126 if (d == 0) {
1127 target = word;
1128 label = 1;
1129 } else {
1130 next_random = next_random
1131 * (unsigned long long) 25214903917 + 11;
1132 if (word_to_group != NULL
1133 && word_to_group[word] != -1) {
1134 target = word;
1135 while (target == word) {
1136 target =
1137 group_to_table[word_to_group[word]
1138 * table_size
1139 + (next_random >> 16)
1140 % table_size];
1141 next_random =
1142 next_random
1143 * (unsigned long long) 25214903917
1144 + 11;
1145 }
1146 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1147 } else {
1148 target = table[(next_random >> 16)
1149 % table_size];
1150 }
1151 if (target == 0)
1152 target = next_random % (vocab_size - 1) + 1;
1153 if (target == word)
1154 continue;
1155 label = 0;
1156 }
1157 l2 = target * layer1_size;
1158 f = 0;
1159 for (c = 0; c < layer1_size; c++)
1160 f += syn0[c + l1] * syn1nce[c + l2];
1161 if (f > MAX_EXP)
1162 g = (label - 1) * alpha;
1163 else if (f < -MAX_EXP)
1164 g = (label - 0) * alpha;
1165 else {
1166 f = exp(f);
1167 g = (label
1168 - f
1169 / (noise_distribution[target]
1170 * nce + f)) * alpha;
1171 }
1172 for (c = 0; c < layer1_size; c++)
1173 neu1e[c] += g * syn1nce[c + l2];
1174 for (c = 0; c < layer1_size; c++)
1175 syn1nce[c + l2] += g * syn0[c + l1];
1176 if (cap == 1)
1177 for (c = 0; c < layer1_size; c++)
1178 capParam(syn1nce, c + l2);
1179 }
1180 // Learn weights input -> hidden
1181 for (c = 0; c < layer1_size; c++)
1182 syn0[c + l1] += neu1e[c];
1183 }
1184 } else if (type == 2) { //train the cwindow architecture
1185 // in -> hidden
1186 cw = 0;
1187 for (a = 0; a < window * 2 + 1; a++)
1188 if (a != window) {
1189 c = sentence_position - window + a;
1190 if (c < 0)
1191 continue;
1192 if (c >= sentence_length)
1193 continue;
1194 last_word = sen[c];
1195 if (last_word == -1)
1196 continue;
1197 window_offset = a * layer1_size;
1198 if (a > window)
1199 window_offset -= layer1_size;
1200 for (c = 0; c < layer1_size; c++)
1201 neu1[c + window_offset] += syn0[c
1202 + last_word * layer1_size];
1203 cw++;
1204 }
1205 if (cw) {
1206 if (hs)
1207 for (d = 0; d < vocab[word].codelen; d++) {
1208 f = 0;
1209 l2 = vocab[word].point[d] * window_layer_size;
1210 // Propagate hidden -> output
1211 for (c = 0; c < window_layer_size; c++)
1212 f += neu1[c] * syn1_window[c + l2];
1213 if (f <= -MAX_EXP)
1214 continue;
1215 else if (f >= MAX_EXP)
1216 continue;
1217 else
1218 f = expTable[(int) ((f + MAX_EXP)
1219 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1220 // 'g' is the gradient multiplied by the learning rate
1221 g = (1 - vocab[word].code[d] - f) * alpha;
1222 // Propagate errors output -> hidden
1223 for (c = 0; c < window_layer_size; c++)
1224 neu1e[c] += g * syn1_window[c + l2];
1225 // Learn weights hidden -> output
1226 for (c = 0; c < window_layer_size; c++)
1227 syn1_window[c + l2] += g * neu1[c];
1228 if (cap == 1)
1229 for (c = 0; c < window_layer_size; c++)
1230 capParam(syn1_window, c + l2);
1231 }
1232 // NEGATIVE SAMPLING
1233 if (negative > 0)
1234 for (d = 0; d < negative + 1; d++) {
1235 if (d == 0) {
1236 target = word;
1237 label = 1;
1238 } else {
1239 next_random = next_random
1240 * (unsigned long long) 25214903917 + 11;
1241 if (word_to_group != NULL
1242 && word_to_group[word] != -1) {
1243 target = word;
1244 while (target == word) {
1245 target = group_to_table[word_to_group[word]
1246 * table_size
1247 + (next_random >> 16) % table_size];
1248 next_random = next_random
1249 * (unsigned long long) 25214903917
1250 + 11;
1251 }
1252 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1253 } else {
1254 target =
1255 table[(next_random >> 16) % table_size];
1256 }
1257 if (target == 0)
1258 target = next_random % (vocab_size - 1) + 1;
1259 if (target == word)
1260 continue;
1261 label = 0;
1262 }
1263 l2 = target * window_layer_size;
1264 f = 0;
1265 for (c = 0; c < window_layer_size; c++)
1266 f += neu1[c] * syn1neg_window[c + l2];
1267 if (f > MAX_EXP)
1268 g = (label - 1) * alpha;
1269 else if (f < -MAX_EXP)
1270 g = (label - 0) * alpha;
1271 else
1272 g = (label
1273 - expTable[(int) ((f + MAX_EXP)
1274 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1275 * alpha;
1276 for (c = 0; c < window_layer_size; c++)
1277 neu1e[c] += g * syn1neg_window[c + l2];
1278 for (c = 0; c < window_layer_size; c++)
1279 syn1neg_window[c + l2] += g * neu1[c];
1280 if (cap == 1)
1281 for (c = 0; c < window_layer_size; c++)
1282 capParam(syn1neg_window, c + l2);
1283 }
1284 // Noise Contrastive Estimation
1285 if (nce > 0)
1286 for (d = 0; d < nce + 1; d++) {
1287 if (d == 0) {
1288 target = word;
1289 label = 1;
1290 } else {
1291 next_random = next_random
1292 * (unsigned long long) 25214903917 + 11;
1293 if (word_to_group != NULL
1294 && word_to_group[word] != -1) {
1295 target = word;
1296 while (target == word) {
1297 target = group_to_table[word_to_group[word]
1298 * table_size
1299 + (next_random >> 16) % table_size];
1300 next_random = next_random
1301 * (unsigned long long) 25214903917
1302 + 11;
1303 }
1304 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1305 } else {
1306 target =
1307 table[(next_random >> 16) % table_size];
1308 }
1309 if (target == 0)
1310 target = next_random % (vocab_size - 1) + 1;
1311 if (target == word)
1312 continue;
1313 label = 0;
1314 }
1315 l2 = target * window_layer_size;
1316 f = 0;
1317 for (c = 0; c < window_layer_size; c++)
1318 f += neu1[c] * syn1nce_window[c + l2];
1319 if (f > MAX_EXP)
1320 g = (label - 1) * alpha;
1321 else if (f < -MAX_EXP)
1322 g = (label - 0) * alpha;
1323 else {
1324 f = exp(f);
1325 g =
1326 (label
1327 - f
1328 / (noise_distribution[target]
1329 * nce + f)) * alpha;
1330 }
1331 for (c = 0; c < window_layer_size; c++)
1332 neu1e[c] += g * syn1nce_window[c + l2];
1333 for (c = 0; c < window_layer_size; c++)
1334 syn1nce_window[c + l2] += g * neu1[c];
1335 if (cap == 1)
1336 for (c = 0; c < window_layer_size; c++)
1337 capParam(syn1nce_window, c + l2);
1338 }
1339 // hidden -> in
1340 for (a = 0; a < window * 2 + 1; a++)
1341 if (a != window) {
1342 c = sentence_position - window + a;
1343 if (c < 0)
1344 continue;
1345 if (c >= sentence_length)
1346 continue;
1347 last_word = sen[c];
1348 if (last_word == -1)
1349 continue;
1350 window_offset = a * layer1_size;
1351 if (a > window)
1352 window_offset -= layer1_size;
1353 for (c = 0; c < layer1_size; c++)
1354 syn0[c + last_word * layer1_size] += neu1e[c
1355 + window_offset];
1356 }
1357 }
1358 } else if (type == 3) { //train structured skip-gram
1359 for (a = 0; a < window * 2 + 1; a++)
1360 if (a != window) {
1361 c = sentence_position - window + a;
1362 if (c < 0)
1363 continue;
1364 if (c >= sentence_length)
1365 continue;
1366 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001367 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001368 continue;
1369 l1 = last_word * layer1_size;
1370 window_offset = a * layer1_size;
1371 if (a > window)
1372 window_offset -= layer1_size;
1373 for (c = 0; c < layer1_size; c++)
1374 neu1e[c] = 0;
1375 // HIERARCHICAL SOFTMAX
1376 if (hs)
1377 for (d = 0; d < vocab[word].codelen; d++) {
1378 f = 0;
1379 l2 = vocab[word].point[d] * window_layer_size;
1380 // Propagate hidden -> output
1381 for (c = 0; c < layer1_size; c++)
1382 f += syn0[c + l1]
1383 * syn1_window[c + l2 + window_offset];
1384 if (f <= -MAX_EXP)
1385 continue;
1386 else if (f >= MAX_EXP)
1387 continue;
1388 else
1389 f = expTable[(int) ((f + MAX_EXP)
1390 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1391 // 'g' is the gradient multiplied by the learning rate
1392 g = (1 - vocab[word].code[d] - f) * alpha;
1393 // Propagate errors output -> hidden
1394 for (c = 0; c < layer1_size; c++)
1395 neu1e[c] += g
1396 * syn1_window[c + l2 + window_offset];
1397 // Learn weights hidden -> output
1398 for (c = 0; c < layer1_size; c++)
1399 syn1[c + l2 + window_offset] += g
1400 * syn0[c + l1];
1401 if (cap == 1)
1402 for (c = 0; c < layer1_size; c++)
1403 capParam(syn1, c + l2 + window_offset);
1404 }
1405 // NEGATIVE SAMPLING
1406 if (negative > 0)
1407 for (d = 0; d < negative + 1; d++) {
1408 if (d == 0) {
1409 target = word;
1410 label = 1;
1411 } else {
1412 next_random = next_random
1413 * (unsigned long long) 25214903917 + 11;
1414 if (word_to_group != NULL
1415 && word_to_group[word] != -1) {
1416 target = word;
1417 while (target == word) {
1418 target =
1419 group_to_table[word_to_group[word]
1420 * table_size
1421 + (next_random >> 16)
1422 % table_size];
1423 next_random =
1424 next_random
1425 * (unsigned long long) 25214903917
1426 + 11;
1427 }
1428 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1429 } else {
1430 target = table[(next_random >> 16)
1431 % table_size];
1432 }
1433 if (target == 0)
1434 target = next_random % (vocab_size - 1) + 1;
1435 if (target == word)
1436 continue;
1437 label = 0;
1438 }
1439 l2 = target * window_layer_size;
1440 f = 0;
1441 for (c = 0; c < layer1_size; c++)
1442 f +=
1443 syn0[c + l1]
1444 * syn1neg_window[c + l2
1445 + window_offset];
1446 if (f > MAX_EXP)
1447 g = (label - 1) * alpha;
1448 else if (f < -MAX_EXP)
1449 g = (label - 0) * alpha;
1450 else
1451 g =
1452 (label
1453 - expTable[(int) ((f + MAX_EXP)
1454 * (EXP_TABLE_SIZE
1455 / MAX_EXP / 2))])
1456 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001457 if(debug_mode > 2 && ((long long) id) == 0) {
1458 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1459 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1460 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001461 for (c = 0; c < layer1_size; c++)
1462 neu1e[c] +=
1463 g
1464 * syn1neg_window[c + l2
1465 + window_offset];
1466 for (c = 0; c < layer1_size; c++)
1467 syn1neg_window[c + l2 + window_offset] += g
1468 * syn0[c + l1];
1469 if (cap == 1)
1470 for (c = 0; c < layer1_size; c++)
1471 capParam(syn1neg_window,
1472 c + l2 + window_offset);
1473 }
1474 // Noise Constrastive Estimation
1475 if (nce > 0)
1476 for (d = 0; d < nce + 1; d++) {
1477 if (d == 0) {
1478 target = word;
1479 label = 1;
1480 } else {
1481 next_random = next_random
1482 * (unsigned long long) 25214903917 + 11;
1483 if (word_to_group != NULL
1484 && word_to_group[word] != -1) {
1485 target = word;
1486 while (target == word) {
1487 target =
1488 group_to_table[word_to_group[word]
1489 * table_size
1490 + (next_random >> 16)
1491 % table_size];
1492 next_random =
1493 next_random
1494 * (unsigned long long) 25214903917
1495 + 11;
1496 }
1497 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1498 } else {
1499 target = table[(next_random >> 16)
1500 % table_size];
1501 }
1502 if (target == 0)
1503 target = next_random % (vocab_size - 1) + 1;
1504 if (target == word)
1505 continue;
1506 label = 0;
1507 }
1508 l2 = target * window_layer_size;
1509 f = 0;
1510 for (c = 0; c < layer1_size; c++)
1511 f +=
1512 syn0[c + l1]
1513 * syn1nce_window[c + l2
1514 + window_offset];
1515 if (f > MAX_EXP)
1516 g = (label - 1) * alpha;
1517 else if (f < -MAX_EXP)
1518 g = (label - 0) * alpha;
1519 else {
1520 f = exp(f);
1521 g = (label
1522 - f
1523 / (noise_distribution[target]
1524 * nce + f)) * alpha;
1525 }
1526 for (c = 0; c < layer1_size; c++)
1527 neu1e[c] +=
1528 g
1529 * syn1nce_window[c + l2
1530 + window_offset];
1531 for (c = 0; c < layer1_size; c++)
1532 syn1nce_window[c + l2 + window_offset] += g
1533 * syn0[c + l1];
1534 if (cap == 1)
1535 for (c = 0; c < layer1_size; c++)
1536 capParam(syn1nce_window,
1537 c + l2 + window_offset);
1538 }
1539 // Learn weights input -> hidden
1540 for (c = 0; c < layer1_size; c++) {
1541 syn0[c + l1] += neu1e[c];
1542 if (syn0[c + l1] > 50)
1543 syn0[c + l1] = 50;
1544 if (syn0[c + l1] < -50)
1545 syn0[c + l1] = -50;
1546 }
1547 }
1548 } else if (type == 4) { //training senna
1549 // in -> hidden
1550 cw = 0;
1551 for (a = 0; a < window * 2 + 1; a++)
1552 if (a != window) {
1553 c = sentence_position - window + a;
1554 if (c < 0)
1555 continue;
1556 if (c >= sentence_length)
1557 continue;
1558 last_word = sen[c];
1559 if (last_word == -1)
1560 continue;
1561 window_offset = a * layer1_size;
1562 if (a > window)
1563 window_offset -= layer1_size;
1564 for (c = 0; c < layer1_size; c++)
1565 neu1[c + window_offset] += syn0[c
1566 + last_word * layer1_size];
1567 cw++;
1568 }
1569 if (cw) {
1570 for (a = 0; a < window_hidden_size; a++) {
1571 c = a * window_layer_size;
1572 for (b = 0; b < window_layer_size; b++) {
1573 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1574 }
1575 }
1576 if (hs)
1577 for (d = 0; d < vocab[word].codelen; d++) {
1578 f = 0;
1579 l2 = vocab[word].point[d] * window_hidden_size;
1580 // Propagate hidden -> output
1581 for (c = 0; c < window_hidden_size; c++)
1582 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1583 if (f <= -MAX_EXP)
1584 continue;
1585 else if (f >= MAX_EXP)
1586 continue;
1587 else
1588 f = expTable[(int) ((f + MAX_EXP)
1589 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1590 // 'g' is the gradient multiplied by the learning rate
1591 g = (1 - vocab[word].code[d] - f) * alpha;
1592 // Propagate errors output -> hidden
1593 for (c = 0; c < window_hidden_size; c++)
1594 neu2e[c] += dHardTanh(neu2[c], g) * g
1595 * syn_hidden_word[c + l2];
1596 // Learn weights hidden -> output
1597 for (c = 0; c < window_hidden_size; c++)
1598 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1599 * neu2[c];
1600 }
1601 // NEGATIVE SAMPLING
1602 if (negative > 0)
1603 for (d = 0; d < negative + 1; d++) {
1604 if (d == 0) {
1605 target = word;
1606 label = 1;
1607 } else {
1608 next_random = next_random
1609 * (unsigned long long) 25214903917 + 11;
1610 if (word_to_group != NULL
1611 && word_to_group[word] != -1) {
1612 target = word;
1613 while (target == word) {
1614 target = group_to_table[word_to_group[word]
1615 * table_size
1616 + (next_random >> 16) % table_size];
1617 next_random = next_random
1618 * (unsigned long long) 25214903917
1619 + 11;
1620 }
1621 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1622 } else {
1623 target =
1624 table[(next_random >> 16) % table_size];
1625 }
1626 if (target == 0)
1627 target = next_random % (vocab_size - 1) + 1;
1628 if (target == word)
1629 continue;
1630 label = 0;
1631 }
1632 l2 = target * window_hidden_size;
1633 f = 0;
1634 for (c = 0; c < window_hidden_size; c++)
1635 f += hardTanh(neu2[c])
1636 * syn_hidden_word_neg[c + l2];
1637 if (f > MAX_EXP)
1638 g = (label - 1) * alpha / negative;
1639 else if (f < -MAX_EXP)
1640 g = (label - 0) * alpha / negative;
1641 else
1642 g = (label
1643 - expTable[(int) ((f + MAX_EXP)
1644 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1645 * alpha / negative;
1646 for (c = 0; c < window_hidden_size; c++)
1647 neu2e[c] += dHardTanh(neu2[c], g) * g
1648 * syn_hidden_word_neg[c + l2];
1649 for (c = 0; c < window_hidden_size; c++)
1650 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1651 * g * neu2[c];
1652 }
1653 for (a = 0; a < window_hidden_size; a++)
1654 for (b = 0; b < window_layer_size; b++)
1655 neu1e[b] += neu2e[a]
1656 * syn_window_hidden[a * window_layer_size + b];
1657 for (a = 0; a < window_hidden_size; a++)
1658 for (b = 0; b < window_layer_size; b++)
1659 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1660 * neu1[b];
1661 // hidden -> in
1662 for (a = 0; a < window * 2 + 1; a++)
1663 if (a != window) {
1664 c = sentence_position - window + a;
1665 if (c < 0)
1666 continue;
1667 if (c >= sentence_length)
1668 continue;
1669 last_word = sen[c];
1670 if (last_word == -1)
1671 continue;
1672 window_offset = a * layer1_size;
1673 if (a > window)
1674 window_offset -= layer1_size;
1675 for (c = 0; c < layer1_size; c++)
1676 syn0[c + last_word * layer1_size] += neu1e[c
1677 + window_offset];
1678 }
1679 }
1680 } else {
1681 printf("unknown type %i", type);
1682 exit(0);
1683 }
1684 sentence_position++;
1685 if (sentence_position >= sentence_length) {
1686 sentence_length = 0;
1687 continue;
1688 }
1689 }
1690 fclose(fi);
1691 free(neu1);
1692 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001693 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001694 pthread_exit(NULL);
1695}
1696
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001697void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001698 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001699 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001700 real *target_sums, bestf[MAX_CC], worstbest;
1701 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001702 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001703 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1704
1705 for (d = cc; d < vocab_size; d++) {
1706 for (b = 0; b < vocab_size; b++)
1707 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001708 for (b = 0; b < N; b++)
1709 bestf[b]=-1;
1710 worstbest = -1;
1711
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001712 maxmax_f = -1;
1713 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001714 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001715 if (a != window) {
1716 max_f = -1;
1717 window_offset = a * layer1_size;
1718 if (a > window)
1719 window_offset -= layer1_size;
1720 for(target = 0; target < vocab_size; target ++) {
1721 if(target == d)
1722 continue;
1723 f = 0;
1724 for (c = 0; c < layer1_size; c++)
1725 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1726 if (f < -MAX_EXP)
1727 continue;
1728 else if (f > MAX_EXP)
1729 continue;
1730 else
1731 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1732 if(f > max_f) {
1733 max_f = f;
1734 max_target = target;
1735 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001736 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001737 if(f > worstbest) {
1738 for (b = 0; b < N; b++) {
1739 if (f > bestf[b]) {
1740 for (e = N - 1; e > b; e--) {
1741 bestf[e] = bestf[e - 1];
1742 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001743 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001744 }
1745 bestf[b] = f;
1746 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001747 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001748 break;
1749 }
1750 }
1751 worstbest = bestf[N-1];
1752 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001753 }
1754 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1755 if(max_f > maxmax_f) {
1756 maxmax_f = max_f;
1757 maxmax_target = max_target;
1758 }
1759 } else {
1760 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1761 }
1762 }
1763 max_f = -1;
1764 for (b = 0; b < vocab_size; b++) {
1765 if(target_sums[b] > max_f) {
1766 max_f = target_sums[b];
1767 max_target = b;
1768 }
1769 }
1770 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001771 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001772 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001773 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001774 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001775 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001776 }
1777}
1778
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001779void TrainModel() {
1780 long a, b, c, d;
1781 FILE *fo;
1782 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001783 threadPos = malloc(num_threads * sizeof(long long));
1784 threadIters = malloc(num_threads * sizeof(int));
1785 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001786 printf("Starting training using file %s\n", train_file);
1787 starting_alpha = alpha;
1788 if (read_vocab_file[0] != 0)
1789 ReadVocab();
1790 else
1791 LearnVocabFromTrainFile();
1792 if (save_vocab_file[0] != 0)
1793 SaveVocab();
1794 if (output_file[0] == 0)
1795 return;
1796 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001797 if(cc > 0)
1798 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001799 if (negative > 0 || nce > 0)
1800 InitUnigramTable();
1801 if (negative_classes_file[0] != 0)
1802 InitClassUnigramTable();
1803 start = clock();
1804 for (a = 0; a < num_threads; a++)
1805 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001806 if(debug_mode > 1)
1807 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001808 for (a = 0; a < num_threads; a++)
1809 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001810 if(debug_mode > 1) {
1811 pthread_join(pt[num_threads], NULL);
1812 clock_t now = clock();
1813 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now-start) / CLOCKS_PER_SEC / num_threads, (now-start) / CLOCKS_PER_SEC);
1814 printf("Saving vectors to %s ...", output_file);
1815 fflush(stdout);
1816 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001817 fo = fopen(output_file, "wb");
1818 if (classes == 0) {
1819 // Save the word vectors
1820 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1821 for (a = 0; a < vocab_size; a++) {
1822 fprintf(fo, "%s ", vocab[a].word);
1823 if (binary)
1824 for (b = 0; b < layer1_size; b++)
1825 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1826 else
1827 for (b = 0; b < layer1_size; b++)
1828 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1829 fprintf(fo, "\n");
1830 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001831 if(debug_mode > 1)
1832 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001833 } else {
1834 // Run K-means on the word vectors
1835 int clcn = classes, iter = 10, closeid;
1836 int *centcn = (int *) malloc(classes * sizeof(int));
1837 int *cl = (int *) calloc(vocab_size, sizeof(int));
1838 real closev, x;
1839 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1840 for (a = 0; a < vocab_size; a++)
1841 cl[a] = a % clcn;
1842 for (a = 0; a < iter; a++) {
1843 for (b = 0; b < clcn * layer1_size; b++)
1844 cent[b] = 0;
1845 for (b = 0; b < clcn; b++)
1846 centcn[b] = 1;
1847 for (c = 0; c < vocab_size; c++) {
1848 for (d = 0; d < layer1_size; d++)
1849 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1850 centcn[cl[c]]++;
1851 }
1852 for (b = 0; b < clcn; b++) {
1853 closev = 0;
1854 for (c = 0; c < layer1_size; c++) {
1855 cent[layer1_size * b + c] /= centcn[b];
1856 closev += cent[layer1_size * b + c]
1857 * cent[layer1_size * b + c];
1858 }
1859 closev = sqrt(closev);
1860 for (c = 0; c < layer1_size; c++)
1861 cent[layer1_size * b + c] /= closev;
1862 }
1863 for (c = 0; c < vocab_size; c++) {
1864 closev = -10;
1865 closeid = 0;
1866 for (d = 0; d < clcn; d++) {
1867 x = 0;
1868 for (b = 0; b < layer1_size; b++)
1869 x += cent[layer1_size * d + b]
1870 * syn0[c * layer1_size + b];
1871 if (x > closev) {
1872 closev = x;
1873 closeid = d;
1874 }
1875 }
1876 cl[c] = closeid;
1877 }
1878 }
1879 // Save the K-means classes
1880 for (a = 0; a < vocab_size; a++)
1881 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1882 free(centcn);
1883 free(cent);
1884 free(cl);
1885 }
1886 fclose(fo);
1887 if (save_net_file[0] != 0)
1888 SaveNet();
1889}
1890
1891int ArgPos(char *str, int argc, char **argv) {
1892 int a;
1893 for (a = 1; a < argc; a++)
1894 if (!strcmp(str, argv[a])) {
1895 if (a == argc - 1) {
1896 printf("Argument missing for %s\n", str);
1897 exit(1);
1898 }
1899 return a;
1900 }
1901 return -1;
1902}
1903
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001904void print_help() {
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001905 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1906 printf("Options:\n");
1907 printf("Parameters for training:\n");
1908 printf("\t-train <file>\n");
1909 printf("\t\tUse text data from <file> to train the model\n");
1910 printf("\t-output <file>\n");
1911 printf(
1912 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1913 printf("\t-size <int>\n");
1914 printf("\t\tSet size of word vectors; default is 100\n");
1915 printf("\t-window <int>\n");
1916 printf("\t\tSet max skip length between words; default is 5\n");
1917 printf("\t-sample <float>\n");
1918 printf(
1919 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1920 printf(
1921 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1922 printf("\t-hs <int>\n");
1923 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1924 printf("\t-negative <int>\n");
1925 printf(
1926 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1927 printf("\t-negative-classes <file>\n");
1928 printf("\t\tNegative classes to sample from\n");
1929 printf("\t-nce <int>\n");
1930 printf(
1931 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1932 printf("\t-threads <int>\n");
1933 printf("\t\tUse <int> threads (default 12)\n");
1934 printf("\t-iter <int>\n");
1935 printf("\t\tRun more training iterations (default 5)\n");
1936 printf("\t-min-count <int>\n");
1937 printf(
1938 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1939 printf("\t-alpha <float>\n");
1940 printf(
1941 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1942 printf("\t-classes <int>\n");
1943 printf(
1944 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1945 printf("\t-debug <int>\n");
1946 printf(
1947 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1948 printf("\t-binary <int>\n");
1949 printf(
1950 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1951 printf("\t-save-vocab <file>\n");
1952 printf("\t\tThe vocabulary will be saved to <file>\n");
1953 printf("\t-read-vocab <file>\n");
1954 printf(
1955 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1956 printf("\t-read-net <file>\n");
1957 printf(
1958 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1959 printf("\t-save-net <file>\n");
1960 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietze423f732017-12-22 17:57:03 +01001961 printf("\t-magic-stop-file <file>\n");
1962 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001963 printf("\t-show-cc <int>\n");
1964 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001965 printf("\t-type <int>\n");
1966 printf(
1967 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1968 printf("\t-cap <int>\n");
1969 printf(
1970 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1971 printf("\nExamples:\n");
1972 printf(
1973 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001974}
1975
1976int main(int argc, char **argv) {
1977 int i;
1978 setlocale(LC_ALL, "");
1979 if (argc == 1) {
1980 print_help();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001981 return 0;
1982 }
1983 output_file[0] = 0;
1984 save_vocab_file[0] = 0;
1985 read_vocab_file[0] = 0;
1986 save_net_file[0] = 0;
1987 read_net_file[0] = 0;
1988 negative_classes_file[0] = 0;
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001989 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
1990 print_help();
1991 return(0);
1992 }
1993 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
1994 print_help();
1995 return(0);
1996 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001997 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1998 layer1_size = atoi(argv[i + 1]);
1999 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2000 strcpy(train_file, argv[i + 1]);
2001 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2002 strcpy(save_vocab_file, argv[i + 1]);
2003 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2004 strcpy(read_vocab_file, argv[i + 1]);
2005 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2006 strcpy(save_net_file, argv[i + 1]);
2007 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2008 strcpy(read_net_file, argv[i + 1]);
Marc Kupietze423f732017-12-22 17:57:03 +01002009 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2010 strcpy(magic_stop_file, argv[i + 1]);
2011 if (access(magic_stop_file, F_OK ) != -1) {
2012 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2013 exit(1);
2014 }
2015 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002016 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2017 debug_mode = atoi(argv[i + 1]);
2018 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2019 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002020 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2021 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002022 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2023 type = atoi(argv[i + 1]);
2024 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2025 strcpy(output_file, argv[i + 1]);
2026 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2027 window = atoi(argv[i + 1]);
2028 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2029 sample = atof(argv[i + 1]);
2030 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2031 hs = atoi(argv[i + 1]);
2032 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2033 negative = atoi(argv[i + 1]);
2034 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2035 strcpy(negative_classes_file, argv[i + 1]);
2036 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2037 nce = atoi(argv[i + 1]);
2038 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2039 num_threads = atoi(argv[i + 1]);
2040 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2041 iter = atoi(argv[i + 1]);
2042 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2043 min_count = atoi(argv[i + 1]);
2044 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2045 classes = atoi(argv[i + 1]);
2046 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2047 cap = atoi(argv[i + 1]);
2048 if (type == 0 || type == 2 || type == 4)
2049 alpha = 0.05;
2050 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2051 alpha = atof(argv[i + 1]);
2052 vocab = (struct vocab_word *) calloc(vocab_max_size,
2053 sizeof(struct vocab_word));
2054 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2055 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2056 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2057 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2058 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2059 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002060 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002061 TrainModel();
2062 return 0;
2063}
2064