blob: f00ccac498731ddfdb07eec60b81ef4b9b51522c [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
40struct vocab_word *vocab;
41int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
42 num_threads = 12, min_reduce = 1;
43int *vocab_hash;
44long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
45long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
46 classes = 0;
47real alpha = 0.025, starting_alpha, sample = 1e-3;
48real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
49clock_t start;
50
51real *syn1_window, *syn1neg_window, *syn1nce_window;
52int w_offset, window_layer_size;
53
54int window_hidden_size = 500;
55real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
56 *syn_hidden_word_nce;
57
58int hs = 0, negative = 5;
59const int table_size = 1e8;
60int *table;
61
62//constrastive negative sampling
63char negative_classes_file[MAX_STRING];
64int *word_to_group;
65int *group_to_table; //group_size*table_size
66int class_number;
67
68//nce
69real* noise_distribution;
70int nce = 0;
71
72//param caps
73real CAP_VALUE = 50;
74int cap = 0;
75
76void capParam(real* array, int index) {
77 if (array[index] > CAP_VALUE)
78 array[index] = CAP_VALUE;
79 else if (array[index] < -CAP_VALUE)
80 array[index] = -CAP_VALUE;
81}
82
83real hardTanh(real x) {
84 if (x >= 1) {
85 return 1;
86 } else if (x <= -1) {
87 return -1;
88 } else {
89 return x;
90 }
91}
92
93real dHardTanh(real x, real g) {
94 if (x > 1 && g > 0) {
95 return 0;
96 }
97 if (x < -1 && g < 0) {
98 return 0;
99 }
100 return 1;
101}
102
103void InitUnigramTable() {
104 int a, i;
105 long long train_words_pow = 0;
106 real d1, power = 0.75;
107 table = (int *) malloc(table_size * sizeof(int));
108 for (a = 0; a < vocab_size; a++)
109 train_words_pow += pow(vocab[a].cn, power);
110 i = 0;
111 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
112 for (a = 0; a < table_size; a++) {
113 table[a] = i;
114 if (a / (real) table_size > d1) {
115 i++;
116 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
117 }
118 if (i >= vocab_size)
119 i = vocab_size - 1;
120 }
121
122 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
123 for (a = 0; a < vocab_size; a++)
124 noise_distribution[a] = pow(vocab[a].cn, power)
125 / (real) train_words_pow;
126}
127
128// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
129void ReadWord(char *word, FILE *fin) {
130 int a = 0, ch;
131 while (!feof(fin)) {
132 ch = fgetc(fin);
133 if (ch == 13)
134 continue;
135 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
136 if (a > 0) {
137 if (ch == '\n')
138 ungetc(ch, fin);
139 break;
140 }
141 if (ch == '\n') {
142 strcpy(word, (char *) "</s>");
143 return;
144 } else
145 continue;
146 }
147 word[a] = ch;
148 a++;
149 if (a >= MAX_STRING - 1)
150 a--; // Truncate too long words
151 }
152 word[a] = 0;
153}
154
155// Returns hash value of a word
156int GetWordHash(char *word) {
157 unsigned long long a, hash = 0;
158 for (a = 0; a < strlen(word); a++)
159 hash = hash * 257 + word[a];
160 hash = hash % vocab_hash_size;
161 return hash;
162}
163
164// Returns position of a word in the vocabulary; if the word is not found, returns -1
165int SearchVocab(char *word) {
166 unsigned int hash = GetWordHash(word);
167 while (1) {
168 if (vocab_hash[hash] == -1)
169 return -1;
170 if (!strcmp(word, vocab[vocab_hash[hash]].word))
171 return vocab_hash[hash];
172 hash = (hash + 1) % vocab_hash_size;
173 }
174 return -1;
175}
176
177// Reads a word and returns its index in the vocabulary
178int ReadWordIndex(FILE *fin) {
179 char word[MAX_STRING];
180 ReadWord(word, fin);
181 if (feof(fin))
182 return -1;
183 return SearchVocab(word);
184}
185
186// Adds a word to the vocabulary
187int AddWordToVocab(char *word) {
188 unsigned int hash, length = strlen(word) + 1;
189 if (length > MAX_STRING)
190 length = MAX_STRING;
191 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
192 strcpy(vocab[vocab_size].word, word);
193 vocab[vocab_size].cn = 0;
194 vocab_size++;
195 // Reallocate memory if needed
196 if (vocab_size + 2 >= vocab_max_size) {
197 vocab_max_size += 1000;
198 vocab = (struct vocab_word *) realloc(vocab,
199 vocab_max_size * sizeof(struct vocab_word));
200 }
201 hash = GetWordHash(word);
202 while (vocab_hash[hash] != -1)
203 hash = (hash + 1) % vocab_hash_size;
204 vocab_hash[hash] = vocab_size - 1;
205 return vocab_size - 1;
206}
207
208// Used later for sorting by word counts
209int VocabCompare(const void *a, const void *b) {
210 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
211}
212
213// Sorts the vocabulary by frequency using word counts
214void SortVocab() {
215 int a, size;
216 unsigned int hash;
217 // Sort the vocabulary and keep </s> at the first position
218 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
219 for (a = 0; a < vocab_hash_size; a++)
220 vocab_hash[a] = -1;
221 size = vocab_size;
222 train_words = 0;
223 for (a = 0; a < size; a++) {
224 // Words occuring less than min_count times will be discarded from the vocab
225 if ((vocab[a].cn < min_count) && (a != 0)) {
226 vocab_size--;
227 free(vocab[a].word);
228 } else {
229 // Hash will be re-computed, as after the sorting it is not actual
230 hash = GetWordHash(vocab[a].word);
231 while (vocab_hash[hash] != -1)
232 hash = (hash + 1) % vocab_hash_size;
233 vocab_hash[hash] = a;
234 train_words += vocab[a].cn;
235 }
236 }
237 vocab = (struct vocab_word *) realloc(vocab,
238 (vocab_size + 1) * sizeof(struct vocab_word));
239 // Allocate memory for the binary tree construction
240 for (a = 0; a < vocab_size; a++) {
241 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
242 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
243 }
244}
245
246// Reduces the vocabulary by removing infrequent tokens
247void ReduceVocab() {
248 int a, b = 0;
249 unsigned int hash;
250 for (a = 0; a < vocab_size; a++)
251 if (vocab[a].cn > min_reduce) {
252 vocab[b].cn = vocab[a].cn;
253 vocab[b].word = vocab[a].word;
254 b++;
255 } else
256 free(vocab[a].word);
257 vocab_size = b;
258 for (a = 0; a < vocab_hash_size; a++)
259 vocab_hash[a] = -1;
260 for (a = 0; a < vocab_size; a++) {
261 // Hash will be re-computed, as it is not actual
262 hash = GetWordHash(vocab[a].word);
263 while (vocab_hash[hash] != -1)
264 hash = (hash + 1) % vocab_hash_size;
265 vocab_hash[hash] = a;
266 }
267 fflush(stdout);
268 min_reduce++;
269}
270
271// Create binary Huffman tree using the word counts
272// Frequent words will have short uniqe binary codes
273void CreateBinaryTree() {
274 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
275 char code[MAX_CODE_LENGTH];
276 long long *count = (long long *) calloc(vocab_size * 2 + 1,
277 sizeof(long long));
278 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
279 sizeof(long long));
280 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
281 sizeof(long long));
282 for (a = 0; a < vocab_size; a++)
283 count[a] = vocab[a].cn;
284 for (a = vocab_size; a < vocab_size * 2; a++)
285 count[a] = 1e15;
286 pos1 = vocab_size - 1;
287 pos2 = vocab_size;
288 // Following algorithm constructs the Huffman tree by adding one node at a time
289 for (a = 0; a < vocab_size - 1; a++) {
290 // First, find two smallest nodes 'min1, min2'
291 if (pos1 >= 0) {
292 if (count[pos1] < count[pos2]) {
293 min1i = pos1;
294 pos1--;
295 } else {
296 min1i = pos2;
297 pos2++;
298 }
299 } else {
300 min1i = pos2;
301 pos2++;
302 }
303 if (pos1 >= 0) {
304 if (count[pos1] < count[pos2]) {
305 min2i = pos1;
306 pos1--;
307 } else {
308 min2i = pos2;
309 pos2++;
310 }
311 } else {
312 min2i = pos2;
313 pos2++;
314 }
315 count[vocab_size + a] = count[min1i] + count[min2i];
316 parent_node[min1i] = vocab_size + a;
317 parent_node[min2i] = vocab_size + a;
318 binary[min2i] = 1;
319 }
320 // Now assign binary code to each vocabulary word
321 for (a = 0; a < vocab_size; a++) {
322 b = a;
323 i = 0;
324 while (1) {
325 code[i] = binary[b];
326 point[i] = b;
327 i++;
328 b = parent_node[b];
329 if (b == vocab_size * 2 - 2)
330 break;
331 }
332 vocab[a].codelen = i;
333 vocab[a].point[0] = vocab_size - 2;
334 for (b = 0; b < i; b++) {
335 vocab[a].code[i - b - 1] = code[b];
336 vocab[a].point[i - b] = point[b] - vocab_size;
337 }
338 }
339 free(count);
340 free(binary);
341 free(parent_node);
342}
343
344void LearnVocabFromTrainFile() {
345 char word[MAX_STRING];
346 FILE *fin;
347 long long a, i;
348 for (a = 0; a < vocab_hash_size; a++)
349 vocab_hash[a] = -1;
350 fin = fopen(train_file, "rb");
351 if (fin == NULL) {
352 printf("ERROR: training data file not found!\n");
353 exit(1);
354 }
355 vocab_size = 0;
356 AddWordToVocab((char *) "</s>");
357 while (1) {
358 ReadWord(word, fin);
359 if (feof(fin))
360 break;
361 train_words++;
362 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
363 printf("%lldK%c", train_words / 1000, 13);
364 fflush(stdout);
365 }
366 i = SearchVocab(word);
367 if (i == -1) {
368 a = AddWordToVocab(word);
369 vocab[a].cn = 1;
370 } else
371 vocab[i].cn++;
372 if (vocab_size > vocab_hash_size * 0.7)
373 ReduceVocab();
374 }
375 SortVocab();
376 if (debug_mode > 0) {
377 printf("Vocab size: %lld\n", vocab_size);
378 printf("Words in train file: %lld\n", train_words);
379 }
380 file_size = ftell(fin);
381 fclose(fin);
382}
383
384void SaveVocab() {
385 long long i;
386 FILE *fo = fopen(save_vocab_file, "wb");
387 for (i = 0; i < vocab_size; i++)
388 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
389 fclose(fo);
390}
391
392void ReadVocab() {
393 long long a, i = 0;
394 char c;
395 char word[MAX_STRING];
396 FILE *fin = fopen(read_vocab_file, "rb");
397 if (fin == NULL) {
398 printf("Vocabulary file not found\n");
399 exit(1);
400 }
401 for (a = 0; a < vocab_hash_size; a++)
402 vocab_hash[a] = -1;
403 vocab_size = 0;
404 while (1) {
405 ReadWord(word, fin);
406 if (feof(fin))
407 break;
408 a = AddWordToVocab(word);
409 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
410 i++;
411 }
412 SortVocab();
413 if (debug_mode > 0) {
414 printf("Vocab size: %lld\n", vocab_size);
415 printf("Words in train file: %lld\n", train_words);
416 }
417 fin = fopen(train_file, "rb");
418 if (fin == NULL) {
419 printf("ERROR: training data file not found!\n");
420 exit(1);
421 }
422 fseek(fin, 0, SEEK_END);
423 file_size = ftell(fin);
424 fclose(fin);
425}
426
427void InitClassUnigramTable() {
428 long long a, c;
429 printf("loading class unigrams \n");
430 FILE *fin = fopen(negative_classes_file, "rb");
431 if (fin == NULL) {
432 printf("ERROR: class file not found!\n");
433 exit(1);
434 }
435 word_to_group = (int *) malloc(vocab_size * sizeof(int));
436 for (a = 0; a < vocab_size; a++)
437 word_to_group[a] = -1;
438 char class[MAX_STRING];
439 char prev_class[MAX_STRING];
440 prev_class[0] = 0;
441 char word[MAX_STRING];
442 class_number = -1;
443 while (1) {
444 if (feof(fin))
445 break;
446 ReadWord(class, fin);
447 ReadWord(word, fin);
448 int word_index = SearchVocab(word);
449 if (word_index != -1) {
450 if (strcmp(class, prev_class) != 0) {
451 class_number++;
452 strcpy(prev_class, class);
453 }
454 word_to_group[word_index] = class_number;
455 }
456 ReadWord(word, fin);
457 }
458 class_number++;
459 fclose(fin);
460
461 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
462 long long train_words_pow = 0;
463 real d1, power = 0.75;
464
465 for (c = 0; c < class_number; c++) {
466 long long offset = c * table_size;
467 train_words_pow = 0;
468 for (a = 0; a < vocab_size; a++)
469 if (word_to_group[a] == c)
470 train_words_pow += pow(vocab[a].cn, power);
471 int i = 0;
472 while (word_to_group[i] != c && i < vocab_size)
473 i++;
474 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
475 for (a = 0; a < table_size; a++) {
476 //printf("index %lld , word %d\n", a, i);
477 group_to_table[offset + a] = i;
478 if (a / (real) table_size > d1) {
479 i++;
480 while (word_to_group[i] != c && i < vocab_size)
481 i++;
482 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
483 }
484 if (i >= vocab_size)
485 while (word_to_group[i] != c && i >= 0)
486 i--;
487 }
488 }
489}
490
491void SaveNet() {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100492 FILE *fnet = fopen(save_net_file, "wb");
493 if (fnet == NULL) {
494 printf("Net parameter file not found\n");
495 exit(1);
496 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100497 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
498 fwrite(syn_window_hidden, sizeof(real), window_hidden_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100499 fclose(fnet);
500}
501
502void InitNet() {
503 long long a, b;
504 unsigned long long next_random = 1;
505 window_layer_size = layer1_size * window * 2;
506 a = posix_memalign((void **) &syn0, 128,
507 (long long) vocab_size * layer1_size * sizeof(real));
508 if (syn0 == NULL) {
509 printf("Memory allocation failed\n");
510 exit(1);
511 }
512
513 if (hs) {
514 a = posix_memalign((void **) &syn1, 128,
515 (long long) vocab_size * layer1_size * sizeof(real));
516 if (syn1 == NULL) {
517 printf("Memory allocation failed\n");
518 exit(1);
519 }
520 a = posix_memalign((void **) &syn1_window, 128,
521 (long long) vocab_size * window_layer_size * sizeof(real));
522 if (syn1_window == NULL) {
523 printf("Memory allocation failed\n");
524 exit(1);
525 }
526 a = posix_memalign((void **) &syn_hidden_word, 128,
527 (long long) vocab_size * window_hidden_size * sizeof(real));
528 if (syn_hidden_word == NULL) {
529 printf("Memory allocation failed\n");
530 exit(1);
531 }
532
533 for (a = 0; a < vocab_size; a++)
534 for (b = 0; b < layer1_size; b++)
535 syn1[a * layer1_size + b] = 0;
536 for (a = 0; a < vocab_size; a++)
537 for (b = 0; b < window_layer_size; b++)
538 syn1_window[a * window_layer_size + b] = 0;
539 for (a = 0; a < vocab_size; a++)
540 for (b = 0; b < window_hidden_size; b++)
541 syn_hidden_word[a * window_hidden_size + b] = 0;
542 }
543 if (negative > 0) {
544 a = posix_memalign((void **) &syn1neg, 128,
545 (long long) vocab_size * layer1_size * sizeof(real));
546 if (syn1neg == NULL) {
547 printf("Memory allocation failed\n");
548 exit(1);
549 }
550 a = posix_memalign((void **) &syn1neg_window, 128,
551 (long long) vocab_size * window_layer_size * sizeof(real));
552 if (syn1neg_window == NULL) {
553 printf("Memory allocation failed\n");
554 exit(1);
555 }
556 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
557 (long long) vocab_size * window_hidden_size * sizeof(real));
558 if (syn_hidden_word_neg == NULL) {
559 printf("Memory allocation failed\n");
560 exit(1);
561 }
562
563 for (a = 0; a < vocab_size; a++)
564 for (b = 0; b < layer1_size; b++)
565 syn1neg[a * layer1_size + b] = 0;
566 for (a = 0; a < vocab_size; a++)
567 for (b = 0; b < window_layer_size; b++)
568 syn1neg_window[a * window_layer_size + b] = 0;
569 for (a = 0; a < vocab_size; a++)
570 for (b = 0; b < window_hidden_size; b++)
571 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
572 }
573 if (nce > 0) {
574 a = posix_memalign((void **) &syn1nce, 128,
575 (long long) vocab_size * layer1_size * sizeof(real));
576 if (syn1nce == NULL) {
577 printf("Memory allocation failed\n");
578 exit(1);
579 }
580 a = posix_memalign((void **) &syn1nce_window, 128,
581 (long long) vocab_size * window_layer_size * sizeof(real));
582 if (syn1nce_window == NULL) {
583 printf("Memory allocation failed\n");
584 exit(1);
585 }
586 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
587 (long long) vocab_size * window_hidden_size * sizeof(real));
588 if (syn_hidden_word_nce == NULL) {
589 printf("Memory allocation failed\n");
590 exit(1);
591 }
592
593 for (a = 0; a < vocab_size; a++)
594 for (b = 0; b < layer1_size; b++)
595 syn1nce[a * layer1_size + b] = 0;
596 for (a = 0; a < vocab_size; a++)
597 for (b = 0; b < window_layer_size; b++)
598 syn1nce_window[a * window_layer_size + b] = 0;
599 for (a = 0; a < vocab_size; a++)
600 for (b = 0; b < window_hidden_size; b++)
601 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
602 }
603 if (read_net_file[0] == 0) {
604 for (a = 0; a < vocab_size; a++)
605 for (b = 0; b < layer1_size; b++) {
606 next_random = next_random * (unsigned long long) 25214903917
607 + 11;
608 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
609 / (real) 65536) - 0.5) / layer1_size;
610 }
611
612 a = posix_memalign((void **) &syn_window_hidden, 128,
613 window_hidden_size * window_layer_size * sizeof(real));
614 if (syn_window_hidden == NULL) {
615 printf("Memory allocation failed\n");
616 exit(1);
617 }
618 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
619 next_random = next_random * (unsigned long long) 25214903917 + 11;
620 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
621 - 0.5) / (window_hidden_size * window_layer_size);
622 }
623 }
624 else {
625 FILE *fnet = fopen(read_net_file, "rb");
626 if (fnet == NULL) {
627 printf("Net parameter file not found\n");
628 exit(1);
629 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100630 fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100631 a = posix_memalign((void **) &syn_window_hidden, 128,
632 window_hidden_size * window_layer_size * sizeof(real));
633 if (syn_window_hidden == NULL) {
634 printf("Memory allocation failed\n");
635 exit(1);
636 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100637 fread(syn_window_hidden, sizeof(real), window_hidden_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100638 fclose(fnet);
639 }
640
641 CreateBinaryTree();
642}
643
644void *TrainModelThread(void *id) {
645 long long a, b, d, cw, word, last_word, sentence_length = 0,
646 sentence_position = 0;
647 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
648 long long l1, l2, c, target, label, local_iter = iter;
649 unsigned long long next_random = (long long) id;
650 real f, g;
651 clock_t now;
652 int input_len_1 = layer1_size;
653 int window_offset = -1;
654 if (type == 2 || type == 4) {
655 input_len_1 = window_layer_size;
656 }
657 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
658 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
659
660 int input_len_2 = 0;
661 if (type == 4) {
662 input_len_2 = window_hidden_size;
663 }
664 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
665 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
666
667 FILE *fi = fopen(train_file, "rb");
668 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
669 while (1) {
670 if (word_count - last_word_count > 10000) {
671 word_count_actual += word_count - last_word_count;
672 last_word_count = word_count;
673 if ((debug_mode > 1)) {
674 now = clock();
675 printf(
676 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
677 13, alpha,
678 word_count_actual / (real) (iter * train_words + 1)
679 * 100,
680 word_count_actual
681 / ((real) (now - start + 1)
682 / (real) CLOCKS_PER_SEC * 1000));
683 fflush(stdout);
684 }
685 alpha = starting_alpha
686 * (1 - word_count_actual / (real) (iter * train_words + 1));
687 if (alpha < starting_alpha * 0.0001)
688 alpha = starting_alpha * 0.0001;
689 }
690 if (sentence_length == 0) {
691 while (1) {
692 word = ReadWordIndex(fi);
693 if (feof(fi))
694 break;
695 if (word == -1)
696 continue;
697 word_count++;
698 if (word == 0)
699 break;
700 // The subsampling randomly discards frequent words while keeping the ranking same
701 if (sample > 0) {
702 real ran = (sqrt(vocab[word].cn / (sample * train_words))
703 + 1) * (sample * train_words) / vocab[word].cn;
704 next_random = next_random * (unsigned long long) 25214903917
705 + 11;
706 if (ran < (next_random & 0xFFFF) / (real) 65536)
707 continue;
708 }
709 sen[sentence_length] = word;
710 sentence_length++;
711 if (sentence_length >= MAX_SENTENCE_LENGTH)
712 break;
713 }
714 sentence_position = 0;
715 }
716 if (feof(fi) || (word_count > train_words / num_threads)) {
717 word_count_actual += word_count - last_word_count;
718 local_iter--;
719 if (local_iter == 0)
720 break;
721 word_count = 0;
722 last_word_count = 0;
723 sentence_length = 0;
724 fseek(fi, file_size / (long long) num_threads * (long long) id,
725 SEEK_SET);
726 continue;
727 }
728 word = sen[sentence_position];
729 if (word == -1)
730 continue;
731 for (c = 0; c < input_len_1; c++)
732 neu1[c] = 0;
733 for (c = 0; c < input_len_1; c++)
734 neu1e[c] = 0;
735 for (c = 0; c < input_len_2; c++)
736 neu2[c] = 0;
737 for (c = 0; c < input_len_2; c++)
738 neu2e[c] = 0;
739 next_random = next_random * (unsigned long long) 25214903917 + 11;
740 b = next_random % window;
741 if (type == 0) { //train the cbow architecture
742 // in -> hidden
743 cw = 0;
744 for (a = b; a < window * 2 + 1 - b; a++)
745 if (a != window) {
746 c = sentence_position - window + a;
747 if (c < 0)
748 continue;
749 if (c >= sentence_length)
750 continue;
751 last_word = sen[c];
752 if (last_word == -1)
753 continue;
754 for (c = 0; c < layer1_size; c++)
755 neu1[c] += syn0[c + last_word * layer1_size];
756 cw++;
757 }
758 if (cw) {
759 for (c = 0; c < layer1_size; c++)
760 neu1[c] /= cw;
761 if (hs)
762 for (d = 0; d < vocab[word].codelen; d++) {
763 f = 0;
764 l2 = vocab[word].point[d] * layer1_size;
765 // Propagate hidden -> output
766 for (c = 0; c < layer1_size; c++)
767 f += neu1[c] * syn1[c + l2];
768 if (f <= -MAX_EXP)
769 continue;
770 else if (f >= MAX_EXP)
771 continue;
772 else
773 f = expTable[(int) ((f + MAX_EXP)
774 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
775 // 'g' is the gradient multiplied by the learning rate
776 g = (1 - vocab[word].code[d] - f) * alpha;
777 // Propagate errors output -> hidden
778 for (c = 0; c < layer1_size; c++)
779 neu1e[c] += g * syn1[c + l2];
780 // Learn weights hidden -> output
781 for (c = 0; c < layer1_size; c++)
782 syn1[c + l2] += g * neu1[c];
783 if (cap == 1)
784 for (c = 0; c < layer1_size; c++)
785 capParam(syn1, c + l2);
786 }
787 // NEGATIVE SAMPLING
788 if (negative > 0)
789 for (d = 0; d < negative + 1; d++) {
790 if (d == 0) {
791 target = word;
792 label = 1;
793 } else {
794 next_random = next_random
795 * (unsigned long long) 25214903917 + 11;
796 if (word_to_group != NULL
797 && word_to_group[word] != -1) {
798 target = word;
799 while (target == word) {
800 target = group_to_table[word_to_group[word]
801 * table_size
802 + (next_random >> 16) % table_size];
803 next_random = next_random
804 * (unsigned long long) 25214903917
805 + 11;
806 }
807 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
808 } else {
809 target =
810 table[(next_random >> 16) % table_size];
811 }
812 if (target == 0)
813 target = next_random % (vocab_size - 1) + 1;
814 if (target == word)
815 continue;
816 label = 0;
817 }
818 l2 = target * layer1_size;
819 f = 0;
820 for (c = 0; c < layer1_size; c++)
821 f += neu1[c] * syn1neg[c + l2];
822 if (f > MAX_EXP)
823 g = (label - 1) * alpha;
824 else if (f < -MAX_EXP)
825 g = (label - 0) * alpha;
826 else
827 g = (label
828 - expTable[(int) ((f + MAX_EXP)
829 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
830 * alpha;
831 for (c = 0; c < layer1_size; c++)
832 neu1e[c] += g * syn1neg[c + l2];
833 for (c = 0; c < layer1_size; c++)
834 syn1neg[c + l2] += g * neu1[c];
835 if (cap == 1)
836 for (c = 0; c < layer1_size; c++)
837 capParam(syn1neg, c + l2);
838 }
839 // Noise Contrastive Estimation
840 if (nce > 0)
841 for (d = 0; d < nce + 1; d++) {
842 if (d == 0) {
843 target = word;
844 label = 1;
845 } else {
846 next_random = next_random
847 * (unsigned long long) 25214903917 + 11;
848 if (word_to_group != NULL
849 && word_to_group[word] != -1) {
850 target = word;
851 while (target == word) {
852 target = group_to_table[word_to_group[word]
853 * table_size
854 + (next_random >> 16) % table_size];
855 next_random = next_random
856 * (unsigned long long) 25214903917
857 + 11;
858 }
859 } else {
860 target =
861 table[(next_random >> 16) % table_size];
862 }
863 if (target == 0)
864 target = next_random % (vocab_size - 1) + 1;
865 if (target == word)
866 continue;
867 label = 0;
868 }
869 l2 = target * layer1_size;
870 f = 0;
871
872 for (c = 0; c < layer1_size; c++)
873 f += neu1[c] * syn1nce[c + l2];
874 if (f > MAX_EXP)
875 g = (label - 1) * alpha;
876 else if (f < -MAX_EXP)
877 g = (label - 0) * alpha;
878 else {
879 f = exp(f);
880 g =
881 (label
882 - f
883 / (noise_distribution[target]
884 * nce + f)) * alpha;
885 }
886 for (c = 0; c < layer1_size; c++)
887 neu1e[c] += g * syn1nce[c + l2];
888 for (c = 0; c < layer1_size; c++)
889 syn1nce[c + l2] += g * neu1[c];
890 if (cap == 1)
891 for (c = 0; c < layer1_size; c++)
892 capParam(syn1nce, c + l2);
893 }
894 // hidden -> in
895 for (a = b; a < window * 2 + 1 - b; a++)
896 if (a != window) {
897 c = sentence_position - window + a;
898 if (c < 0)
899 continue;
900 if (c >= sentence_length)
901 continue;
902 last_word = sen[c];
903 if (last_word == -1)
904 continue;
905 for (c = 0; c < layer1_size; c++)
906 syn0[c + last_word * layer1_size] += neu1e[c];
907 }
908 }
909 } else if (type == 1) { //train skip-gram
910 for (a = b; a < window * 2 + 1 - b; a++)
911 if (a != window) {
912 c = sentence_position - window + a;
913 if (c < 0)
914 continue;
915 if (c >= sentence_length)
916 continue;
917 last_word = sen[c];
918 if (last_word == -1)
919 continue;
920 l1 = last_word * layer1_size;
921 for (c = 0; c < layer1_size; c++)
922 neu1e[c] = 0;
923 // HIERARCHICAL SOFTMAX
924 if (hs)
925 for (d = 0; d < vocab[word].codelen; d++) {
926 f = 0;
927 l2 = vocab[word].point[d] * layer1_size;
928 // Propagate hidden -> output
929 for (c = 0; c < layer1_size; c++)
930 f += syn0[c + l1] * syn1[c + l2];
931 if (f <= -MAX_EXP)
932 continue;
933 else if (f >= MAX_EXP)
934 continue;
935 else
936 f = expTable[(int) ((f + MAX_EXP)
937 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
938 // 'g' is the gradient multiplied by the learning rate
939 g = (1 - vocab[word].code[d] - f) * alpha;
940 // Propagate errors output -> hidden
941 for (c = 0; c < layer1_size; c++)
942 neu1e[c] += g * syn1[c + l2];
943 // Learn weights hidden -> output
944 for (c = 0; c < layer1_size; c++)
945 syn1[c + l2] += g * syn0[c + l1];
946 if (cap == 1)
947 for (c = 0; c < layer1_size; c++)
948 capParam(syn1, c + l2);
949 }
950 // NEGATIVE SAMPLING
951 if (negative > 0)
952 for (d = 0; d < negative + 1; d++) {
953 if (d == 0) {
954 target = word;
955 label = 1;
956 } else {
957 next_random = next_random
958 * (unsigned long long) 25214903917 + 11;
959 if (word_to_group != NULL
960 && word_to_group[word] != -1) {
961 target = word;
962 while (target == word) {
963 target =
964 group_to_table[word_to_group[word]
965 * table_size
966 + (next_random >> 16)
967 % table_size];
968 next_random =
969 next_random
970 * (unsigned long long) 25214903917
971 + 11;
972 }
973 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
974 } else {
975 target = table[(next_random >> 16)
976 % table_size];
977 }
978 if (target == 0)
979 target = next_random % (vocab_size - 1) + 1;
980 if (target == word)
981 continue;
982 label = 0;
983 }
984 l2 = target * layer1_size;
985 f = 0;
986 for (c = 0; c < layer1_size; c++)
987 f += syn0[c + l1] * syn1neg[c + l2];
988 if (f > MAX_EXP)
989 g = (label - 1) * alpha;
990 else if (f < -MAX_EXP)
991 g = (label - 0) * alpha;
992 else
993 g =
994 (label
995 - expTable[(int) ((f + MAX_EXP)
996 * (EXP_TABLE_SIZE
997 / MAX_EXP / 2))])
998 * alpha;
999 for (c = 0; c < layer1_size; c++)
1000 neu1e[c] += g * syn1neg[c + l2];
1001 for (c = 0; c < layer1_size; c++)
1002 syn1neg[c + l2] += g * syn0[c + l1];
1003 if (cap == 1)
1004 for (c = 0; c < layer1_size; c++)
1005 capParam(syn1neg, c + l2);
1006 }
1007 //Noise Contrastive Estimation
1008 if (nce > 0)
1009 for (d = 0; d < nce + 1; d++) {
1010 if (d == 0) {
1011 target = word;
1012 label = 1;
1013 } else {
1014 next_random = next_random
1015 * (unsigned long long) 25214903917 + 11;
1016 if (word_to_group != NULL
1017 && word_to_group[word] != -1) {
1018 target = word;
1019 while (target == word) {
1020 target =
1021 group_to_table[word_to_group[word]
1022 * table_size
1023 + (next_random >> 16)
1024 % table_size];
1025 next_random =
1026 next_random
1027 * (unsigned long long) 25214903917
1028 + 11;
1029 }
1030 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1031 } else {
1032 target = table[(next_random >> 16)
1033 % table_size];
1034 }
1035 if (target == 0)
1036 target = next_random % (vocab_size - 1) + 1;
1037 if (target == word)
1038 continue;
1039 label = 0;
1040 }
1041 l2 = target * layer1_size;
1042 f = 0;
1043 for (c = 0; c < layer1_size; c++)
1044 f += syn0[c + l1] * syn1nce[c + l2];
1045 if (f > MAX_EXP)
1046 g = (label - 1) * alpha;
1047 else if (f < -MAX_EXP)
1048 g = (label - 0) * alpha;
1049 else {
1050 f = exp(f);
1051 g = (label
1052 - f
1053 / (noise_distribution[target]
1054 * nce + f)) * alpha;
1055 }
1056 for (c = 0; c < layer1_size; c++)
1057 neu1e[c] += g * syn1nce[c + l2];
1058 for (c = 0; c < layer1_size; c++)
1059 syn1nce[c + l2] += g * syn0[c + l1];
1060 if (cap == 1)
1061 for (c = 0; c < layer1_size; c++)
1062 capParam(syn1nce, c + l2);
1063 }
1064 // Learn weights input -> hidden
1065 for (c = 0; c < layer1_size; c++)
1066 syn0[c + l1] += neu1e[c];
1067 }
1068 } else if (type == 2) { //train the cwindow architecture
1069 // in -> hidden
1070 cw = 0;
1071 for (a = 0; a < window * 2 + 1; a++)
1072 if (a != window) {
1073 c = sentence_position - window + a;
1074 if (c < 0)
1075 continue;
1076 if (c >= sentence_length)
1077 continue;
1078 last_word = sen[c];
1079 if (last_word == -1)
1080 continue;
1081 window_offset = a * layer1_size;
1082 if (a > window)
1083 window_offset -= layer1_size;
1084 for (c = 0; c < layer1_size; c++)
1085 neu1[c + window_offset] += syn0[c
1086 + last_word * layer1_size];
1087 cw++;
1088 }
1089 if (cw) {
1090 if (hs)
1091 for (d = 0; d < vocab[word].codelen; d++) {
1092 f = 0;
1093 l2 = vocab[word].point[d] * window_layer_size;
1094 // Propagate hidden -> output
1095 for (c = 0; c < window_layer_size; c++)
1096 f += neu1[c] * syn1_window[c + l2];
1097 if (f <= -MAX_EXP)
1098 continue;
1099 else if (f >= MAX_EXP)
1100 continue;
1101 else
1102 f = expTable[(int) ((f + MAX_EXP)
1103 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1104 // 'g' is the gradient multiplied by the learning rate
1105 g = (1 - vocab[word].code[d] - f) * alpha;
1106 // Propagate errors output -> hidden
1107 for (c = 0; c < window_layer_size; c++)
1108 neu1e[c] += g * syn1_window[c + l2];
1109 // Learn weights hidden -> output
1110 for (c = 0; c < window_layer_size; c++)
1111 syn1_window[c + l2] += g * neu1[c];
1112 if (cap == 1)
1113 for (c = 0; c < window_layer_size; c++)
1114 capParam(syn1_window, c + l2);
1115 }
1116 // NEGATIVE SAMPLING
1117 if (negative > 0)
1118 for (d = 0; d < negative + 1; d++) {
1119 if (d == 0) {
1120 target = word;
1121 label = 1;
1122 } else {
1123 next_random = next_random
1124 * (unsigned long long) 25214903917 + 11;
1125 if (word_to_group != NULL
1126 && word_to_group[word] != -1) {
1127 target = word;
1128 while (target == word) {
1129 target = group_to_table[word_to_group[word]
1130 * table_size
1131 + (next_random >> 16) % table_size];
1132 next_random = next_random
1133 * (unsigned long long) 25214903917
1134 + 11;
1135 }
1136 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1137 } else {
1138 target =
1139 table[(next_random >> 16) % table_size];
1140 }
1141 if (target == 0)
1142 target = next_random % (vocab_size - 1) + 1;
1143 if (target == word)
1144 continue;
1145 label = 0;
1146 }
1147 l2 = target * window_layer_size;
1148 f = 0;
1149 for (c = 0; c < window_layer_size; c++)
1150 f += neu1[c] * syn1neg_window[c + l2];
1151 if (f > MAX_EXP)
1152 g = (label - 1) * alpha;
1153 else if (f < -MAX_EXP)
1154 g = (label - 0) * alpha;
1155 else
1156 g = (label
1157 - expTable[(int) ((f + MAX_EXP)
1158 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1159 * alpha;
1160 for (c = 0; c < window_layer_size; c++)
1161 neu1e[c] += g * syn1neg_window[c + l2];
1162 for (c = 0; c < window_layer_size; c++)
1163 syn1neg_window[c + l2] += g * neu1[c];
1164 if (cap == 1)
1165 for (c = 0; c < window_layer_size; c++)
1166 capParam(syn1neg_window, c + l2);
1167 }
1168 // Noise Contrastive Estimation
1169 if (nce > 0)
1170 for (d = 0; d < nce + 1; d++) {
1171 if (d == 0) {
1172 target = word;
1173 label = 1;
1174 } else {
1175 next_random = next_random
1176 * (unsigned long long) 25214903917 + 11;
1177 if (word_to_group != NULL
1178 && word_to_group[word] != -1) {
1179 target = word;
1180 while (target == word) {
1181 target = group_to_table[word_to_group[word]
1182 * table_size
1183 + (next_random >> 16) % table_size];
1184 next_random = next_random
1185 * (unsigned long long) 25214903917
1186 + 11;
1187 }
1188 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1189 } else {
1190 target =
1191 table[(next_random >> 16) % table_size];
1192 }
1193 if (target == 0)
1194 target = next_random % (vocab_size - 1) + 1;
1195 if (target == word)
1196 continue;
1197 label = 0;
1198 }
1199 l2 = target * window_layer_size;
1200 f = 0;
1201 for (c = 0; c < window_layer_size; c++)
1202 f += neu1[c] * syn1nce_window[c + l2];
1203 if (f > MAX_EXP)
1204 g = (label - 1) * alpha;
1205 else if (f < -MAX_EXP)
1206 g = (label - 0) * alpha;
1207 else {
1208 f = exp(f);
1209 g =
1210 (label
1211 - f
1212 / (noise_distribution[target]
1213 * nce + f)) * alpha;
1214 }
1215 for (c = 0; c < window_layer_size; c++)
1216 neu1e[c] += g * syn1nce_window[c + l2];
1217 for (c = 0; c < window_layer_size; c++)
1218 syn1nce_window[c + l2] += g * neu1[c];
1219 if (cap == 1)
1220 for (c = 0; c < window_layer_size; c++)
1221 capParam(syn1nce_window, c + l2);
1222 }
1223 // hidden -> in
1224 for (a = 0; a < window * 2 + 1; a++)
1225 if (a != window) {
1226 c = sentence_position - window + a;
1227 if (c < 0)
1228 continue;
1229 if (c >= sentence_length)
1230 continue;
1231 last_word = sen[c];
1232 if (last_word == -1)
1233 continue;
1234 window_offset = a * layer1_size;
1235 if (a > window)
1236 window_offset -= layer1_size;
1237 for (c = 0; c < layer1_size; c++)
1238 syn0[c + last_word * layer1_size] += neu1e[c
1239 + window_offset];
1240 }
1241 }
1242 } else if (type == 3) { //train structured skip-gram
1243 for (a = 0; a < window * 2 + 1; a++)
1244 if (a != window) {
1245 c = sentence_position - window + a;
1246 if (c < 0)
1247 continue;
1248 if (c >= sentence_length)
1249 continue;
1250 last_word = sen[c];
1251 if (last_word == -1)
1252 continue;
1253 l1 = last_word * layer1_size;
1254 window_offset = a * layer1_size;
1255 if (a > window)
1256 window_offset -= layer1_size;
1257 for (c = 0; c < layer1_size; c++)
1258 neu1e[c] = 0;
1259 // HIERARCHICAL SOFTMAX
1260 if (hs)
1261 for (d = 0; d < vocab[word].codelen; d++) {
1262 f = 0;
1263 l2 = vocab[word].point[d] * window_layer_size;
1264 // Propagate hidden -> output
1265 for (c = 0; c < layer1_size; c++)
1266 f += syn0[c + l1]
1267 * syn1_window[c + l2 + window_offset];
1268 if (f <= -MAX_EXP)
1269 continue;
1270 else if (f >= MAX_EXP)
1271 continue;
1272 else
1273 f = expTable[(int) ((f + MAX_EXP)
1274 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1275 // 'g' is the gradient multiplied by the learning rate
1276 g = (1 - vocab[word].code[d] - f) * alpha;
1277 // Propagate errors output -> hidden
1278 for (c = 0; c < layer1_size; c++)
1279 neu1e[c] += g
1280 * syn1_window[c + l2 + window_offset];
1281 // Learn weights hidden -> output
1282 for (c = 0; c < layer1_size; c++)
1283 syn1[c + l2 + window_offset] += g
1284 * syn0[c + l1];
1285 if (cap == 1)
1286 for (c = 0; c < layer1_size; c++)
1287 capParam(syn1, c + l2 + window_offset);
1288 }
1289 // NEGATIVE SAMPLING
1290 if (negative > 0)
1291 for (d = 0; d < negative + 1; d++) {
1292 if (d == 0) {
1293 target = word;
1294 label = 1;
1295 } else {
1296 next_random = next_random
1297 * (unsigned long long) 25214903917 + 11;
1298 if (word_to_group != NULL
1299 && word_to_group[word] != -1) {
1300 target = word;
1301 while (target == word) {
1302 target =
1303 group_to_table[word_to_group[word]
1304 * table_size
1305 + (next_random >> 16)
1306 % table_size];
1307 next_random =
1308 next_random
1309 * (unsigned long long) 25214903917
1310 + 11;
1311 }
1312 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1313 } else {
1314 target = table[(next_random >> 16)
1315 % table_size];
1316 }
1317 if (target == 0)
1318 target = next_random % (vocab_size - 1) + 1;
1319 if (target == word)
1320 continue;
1321 label = 0;
1322 }
1323 l2 = target * window_layer_size;
1324 f = 0;
1325 for (c = 0; c < layer1_size; c++)
1326 f +=
1327 syn0[c + l1]
1328 * syn1neg_window[c + l2
1329 + window_offset];
1330 if (f > MAX_EXP)
1331 g = (label - 1) * alpha;
1332 else if (f < -MAX_EXP)
1333 g = (label - 0) * alpha;
1334 else
1335 g =
1336 (label
1337 - expTable[(int) ((f + MAX_EXP)
1338 * (EXP_TABLE_SIZE
1339 / MAX_EXP / 2))])
1340 * alpha;
1341 for (c = 0; c < layer1_size; c++)
1342 neu1e[c] +=
1343 g
1344 * syn1neg_window[c + l2
1345 + window_offset];
1346 for (c = 0; c < layer1_size; c++)
1347 syn1neg_window[c + l2 + window_offset] += g
1348 * syn0[c + l1];
1349 if (cap == 1)
1350 for (c = 0; c < layer1_size; c++)
1351 capParam(syn1neg_window,
1352 c + l2 + window_offset);
1353 }
1354 // Noise Constrastive Estimation
1355 if (nce > 0)
1356 for (d = 0; d < nce + 1; d++) {
1357 if (d == 0) {
1358 target = word;
1359 label = 1;
1360 } else {
1361 next_random = next_random
1362 * (unsigned long long) 25214903917 + 11;
1363 if (word_to_group != NULL
1364 && word_to_group[word] != -1) {
1365 target = word;
1366 while (target == word) {
1367 target =
1368 group_to_table[word_to_group[word]
1369 * table_size
1370 + (next_random >> 16)
1371 % table_size];
1372 next_random =
1373 next_random
1374 * (unsigned long long) 25214903917
1375 + 11;
1376 }
1377 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1378 } else {
1379 target = table[(next_random >> 16)
1380 % table_size];
1381 }
1382 if (target == 0)
1383 target = next_random % (vocab_size - 1) + 1;
1384 if (target == word)
1385 continue;
1386 label = 0;
1387 }
1388 l2 = target * window_layer_size;
1389 f = 0;
1390 for (c = 0; c < layer1_size; c++)
1391 f +=
1392 syn0[c + l1]
1393 * syn1nce_window[c + l2
1394 + window_offset];
1395 if (f > MAX_EXP)
1396 g = (label - 1) * alpha;
1397 else if (f < -MAX_EXP)
1398 g = (label - 0) * alpha;
1399 else {
1400 f = exp(f);
1401 g = (label
1402 - f
1403 / (noise_distribution[target]
1404 * nce + f)) * alpha;
1405 }
1406 for (c = 0; c < layer1_size; c++)
1407 neu1e[c] +=
1408 g
1409 * syn1nce_window[c + l2
1410 + window_offset];
1411 for (c = 0; c < layer1_size; c++)
1412 syn1nce_window[c + l2 + window_offset] += g
1413 * syn0[c + l1];
1414 if (cap == 1)
1415 for (c = 0; c < layer1_size; c++)
1416 capParam(syn1nce_window,
1417 c + l2 + window_offset);
1418 }
1419 // Learn weights input -> hidden
1420 for (c = 0; c < layer1_size; c++) {
1421 syn0[c + l1] += neu1e[c];
1422 if (syn0[c + l1] > 50)
1423 syn0[c + l1] = 50;
1424 if (syn0[c + l1] < -50)
1425 syn0[c + l1] = -50;
1426 }
1427 }
1428 } else if (type == 4) { //training senna
1429 // in -> hidden
1430 cw = 0;
1431 for (a = 0; a < window * 2 + 1; a++)
1432 if (a != window) {
1433 c = sentence_position - window + a;
1434 if (c < 0)
1435 continue;
1436 if (c >= sentence_length)
1437 continue;
1438 last_word = sen[c];
1439 if (last_word == -1)
1440 continue;
1441 window_offset = a * layer1_size;
1442 if (a > window)
1443 window_offset -= layer1_size;
1444 for (c = 0; c < layer1_size; c++)
1445 neu1[c + window_offset] += syn0[c
1446 + last_word * layer1_size];
1447 cw++;
1448 }
1449 if (cw) {
1450 for (a = 0; a < window_hidden_size; a++) {
1451 c = a * window_layer_size;
1452 for (b = 0; b < window_layer_size; b++) {
1453 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1454 }
1455 }
1456 if (hs)
1457 for (d = 0; d < vocab[word].codelen; d++) {
1458 f = 0;
1459 l2 = vocab[word].point[d] * window_hidden_size;
1460 // Propagate hidden -> output
1461 for (c = 0; c < window_hidden_size; c++)
1462 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1463 if (f <= -MAX_EXP)
1464 continue;
1465 else if (f >= MAX_EXP)
1466 continue;
1467 else
1468 f = expTable[(int) ((f + MAX_EXP)
1469 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1470 // 'g' is the gradient multiplied by the learning rate
1471 g = (1 - vocab[word].code[d] - f) * alpha;
1472 // Propagate errors output -> hidden
1473 for (c = 0; c < window_hidden_size; c++)
1474 neu2e[c] += dHardTanh(neu2[c], g) * g
1475 * syn_hidden_word[c + l2];
1476 // Learn weights hidden -> output
1477 for (c = 0; c < window_hidden_size; c++)
1478 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1479 * neu2[c];
1480 }
1481 // NEGATIVE SAMPLING
1482 if (negative > 0)
1483 for (d = 0; d < negative + 1; d++) {
1484 if (d == 0) {
1485 target = word;
1486 label = 1;
1487 } else {
1488 next_random = next_random
1489 * (unsigned long long) 25214903917 + 11;
1490 if (word_to_group != NULL
1491 && word_to_group[word] != -1) {
1492 target = word;
1493 while (target == word) {
1494 target = group_to_table[word_to_group[word]
1495 * table_size
1496 + (next_random >> 16) % table_size];
1497 next_random = next_random
1498 * (unsigned long long) 25214903917
1499 + 11;
1500 }
1501 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1502 } else {
1503 target =
1504 table[(next_random >> 16) % table_size];
1505 }
1506 if (target == 0)
1507 target = next_random % (vocab_size - 1) + 1;
1508 if (target == word)
1509 continue;
1510 label = 0;
1511 }
1512 l2 = target * window_hidden_size;
1513 f = 0;
1514 for (c = 0; c < window_hidden_size; c++)
1515 f += hardTanh(neu2[c])
1516 * syn_hidden_word_neg[c + l2];
1517 if (f > MAX_EXP)
1518 g = (label - 1) * alpha / negative;
1519 else if (f < -MAX_EXP)
1520 g = (label - 0) * alpha / negative;
1521 else
1522 g = (label
1523 - expTable[(int) ((f + MAX_EXP)
1524 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1525 * alpha / negative;
1526 for (c = 0; c < window_hidden_size; c++)
1527 neu2e[c] += dHardTanh(neu2[c], g) * g
1528 * syn_hidden_word_neg[c + l2];
1529 for (c = 0; c < window_hidden_size; c++)
1530 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1531 * g * neu2[c];
1532 }
1533 for (a = 0; a < window_hidden_size; a++)
1534 for (b = 0; b < window_layer_size; b++)
1535 neu1e[b] += neu2e[a]
1536 * syn_window_hidden[a * window_layer_size + b];
1537 for (a = 0; a < window_hidden_size; a++)
1538 for (b = 0; b < window_layer_size; b++)
1539 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1540 * neu1[b];
1541 // hidden -> in
1542 for (a = 0; a < window * 2 + 1; a++)
1543 if (a != window) {
1544 c = sentence_position - window + a;
1545 if (c < 0)
1546 continue;
1547 if (c >= sentence_length)
1548 continue;
1549 last_word = sen[c];
1550 if (last_word == -1)
1551 continue;
1552 window_offset = a * layer1_size;
1553 if (a > window)
1554 window_offset -= layer1_size;
1555 for (c = 0; c < layer1_size; c++)
1556 syn0[c + last_word * layer1_size] += neu1e[c
1557 + window_offset];
1558 }
1559 }
1560 } else {
1561 printf("unknown type %i", type);
1562 exit(0);
1563 }
1564 sentence_position++;
1565 if (sentence_position >= sentence_length) {
1566 sentence_length = 0;
1567 continue;
1568 }
1569 }
1570 fclose(fi);
1571 free(neu1);
1572 free(neu1e);
1573 pthread_exit(NULL);
1574}
1575
1576void TrainModel() {
1577 long a, b, c, d;
1578 FILE *fo;
1579 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1580 printf("Starting training using file %s\n", train_file);
1581 starting_alpha = alpha;
1582 if (read_vocab_file[0] != 0)
1583 ReadVocab();
1584 else
1585 LearnVocabFromTrainFile();
1586 if (save_vocab_file[0] != 0)
1587 SaveVocab();
1588 if (output_file[0] == 0)
1589 return;
1590 InitNet();
1591 if (negative > 0 || nce > 0)
1592 InitUnigramTable();
1593 if (negative_classes_file[0] != 0)
1594 InitClassUnigramTable();
1595 start = clock();
1596 for (a = 0; a < num_threads; a++)
1597 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1598 for (a = 0; a < num_threads; a++)
1599 pthread_join(pt[a], NULL);
1600 fo = fopen(output_file, "wb");
1601 if (classes == 0) {
1602 // Save the word vectors
1603 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1604 for (a = 0; a < vocab_size; a++) {
1605 fprintf(fo, "%s ", vocab[a].word);
1606 if (binary)
1607 for (b = 0; b < layer1_size; b++)
1608 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1609 else
1610 for (b = 0; b < layer1_size; b++)
1611 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1612 fprintf(fo, "\n");
1613 }
1614 } else {
1615 // Run K-means on the word vectors
1616 int clcn = classes, iter = 10, closeid;
1617 int *centcn = (int *) malloc(classes * sizeof(int));
1618 int *cl = (int *) calloc(vocab_size, sizeof(int));
1619 real closev, x;
1620 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1621 for (a = 0; a < vocab_size; a++)
1622 cl[a] = a % clcn;
1623 for (a = 0; a < iter; a++) {
1624 for (b = 0; b < clcn * layer1_size; b++)
1625 cent[b] = 0;
1626 for (b = 0; b < clcn; b++)
1627 centcn[b] = 1;
1628 for (c = 0; c < vocab_size; c++) {
1629 for (d = 0; d < layer1_size; d++)
1630 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1631 centcn[cl[c]]++;
1632 }
1633 for (b = 0; b < clcn; b++) {
1634 closev = 0;
1635 for (c = 0; c < layer1_size; c++) {
1636 cent[layer1_size * b + c] /= centcn[b];
1637 closev += cent[layer1_size * b + c]
1638 * cent[layer1_size * b + c];
1639 }
1640 closev = sqrt(closev);
1641 for (c = 0; c < layer1_size; c++)
1642 cent[layer1_size * b + c] /= closev;
1643 }
1644 for (c = 0; c < vocab_size; c++) {
1645 closev = -10;
1646 closeid = 0;
1647 for (d = 0; d < clcn; d++) {
1648 x = 0;
1649 for (b = 0; b < layer1_size; b++)
1650 x += cent[layer1_size * d + b]
1651 * syn0[c * layer1_size + b];
1652 if (x > closev) {
1653 closev = x;
1654 closeid = d;
1655 }
1656 }
1657 cl[c] = closeid;
1658 }
1659 }
1660 // Save the K-means classes
1661 for (a = 0; a < vocab_size; a++)
1662 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1663 free(centcn);
1664 free(cent);
1665 free(cl);
1666 }
1667 fclose(fo);
1668 if (save_net_file[0] != 0)
1669 SaveNet();
1670}
1671
1672int ArgPos(char *str, int argc, char **argv) {
1673 int a;
1674 for (a = 1; a < argc; a++)
1675 if (!strcmp(str, argv[a])) {
1676 if (a == argc - 1) {
1677 printf("Argument missing for %s\n", str);
1678 exit(1);
1679 }
1680 return a;
1681 }
1682 return -1;
1683}
1684
1685int main(int argc, char **argv) {
1686 int i;
1687 if (argc == 1) {
1688 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1689 printf("Options:\n");
1690 printf("Parameters for training:\n");
1691 printf("\t-train <file>\n");
1692 printf("\t\tUse text data from <file> to train the model\n");
1693 printf("\t-output <file>\n");
1694 printf(
1695 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1696 printf("\t-size <int>\n");
1697 printf("\t\tSet size of word vectors; default is 100\n");
1698 printf("\t-window <int>\n");
1699 printf("\t\tSet max skip length between words; default is 5\n");
1700 printf("\t-sample <float>\n");
1701 printf(
1702 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1703 printf(
1704 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1705 printf("\t-hs <int>\n");
1706 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1707 printf("\t-negative <int>\n");
1708 printf(
1709 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1710 printf("\t-negative-classes <file>\n");
1711 printf("\t\tNegative classes to sample from\n");
1712 printf("\t-nce <int>\n");
1713 printf(
1714 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1715 printf("\t-threads <int>\n");
1716 printf("\t\tUse <int> threads (default 12)\n");
1717 printf("\t-iter <int>\n");
1718 printf("\t\tRun more training iterations (default 5)\n");
1719 printf("\t-min-count <int>\n");
1720 printf(
1721 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1722 printf("\t-alpha <float>\n");
1723 printf(
1724 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1725 printf("\t-classes <int>\n");
1726 printf(
1727 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1728 printf("\t-debug <int>\n");
1729 printf(
1730 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1731 printf("\t-binary <int>\n");
1732 printf(
1733 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1734 printf("\t-save-vocab <file>\n");
1735 printf("\t\tThe vocabulary will be saved to <file>\n");
1736 printf("\t-read-vocab <file>\n");
1737 printf(
1738 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1739 printf("\t-read-net <file>\n");
1740 printf(
1741 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1742 printf("\t-save-net <file>\n");
1743 printf("\t\tThe net parameters will be saved to <file>\n");
1744 printf("\t-type <int>\n");
1745 printf(
1746 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1747 printf("\t-cap <int>\n");
1748 printf(
1749 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1750 printf("\nExamples:\n");
1751 printf(
1752 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1753 return 0;
1754 }
1755 output_file[0] = 0;
1756 save_vocab_file[0] = 0;
1757 read_vocab_file[0] = 0;
1758 save_net_file[0] = 0;
1759 read_net_file[0] = 0;
1760 negative_classes_file[0] = 0;
1761 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1762 layer1_size = atoi(argv[i + 1]);
1763 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1764 strcpy(train_file, argv[i + 1]);
1765 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1766 strcpy(save_vocab_file, argv[i + 1]);
1767 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1768 strcpy(read_vocab_file, argv[i + 1]);
1769 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1770 strcpy(save_net_file, argv[i + 1]);
1771 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1772 strcpy(read_net_file, argv[i + 1]);
1773 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1774 debug_mode = atoi(argv[i + 1]);
1775 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1776 binary = atoi(argv[i + 1]);
1777 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1778 type = atoi(argv[i + 1]);
1779 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1780 strcpy(output_file, argv[i + 1]);
1781 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1782 window = atoi(argv[i + 1]);
1783 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1784 sample = atof(argv[i + 1]);
1785 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1786 hs = atoi(argv[i + 1]);
1787 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1788 negative = atoi(argv[i + 1]);
1789 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1790 strcpy(negative_classes_file, argv[i + 1]);
1791 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1792 nce = atoi(argv[i + 1]);
1793 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1794 num_threads = atoi(argv[i + 1]);
1795 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1796 iter = atoi(argv[i + 1]);
1797 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1798 min_count = atoi(argv[i + 1]);
1799 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1800 classes = atoi(argv[i + 1]);
1801 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1802 cap = atoi(argv[i + 1]);
1803 if (type == 0 || type == 2 || type == 4)
1804 alpha = 0.05;
1805 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1806 alpha = atof(argv[i + 1]);
1807 vocab = (struct vocab_word *) calloc(vocab_max_size,
1808 sizeof(struct vocab_word));
1809 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1810 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1811 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1812 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1813 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1814 }
1815 TrainModel();
1816 return 0;
1817}
1818