blob: 88d7ef0a4ff081fbc3fdf7698739636e561fba19 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
40struct vocab_word *vocab;
41int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
42 num_threads = 12, min_reduce = 1;
43int *vocab_hash;
44long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
45long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
46 classes = 0;
47real alpha = 0.025, starting_alpha, sample = 1e-3;
48real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
49clock_t start;
50
51real *syn1_window, *syn1neg_window, *syn1nce_window;
52int w_offset, window_layer_size;
53
54int window_hidden_size = 500;
55real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
56 *syn_hidden_word_nce;
57
58int hs = 0, negative = 5;
59const int table_size = 1e8;
60int *table;
61
62//constrastive negative sampling
63char negative_classes_file[MAX_STRING];
64int *word_to_group;
65int *group_to_table; //group_size*table_size
66int class_number;
67
68//nce
69real* noise_distribution;
70int nce = 0;
71
72//param caps
73real CAP_VALUE = 50;
74int cap = 0;
75
76void capParam(real* array, int index) {
77 if (array[index] > CAP_VALUE)
78 array[index] = CAP_VALUE;
79 else if (array[index] < -CAP_VALUE)
80 array[index] = -CAP_VALUE;
81}
82
83real hardTanh(real x) {
84 if (x >= 1) {
85 return 1;
86 } else if (x <= -1) {
87 return -1;
88 } else {
89 return x;
90 }
91}
92
93real dHardTanh(real x, real g) {
94 if (x > 1 && g > 0) {
95 return 0;
96 }
97 if (x < -1 && g < 0) {
98 return 0;
99 }
100 return 1;
101}
102
103void InitUnigramTable() {
104 int a, i;
105 long long train_words_pow = 0;
106 real d1, power = 0.75;
107 table = (int *) malloc(table_size * sizeof(int));
108 for (a = 0; a < vocab_size; a++)
109 train_words_pow += pow(vocab[a].cn, power);
110 i = 0;
111 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
112 for (a = 0; a < table_size; a++) {
113 table[a] = i;
114 if (a / (real) table_size > d1) {
115 i++;
116 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
117 }
118 if (i >= vocab_size)
119 i = vocab_size - 1;
120 }
121
122 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
123 for (a = 0; a < vocab_size; a++)
124 noise_distribution[a] = pow(vocab[a].cn, power)
125 / (real) train_words_pow;
126}
127
128// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
129void ReadWord(char *word, FILE *fin) {
130 int a = 0, ch;
131 while (!feof(fin)) {
132 ch = fgetc(fin);
133 if (ch == 13)
134 continue;
135 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
136 if (a > 0) {
137 if (ch == '\n')
138 ungetc(ch, fin);
139 break;
140 }
141 if (ch == '\n') {
142 strcpy(word, (char *) "</s>");
143 return;
144 } else
145 continue;
146 }
147 word[a] = ch;
148 a++;
149 if (a >= MAX_STRING - 1)
150 a--; // Truncate too long words
151 }
152 word[a] = 0;
153}
154
155// Returns hash value of a word
156int GetWordHash(char *word) {
157 unsigned long long a, hash = 0;
158 for (a = 0; a < strlen(word); a++)
159 hash = hash * 257 + word[a];
160 hash = hash % vocab_hash_size;
161 return hash;
162}
163
164// Returns position of a word in the vocabulary; if the word is not found, returns -1
165int SearchVocab(char *word) {
166 unsigned int hash = GetWordHash(word);
167 while (1) {
168 if (vocab_hash[hash] == -1)
169 return -1;
170 if (!strcmp(word, vocab[vocab_hash[hash]].word))
171 return vocab_hash[hash];
172 hash = (hash + 1) % vocab_hash_size;
173 }
174 return -1;
175}
176
177// Reads a word and returns its index in the vocabulary
178int ReadWordIndex(FILE *fin) {
179 char word[MAX_STRING];
180 ReadWord(word, fin);
181 if (feof(fin))
182 return -1;
183 return SearchVocab(word);
184}
185
186// Adds a word to the vocabulary
187int AddWordToVocab(char *word) {
188 unsigned int hash, length = strlen(word) + 1;
189 if (length > MAX_STRING)
190 length = MAX_STRING;
191 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
192 strcpy(vocab[vocab_size].word, word);
193 vocab[vocab_size].cn = 0;
194 vocab_size++;
195 // Reallocate memory if needed
196 if (vocab_size + 2 >= vocab_max_size) {
197 vocab_max_size += 1000;
198 vocab = (struct vocab_word *) realloc(vocab,
199 vocab_max_size * sizeof(struct vocab_word));
200 }
201 hash = GetWordHash(word);
202 while (vocab_hash[hash] != -1)
203 hash = (hash + 1) % vocab_hash_size;
204 vocab_hash[hash] = vocab_size - 1;
205 return vocab_size - 1;
206}
207
208// Used later for sorting by word counts
209int VocabCompare(const void *a, const void *b) {
210 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
211}
212
213// Sorts the vocabulary by frequency using word counts
214void SortVocab() {
215 int a, size;
216 unsigned int hash;
217 // Sort the vocabulary and keep </s> at the first position
218 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
219 for (a = 0; a < vocab_hash_size; a++)
220 vocab_hash[a] = -1;
221 size = vocab_size;
222 train_words = 0;
223 for (a = 0; a < size; a++) {
224 // Words occuring less than min_count times will be discarded from the vocab
225 if ((vocab[a].cn < min_count) && (a != 0)) {
226 vocab_size--;
227 free(vocab[a].word);
228 } else {
229 // Hash will be re-computed, as after the sorting it is not actual
230 hash = GetWordHash(vocab[a].word);
231 while (vocab_hash[hash] != -1)
232 hash = (hash + 1) % vocab_hash_size;
233 vocab_hash[hash] = a;
234 train_words += vocab[a].cn;
235 }
236 }
237 vocab = (struct vocab_word *) realloc(vocab,
238 (vocab_size + 1) * sizeof(struct vocab_word));
239 // Allocate memory for the binary tree construction
240 for (a = 0; a < vocab_size; a++) {
241 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
242 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
243 }
244}
245
246// Reduces the vocabulary by removing infrequent tokens
247void ReduceVocab() {
248 int a, b = 0;
249 unsigned int hash;
250 for (a = 0; a < vocab_size; a++)
251 if (vocab[a].cn > min_reduce) {
252 vocab[b].cn = vocab[a].cn;
253 vocab[b].word = vocab[a].word;
254 b++;
255 } else
256 free(vocab[a].word);
257 vocab_size = b;
258 for (a = 0; a < vocab_hash_size; a++)
259 vocab_hash[a] = -1;
260 for (a = 0; a < vocab_size; a++) {
261 // Hash will be re-computed, as it is not actual
262 hash = GetWordHash(vocab[a].word);
263 while (vocab_hash[hash] != -1)
264 hash = (hash + 1) % vocab_hash_size;
265 vocab_hash[hash] = a;
266 }
267 fflush(stdout);
268 min_reduce++;
269}
270
271// Create binary Huffman tree using the word counts
272// Frequent words will have short uniqe binary codes
273void CreateBinaryTree() {
274 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
275 char code[MAX_CODE_LENGTH];
276 long long *count = (long long *) calloc(vocab_size * 2 + 1,
277 sizeof(long long));
278 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
279 sizeof(long long));
280 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
281 sizeof(long long));
282 for (a = 0; a < vocab_size; a++)
283 count[a] = vocab[a].cn;
284 for (a = vocab_size; a < vocab_size * 2; a++)
285 count[a] = 1e15;
286 pos1 = vocab_size - 1;
287 pos2 = vocab_size;
288 // Following algorithm constructs the Huffman tree by adding one node at a time
289 for (a = 0; a < vocab_size - 1; a++) {
290 // First, find two smallest nodes 'min1, min2'
291 if (pos1 >= 0) {
292 if (count[pos1] < count[pos2]) {
293 min1i = pos1;
294 pos1--;
295 } else {
296 min1i = pos2;
297 pos2++;
298 }
299 } else {
300 min1i = pos2;
301 pos2++;
302 }
303 if (pos1 >= 0) {
304 if (count[pos1] < count[pos2]) {
305 min2i = pos1;
306 pos1--;
307 } else {
308 min2i = pos2;
309 pos2++;
310 }
311 } else {
312 min2i = pos2;
313 pos2++;
314 }
315 count[vocab_size + a] = count[min1i] + count[min2i];
316 parent_node[min1i] = vocab_size + a;
317 parent_node[min2i] = vocab_size + a;
318 binary[min2i] = 1;
319 }
320 // Now assign binary code to each vocabulary word
321 for (a = 0; a < vocab_size; a++) {
322 b = a;
323 i = 0;
324 while (1) {
325 code[i] = binary[b];
326 point[i] = b;
327 i++;
328 b = parent_node[b];
329 if (b == vocab_size * 2 - 2)
330 break;
331 }
332 vocab[a].codelen = i;
333 vocab[a].point[0] = vocab_size - 2;
334 for (b = 0; b < i; b++) {
335 vocab[a].code[i - b - 1] = code[b];
336 vocab[a].point[i - b] = point[b] - vocab_size;
337 }
338 }
339 free(count);
340 free(binary);
341 free(parent_node);
342}
343
344void LearnVocabFromTrainFile() {
345 char word[MAX_STRING];
346 FILE *fin;
347 long long a, i;
348 for (a = 0; a < vocab_hash_size; a++)
349 vocab_hash[a] = -1;
350 fin = fopen(train_file, "rb");
351 if (fin == NULL) {
352 printf("ERROR: training data file not found!\n");
353 exit(1);
354 }
355 vocab_size = 0;
356 AddWordToVocab((char *) "</s>");
357 while (1) {
358 ReadWord(word, fin);
359 if (feof(fin))
360 break;
361 train_words++;
362 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
363 printf("%lldK%c", train_words / 1000, 13);
364 fflush(stdout);
365 }
366 i = SearchVocab(word);
367 if (i == -1) {
368 a = AddWordToVocab(word);
369 vocab[a].cn = 1;
370 } else
371 vocab[i].cn++;
372 if (vocab_size > vocab_hash_size * 0.7)
373 ReduceVocab();
374 }
375 SortVocab();
376 if (debug_mode > 0) {
377 printf("Vocab size: %lld\n", vocab_size);
378 printf("Words in train file: %lld\n", train_words);
379 }
380 file_size = ftell(fin);
381 fclose(fin);
382}
383
384void SaveVocab() {
385 long long i;
386 FILE *fo = fopen(save_vocab_file, "wb");
387 for (i = 0; i < vocab_size; i++)
388 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
389 fclose(fo);
390}
391
392void ReadVocab() {
393 long long a, i = 0;
394 char c;
395 char word[MAX_STRING];
396 FILE *fin = fopen(read_vocab_file, "rb");
397 if (fin == NULL) {
398 printf("Vocabulary file not found\n");
399 exit(1);
400 }
401 for (a = 0; a < vocab_hash_size; a++)
402 vocab_hash[a] = -1;
403 vocab_size = 0;
404 while (1) {
405 ReadWord(word, fin);
406 if (feof(fin))
407 break;
408 a = AddWordToVocab(word);
409 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
410 i++;
411 }
412 SortVocab();
413 if (debug_mode > 0) {
414 printf("Vocab size: %lld\n", vocab_size);
415 printf("Words in train file: %lld\n", train_words);
416 }
417 fin = fopen(train_file, "rb");
418 if (fin == NULL) {
419 printf("ERROR: training data file not found!\n");
420 exit(1);
421 }
422 fseek(fin, 0, SEEK_END);
423 file_size = ftell(fin);
424 fclose(fin);
425}
426
427void InitClassUnigramTable() {
428 long long a, c;
429 printf("loading class unigrams \n");
430 FILE *fin = fopen(negative_classes_file, "rb");
431 if (fin == NULL) {
432 printf("ERROR: class file not found!\n");
433 exit(1);
434 }
435 word_to_group = (int *) malloc(vocab_size * sizeof(int));
436 for (a = 0; a < vocab_size; a++)
437 word_to_group[a] = -1;
438 char class[MAX_STRING];
439 char prev_class[MAX_STRING];
440 prev_class[0] = 0;
441 char word[MAX_STRING];
442 class_number = -1;
443 while (1) {
444 if (feof(fin))
445 break;
446 ReadWord(class, fin);
447 ReadWord(word, fin);
448 int word_index = SearchVocab(word);
449 if (word_index != -1) {
450 if (strcmp(class, prev_class) != 0) {
451 class_number++;
452 strcpy(prev_class, class);
453 }
454 word_to_group[word_index] = class_number;
455 }
456 ReadWord(word, fin);
457 }
458 class_number++;
459 fclose(fin);
460
461 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
462 long long train_words_pow = 0;
463 real d1, power = 0.75;
464
465 for (c = 0; c < class_number; c++) {
466 long long offset = c * table_size;
467 train_words_pow = 0;
468 for (a = 0; a < vocab_size; a++)
469 if (word_to_group[a] == c)
470 train_words_pow += pow(vocab[a].cn, power);
471 int i = 0;
472 while (word_to_group[i] != c && i < vocab_size)
473 i++;
474 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
475 for (a = 0; a < table_size; a++) {
476 //printf("index %lld , word %d\n", a, i);
477 group_to_table[offset + a] = i;
478 if (a / (real) table_size > d1) {
479 i++;
480 while (word_to_group[i] != c && i < vocab_size)
481 i++;
482 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
483 }
484 if (i >= vocab_size)
485 while (word_to_group[i] != c && i >= 0)
486 i--;
487 }
488 }
489}
490
491void SaveNet() {
492 long long a, b;
493 FILE *fnet = fopen(save_net_file, "wb");
494 if (fnet == NULL) {
495 printf("Net parameter file not found\n");
496 exit(1);
497 }
498 for (a = 0; a < vocab_size; a++)
499 for (b = 0; b < layer1_size; b++) {
500 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fnet);
501 }
502 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
503 fwrite(&syn_window_hidden[a],sizeof(real),1,fnet);
504 }
505 fclose(fnet);
506}
507
508void InitNet() {
509 long long a, b;
510 unsigned long long next_random = 1;
511 window_layer_size = layer1_size * window * 2;
512 a = posix_memalign((void **) &syn0, 128,
513 (long long) vocab_size * layer1_size * sizeof(real));
514 if (syn0 == NULL) {
515 printf("Memory allocation failed\n");
516 exit(1);
517 }
518
519 if (hs) {
520 a = posix_memalign((void **) &syn1, 128,
521 (long long) vocab_size * layer1_size * sizeof(real));
522 if (syn1 == NULL) {
523 printf("Memory allocation failed\n");
524 exit(1);
525 }
526 a = posix_memalign((void **) &syn1_window, 128,
527 (long long) vocab_size * window_layer_size * sizeof(real));
528 if (syn1_window == NULL) {
529 printf("Memory allocation failed\n");
530 exit(1);
531 }
532 a = posix_memalign((void **) &syn_hidden_word, 128,
533 (long long) vocab_size * window_hidden_size * sizeof(real));
534 if (syn_hidden_word == NULL) {
535 printf("Memory allocation failed\n");
536 exit(1);
537 }
538
539 for (a = 0; a < vocab_size; a++)
540 for (b = 0; b < layer1_size; b++)
541 syn1[a * layer1_size + b] = 0;
542 for (a = 0; a < vocab_size; a++)
543 for (b = 0; b < window_layer_size; b++)
544 syn1_window[a * window_layer_size + b] = 0;
545 for (a = 0; a < vocab_size; a++)
546 for (b = 0; b < window_hidden_size; b++)
547 syn_hidden_word[a * window_hidden_size + b] = 0;
548 }
549 if (negative > 0) {
550 a = posix_memalign((void **) &syn1neg, 128,
551 (long long) vocab_size * layer1_size * sizeof(real));
552 if (syn1neg == NULL) {
553 printf("Memory allocation failed\n");
554 exit(1);
555 }
556 a = posix_memalign((void **) &syn1neg_window, 128,
557 (long long) vocab_size * window_layer_size * sizeof(real));
558 if (syn1neg_window == NULL) {
559 printf("Memory allocation failed\n");
560 exit(1);
561 }
562 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
563 (long long) vocab_size * window_hidden_size * sizeof(real));
564 if (syn_hidden_word_neg == NULL) {
565 printf("Memory allocation failed\n");
566 exit(1);
567 }
568
569 for (a = 0; a < vocab_size; a++)
570 for (b = 0; b < layer1_size; b++)
571 syn1neg[a * layer1_size + b] = 0;
572 for (a = 0; a < vocab_size; a++)
573 for (b = 0; b < window_layer_size; b++)
574 syn1neg_window[a * window_layer_size + b] = 0;
575 for (a = 0; a < vocab_size; a++)
576 for (b = 0; b < window_hidden_size; b++)
577 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
578 }
579 if (nce > 0) {
580 a = posix_memalign((void **) &syn1nce, 128,
581 (long long) vocab_size * layer1_size * sizeof(real));
582 if (syn1nce == NULL) {
583 printf("Memory allocation failed\n");
584 exit(1);
585 }
586 a = posix_memalign((void **) &syn1nce_window, 128,
587 (long long) vocab_size * window_layer_size * sizeof(real));
588 if (syn1nce_window == NULL) {
589 printf("Memory allocation failed\n");
590 exit(1);
591 }
592 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
593 (long long) vocab_size * window_hidden_size * sizeof(real));
594 if (syn_hidden_word_nce == NULL) {
595 printf("Memory allocation failed\n");
596 exit(1);
597 }
598
599 for (a = 0; a < vocab_size; a++)
600 for (b = 0; b < layer1_size; b++)
601 syn1nce[a * layer1_size + b] = 0;
602 for (a = 0; a < vocab_size; a++)
603 for (b = 0; b < window_layer_size; b++)
604 syn1nce_window[a * window_layer_size + b] = 0;
605 for (a = 0; a < vocab_size; a++)
606 for (b = 0; b < window_hidden_size; b++)
607 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
608 }
609 if (read_net_file[0] == 0) {
610 for (a = 0; a < vocab_size; a++)
611 for (b = 0; b < layer1_size; b++) {
612 next_random = next_random * (unsigned long long) 25214903917
613 + 11;
614 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
615 / (real) 65536) - 0.5) / layer1_size;
616 }
617
618 a = posix_memalign((void **) &syn_window_hidden, 128,
619 window_hidden_size * window_layer_size * sizeof(real));
620 if (syn_window_hidden == NULL) {
621 printf("Memory allocation failed\n");
622 exit(1);
623 }
624 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
625 next_random = next_random * (unsigned long long) 25214903917 + 11;
626 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
627 - 0.5) / (window_hidden_size * window_layer_size);
628 }
629 }
630 else {
631 FILE *fnet = fopen(read_net_file, "rb");
632 if (fnet == NULL) {
633 printf("Net parameter file not found\n");
634 exit(1);
635 }
636 for (a = 0; a < vocab_size; a++)
637 for (b = 0; b < layer1_size; b++) {
638 fread(&syn0[a * layer1_size + b], sizeof(real), 1, fnet);
639 }
640
641 a = posix_memalign((void **) &syn_window_hidden, 128,
642 window_hidden_size * window_layer_size * sizeof(real));
643 if (syn_window_hidden == NULL) {
644 printf("Memory allocation failed\n");
645 exit(1);
646 }
647 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
648 fread(&syn_window_hidden[a],sizeof(real),1,fnet);
649 }
650 fclose(fnet);
651 }
652
653 CreateBinaryTree();
654}
655
656void *TrainModelThread(void *id) {
657 long long a, b, d, cw, word, last_word, sentence_length = 0,
658 sentence_position = 0;
659 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
660 long long l1, l2, c, target, label, local_iter = iter;
661 unsigned long long next_random = (long long) id;
662 real f, g;
663 clock_t now;
664 int input_len_1 = layer1_size;
665 int window_offset = -1;
666 if (type == 2 || type == 4) {
667 input_len_1 = window_layer_size;
668 }
669 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
670 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
671
672 int input_len_2 = 0;
673 if (type == 4) {
674 input_len_2 = window_hidden_size;
675 }
676 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
677 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
678
679 FILE *fi = fopen(train_file, "rb");
680 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
681 while (1) {
682 if (word_count - last_word_count > 10000) {
683 word_count_actual += word_count - last_word_count;
684 last_word_count = word_count;
685 if ((debug_mode > 1)) {
686 now = clock();
687 printf(
688 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
689 13, alpha,
690 word_count_actual / (real) (iter * train_words + 1)
691 * 100,
692 word_count_actual
693 / ((real) (now - start + 1)
694 / (real) CLOCKS_PER_SEC * 1000));
695 fflush(stdout);
696 }
697 alpha = starting_alpha
698 * (1 - word_count_actual / (real) (iter * train_words + 1));
699 if (alpha < starting_alpha * 0.0001)
700 alpha = starting_alpha * 0.0001;
701 }
702 if (sentence_length == 0) {
703 while (1) {
704 word = ReadWordIndex(fi);
705 if (feof(fi))
706 break;
707 if (word == -1)
708 continue;
709 word_count++;
710 if (word == 0)
711 break;
712 // The subsampling randomly discards frequent words while keeping the ranking same
713 if (sample > 0) {
714 real ran = (sqrt(vocab[word].cn / (sample * train_words))
715 + 1) * (sample * train_words) / vocab[word].cn;
716 next_random = next_random * (unsigned long long) 25214903917
717 + 11;
718 if (ran < (next_random & 0xFFFF) / (real) 65536)
719 continue;
720 }
721 sen[sentence_length] = word;
722 sentence_length++;
723 if (sentence_length >= MAX_SENTENCE_LENGTH)
724 break;
725 }
726 sentence_position = 0;
727 }
728 if (feof(fi) || (word_count > train_words / num_threads)) {
729 word_count_actual += word_count - last_word_count;
730 local_iter--;
731 if (local_iter == 0)
732 break;
733 word_count = 0;
734 last_word_count = 0;
735 sentence_length = 0;
736 fseek(fi, file_size / (long long) num_threads * (long long) id,
737 SEEK_SET);
738 continue;
739 }
740 word = sen[sentence_position];
741 if (word == -1)
742 continue;
743 for (c = 0; c < input_len_1; c++)
744 neu1[c] = 0;
745 for (c = 0; c < input_len_1; c++)
746 neu1e[c] = 0;
747 for (c = 0; c < input_len_2; c++)
748 neu2[c] = 0;
749 for (c = 0; c < input_len_2; c++)
750 neu2e[c] = 0;
751 next_random = next_random * (unsigned long long) 25214903917 + 11;
752 b = next_random % window;
753 if (type == 0) { //train the cbow architecture
754 // in -> hidden
755 cw = 0;
756 for (a = b; a < window * 2 + 1 - b; a++)
757 if (a != window) {
758 c = sentence_position - window + a;
759 if (c < 0)
760 continue;
761 if (c >= sentence_length)
762 continue;
763 last_word = sen[c];
764 if (last_word == -1)
765 continue;
766 for (c = 0; c < layer1_size; c++)
767 neu1[c] += syn0[c + last_word * layer1_size];
768 cw++;
769 }
770 if (cw) {
771 for (c = 0; c < layer1_size; c++)
772 neu1[c] /= cw;
773 if (hs)
774 for (d = 0; d < vocab[word].codelen; d++) {
775 f = 0;
776 l2 = vocab[word].point[d] * layer1_size;
777 // Propagate hidden -> output
778 for (c = 0; c < layer1_size; c++)
779 f += neu1[c] * syn1[c + l2];
780 if (f <= -MAX_EXP)
781 continue;
782 else if (f >= MAX_EXP)
783 continue;
784 else
785 f = expTable[(int) ((f + MAX_EXP)
786 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
787 // 'g' is the gradient multiplied by the learning rate
788 g = (1 - vocab[word].code[d] - f) * alpha;
789 // Propagate errors output -> hidden
790 for (c = 0; c < layer1_size; c++)
791 neu1e[c] += g * syn1[c + l2];
792 // Learn weights hidden -> output
793 for (c = 0; c < layer1_size; c++)
794 syn1[c + l2] += g * neu1[c];
795 if (cap == 1)
796 for (c = 0; c < layer1_size; c++)
797 capParam(syn1, c + l2);
798 }
799 // NEGATIVE SAMPLING
800 if (negative > 0)
801 for (d = 0; d < negative + 1; d++) {
802 if (d == 0) {
803 target = word;
804 label = 1;
805 } else {
806 next_random = next_random
807 * (unsigned long long) 25214903917 + 11;
808 if (word_to_group != NULL
809 && word_to_group[word] != -1) {
810 target = word;
811 while (target == word) {
812 target = group_to_table[word_to_group[word]
813 * table_size
814 + (next_random >> 16) % table_size];
815 next_random = next_random
816 * (unsigned long long) 25214903917
817 + 11;
818 }
819 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
820 } else {
821 target =
822 table[(next_random >> 16) % table_size];
823 }
824 if (target == 0)
825 target = next_random % (vocab_size - 1) + 1;
826 if (target == word)
827 continue;
828 label = 0;
829 }
830 l2 = target * layer1_size;
831 f = 0;
832 for (c = 0; c < layer1_size; c++)
833 f += neu1[c] * syn1neg[c + l2];
834 if (f > MAX_EXP)
835 g = (label - 1) * alpha;
836 else if (f < -MAX_EXP)
837 g = (label - 0) * alpha;
838 else
839 g = (label
840 - expTable[(int) ((f + MAX_EXP)
841 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
842 * alpha;
843 for (c = 0; c < layer1_size; c++)
844 neu1e[c] += g * syn1neg[c + l2];
845 for (c = 0; c < layer1_size; c++)
846 syn1neg[c + l2] += g * neu1[c];
847 if (cap == 1)
848 for (c = 0; c < layer1_size; c++)
849 capParam(syn1neg, c + l2);
850 }
851 // Noise Contrastive Estimation
852 if (nce > 0)
853 for (d = 0; d < nce + 1; d++) {
854 if (d == 0) {
855 target = word;
856 label = 1;
857 } else {
858 next_random = next_random
859 * (unsigned long long) 25214903917 + 11;
860 if (word_to_group != NULL
861 && word_to_group[word] != -1) {
862 target = word;
863 while (target == word) {
864 target = group_to_table[word_to_group[word]
865 * table_size
866 + (next_random >> 16) % table_size];
867 next_random = next_random
868 * (unsigned long long) 25214903917
869 + 11;
870 }
871 } else {
872 target =
873 table[(next_random >> 16) % table_size];
874 }
875 if (target == 0)
876 target = next_random % (vocab_size - 1) + 1;
877 if (target == word)
878 continue;
879 label = 0;
880 }
881 l2 = target * layer1_size;
882 f = 0;
883
884 for (c = 0; c < layer1_size; c++)
885 f += neu1[c] * syn1nce[c + l2];
886 if (f > MAX_EXP)
887 g = (label - 1) * alpha;
888 else if (f < -MAX_EXP)
889 g = (label - 0) * alpha;
890 else {
891 f = exp(f);
892 g =
893 (label
894 - f
895 / (noise_distribution[target]
896 * nce + f)) * alpha;
897 }
898 for (c = 0; c < layer1_size; c++)
899 neu1e[c] += g * syn1nce[c + l2];
900 for (c = 0; c < layer1_size; c++)
901 syn1nce[c + l2] += g * neu1[c];
902 if (cap == 1)
903 for (c = 0; c < layer1_size; c++)
904 capParam(syn1nce, c + l2);
905 }
906 // hidden -> in
907 for (a = b; a < window * 2 + 1 - b; a++)
908 if (a != window) {
909 c = sentence_position - window + a;
910 if (c < 0)
911 continue;
912 if (c >= sentence_length)
913 continue;
914 last_word = sen[c];
915 if (last_word == -1)
916 continue;
917 for (c = 0; c < layer1_size; c++)
918 syn0[c + last_word * layer1_size] += neu1e[c];
919 }
920 }
921 } else if (type == 1) { //train skip-gram
922 for (a = b; a < window * 2 + 1 - b; a++)
923 if (a != window) {
924 c = sentence_position - window + a;
925 if (c < 0)
926 continue;
927 if (c >= sentence_length)
928 continue;
929 last_word = sen[c];
930 if (last_word == -1)
931 continue;
932 l1 = last_word * layer1_size;
933 for (c = 0; c < layer1_size; c++)
934 neu1e[c] = 0;
935 // HIERARCHICAL SOFTMAX
936 if (hs)
937 for (d = 0; d < vocab[word].codelen; d++) {
938 f = 0;
939 l2 = vocab[word].point[d] * layer1_size;
940 // Propagate hidden -> output
941 for (c = 0; c < layer1_size; c++)
942 f += syn0[c + l1] * syn1[c + l2];
943 if (f <= -MAX_EXP)
944 continue;
945 else if (f >= MAX_EXP)
946 continue;
947 else
948 f = expTable[(int) ((f + MAX_EXP)
949 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
950 // 'g' is the gradient multiplied by the learning rate
951 g = (1 - vocab[word].code[d] - f) * alpha;
952 // Propagate errors output -> hidden
953 for (c = 0; c < layer1_size; c++)
954 neu1e[c] += g * syn1[c + l2];
955 // Learn weights hidden -> output
956 for (c = 0; c < layer1_size; c++)
957 syn1[c + l2] += g * syn0[c + l1];
958 if (cap == 1)
959 for (c = 0; c < layer1_size; c++)
960 capParam(syn1, c + l2);
961 }
962 // NEGATIVE SAMPLING
963 if (negative > 0)
964 for (d = 0; d < negative + 1; d++) {
965 if (d == 0) {
966 target = word;
967 label = 1;
968 } else {
969 next_random = next_random
970 * (unsigned long long) 25214903917 + 11;
971 if (word_to_group != NULL
972 && word_to_group[word] != -1) {
973 target = word;
974 while (target == word) {
975 target =
976 group_to_table[word_to_group[word]
977 * table_size
978 + (next_random >> 16)
979 % table_size];
980 next_random =
981 next_random
982 * (unsigned long long) 25214903917
983 + 11;
984 }
985 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
986 } else {
987 target = table[(next_random >> 16)
988 % table_size];
989 }
990 if (target == 0)
991 target = next_random % (vocab_size - 1) + 1;
992 if (target == word)
993 continue;
994 label = 0;
995 }
996 l2 = target * layer1_size;
997 f = 0;
998 for (c = 0; c < layer1_size; c++)
999 f += syn0[c + l1] * syn1neg[c + l2];
1000 if (f > MAX_EXP)
1001 g = (label - 1) * alpha;
1002 else if (f < -MAX_EXP)
1003 g = (label - 0) * alpha;
1004 else
1005 g =
1006 (label
1007 - expTable[(int) ((f + MAX_EXP)
1008 * (EXP_TABLE_SIZE
1009 / MAX_EXP / 2))])
1010 * alpha;
1011 for (c = 0; c < layer1_size; c++)
1012 neu1e[c] += g * syn1neg[c + l2];
1013 for (c = 0; c < layer1_size; c++)
1014 syn1neg[c + l2] += g * syn0[c + l1];
1015 if (cap == 1)
1016 for (c = 0; c < layer1_size; c++)
1017 capParam(syn1neg, c + l2);
1018 }
1019 //Noise Contrastive Estimation
1020 if (nce > 0)
1021 for (d = 0; d < nce + 1; d++) {
1022 if (d == 0) {
1023 target = word;
1024 label = 1;
1025 } else {
1026 next_random = next_random
1027 * (unsigned long long) 25214903917 + 11;
1028 if (word_to_group != NULL
1029 && word_to_group[word] != -1) {
1030 target = word;
1031 while (target == word) {
1032 target =
1033 group_to_table[word_to_group[word]
1034 * table_size
1035 + (next_random >> 16)
1036 % table_size];
1037 next_random =
1038 next_random
1039 * (unsigned long long) 25214903917
1040 + 11;
1041 }
1042 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1043 } else {
1044 target = table[(next_random >> 16)
1045 % table_size];
1046 }
1047 if (target == 0)
1048 target = next_random % (vocab_size - 1) + 1;
1049 if (target == word)
1050 continue;
1051 label = 0;
1052 }
1053 l2 = target * layer1_size;
1054 f = 0;
1055 for (c = 0; c < layer1_size; c++)
1056 f += syn0[c + l1] * syn1nce[c + l2];
1057 if (f > MAX_EXP)
1058 g = (label - 1) * alpha;
1059 else if (f < -MAX_EXP)
1060 g = (label - 0) * alpha;
1061 else {
1062 f = exp(f);
1063 g = (label
1064 - f
1065 / (noise_distribution[target]
1066 * nce + f)) * alpha;
1067 }
1068 for (c = 0; c < layer1_size; c++)
1069 neu1e[c] += g * syn1nce[c + l2];
1070 for (c = 0; c < layer1_size; c++)
1071 syn1nce[c + l2] += g * syn0[c + l1];
1072 if (cap == 1)
1073 for (c = 0; c < layer1_size; c++)
1074 capParam(syn1nce, c + l2);
1075 }
1076 // Learn weights input -> hidden
1077 for (c = 0; c < layer1_size; c++)
1078 syn0[c + l1] += neu1e[c];
1079 }
1080 } else if (type == 2) { //train the cwindow architecture
1081 // in -> hidden
1082 cw = 0;
1083 for (a = 0; a < window * 2 + 1; a++)
1084 if (a != window) {
1085 c = sentence_position - window + a;
1086 if (c < 0)
1087 continue;
1088 if (c >= sentence_length)
1089 continue;
1090 last_word = sen[c];
1091 if (last_word == -1)
1092 continue;
1093 window_offset = a * layer1_size;
1094 if (a > window)
1095 window_offset -= layer1_size;
1096 for (c = 0; c < layer1_size; c++)
1097 neu1[c + window_offset] += syn0[c
1098 + last_word * layer1_size];
1099 cw++;
1100 }
1101 if (cw) {
1102 if (hs)
1103 for (d = 0; d < vocab[word].codelen; d++) {
1104 f = 0;
1105 l2 = vocab[word].point[d] * window_layer_size;
1106 // Propagate hidden -> output
1107 for (c = 0; c < window_layer_size; c++)
1108 f += neu1[c] * syn1_window[c + l2];
1109 if (f <= -MAX_EXP)
1110 continue;
1111 else if (f >= MAX_EXP)
1112 continue;
1113 else
1114 f = expTable[(int) ((f + MAX_EXP)
1115 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1116 // 'g' is the gradient multiplied by the learning rate
1117 g = (1 - vocab[word].code[d] - f) * alpha;
1118 // Propagate errors output -> hidden
1119 for (c = 0; c < window_layer_size; c++)
1120 neu1e[c] += g * syn1_window[c + l2];
1121 // Learn weights hidden -> output
1122 for (c = 0; c < window_layer_size; c++)
1123 syn1_window[c + l2] += g * neu1[c];
1124 if (cap == 1)
1125 for (c = 0; c < window_layer_size; c++)
1126 capParam(syn1_window, c + l2);
1127 }
1128 // NEGATIVE SAMPLING
1129 if (negative > 0)
1130 for (d = 0; d < negative + 1; d++) {
1131 if (d == 0) {
1132 target = word;
1133 label = 1;
1134 } else {
1135 next_random = next_random
1136 * (unsigned long long) 25214903917 + 11;
1137 if (word_to_group != NULL
1138 && word_to_group[word] != -1) {
1139 target = word;
1140 while (target == word) {
1141 target = group_to_table[word_to_group[word]
1142 * table_size
1143 + (next_random >> 16) % table_size];
1144 next_random = next_random
1145 * (unsigned long long) 25214903917
1146 + 11;
1147 }
1148 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1149 } else {
1150 target =
1151 table[(next_random >> 16) % table_size];
1152 }
1153 if (target == 0)
1154 target = next_random % (vocab_size - 1) + 1;
1155 if (target == word)
1156 continue;
1157 label = 0;
1158 }
1159 l2 = target * window_layer_size;
1160 f = 0;
1161 for (c = 0; c < window_layer_size; c++)
1162 f += neu1[c] * syn1neg_window[c + l2];
1163 if (f > MAX_EXP)
1164 g = (label - 1) * alpha;
1165 else if (f < -MAX_EXP)
1166 g = (label - 0) * alpha;
1167 else
1168 g = (label
1169 - expTable[(int) ((f + MAX_EXP)
1170 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1171 * alpha;
1172 for (c = 0; c < window_layer_size; c++)
1173 neu1e[c] += g * syn1neg_window[c + l2];
1174 for (c = 0; c < window_layer_size; c++)
1175 syn1neg_window[c + l2] += g * neu1[c];
1176 if (cap == 1)
1177 for (c = 0; c < window_layer_size; c++)
1178 capParam(syn1neg_window, c + l2);
1179 }
1180 // Noise Contrastive Estimation
1181 if (nce > 0)
1182 for (d = 0; d < nce + 1; d++) {
1183 if (d == 0) {
1184 target = word;
1185 label = 1;
1186 } else {
1187 next_random = next_random
1188 * (unsigned long long) 25214903917 + 11;
1189 if (word_to_group != NULL
1190 && word_to_group[word] != -1) {
1191 target = word;
1192 while (target == word) {
1193 target = group_to_table[word_to_group[word]
1194 * table_size
1195 + (next_random >> 16) % table_size];
1196 next_random = next_random
1197 * (unsigned long long) 25214903917
1198 + 11;
1199 }
1200 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1201 } else {
1202 target =
1203 table[(next_random >> 16) % table_size];
1204 }
1205 if (target == 0)
1206 target = next_random % (vocab_size - 1) + 1;
1207 if (target == word)
1208 continue;
1209 label = 0;
1210 }
1211 l2 = target * window_layer_size;
1212 f = 0;
1213 for (c = 0; c < window_layer_size; c++)
1214 f += neu1[c] * syn1nce_window[c + l2];
1215 if (f > MAX_EXP)
1216 g = (label - 1) * alpha;
1217 else if (f < -MAX_EXP)
1218 g = (label - 0) * alpha;
1219 else {
1220 f = exp(f);
1221 g =
1222 (label
1223 - f
1224 / (noise_distribution[target]
1225 * nce + f)) * alpha;
1226 }
1227 for (c = 0; c < window_layer_size; c++)
1228 neu1e[c] += g * syn1nce_window[c + l2];
1229 for (c = 0; c < window_layer_size; c++)
1230 syn1nce_window[c + l2] += g * neu1[c];
1231 if (cap == 1)
1232 for (c = 0; c < window_layer_size; c++)
1233 capParam(syn1nce_window, c + l2);
1234 }
1235 // hidden -> in
1236 for (a = 0; a < window * 2 + 1; a++)
1237 if (a != window) {
1238 c = sentence_position - window + a;
1239 if (c < 0)
1240 continue;
1241 if (c >= sentence_length)
1242 continue;
1243 last_word = sen[c];
1244 if (last_word == -1)
1245 continue;
1246 window_offset = a * layer1_size;
1247 if (a > window)
1248 window_offset -= layer1_size;
1249 for (c = 0; c < layer1_size; c++)
1250 syn0[c + last_word * layer1_size] += neu1e[c
1251 + window_offset];
1252 }
1253 }
1254 } else if (type == 3) { //train structured skip-gram
1255 for (a = 0; a < window * 2 + 1; a++)
1256 if (a != window) {
1257 c = sentence_position - window + a;
1258 if (c < 0)
1259 continue;
1260 if (c >= sentence_length)
1261 continue;
1262 last_word = sen[c];
1263 if (last_word == -1)
1264 continue;
1265 l1 = last_word * layer1_size;
1266 window_offset = a * layer1_size;
1267 if (a > window)
1268 window_offset -= layer1_size;
1269 for (c = 0; c < layer1_size; c++)
1270 neu1e[c] = 0;
1271 // HIERARCHICAL SOFTMAX
1272 if (hs)
1273 for (d = 0; d < vocab[word].codelen; d++) {
1274 f = 0;
1275 l2 = vocab[word].point[d] * window_layer_size;
1276 // Propagate hidden -> output
1277 for (c = 0; c < layer1_size; c++)
1278 f += syn0[c + l1]
1279 * syn1_window[c + l2 + window_offset];
1280 if (f <= -MAX_EXP)
1281 continue;
1282 else if (f >= MAX_EXP)
1283 continue;
1284 else
1285 f = expTable[(int) ((f + MAX_EXP)
1286 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1287 // 'g' is the gradient multiplied by the learning rate
1288 g = (1 - vocab[word].code[d] - f) * alpha;
1289 // Propagate errors output -> hidden
1290 for (c = 0; c < layer1_size; c++)
1291 neu1e[c] += g
1292 * syn1_window[c + l2 + window_offset];
1293 // Learn weights hidden -> output
1294 for (c = 0; c < layer1_size; c++)
1295 syn1[c + l2 + window_offset] += g
1296 * syn0[c + l1];
1297 if (cap == 1)
1298 for (c = 0; c < layer1_size; c++)
1299 capParam(syn1, c + l2 + window_offset);
1300 }
1301 // NEGATIVE SAMPLING
1302 if (negative > 0)
1303 for (d = 0; d < negative + 1; d++) {
1304 if (d == 0) {
1305 target = word;
1306 label = 1;
1307 } else {
1308 next_random = next_random
1309 * (unsigned long long) 25214903917 + 11;
1310 if (word_to_group != NULL
1311 && word_to_group[word] != -1) {
1312 target = word;
1313 while (target == word) {
1314 target =
1315 group_to_table[word_to_group[word]
1316 * table_size
1317 + (next_random >> 16)
1318 % table_size];
1319 next_random =
1320 next_random
1321 * (unsigned long long) 25214903917
1322 + 11;
1323 }
1324 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1325 } else {
1326 target = table[(next_random >> 16)
1327 % table_size];
1328 }
1329 if (target == 0)
1330 target = next_random % (vocab_size - 1) + 1;
1331 if (target == word)
1332 continue;
1333 label = 0;
1334 }
1335 l2 = target * window_layer_size;
1336 f = 0;
1337 for (c = 0; c < layer1_size; c++)
1338 f +=
1339 syn0[c + l1]
1340 * syn1neg_window[c + l2
1341 + window_offset];
1342 if (f > MAX_EXP)
1343 g = (label - 1) * alpha;
1344 else if (f < -MAX_EXP)
1345 g = (label - 0) * alpha;
1346 else
1347 g =
1348 (label
1349 - expTable[(int) ((f + MAX_EXP)
1350 * (EXP_TABLE_SIZE
1351 / MAX_EXP / 2))])
1352 * alpha;
1353 for (c = 0; c < layer1_size; c++)
1354 neu1e[c] +=
1355 g
1356 * syn1neg_window[c + l2
1357 + window_offset];
1358 for (c = 0; c < layer1_size; c++)
1359 syn1neg_window[c + l2 + window_offset] += g
1360 * syn0[c + l1];
1361 if (cap == 1)
1362 for (c = 0; c < layer1_size; c++)
1363 capParam(syn1neg_window,
1364 c + l2 + window_offset);
1365 }
1366 // Noise Constrastive Estimation
1367 if (nce > 0)
1368 for (d = 0; d < nce + 1; d++) {
1369 if (d == 0) {
1370 target = word;
1371 label = 1;
1372 } else {
1373 next_random = next_random
1374 * (unsigned long long) 25214903917 + 11;
1375 if (word_to_group != NULL
1376 && word_to_group[word] != -1) {
1377 target = word;
1378 while (target == word) {
1379 target =
1380 group_to_table[word_to_group[word]
1381 * table_size
1382 + (next_random >> 16)
1383 % table_size];
1384 next_random =
1385 next_random
1386 * (unsigned long long) 25214903917
1387 + 11;
1388 }
1389 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1390 } else {
1391 target = table[(next_random >> 16)
1392 % table_size];
1393 }
1394 if (target == 0)
1395 target = next_random % (vocab_size - 1) + 1;
1396 if (target == word)
1397 continue;
1398 label = 0;
1399 }
1400 l2 = target * window_layer_size;
1401 f = 0;
1402 for (c = 0; c < layer1_size; c++)
1403 f +=
1404 syn0[c + l1]
1405 * syn1nce_window[c + l2
1406 + window_offset];
1407 if (f > MAX_EXP)
1408 g = (label - 1) * alpha;
1409 else if (f < -MAX_EXP)
1410 g = (label - 0) * alpha;
1411 else {
1412 f = exp(f);
1413 g = (label
1414 - f
1415 / (noise_distribution[target]
1416 * nce + f)) * alpha;
1417 }
1418 for (c = 0; c < layer1_size; c++)
1419 neu1e[c] +=
1420 g
1421 * syn1nce_window[c + l2
1422 + window_offset];
1423 for (c = 0; c < layer1_size; c++)
1424 syn1nce_window[c + l2 + window_offset] += g
1425 * syn0[c + l1];
1426 if (cap == 1)
1427 for (c = 0; c < layer1_size; c++)
1428 capParam(syn1nce_window,
1429 c + l2 + window_offset);
1430 }
1431 // Learn weights input -> hidden
1432 for (c = 0; c < layer1_size; c++) {
1433 syn0[c + l1] += neu1e[c];
1434 if (syn0[c + l1] > 50)
1435 syn0[c + l1] = 50;
1436 if (syn0[c + l1] < -50)
1437 syn0[c + l1] = -50;
1438 }
1439 }
1440 } else if (type == 4) { //training senna
1441 // in -> hidden
1442 cw = 0;
1443 for (a = 0; a < window * 2 + 1; a++)
1444 if (a != window) {
1445 c = sentence_position - window + a;
1446 if (c < 0)
1447 continue;
1448 if (c >= sentence_length)
1449 continue;
1450 last_word = sen[c];
1451 if (last_word == -1)
1452 continue;
1453 window_offset = a * layer1_size;
1454 if (a > window)
1455 window_offset -= layer1_size;
1456 for (c = 0; c < layer1_size; c++)
1457 neu1[c + window_offset] += syn0[c
1458 + last_word * layer1_size];
1459 cw++;
1460 }
1461 if (cw) {
1462 for (a = 0; a < window_hidden_size; a++) {
1463 c = a * window_layer_size;
1464 for (b = 0; b < window_layer_size; b++) {
1465 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1466 }
1467 }
1468 if (hs)
1469 for (d = 0; d < vocab[word].codelen; d++) {
1470 f = 0;
1471 l2 = vocab[word].point[d] * window_hidden_size;
1472 // Propagate hidden -> output
1473 for (c = 0; c < window_hidden_size; c++)
1474 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1475 if (f <= -MAX_EXP)
1476 continue;
1477 else if (f >= MAX_EXP)
1478 continue;
1479 else
1480 f = expTable[(int) ((f + MAX_EXP)
1481 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1482 // 'g' is the gradient multiplied by the learning rate
1483 g = (1 - vocab[word].code[d] - f) * alpha;
1484 // Propagate errors output -> hidden
1485 for (c = 0; c < window_hidden_size; c++)
1486 neu2e[c] += dHardTanh(neu2[c], g) * g
1487 * syn_hidden_word[c + l2];
1488 // Learn weights hidden -> output
1489 for (c = 0; c < window_hidden_size; c++)
1490 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1491 * neu2[c];
1492 }
1493 // NEGATIVE SAMPLING
1494 if (negative > 0)
1495 for (d = 0; d < negative + 1; d++) {
1496 if (d == 0) {
1497 target = word;
1498 label = 1;
1499 } else {
1500 next_random = next_random
1501 * (unsigned long long) 25214903917 + 11;
1502 if (word_to_group != NULL
1503 && word_to_group[word] != -1) {
1504 target = word;
1505 while (target == word) {
1506 target = group_to_table[word_to_group[word]
1507 * table_size
1508 + (next_random >> 16) % table_size];
1509 next_random = next_random
1510 * (unsigned long long) 25214903917
1511 + 11;
1512 }
1513 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1514 } else {
1515 target =
1516 table[(next_random >> 16) % table_size];
1517 }
1518 if (target == 0)
1519 target = next_random % (vocab_size - 1) + 1;
1520 if (target == word)
1521 continue;
1522 label = 0;
1523 }
1524 l2 = target * window_hidden_size;
1525 f = 0;
1526 for (c = 0; c < window_hidden_size; c++)
1527 f += hardTanh(neu2[c])
1528 * syn_hidden_word_neg[c + l2];
1529 if (f > MAX_EXP)
1530 g = (label - 1) * alpha / negative;
1531 else if (f < -MAX_EXP)
1532 g = (label - 0) * alpha / negative;
1533 else
1534 g = (label
1535 - expTable[(int) ((f + MAX_EXP)
1536 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1537 * alpha / negative;
1538 for (c = 0; c < window_hidden_size; c++)
1539 neu2e[c] += dHardTanh(neu2[c], g) * g
1540 * syn_hidden_word_neg[c + l2];
1541 for (c = 0; c < window_hidden_size; c++)
1542 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1543 * g * neu2[c];
1544 }
1545 for (a = 0; a < window_hidden_size; a++)
1546 for (b = 0; b < window_layer_size; b++)
1547 neu1e[b] += neu2e[a]
1548 * syn_window_hidden[a * window_layer_size + b];
1549 for (a = 0; a < window_hidden_size; a++)
1550 for (b = 0; b < window_layer_size; b++)
1551 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1552 * neu1[b];
1553 // hidden -> in
1554 for (a = 0; a < window * 2 + 1; a++)
1555 if (a != window) {
1556 c = sentence_position - window + a;
1557 if (c < 0)
1558 continue;
1559 if (c >= sentence_length)
1560 continue;
1561 last_word = sen[c];
1562 if (last_word == -1)
1563 continue;
1564 window_offset = a * layer1_size;
1565 if (a > window)
1566 window_offset -= layer1_size;
1567 for (c = 0; c < layer1_size; c++)
1568 syn0[c + last_word * layer1_size] += neu1e[c
1569 + window_offset];
1570 }
1571 }
1572 } else {
1573 printf("unknown type %i", type);
1574 exit(0);
1575 }
1576 sentence_position++;
1577 if (sentence_position >= sentence_length) {
1578 sentence_length = 0;
1579 continue;
1580 }
1581 }
1582 fclose(fi);
1583 free(neu1);
1584 free(neu1e);
1585 pthread_exit(NULL);
1586}
1587
1588void TrainModel() {
1589 long a, b, c, d;
1590 FILE *fo;
1591 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1592 printf("Starting training using file %s\n", train_file);
1593 starting_alpha = alpha;
1594 if (read_vocab_file[0] != 0)
1595 ReadVocab();
1596 else
1597 LearnVocabFromTrainFile();
1598 if (save_vocab_file[0] != 0)
1599 SaveVocab();
1600 if (output_file[0] == 0)
1601 return;
1602 InitNet();
1603 if (negative > 0 || nce > 0)
1604 InitUnigramTable();
1605 if (negative_classes_file[0] != 0)
1606 InitClassUnigramTable();
1607 start = clock();
1608 for (a = 0; a < num_threads; a++)
1609 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1610 for (a = 0; a < num_threads; a++)
1611 pthread_join(pt[a], NULL);
1612 fo = fopen(output_file, "wb");
1613 if (classes == 0) {
1614 // Save the word vectors
1615 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1616 for (a = 0; a < vocab_size; a++) {
1617 fprintf(fo, "%s ", vocab[a].word);
1618 if (binary)
1619 for (b = 0; b < layer1_size; b++)
1620 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1621 else
1622 for (b = 0; b < layer1_size; b++)
1623 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1624 fprintf(fo, "\n");
1625 }
1626 } else {
1627 // Run K-means on the word vectors
1628 int clcn = classes, iter = 10, closeid;
1629 int *centcn = (int *) malloc(classes * sizeof(int));
1630 int *cl = (int *) calloc(vocab_size, sizeof(int));
1631 real closev, x;
1632 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1633 for (a = 0; a < vocab_size; a++)
1634 cl[a] = a % clcn;
1635 for (a = 0; a < iter; a++) {
1636 for (b = 0; b < clcn * layer1_size; b++)
1637 cent[b] = 0;
1638 for (b = 0; b < clcn; b++)
1639 centcn[b] = 1;
1640 for (c = 0; c < vocab_size; c++) {
1641 for (d = 0; d < layer1_size; d++)
1642 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1643 centcn[cl[c]]++;
1644 }
1645 for (b = 0; b < clcn; b++) {
1646 closev = 0;
1647 for (c = 0; c < layer1_size; c++) {
1648 cent[layer1_size * b + c] /= centcn[b];
1649 closev += cent[layer1_size * b + c]
1650 * cent[layer1_size * b + c];
1651 }
1652 closev = sqrt(closev);
1653 for (c = 0; c < layer1_size; c++)
1654 cent[layer1_size * b + c] /= closev;
1655 }
1656 for (c = 0; c < vocab_size; c++) {
1657 closev = -10;
1658 closeid = 0;
1659 for (d = 0; d < clcn; d++) {
1660 x = 0;
1661 for (b = 0; b < layer1_size; b++)
1662 x += cent[layer1_size * d + b]
1663 * syn0[c * layer1_size + b];
1664 if (x > closev) {
1665 closev = x;
1666 closeid = d;
1667 }
1668 }
1669 cl[c] = closeid;
1670 }
1671 }
1672 // Save the K-means classes
1673 for (a = 0; a < vocab_size; a++)
1674 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1675 free(centcn);
1676 free(cent);
1677 free(cl);
1678 }
1679 fclose(fo);
1680 if (save_net_file[0] != 0)
1681 SaveNet();
1682}
1683
1684int ArgPos(char *str, int argc, char **argv) {
1685 int a;
1686 for (a = 1; a < argc; a++)
1687 if (!strcmp(str, argv[a])) {
1688 if (a == argc - 1) {
1689 printf("Argument missing for %s\n", str);
1690 exit(1);
1691 }
1692 return a;
1693 }
1694 return -1;
1695}
1696
1697int main(int argc, char **argv) {
1698 int i;
1699 if (argc == 1) {
1700 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1701 printf("Options:\n");
1702 printf("Parameters for training:\n");
1703 printf("\t-train <file>\n");
1704 printf("\t\tUse text data from <file> to train the model\n");
1705 printf("\t-output <file>\n");
1706 printf(
1707 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1708 printf("\t-size <int>\n");
1709 printf("\t\tSet size of word vectors; default is 100\n");
1710 printf("\t-window <int>\n");
1711 printf("\t\tSet max skip length between words; default is 5\n");
1712 printf("\t-sample <float>\n");
1713 printf(
1714 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1715 printf(
1716 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1717 printf("\t-hs <int>\n");
1718 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1719 printf("\t-negative <int>\n");
1720 printf(
1721 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1722 printf("\t-negative-classes <file>\n");
1723 printf("\t\tNegative classes to sample from\n");
1724 printf("\t-nce <int>\n");
1725 printf(
1726 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1727 printf("\t-threads <int>\n");
1728 printf("\t\tUse <int> threads (default 12)\n");
1729 printf("\t-iter <int>\n");
1730 printf("\t\tRun more training iterations (default 5)\n");
1731 printf("\t-min-count <int>\n");
1732 printf(
1733 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1734 printf("\t-alpha <float>\n");
1735 printf(
1736 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1737 printf("\t-classes <int>\n");
1738 printf(
1739 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1740 printf("\t-debug <int>\n");
1741 printf(
1742 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1743 printf("\t-binary <int>\n");
1744 printf(
1745 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1746 printf("\t-save-vocab <file>\n");
1747 printf("\t\tThe vocabulary will be saved to <file>\n");
1748 printf("\t-read-vocab <file>\n");
1749 printf(
1750 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1751 printf("\t-read-net <file>\n");
1752 printf(
1753 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1754 printf("\t-save-net <file>\n");
1755 printf("\t\tThe net parameters will be saved to <file>\n");
1756 printf("\t-type <int>\n");
1757 printf(
1758 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1759 printf("\t-cap <int>\n");
1760 printf(
1761 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1762 printf("\nExamples:\n");
1763 printf(
1764 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1765 return 0;
1766 }
1767 output_file[0] = 0;
1768 save_vocab_file[0] = 0;
1769 read_vocab_file[0] = 0;
1770 save_net_file[0] = 0;
1771 read_net_file[0] = 0;
1772 negative_classes_file[0] = 0;
1773 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1774 layer1_size = atoi(argv[i + 1]);
1775 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1776 strcpy(train_file, argv[i + 1]);
1777 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1778 strcpy(save_vocab_file, argv[i + 1]);
1779 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1780 strcpy(read_vocab_file, argv[i + 1]);
1781 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1782 strcpy(save_net_file, argv[i + 1]);
1783 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1784 strcpy(read_net_file, argv[i + 1]);
1785 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1786 debug_mode = atoi(argv[i + 1]);
1787 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1788 binary = atoi(argv[i + 1]);
1789 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1790 type = atoi(argv[i + 1]);
1791 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1792 strcpy(output_file, argv[i + 1]);
1793 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1794 window = atoi(argv[i + 1]);
1795 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1796 sample = atof(argv[i + 1]);
1797 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1798 hs = atoi(argv[i + 1]);
1799 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1800 negative = atoi(argv[i + 1]);
1801 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1802 strcpy(negative_classes_file, argv[i + 1]);
1803 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1804 nce = atoi(argv[i + 1]);
1805 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1806 num_threads = atoi(argv[i + 1]);
1807 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1808 iter = atoi(argv[i + 1]);
1809 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1810 min_count = atoi(argv[i + 1]);
1811 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1812 classes = atoi(argv[i + 1]);
1813 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1814 cap = atoi(argv[i + 1]);
1815 if (type == 0 || type == 2 || type == 4)
1816 alpha = 0.05;
1817 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1818 alpha = atof(argv[i + 1]);
1819 vocab = (struct vocab_word *) calloc(vocab_max_size,
1820 sizeof(struct vocab_word));
1821 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1822 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1823 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1824 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1825 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1826 }
1827 TrainModel();
1828 return 0;
1829}
1830