blob: 4e05e962fe7c2c46159ad0335a21aad7efc76e42 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
40struct vocab_word *vocab;
41int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
42 num_threads = 12, min_reduce = 1;
43int *vocab_hash;
44long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
45long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
46 classes = 0;
47real alpha = 0.025, starting_alpha, sample = 1e-3;
48real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
49clock_t start;
50
51real *syn1_window, *syn1neg_window, *syn1nce_window;
52int w_offset, window_layer_size;
53
54int window_hidden_size = 500;
55real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
56 *syn_hidden_word_nce;
57
58int hs = 0, negative = 5;
59const int table_size = 1e8;
60int *table;
61
62//constrastive negative sampling
63char negative_classes_file[MAX_STRING];
64int *word_to_group;
65int *group_to_table; //group_size*table_size
66int class_number;
67
68//nce
69real* noise_distribution;
70int nce = 0;
71
72//param caps
73real CAP_VALUE = 50;
74int cap = 0;
75
76void capParam(real* array, int index) {
77 if (array[index] > CAP_VALUE)
78 array[index] = CAP_VALUE;
79 else if (array[index] < -CAP_VALUE)
80 array[index] = -CAP_VALUE;
81}
82
83real hardTanh(real x) {
84 if (x >= 1) {
85 return 1;
86 } else if (x <= -1) {
87 return -1;
88 } else {
89 return x;
90 }
91}
92
93real dHardTanh(real x, real g) {
94 if (x > 1 && g > 0) {
95 return 0;
96 }
97 if (x < -1 && g < 0) {
98 return 0;
99 }
100 return 1;
101}
102
103void InitUnigramTable() {
104 int a, i;
105 long long train_words_pow = 0;
106 real d1, power = 0.75;
107 table = (int *) malloc(table_size * sizeof(int));
108 for (a = 0; a < vocab_size; a++)
109 train_words_pow += pow(vocab[a].cn, power);
110 i = 0;
111 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
112 for (a = 0; a < table_size; a++) {
113 table[a] = i;
114 if (a / (real) table_size > d1) {
115 i++;
116 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
117 }
118 if (i >= vocab_size)
119 i = vocab_size - 1;
120 }
121
122 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
123 for (a = 0; a < vocab_size; a++)
124 noise_distribution[a] = pow(vocab[a].cn, power)
125 / (real) train_words_pow;
126}
127
128// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
129void ReadWord(char *word, FILE *fin) {
130 int a = 0, ch;
131 while (!feof(fin)) {
132 ch = fgetc(fin);
133 if (ch == 13)
134 continue;
135 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
136 if (a > 0) {
137 if (ch == '\n')
138 ungetc(ch, fin);
139 break;
140 }
141 if (ch == '\n') {
142 strcpy(word, (char *) "</s>");
143 return;
144 } else
145 continue;
146 }
147 word[a] = ch;
148 a++;
149 if (a >= MAX_STRING - 1)
150 a--; // Truncate too long words
151 }
152 word[a] = 0;
153}
154
155// Returns hash value of a word
156int GetWordHash(char *word) {
157 unsigned long long a, hash = 0;
158 for (a = 0; a < strlen(word); a++)
159 hash = hash * 257 + word[a];
160 hash = hash % vocab_hash_size;
161 return hash;
162}
163
164// Returns position of a word in the vocabulary; if the word is not found, returns -1
165int SearchVocab(char *word) {
166 unsigned int hash = GetWordHash(word);
167 while (1) {
168 if (vocab_hash[hash] == -1)
169 return -1;
170 if (!strcmp(word, vocab[vocab_hash[hash]].word))
171 return vocab_hash[hash];
172 hash = (hash + 1) % vocab_hash_size;
173 }
174 return -1;
175}
176
177// Reads a word and returns its index in the vocabulary
178int ReadWordIndex(FILE *fin) {
179 char word[MAX_STRING];
180 ReadWord(word, fin);
181 if (feof(fin))
182 return -1;
183 return SearchVocab(word);
184}
185
186// Adds a word to the vocabulary
187int AddWordToVocab(char *word) {
188 unsigned int hash, length = strlen(word) + 1;
189 if (length > MAX_STRING)
190 length = MAX_STRING;
191 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
192 strcpy(vocab[vocab_size].word, word);
193 vocab[vocab_size].cn = 0;
194 vocab_size++;
195 // Reallocate memory if needed
196 if (vocab_size + 2 >= vocab_max_size) {
197 vocab_max_size += 1000;
198 vocab = (struct vocab_word *) realloc(vocab,
199 vocab_max_size * sizeof(struct vocab_word));
200 }
201 hash = GetWordHash(word);
202 while (vocab_hash[hash] != -1)
203 hash = (hash + 1) % vocab_hash_size;
204 vocab_hash[hash] = vocab_size - 1;
205 return vocab_size - 1;
206}
207
208// Used later for sorting by word counts
209int VocabCompare(const void *a, const void *b) {
210 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
211}
212
213// Sorts the vocabulary by frequency using word counts
214void SortVocab() {
215 int a, size;
216 unsigned int hash;
217 // Sort the vocabulary and keep </s> at the first position
218 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
219 for (a = 0; a < vocab_hash_size; a++)
220 vocab_hash[a] = -1;
221 size = vocab_size;
222 train_words = 0;
223 for (a = 0; a < size; a++) {
224 // Words occuring less than min_count times will be discarded from the vocab
225 if ((vocab[a].cn < min_count) && (a != 0)) {
226 vocab_size--;
227 free(vocab[a].word);
228 } else {
229 // Hash will be re-computed, as after the sorting it is not actual
230 hash = GetWordHash(vocab[a].word);
231 while (vocab_hash[hash] != -1)
232 hash = (hash + 1) % vocab_hash_size;
233 vocab_hash[hash] = a;
234 train_words += vocab[a].cn;
235 }
236 }
237 vocab = (struct vocab_word *) realloc(vocab,
238 (vocab_size + 1) * sizeof(struct vocab_word));
239 // Allocate memory for the binary tree construction
240 for (a = 0; a < vocab_size; a++) {
241 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
242 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
243 }
244}
245
246// Reduces the vocabulary by removing infrequent tokens
247void ReduceVocab() {
248 int a, b = 0;
249 unsigned int hash;
250 for (a = 0; a < vocab_size; a++)
251 if (vocab[a].cn > min_reduce) {
252 vocab[b].cn = vocab[a].cn;
253 vocab[b].word = vocab[a].word;
254 b++;
255 } else
256 free(vocab[a].word);
257 vocab_size = b;
258 for (a = 0; a < vocab_hash_size; a++)
259 vocab_hash[a] = -1;
260 for (a = 0; a < vocab_size; a++) {
261 // Hash will be re-computed, as it is not actual
262 hash = GetWordHash(vocab[a].word);
263 while (vocab_hash[hash] != -1)
264 hash = (hash + 1) % vocab_hash_size;
265 vocab_hash[hash] = a;
266 }
267 fflush(stdout);
268 min_reduce++;
269}
270
271// Create binary Huffman tree using the word counts
272// Frequent words will have short uniqe binary codes
273void CreateBinaryTree() {
274 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
275 char code[MAX_CODE_LENGTH];
276 long long *count = (long long *) calloc(vocab_size * 2 + 1,
277 sizeof(long long));
278 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
279 sizeof(long long));
280 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
281 sizeof(long long));
282 for (a = 0; a < vocab_size; a++)
283 count[a] = vocab[a].cn;
284 for (a = vocab_size; a < vocab_size * 2; a++)
285 count[a] = 1e15;
286 pos1 = vocab_size - 1;
287 pos2 = vocab_size;
288 // Following algorithm constructs the Huffman tree by adding one node at a time
289 for (a = 0; a < vocab_size - 1; a++) {
290 // First, find two smallest nodes 'min1, min2'
291 if (pos1 >= 0) {
292 if (count[pos1] < count[pos2]) {
293 min1i = pos1;
294 pos1--;
295 } else {
296 min1i = pos2;
297 pos2++;
298 }
299 } else {
300 min1i = pos2;
301 pos2++;
302 }
303 if (pos1 >= 0) {
304 if (count[pos1] < count[pos2]) {
305 min2i = pos1;
306 pos1--;
307 } else {
308 min2i = pos2;
309 pos2++;
310 }
311 } else {
312 min2i = pos2;
313 pos2++;
314 }
315 count[vocab_size + a] = count[min1i] + count[min2i];
316 parent_node[min1i] = vocab_size + a;
317 parent_node[min2i] = vocab_size + a;
318 binary[min2i] = 1;
319 }
320 // Now assign binary code to each vocabulary word
321 for (a = 0; a < vocab_size; a++) {
322 b = a;
323 i = 0;
324 while (1) {
325 code[i] = binary[b];
326 point[i] = b;
327 i++;
328 b = parent_node[b];
329 if (b == vocab_size * 2 - 2)
330 break;
331 }
332 vocab[a].codelen = i;
333 vocab[a].point[0] = vocab_size - 2;
334 for (b = 0; b < i; b++) {
335 vocab[a].code[i - b - 1] = code[b];
336 vocab[a].point[i - b] = point[b] - vocab_size;
337 }
338 }
339 free(count);
340 free(binary);
341 free(parent_node);
342}
343
344void LearnVocabFromTrainFile() {
345 char word[MAX_STRING];
346 FILE *fin;
347 long long a, i;
348 for (a = 0; a < vocab_hash_size; a++)
349 vocab_hash[a] = -1;
350 fin = fopen(train_file, "rb");
351 if (fin == NULL) {
352 printf("ERROR: training data file not found!\n");
353 exit(1);
354 }
355 vocab_size = 0;
356 AddWordToVocab((char *) "</s>");
357 while (1) {
358 ReadWord(word, fin);
359 if (feof(fin))
360 break;
361 train_words++;
362 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
363 printf("%lldK%c", train_words / 1000, 13);
364 fflush(stdout);
365 }
366 i = SearchVocab(word);
367 if (i == -1) {
368 a = AddWordToVocab(word);
369 vocab[a].cn = 1;
370 } else
371 vocab[i].cn++;
372 if (vocab_size > vocab_hash_size * 0.7)
373 ReduceVocab();
374 }
375 SortVocab();
376 if (debug_mode > 0) {
377 printf("Vocab size: %lld\n", vocab_size);
378 printf("Words in train file: %lld\n", train_words);
379 }
380 file_size = ftell(fin);
381 fclose(fin);
382}
383
384void SaveVocab() {
385 long long i;
386 FILE *fo = fopen(save_vocab_file, "wb");
387 for (i = 0; i < vocab_size; i++)
388 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
389 fclose(fo);
390}
391
392void ReadVocab() {
393 long long a, i = 0;
394 char c;
395 char word[MAX_STRING];
396 FILE *fin = fopen(read_vocab_file, "rb");
397 if (fin == NULL) {
398 printf("Vocabulary file not found\n");
399 exit(1);
400 }
401 for (a = 0; a < vocab_hash_size; a++)
402 vocab_hash[a] = -1;
403 vocab_size = 0;
404 while (1) {
405 ReadWord(word, fin);
406 if (feof(fin))
407 break;
408 a = AddWordToVocab(word);
409 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
410 i++;
411 }
412 SortVocab();
413 if (debug_mode > 0) {
414 printf("Vocab size: %lld\n", vocab_size);
415 printf("Words in train file: %lld\n", train_words);
416 }
417 fin = fopen(train_file, "rb");
418 if (fin == NULL) {
419 printf("ERROR: training data file not found!\n");
420 exit(1);
421 }
422 fseek(fin, 0, SEEK_END);
423 file_size = ftell(fin);
424 fclose(fin);
425}
426
427void InitClassUnigramTable() {
428 long long a, c;
429 printf("loading class unigrams \n");
430 FILE *fin = fopen(negative_classes_file, "rb");
431 if (fin == NULL) {
432 printf("ERROR: class file not found!\n");
433 exit(1);
434 }
435 word_to_group = (int *) malloc(vocab_size * sizeof(int));
436 for (a = 0; a < vocab_size; a++)
437 word_to_group[a] = -1;
438 char class[MAX_STRING];
439 char prev_class[MAX_STRING];
440 prev_class[0] = 0;
441 char word[MAX_STRING];
442 class_number = -1;
443 while (1) {
444 if (feof(fin))
445 break;
446 ReadWord(class, fin);
447 ReadWord(word, fin);
448 int word_index = SearchVocab(word);
449 if (word_index != -1) {
450 if (strcmp(class, prev_class) != 0) {
451 class_number++;
452 strcpy(prev_class, class);
453 }
454 word_to_group[word_index] = class_number;
455 }
456 ReadWord(word, fin);
457 }
458 class_number++;
459 fclose(fin);
460
461 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
462 long long train_words_pow = 0;
463 real d1, power = 0.75;
464
465 for (c = 0; c < class_number; c++) {
466 long long offset = c * table_size;
467 train_words_pow = 0;
468 for (a = 0; a < vocab_size; a++)
469 if (word_to_group[a] == c)
470 train_words_pow += pow(vocab[a].cn, power);
471 int i = 0;
472 while (word_to_group[i] != c && i < vocab_size)
473 i++;
474 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
475 for (a = 0; a < table_size; a++) {
476 //printf("index %lld , word %d\n", a, i);
477 group_to_table[offset + a] = i;
478 if (a / (real) table_size > d1) {
479 i++;
480 while (word_to_group[i] != c && i < vocab_size)
481 i++;
482 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
483 }
484 if (i >= vocab_size)
485 while (word_to_group[i] != c && i >= 0)
486 i--;
487 }
488 }
489}
490
491void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100492 if(type != 3 || negative <= 0) {
493 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
494 return;
495 }
496
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100497 FILE *fnet = fopen(save_net_file, "wb");
498 if (fnet == NULL) {
499 printf("Net parameter file not found\n");
500 exit(1);
501 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100502 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100503 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100504 fclose(fnet);
505}
506
507void InitNet() {
508 long long a, b;
509 unsigned long long next_random = 1;
510 window_layer_size = layer1_size * window * 2;
511 a = posix_memalign((void **) &syn0, 128,
512 (long long) vocab_size * layer1_size * sizeof(real));
513 if (syn0 == NULL) {
514 printf("Memory allocation failed\n");
515 exit(1);
516 }
517
518 if (hs) {
519 a = posix_memalign((void **) &syn1, 128,
520 (long long) vocab_size * layer1_size * sizeof(real));
521 if (syn1 == NULL) {
522 printf("Memory allocation failed\n");
523 exit(1);
524 }
525 a = posix_memalign((void **) &syn1_window, 128,
526 (long long) vocab_size * window_layer_size * sizeof(real));
527 if (syn1_window == NULL) {
528 printf("Memory allocation failed\n");
529 exit(1);
530 }
531 a = posix_memalign((void **) &syn_hidden_word, 128,
532 (long long) vocab_size * window_hidden_size * sizeof(real));
533 if (syn_hidden_word == NULL) {
534 printf("Memory allocation failed\n");
535 exit(1);
536 }
537
538 for (a = 0; a < vocab_size; a++)
539 for (b = 0; b < layer1_size; b++)
540 syn1[a * layer1_size + b] = 0;
541 for (a = 0; a < vocab_size; a++)
542 for (b = 0; b < window_layer_size; b++)
543 syn1_window[a * window_layer_size + b] = 0;
544 for (a = 0; a < vocab_size; a++)
545 for (b = 0; b < window_hidden_size; b++)
546 syn_hidden_word[a * window_hidden_size + b] = 0;
547 }
548 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100549 if(type == 0) {
550 a = posix_memalign((void **) &syn1neg, 128,
551 (long long) vocab_size * layer1_size * sizeof(real));
552 if (syn1neg == NULL) {
553 printf("Memory allocation failed\n");
554 exit(1);
555 }
556 for (a = 0; a < vocab_size; a++)
557 for (b = 0; b < layer1_size; b++)
558 syn1neg[a * layer1_size + b] = 0;
559 } else if (type == 3) {
560 a = posix_memalign((void **) &syn1neg_window, 128,
561 (long long) vocab_size * window_layer_size * sizeof(real));
562 if (syn1neg_window == NULL) {
563 printf("Memory allocation failed\n");
564 exit(1);
565 }
566 for (a = 0; a < vocab_size; a++)
567 for (b = 0; b < window_layer_size; b++)
568 syn1neg_window[a * window_layer_size + b] = 0;
569 } else if (type == 4) {
570 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
571 (long long) vocab_size * window_hidden_size * sizeof(real));
572 if (syn_hidden_word_neg == NULL) {
573 printf("Memory allocation failed\n");
574 exit(1);
575 }
576 for (a = 0; a < vocab_size; a++)
577 for (b = 0; b < window_hidden_size; b++)
578 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100579 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100580 }
581 if (nce > 0) {
582 a = posix_memalign((void **) &syn1nce, 128,
583 (long long) vocab_size * layer1_size * sizeof(real));
584 if (syn1nce == NULL) {
585 printf("Memory allocation failed\n");
586 exit(1);
587 }
588 a = posix_memalign((void **) &syn1nce_window, 128,
589 (long long) vocab_size * window_layer_size * sizeof(real));
590 if (syn1nce_window == NULL) {
591 printf("Memory allocation failed\n");
592 exit(1);
593 }
594 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
595 (long long) vocab_size * window_hidden_size * sizeof(real));
596 if (syn_hidden_word_nce == NULL) {
597 printf("Memory allocation failed\n");
598 exit(1);
599 }
600
601 for (a = 0; a < vocab_size; a++)
602 for (b = 0; b < layer1_size; b++)
603 syn1nce[a * layer1_size + b] = 0;
604 for (a = 0; a < vocab_size; a++)
605 for (b = 0; b < window_layer_size; b++)
606 syn1nce_window[a * window_layer_size + b] = 0;
607 for (a = 0; a < vocab_size; a++)
608 for (b = 0; b < window_hidden_size; b++)
609 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
610 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100611
Marc Kupietz1006a272016-03-16 15:50:20 +0100612 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100613 a = posix_memalign((void **) &syn_window_hidden, 128,
614 window_hidden_size * window_layer_size * sizeof(real));
615 if (syn_window_hidden == NULL) {
616 printf("Memory allocation failed\n");
617 exit(1);
618 }
619 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
620 next_random = next_random * (unsigned long long) 25214903917 + 11;
621 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
622 - 0.5) / (window_hidden_size * window_layer_size);
623 }
624 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100625
626 if (read_net_file[0] == 0) {
627 for (a = 0; a < vocab_size; a++)
628 for (b = 0; b < layer1_size; b++) {
629 next_random = next_random * (unsigned long long) 25214903917
630 + 11;
631 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
632 / (real) 65536) - 0.5) / layer1_size;
633 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100634 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100635 FILE *fnet = fopen(read_net_file, "rb");
636 if (fnet == NULL) {
637 printf("Net parameter file not found\n");
638 exit(1);
639 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100640 fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100641 fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100642 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100643 } else {
644 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
645 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100646 }
647
648 CreateBinaryTree();
649}
650
651void *TrainModelThread(void *id) {
652 long long a, b, d, cw, word, last_word, sentence_length = 0,
653 sentence_position = 0;
654 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
655 long long l1, l2, c, target, label, local_iter = iter;
656 unsigned long long next_random = (long long) id;
657 real f, g;
658 clock_t now;
659 int input_len_1 = layer1_size;
660 int window_offset = -1;
661 if (type == 2 || type == 4) {
662 input_len_1 = window_layer_size;
663 }
664 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
665 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
666
667 int input_len_2 = 0;
668 if (type == 4) {
669 input_len_2 = window_hidden_size;
670 }
671 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
672 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
673
674 FILE *fi = fopen(train_file, "rb");
675 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
676 while (1) {
677 if (word_count - last_word_count > 10000) {
678 word_count_actual += word_count - last_word_count;
679 last_word_count = word_count;
680 if ((debug_mode > 1)) {
681 now = clock();
682 printf(
683 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
684 13, alpha,
685 word_count_actual / (real) (iter * train_words + 1)
686 * 100,
687 word_count_actual
688 / ((real) (now - start + 1)
689 / (real) CLOCKS_PER_SEC * 1000));
690 fflush(stdout);
691 }
692 alpha = starting_alpha
693 * (1 - word_count_actual / (real) (iter * train_words + 1));
694 if (alpha < starting_alpha * 0.0001)
695 alpha = starting_alpha * 0.0001;
696 }
697 if (sentence_length == 0) {
698 while (1) {
699 word = ReadWordIndex(fi);
700 if (feof(fi))
701 break;
702 if (word == -1)
703 continue;
704 word_count++;
705 if (word == 0)
706 break;
707 // The subsampling randomly discards frequent words while keeping the ranking same
708 if (sample > 0) {
709 real ran = (sqrt(vocab[word].cn / (sample * train_words))
710 + 1) * (sample * train_words) / vocab[word].cn;
711 next_random = next_random * (unsigned long long) 25214903917
712 + 11;
713 if (ran < (next_random & 0xFFFF) / (real) 65536)
714 continue;
715 }
716 sen[sentence_length] = word;
717 sentence_length++;
718 if (sentence_length >= MAX_SENTENCE_LENGTH)
719 break;
720 }
721 sentence_position = 0;
722 }
723 if (feof(fi) || (word_count > train_words / num_threads)) {
724 word_count_actual += word_count - last_word_count;
725 local_iter--;
726 if (local_iter == 0)
727 break;
728 word_count = 0;
729 last_word_count = 0;
730 sentence_length = 0;
731 fseek(fi, file_size / (long long) num_threads * (long long) id,
732 SEEK_SET);
733 continue;
734 }
735 word = sen[sentence_position];
736 if (word == -1)
737 continue;
738 for (c = 0; c < input_len_1; c++)
739 neu1[c] = 0;
740 for (c = 0; c < input_len_1; c++)
741 neu1e[c] = 0;
742 for (c = 0; c < input_len_2; c++)
743 neu2[c] = 0;
744 for (c = 0; c < input_len_2; c++)
745 neu2e[c] = 0;
746 next_random = next_random * (unsigned long long) 25214903917 + 11;
747 b = next_random % window;
748 if (type == 0) { //train the cbow architecture
749 // in -> hidden
750 cw = 0;
751 for (a = b; a < window * 2 + 1 - b; a++)
752 if (a != window) {
753 c = sentence_position - window + a;
754 if (c < 0)
755 continue;
756 if (c >= sentence_length)
757 continue;
758 last_word = sen[c];
759 if (last_word == -1)
760 continue;
761 for (c = 0; c < layer1_size; c++)
762 neu1[c] += syn0[c + last_word * layer1_size];
763 cw++;
764 }
765 if (cw) {
766 for (c = 0; c < layer1_size; c++)
767 neu1[c] /= cw;
768 if (hs)
769 for (d = 0; d < vocab[word].codelen; d++) {
770 f = 0;
771 l2 = vocab[word].point[d] * layer1_size;
772 // Propagate hidden -> output
773 for (c = 0; c < layer1_size; c++)
774 f += neu1[c] * syn1[c + l2];
775 if (f <= -MAX_EXP)
776 continue;
777 else if (f >= MAX_EXP)
778 continue;
779 else
780 f = expTable[(int) ((f + MAX_EXP)
781 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
782 // 'g' is the gradient multiplied by the learning rate
783 g = (1 - vocab[word].code[d] - f) * alpha;
784 // Propagate errors output -> hidden
785 for (c = 0; c < layer1_size; c++)
786 neu1e[c] += g * syn1[c + l2];
787 // Learn weights hidden -> output
788 for (c = 0; c < layer1_size; c++)
789 syn1[c + l2] += g * neu1[c];
790 if (cap == 1)
791 for (c = 0; c < layer1_size; c++)
792 capParam(syn1, c + l2);
793 }
794 // NEGATIVE SAMPLING
795 if (negative > 0)
796 for (d = 0; d < negative + 1; d++) {
797 if (d == 0) {
798 target = word;
799 label = 1;
800 } else {
801 next_random = next_random
802 * (unsigned long long) 25214903917 + 11;
803 if (word_to_group != NULL
804 && word_to_group[word] != -1) {
805 target = word;
806 while (target == word) {
807 target = group_to_table[word_to_group[word]
808 * table_size
809 + (next_random >> 16) % table_size];
810 next_random = next_random
811 * (unsigned long long) 25214903917
812 + 11;
813 }
814 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
815 } else {
816 target =
817 table[(next_random >> 16) % table_size];
818 }
819 if (target == 0)
820 target = next_random % (vocab_size - 1) + 1;
821 if (target == word)
822 continue;
823 label = 0;
824 }
825 l2 = target * layer1_size;
826 f = 0;
827 for (c = 0; c < layer1_size; c++)
828 f += neu1[c] * syn1neg[c + l2];
829 if (f > MAX_EXP)
830 g = (label - 1) * alpha;
831 else if (f < -MAX_EXP)
832 g = (label - 0) * alpha;
833 else
834 g = (label
835 - expTable[(int) ((f + MAX_EXP)
836 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
837 * alpha;
838 for (c = 0; c < layer1_size; c++)
839 neu1e[c] += g * syn1neg[c + l2];
840 for (c = 0; c < layer1_size; c++)
841 syn1neg[c + l2] += g * neu1[c];
842 if (cap == 1)
843 for (c = 0; c < layer1_size; c++)
844 capParam(syn1neg, c + l2);
845 }
846 // Noise Contrastive Estimation
847 if (nce > 0)
848 for (d = 0; d < nce + 1; d++) {
849 if (d == 0) {
850 target = word;
851 label = 1;
852 } else {
853 next_random = next_random
854 * (unsigned long long) 25214903917 + 11;
855 if (word_to_group != NULL
856 && word_to_group[word] != -1) {
857 target = word;
858 while (target == word) {
859 target = group_to_table[word_to_group[word]
860 * table_size
861 + (next_random >> 16) % table_size];
862 next_random = next_random
863 * (unsigned long long) 25214903917
864 + 11;
865 }
866 } else {
867 target =
868 table[(next_random >> 16) % table_size];
869 }
870 if (target == 0)
871 target = next_random % (vocab_size - 1) + 1;
872 if (target == word)
873 continue;
874 label = 0;
875 }
876 l2 = target * layer1_size;
877 f = 0;
878
879 for (c = 0; c < layer1_size; c++)
880 f += neu1[c] * syn1nce[c + l2];
881 if (f > MAX_EXP)
882 g = (label - 1) * alpha;
883 else if (f < -MAX_EXP)
884 g = (label - 0) * alpha;
885 else {
886 f = exp(f);
887 g =
888 (label
889 - f
890 / (noise_distribution[target]
891 * nce + f)) * alpha;
892 }
893 for (c = 0; c < layer1_size; c++)
894 neu1e[c] += g * syn1nce[c + l2];
895 for (c = 0; c < layer1_size; c++)
896 syn1nce[c + l2] += g * neu1[c];
897 if (cap == 1)
898 for (c = 0; c < layer1_size; c++)
899 capParam(syn1nce, c + l2);
900 }
901 // hidden -> in
902 for (a = b; a < window * 2 + 1 - b; a++)
903 if (a != window) {
904 c = sentence_position - window + a;
905 if (c < 0)
906 continue;
907 if (c >= sentence_length)
908 continue;
909 last_word = sen[c];
910 if (last_word == -1)
911 continue;
912 for (c = 0; c < layer1_size; c++)
913 syn0[c + last_word * layer1_size] += neu1e[c];
914 }
915 }
916 } else if (type == 1) { //train skip-gram
917 for (a = b; a < window * 2 + 1 - b; a++)
918 if (a != window) {
919 c = sentence_position - window + a;
920 if (c < 0)
921 continue;
922 if (c >= sentence_length)
923 continue;
924 last_word = sen[c];
925 if (last_word == -1)
926 continue;
927 l1 = last_word * layer1_size;
928 for (c = 0; c < layer1_size; c++)
929 neu1e[c] = 0;
930 // HIERARCHICAL SOFTMAX
931 if (hs)
932 for (d = 0; d < vocab[word].codelen; d++) {
933 f = 0;
934 l2 = vocab[word].point[d] * layer1_size;
935 // Propagate hidden -> output
936 for (c = 0; c < layer1_size; c++)
937 f += syn0[c + l1] * syn1[c + l2];
938 if (f <= -MAX_EXP)
939 continue;
940 else if (f >= MAX_EXP)
941 continue;
942 else
943 f = expTable[(int) ((f + MAX_EXP)
944 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
945 // 'g' is the gradient multiplied by the learning rate
946 g = (1 - vocab[word].code[d] - f) * alpha;
947 // Propagate errors output -> hidden
948 for (c = 0; c < layer1_size; c++)
949 neu1e[c] += g * syn1[c + l2];
950 // Learn weights hidden -> output
951 for (c = 0; c < layer1_size; c++)
952 syn1[c + l2] += g * syn0[c + l1];
953 if (cap == 1)
954 for (c = 0; c < layer1_size; c++)
955 capParam(syn1, c + l2);
956 }
957 // NEGATIVE SAMPLING
958 if (negative > 0)
959 for (d = 0; d < negative + 1; d++) {
960 if (d == 0) {
961 target = word;
962 label = 1;
963 } else {
964 next_random = next_random
965 * (unsigned long long) 25214903917 + 11;
966 if (word_to_group != NULL
967 && word_to_group[word] != -1) {
968 target = word;
969 while (target == word) {
970 target =
971 group_to_table[word_to_group[word]
972 * table_size
973 + (next_random >> 16)
974 % table_size];
975 next_random =
976 next_random
977 * (unsigned long long) 25214903917
978 + 11;
979 }
980 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
981 } else {
982 target = table[(next_random >> 16)
983 % table_size];
984 }
985 if (target == 0)
986 target = next_random % (vocab_size - 1) + 1;
987 if (target == word)
988 continue;
989 label = 0;
990 }
991 l2 = target * layer1_size;
992 f = 0;
993 for (c = 0; c < layer1_size; c++)
994 f += syn0[c + l1] * syn1neg[c + l2];
995 if (f > MAX_EXP)
996 g = (label - 1) * alpha;
997 else if (f < -MAX_EXP)
998 g = (label - 0) * alpha;
999 else
1000 g =
1001 (label
1002 - expTable[(int) ((f + MAX_EXP)
1003 * (EXP_TABLE_SIZE
1004 / MAX_EXP / 2))])
1005 * alpha;
1006 for (c = 0; c < layer1_size; c++)
1007 neu1e[c] += g * syn1neg[c + l2];
1008 for (c = 0; c < layer1_size; c++)
1009 syn1neg[c + l2] += g * syn0[c + l1];
1010 if (cap == 1)
1011 for (c = 0; c < layer1_size; c++)
1012 capParam(syn1neg, c + l2);
1013 }
1014 //Noise Contrastive Estimation
1015 if (nce > 0)
1016 for (d = 0; d < nce + 1; d++) {
1017 if (d == 0) {
1018 target = word;
1019 label = 1;
1020 } else {
1021 next_random = next_random
1022 * (unsigned long long) 25214903917 + 11;
1023 if (word_to_group != NULL
1024 && word_to_group[word] != -1) {
1025 target = word;
1026 while (target == word) {
1027 target =
1028 group_to_table[word_to_group[word]
1029 * table_size
1030 + (next_random >> 16)
1031 % table_size];
1032 next_random =
1033 next_random
1034 * (unsigned long long) 25214903917
1035 + 11;
1036 }
1037 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1038 } else {
1039 target = table[(next_random >> 16)
1040 % table_size];
1041 }
1042 if (target == 0)
1043 target = next_random % (vocab_size - 1) + 1;
1044 if (target == word)
1045 continue;
1046 label = 0;
1047 }
1048 l2 = target * layer1_size;
1049 f = 0;
1050 for (c = 0; c < layer1_size; c++)
1051 f += syn0[c + l1] * syn1nce[c + l2];
1052 if (f > MAX_EXP)
1053 g = (label - 1) * alpha;
1054 else if (f < -MAX_EXP)
1055 g = (label - 0) * alpha;
1056 else {
1057 f = exp(f);
1058 g = (label
1059 - f
1060 / (noise_distribution[target]
1061 * nce + f)) * alpha;
1062 }
1063 for (c = 0; c < layer1_size; c++)
1064 neu1e[c] += g * syn1nce[c + l2];
1065 for (c = 0; c < layer1_size; c++)
1066 syn1nce[c + l2] += g * syn0[c + l1];
1067 if (cap == 1)
1068 for (c = 0; c < layer1_size; c++)
1069 capParam(syn1nce, c + l2);
1070 }
1071 // Learn weights input -> hidden
1072 for (c = 0; c < layer1_size; c++)
1073 syn0[c + l1] += neu1e[c];
1074 }
1075 } else if (type == 2) { //train the cwindow architecture
1076 // in -> hidden
1077 cw = 0;
1078 for (a = 0; a < window * 2 + 1; a++)
1079 if (a != window) {
1080 c = sentence_position - window + a;
1081 if (c < 0)
1082 continue;
1083 if (c >= sentence_length)
1084 continue;
1085 last_word = sen[c];
1086 if (last_word == -1)
1087 continue;
1088 window_offset = a * layer1_size;
1089 if (a > window)
1090 window_offset -= layer1_size;
1091 for (c = 0; c < layer1_size; c++)
1092 neu1[c + window_offset] += syn0[c
1093 + last_word * layer1_size];
1094 cw++;
1095 }
1096 if (cw) {
1097 if (hs)
1098 for (d = 0; d < vocab[word].codelen; d++) {
1099 f = 0;
1100 l2 = vocab[word].point[d] * window_layer_size;
1101 // Propagate hidden -> output
1102 for (c = 0; c < window_layer_size; c++)
1103 f += neu1[c] * syn1_window[c + l2];
1104 if (f <= -MAX_EXP)
1105 continue;
1106 else if (f >= MAX_EXP)
1107 continue;
1108 else
1109 f = expTable[(int) ((f + MAX_EXP)
1110 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1111 // 'g' is the gradient multiplied by the learning rate
1112 g = (1 - vocab[word].code[d] - f) * alpha;
1113 // Propagate errors output -> hidden
1114 for (c = 0; c < window_layer_size; c++)
1115 neu1e[c] += g * syn1_window[c + l2];
1116 // Learn weights hidden -> output
1117 for (c = 0; c < window_layer_size; c++)
1118 syn1_window[c + l2] += g * neu1[c];
1119 if (cap == 1)
1120 for (c = 0; c < window_layer_size; c++)
1121 capParam(syn1_window, c + l2);
1122 }
1123 // NEGATIVE SAMPLING
1124 if (negative > 0)
1125 for (d = 0; d < negative + 1; d++) {
1126 if (d == 0) {
1127 target = word;
1128 label = 1;
1129 } else {
1130 next_random = next_random
1131 * (unsigned long long) 25214903917 + 11;
1132 if (word_to_group != NULL
1133 && word_to_group[word] != -1) {
1134 target = word;
1135 while (target == word) {
1136 target = group_to_table[word_to_group[word]
1137 * table_size
1138 + (next_random >> 16) % table_size];
1139 next_random = next_random
1140 * (unsigned long long) 25214903917
1141 + 11;
1142 }
1143 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1144 } else {
1145 target =
1146 table[(next_random >> 16) % table_size];
1147 }
1148 if (target == 0)
1149 target = next_random % (vocab_size - 1) + 1;
1150 if (target == word)
1151 continue;
1152 label = 0;
1153 }
1154 l2 = target * window_layer_size;
1155 f = 0;
1156 for (c = 0; c < window_layer_size; c++)
1157 f += neu1[c] * syn1neg_window[c + l2];
1158 if (f > MAX_EXP)
1159 g = (label - 1) * alpha;
1160 else if (f < -MAX_EXP)
1161 g = (label - 0) * alpha;
1162 else
1163 g = (label
1164 - expTable[(int) ((f + MAX_EXP)
1165 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1166 * alpha;
1167 for (c = 0; c < window_layer_size; c++)
1168 neu1e[c] += g * syn1neg_window[c + l2];
1169 for (c = 0; c < window_layer_size; c++)
1170 syn1neg_window[c + l2] += g * neu1[c];
1171 if (cap == 1)
1172 for (c = 0; c < window_layer_size; c++)
1173 capParam(syn1neg_window, c + l2);
1174 }
1175 // Noise Contrastive Estimation
1176 if (nce > 0)
1177 for (d = 0; d < nce + 1; d++) {
1178 if (d == 0) {
1179 target = word;
1180 label = 1;
1181 } else {
1182 next_random = next_random
1183 * (unsigned long long) 25214903917 + 11;
1184 if (word_to_group != NULL
1185 && word_to_group[word] != -1) {
1186 target = word;
1187 while (target == word) {
1188 target = group_to_table[word_to_group[word]
1189 * table_size
1190 + (next_random >> 16) % table_size];
1191 next_random = next_random
1192 * (unsigned long long) 25214903917
1193 + 11;
1194 }
1195 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1196 } else {
1197 target =
1198 table[(next_random >> 16) % table_size];
1199 }
1200 if (target == 0)
1201 target = next_random % (vocab_size - 1) + 1;
1202 if (target == word)
1203 continue;
1204 label = 0;
1205 }
1206 l2 = target * window_layer_size;
1207 f = 0;
1208 for (c = 0; c < window_layer_size; c++)
1209 f += neu1[c] * syn1nce_window[c + l2];
1210 if (f > MAX_EXP)
1211 g = (label - 1) * alpha;
1212 else if (f < -MAX_EXP)
1213 g = (label - 0) * alpha;
1214 else {
1215 f = exp(f);
1216 g =
1217 (label
1218 - f
1219 / (noise_distribution[target]
1220 * nce + f)) * alpha;
1221 }
1222 for (c = 0; c < window_layer_size; c++)
1223 neu1e[c] += g * syn1nce_window[c + l2];
1224 for (c = 0; c < window_layer_size; c++)
1225 syn1nce_window[c + l2] += g * neu1[c];
1226 if (cap == 1)
1227 for (c = 0; c < window_layer_size; c++)
1228 capParam(syn1nce_window, c + l2);
1229 }
1230 // hidden -> in
1231 for (a = 0; a < window * 2 + 1; a++)
1232 if (a != window) {
1233 c = sentence_position - window + a;
1234 if (c < 0)
1235 continue;
1236 if (c >= sentence_length)
1237 continue;
1238 last_word = sen[c];
1239 if (last_word == -1)
1240 continue;
1241 window_offset = a * layer1_size;
1242 if (a > window)
1243 window_offset -= layer1_size;
1244 for (c = 0; c < layer1_size; c++)
1245 syn0[c + last_word * layer1_size] += neu1e[c
1246 + window_offset];
1247 }
1248 }
1249 } else if (type == 3) { //train structured skip-gram
1250 for (a = 0; a < window * 2 + 1; a++)
1251 if (a != window) {
1252 c = sentence_position - window + a;
1253 if (c < 0)
1254 continue;
1255 if (c >= sentence_length)
1256 continue;
1257 last_word = sen[c];
1258 if (last_word == -1)
1259 continue;
1260 l1 = last_word * layer1_size;
1261 window_offset = a * layer1_size;
1262 if (a > window)
1263 window_offset -= layer1_size;
1264 for (c = 0; c < layer1_size; c++)
1265 neu1e[c] = 0;
1266 // HIERARCHICAL SOFTMAX
1267 if (hs)
1268 for (d = 0; d < vocab[word].codelen; d++) {
1269 f = 0;
1270 l2 = vocab[word].point[d] * window_layer_size;
1271 // Propagate hidden -> output
1272 for (c = 0; c < layer1_size; c++)
1273 f += syn0[c + l1]
1274 * syn1_window[c + l2 + window_offset];
1275 if (f <= -MAX_EXP)
1276 continue;
1277 else if (f >= MAX_EXP)
1278 continue;
1279 else
1280 f = expTable[(int) ((f + MAX_EXP)
1281 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1282 // 'g' is the gradient multiplied by the learning rate
1283 g = (1 - vocab[word].code[d] - f) * alpha;
1284 // Propagate errors output -> hidden
1285 for (c = 0; c < layer1_size; c++)
1286 neu1e[c] += g
1287 * syn1_window[c + l2 + window_offset];
1288 // Learn weights hidden -> output
1289 for (c = 0; c < layer1_size; c++)
1290 syn1[c + l2 + window_offset] += g
1291 * syn0[c + l1];
1292 if (cap == 1)
1293 for (c = 0; c < layer1_size; c++)
1294 capParam(syn1, c + l2 + window_offset);
1295 }
1296 // NEGATIVE SAMPLING
1297 if (negative > 0)
1298 for (d = 0; d < negative + 1; d++) {
1299 if (d == 0) {
1300 target = word;
1301 label = 1;
1302 } else {
1303 next_random = next_random
1304 * (unsigned long long) 25214903917 + 11;
1305 if (word_to_group != NULL
1306 && word_to_group[word] != -1) {
1307 target = word;
1308 while (target == word) {
1309 target =
1310 group_to_table[word_to_group[word]
1311 * table_size
1312 + (next_random >> 16)
1313 % table_size];
1314 next_random =
1315 next_random
1316 * (unsigned long long) 25214903917
1317 + 11;
1318 }
1319 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1320 } else {
1321 target = table[(next_random >> 16)
1322 % table_size];
1323 }
1324 if (target == 0)
1325 target = next_random % (vocab_size - 1) + 1;
1326 if (target == word)
1327 continue;
1328 label = 0;
1329 }
1330 l2 = target * window_layer_size;
1331 f = 0;
1332 for (c = 0; c < layer1_size; c++)
1333 f +=
1334 syn0[c + l1]
1335 * syn1neg_window[c + l2
1336 + window_offset];
1337 if (f > MAX_EXP)
1338 g = (label - 1) * alpha;
1339 else if (f < -MAX_EXP)
1340 g = (label - 0) * alpha;
1341 else
1342 g =
1343 (label
1344 - expTable[(int) ((f + MAX_EXP)
1345 * (EXP_TABLE_SIZE
1346 / MAX_EXP / 2))])
1347 * alpha;
1348 for (c = 0; c < layer1_size; c++)
1349 neu1e[c] +=
1350 g
1351 * syn1neg_window[c + l2
1352 + window_offset];
1353 for (c = 0; c < layer1_size; c++)
1354 syn1neg_window[c + l2 + window_offset] += g
1355 * syn0[c + l1];
1356 if (cap == 1)
1357 for (c = 0; c < layer1_size; c++)
1358 capParam(syn1neg_window,
1359 c + l2 + window_offset);
1360 }
1361 // Noise Constrastive Estimation
1362 if (nce > 0)
1363 for (d = 0; d < nce + 1; d++) {
1364 if (d == 0) {
1365 target = word;
1366 label = 1;
1367 } else {
1368 next_random = next_random
1369 * (unsigned long long) 25214903917 + 11;
1370 if (word_to_group != NULL
1371 && word_to_group[word] != -1) {
1372 target = word;
1373 while (target == word) {
1374 target =
1375 group_to_table[word_to_group[word]
1376 * table_size
1377 + (next_random >> 16)
1378 % table_size];
1379 next_random =
1380 next_random
1381 * (unsigned long long) 25214903917
1382 + 11;
1383 }
1384 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1385 } else {
1386 target = table[(next_random >> 16)
1387 % table_size];
1388 }
1389 if (target == 0)
1390 target = next_random % (vocab_size - 1) + 1;
1391 if (target == word)
1392 continue;
1393 label = 0;
1394 }
1395 l2 = target * window_layer_size;
1396 f = 0;
1397 for (c = 0; c < layer1_size; c++)
1398 f +=
1399 syn0[c + l1]
1400 * syn1nce_window[c + l2
1401 + window_offset];
1402 if (f > MAX_EXP)
1403 g = (label - 1) * alpha;
1404 else if (f < -MAX_EXP)
1405 g = (label - 0) * alpha;
1406 else {
1407 f = exp(f);
1408 g = (label
1409 - f
1410 / (noise_distribution[target]
1411 * nce + f)) * alpha;
1412 }
1413 for (c = 0; c < layer1_size; c++)
1414 neu1e[c] +=
1415 g
1416 * syn1nce_window[c + l2
1417 + window_offset];
1418 for (c = 0; c < layer1_size; c++)
1419 syn1nce_window[c + l2 + window_offset] += g
1420 * syn0[c + l1];
1421 if (cap == 1)
1422 for (c = 0; c < layer1_size; c++)
1423 capParam(syn1nce_window,
1424 c + l2 + window_offset);
1425 }
1426 // Learn weights input -> hidden
1427 for (c = 0; c < layer1_size; c++) {
1428 syn0[c + l1] += neu1e[c];
1429 if (syn0[c + l1] > 50)
1430 syn0[c + l1] = 50;
1431 if (syn0[c + l1] < -50)
1432 syn0[c + l1] = -50;
1433 }
1434 }
1435 } else if (type == 4) { //training senna
1436 // in -> hidden
1437 cw = 0;
1438 for (a = 0; a < window * 2 + 1; a++)
1439 if (a != window) {
1440 c = sentence_position - window + a;
1441 if (c < 0)
1442 continue;
1443 if (c >= sentence_length)
1444 continue;
1445 last_word = sen[c];
1446 if (last_word == -1)
1447 continue;
1448 window_offset = a * layer1_size;
1449 if (a > window)
1450 window_offset -= layer1_size;
1451 for (c = 0; c < layer1_size; c++)
1452 neu1[c + window_offset] += syn0[c
1453 + last_word * layer1_size];
1454 cw++;
1455 }
1456 if (cw) {
1457 for (a = 0; a < window_hidden_size; a++) {
1458 c = a * window_layer_size;
1459 for (b = 0; b < window_layer_size; b++) {
1460 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1461 }
1462 }
1463 if (hs)
1464 for (d = 0; d < vocab[word].codelen; d++) {
1465 f = 0;
1466 l2 = vocab[word].point[d] * window_hidden_size;
1467 // Propagate hidden -> output
1468 for (c = 0; c < window_hidden_size; c++)
1469 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1470 if (f <= -MAX_EXP)
1471 continue;
1472 else if (f >= MAX_EXP)
1473 continue;
1474 else
1475 f = expTable[(int) ((f + MAX_EXP)
1476 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1477 // 'g' is the gradient multiplied by the learning rate
1478 g = (1 - vocab[word].code[d] - f) * alpha;
1479 // Propagate errors output -> hidden
1480 for (c = 0; c < window_hidden_size; c++)
1481 neu2e[c] += dHardTanh(neu2[c], g) * g
1482 * syn_hidden_word[c + l2];
1483 // Learn weights hidden -> output
1484 for (c = 0; c < window_hidden_size; c++)
1485 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1486 * neu2[c];
1487 }
1488 // NEGATIVE SAMPLING
1489 if (negative > 0)
1490 for (d = 0; d < negative + 1; d++) {
1491 if (d == 0) {
1492 target = word;
1493 label = 1;
1494 } else {
1495 next_random = next_random
1496 * (unsigned long long) 25214903917 + 11;
1497 if (word_to_group != NULL
1498 && word_to_group[word] != -1) {
1499 target = word;
1500 while (target == word) {
1501 target = group_to_table[word_to_group[word]
1502 * table_size
1503 + (next_random >> 16) % table_size];
1504 next_random = next_random
1505 * (unsigned long long) 25214903917
1506 + 11;
1507 }
1508 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1509 } else {
1510 target =
1511 table[(next_random >> 16) % table_size];
1512 }
1513 if (target == 0)
1514 target = next_random % (vocab_size - 1) + 1;
1515 if (target == word)
1516 continue;
1517 label = 0;
1518 }
1519 l2 = target * window_hidden_size;
1520 f = 0;
1521 for (c = 0; c < window_hidden_size; c++)
1522 f += hardTanh(neu2[c])
1523 * syn_hidden_word_neg[c + l2];
1524 if (f > MAX_EXP)
1525 g = (label - 1) * alpha / negative;
1526 else if (f < -MAX_EXP)
1527 g = (label - 0) * alpha / negative;
1528 else
1529 g = (label
1530 - expTable[(int) ((f + MAX_EXP)
1531 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1532 * alpha / negative;
1533 for (c = 0; c < window_hidden_size; c++)
1534 neu2e[c] += dHardTanh(neu2[c], g) * g
1535 * syn_hidden_word_neg[c + l2];
1536 for (c = 0; c < window_hidden_size; c++)
1537 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1538 * g * neu2[c];
1539 }
1540 for (a = 0; a < window_hidden_size; a++)
1541 for (b = 0; b < window_layer_size; b++)
1542 neu1e[b] += neu2e[a]
1543 * syn_window_hidden[a * window_layer_size + b];
1544 for (a = 0; a < window_hidden_size; a++)
1545 for (b = 0; b < window_layer_size; b++)
1546 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1547 * neu1[b];
1548 // hidden -> in
1549 for (a = 0; a < window * 2 + 1; a++)
1550 if (a != window) {
1551 c = sentence_position - window + a;
1552 if (c < 0)
1553 continue;
1554 if (c >= sentence_length)
1555 continue;
1556 last_word = sen[c];
1557 if (last_word == -1)
1558 continue;
1559 window_offset = a * layer1_size;
1560 if (a > window)
1561 window_offset -= layer1_size;
1562 for (c = 0; c < layer1_size; c++)
1563 syn0[c + last_word * layer1_size] += neu1e[c
1564 + window_offset];
1565 }
1566 }
1567 } else {
1568 printf("unknown type %i", type);
1569 exit(0);
1570 }
1571 sentence_position++;
1572 if (sentence_position >= sentence_length) {
1573 sentence_length = 0;
1574 continue;
1575 }
1576 }
1577 fclose(fi);
1578 free(neu1);
1579 free(neu1e);
1580 pthread_exit(NULL);
1581}
1582
1583void TrainModel() {
1584 long a, b, c, d;
1585 FILE *fo;
1586 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1587 printf("Starting training using file %s\n", train_file);
1588 starting_alpha = alpha;
1589 if (read_vocab_file[0] != 0)
1590 ReadVocab();
1591 else
1592 LearnVocabFromTrainFile();
1593 if (save_vocab_file[0] != 0)
1594 SaveVocab();
1595 if (output_file[0] == 0)
1596 return;
1597 InitNet();
1598 if (negative > 0 || nce > 0)
1599 InitUnigramTable();
1600 if (negative_classes_file[0] != 0)
1601 InitClassUnigramTable();
1602 start = clock();
1603 for (a = 0; a < num_threads; a++)
1604 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1605 for (a = 0; a < num_threads; a++)
1606 pthread_join(pt[a], NULL);
1607 fo = fopen(output_file, "wb");
1608 if (classes == 0) {
1609 // Save the word vectors
1610 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1611 for (a = 0; a < vocab_size; a++) {
1612 fprintf(fo, "%s ", vocab[a].word);
1613 if (binary)
1614 for (b = 0; b < layer1_size; b++)
1615 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1616 else
1617 for (b = 0; b < layer1_size; b++)
1618 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1619 fprintf(fo, "\n");
1620 }
1621 } else {
1622 // Run K-means on the word vectors
1623 int clcn = classes, iter = 10, closeid;
1624 int *centcn = (int *) malloc(classes * sizeof(int));
1625 int *cl = (int *) calloc(vocab_size, sizeof(int));
1626 real closev, x;
1627 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1628 for (a = 0; a < vocab_size; a++)
1629 cl[a] = a % clcn;
1630 for (a = 0; a < iter; a++) {
1631 for (b = 0; b < clcn * layer1_size; b++)
1632 cent[b] = 0;
1633 for (b = 0; b < clcn; b++)
1634 centcn[b] = 1;
1635 for (c = 0; c < vocab_size; c++) {
1636 for (d = 0; d < layer1_size; d++)
1637 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1638 centcn[cl[c]]++;
1639 }
1640 for (b = 0; b < clcn; b++) {
1641 closev = 0;
1642 for (c = 0; c < layer1_size; c++) {
1643 cent[layer1_size * b + c] /= centcn[b];
1644 closev += cent[layer1_size * b + c]
1645 * cent[layer1_size * b + c];
1646 }
1647 closev = sqrt(closev);
1648 for (c = 0; c < layer1_size; c++)
1649 cent[layer1_size * b + c] /= closev;
1650 }
1651 for (c = 0; c < vocab_size; c++) {
1652 closev = -10;
1653 closeid = 0;
1654 for (d = 0; d < clcn; d++) {
1655 x = 0;
1656 for (b = 0; b < layer1_size; b++)
1657 x += cent[layer1_size * d + b]
1658 * syn0[c * layer1_size + b];
1659 if (x > closev) {
1660 closev = x;
1661 closeid = d;
1662 }
1663 }
1664 cl[c] = closeid;
1665 }
1666 }
1667 // Save the K-means classes
1668 for (a = 0; a < vocab_size; a++)
1669 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1670 free(centcn);
1671 free(cent);
1672 free(cl);
1673 }
1674 fclose(fo);
1675 if (save_net_file[0] != 0)
1676 SaveNet();
1677}
1678
1679int ArgPos(char *str, int argc, char **argv) {
1680 int a;
1681 for (a = 1; a < argc; a++)
1682 if (!strcmp(str, argv[a])) {
1683 if (a == argc - 1) {
1684 printf("Argument missing for %s\n", str);
1685 exit(1);
1686 }
1687 return a;
1688 }
1689 return -1;
1690}
1691
1692int main(int argc, char **argv) {
1693 int i;
1694 if (argc == 1) {
1695 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1696 printf("Options:\n");
1697 printf("Parameters for training:\n");
1698 printf("\t-train <file>\n");
1699 printf("\t\tUse text data from <file> to train the model\n");
1700 printf("\t-output <file>\n");
1701 printf(
1702 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1703 printf("\t-size <int>\n");
1704 printf("\t\tSet size of word vectors; default is 100\n");
1705 printf("\t-window <int>\n");
1706 printf("\t\tSet max skip length between words; default is 5\n");
1707 printf("\t-sample <float>\n");
1708 printf(
1709 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1710 printf(
1711 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1712 printf("\t-hs <int>\n");
1713 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1714 printf("\t-negative <int>\n");
1715 printf(
1716 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1717 printf("\t-negative-classes <file>\n");
1718 printf("\t\tNegative classes to sample from\n");
1719 printf("\t-nce <int>\n");
1720 printf(
1721 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1722 printf("\t-threads <int>\n");
1723 printf("\t\tUse <int> threads (default 12)\n");
1724 printf("\t-iter <int>\n");
1725 printf("\t\tRun more training iterations (default 5)\n");
1726 printf("\t-min-count <int>\n");
1727 printf(
1728 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1729 printf("\t-alpha <float>\n");
1730 printf(
1731 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1732 printf("\t-classes <int>\n");
1733 printf(
1734 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1735 printf("\t-debug <int>\n");
1736 printf(
1737 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1738 printf("\t-binary <int>\n");
1739 printf(
1740 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1741 printf("\t-save-vocab <file>\n");
1742 printf("\t\tThe vocabulary will be saved to <file>\n");
1743 printf("\t-read-vocab <file>\n");
1744 printf(
1745 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1746 printf("\t-read-net <file>\n");
1747 printf(
1748 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1749 printf("\t-save-net <file>\n");
1750 printf("\t\tThe net parameters will be saved to <file>\n");
1751 printf("\t-type <int>\n");
1752 printf(
1753 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1754 printf("\t-cap <int>\n");
1755 printf(
1756 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1757 printf("\nExamples:\n");
1758 printf(
1759 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1760 return 0;
1761 }
1762 output_file[0] = 0;
1763 save_vocab_file[0] = 0;
1764 read_vocab_file[0] = 0;
1765 save_net_file[0] = 0;
1766 read_net_file[0] = 0;
1767 negative_classes_file[0] = 0;
1768 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1769 layer1_size = atoi(argv[i + 1]);
1770 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1771 strcpy(train_file, argv[i + 1]);
1772 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1773 strcpy(save_vocab_file, argv[i + 1]);
1774 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1775 strcpy(read_vocab_file, argv[i + 1]);
1776 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1777 strcpy(save_net_file, argv[i + 1]);
1778 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1779 strcpy(read_net_file, argv[i + 1]);
1780 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1781 debug_mode = atoi(argv[i + 1]);
1782 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1783 binary = atoi(argv[i + 1]);
1784 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1785 type = atoi(argv[i + 1]);
1786 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1787 strcpy(output_file, argv[i + 1]);
1788 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1789 window = atoi(argv[i + 1]);
1790 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1791 sample = atof(argv[i + 1]);
1792 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1793 hs = atoi(argv[i + 1]);
1794 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1795 negative = atoi(argv[i + 1]);
1796 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1797 strcpy(negative_classes_file, argv[i + 1]);
1798 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1799 nce = atoi(argv[i + 1]);
1800 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1801 num_threads = atoi(argv[i + 1]);
1802 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1803 iter = atoi(argv[i + 1]);
1804 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1805 min_count = atoi(argv[i + 1]);
1806 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1807 classes = atoi(argv[i + 1]);
1808 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1809 cap = atoi(argv[i + 1]);
1810 if (type == 0 || type == 2 || type == 4)
1811 alpha = 0.05;
1812 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1813 alpha = atof(argv[i + 1]);
1814 vocab = (struct vocab_word *) calloc(vocab_max_size,
1815 sizeof(struct vocab_word));
1816 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1817 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1818 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1819 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1820 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1821 }
1822 TrainModel();
1823 return 0;
1824}
1825