blob: 2d4d34d4dd52c09ed9bb7268a071e2271f3b4803 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
40struct vocab_word *vocab;
41int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
42 num_threads = 12, min_reduce = 1;
43int *vocab_hash;
44long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
45long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
46 classes = 0;
47real alpha = 0.025, starting_alpha, sample = 1e-3;
48real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
49clock_t start;
50
51real *syn1_window, *syn1neg_window, *syn1nce_window;
52int w_offset, window_layer_size;
53
54int window_hidden_size = 500;
55real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
56 *syn_hidden_word_nce;
57
58int hs = 0, negative = 5;
59const int table_size = 1e8;
60int *table;
61
62//constrastive negative sampling
63char negative_classes_file[MAX_STRING];
64int *word_to_group;
65int *group_to_table; //group_size*table_size
66int class_number;
67
68//nce
69real* noise_distribution;
70int nce = 0;
71
72//param caps
73real CAP_VALUE = 50;
74int cap = 0;
75
76void capParam(real* array, int index) {
77 if (array[index] > CAP_VALUE)
78 array[index] = CAP_VALUE;
79 else if (array[index] < -CAP_VALUE)
80 array[index] = -CAP_VALUE;
81}
82
83real hardTanh(real x) {
84 if (x >= 1) {
85 return 1;
86 } else if (x <= -1) {
87 return -1;
88 } else {
89 return x;
90 }
91}
92
93real dHardTanh(real x, real g) {
94 if (x > 1 && g > 0) {
95 return 0;
96 }
97 if (x < -1 && g < 0) {
98 return 0;
99 }
100 return 1;
101}
102
103void InitUnigramTable() {
104 int a, i;
105 long long train_words_pow = 0;
106 real d1, power = 0.75;
107 table = (int *) malloc(table_size * sizeof(int));
108 for (a = 0; a < vocab_size; a++)
109 train_words_pow += pow(vocab[a].cn, power);
110 i = 0;
111 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
112 for (a = 0; a < table_size; a++) {
113 table[a] = i;
114 if (a / (real) table_size > d1) {
115 i++;
116 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
117 }
118 if (i >= vocab_size)
119 i = vocab_size - 1;
120 }
121
122 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
123 for (a = 0; a < vocab_size; a++)
124 noise_distribution[a] = pow(vocab[a].cn, power)
125 / (real) train_words_pow;
126}
127
128// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
129void ReadWord(char *word, FILE *fin) {
130 int a = 0, ch;
131 while (!feof(fin)) {
132 ch = fgetc(fin);
133 if (ch == 13)
134 continue;
135 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
136 if (a > 0) {
137 if (ch == '\n')
138 ungetc(ch, fin);
139 break;
140 }
141 if (ch == '\n') {
142 strcpy(word, (char *) "</s>");
143 return;
144 } else
145 continue;
146 }
147 word[a] = ch;
148 a++;
149 if (a >= MAX_STRING - 1)
150 a--; // Truncate too long words
151 }
152 word[a] = 0;
153}
154
155// Returns hash value of a word
156int GetWordHash(char *word) {
157 unsigned long long a, hash = 0;
158 for (a = 0; a < strlen(word); a++)
159 hash = hash * 257 + word[a];
160 hash = hash % vocab_hash_size;
161 return hash;
162}
163
164// Returns position of a word in the vocabulary; if the word is not found, returns -1
165int SearchVocab(char *word) {
166 unsigned int hash = GetWordHash(word);
167 while (1) {
168 if (vocab_hash[hash] == -1)
169 return -1;
170 if (!strcmp(word, vocab[vocab_hash[hash]].word))
171 return vocab_hash[hash];
172 hash = (hash + 1) % vocab_hash_size;
173 }
174 return -1;
175}
176
177// Reads a word and returns its index in the vocabulary
178int ReadWordIndex(FILE *fin) {
179 char word[MAX_STRING];
180 ReadWord(word, fin);
181 if (feof(fin))
182 return -1;
183 return SearchVocab(word);
184}
185
186// Adds a word to the vocabulary
187int AddWordToVocab(char *word) {
188 unsigned int hash, length = strlen(word) + 1;
189 if (length > MAX_STRING)
190 length = MAX_STRING;
191 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
192 strcpy(vocab[vocab_size].word, word);
193 vocab[vocab_size].cn = 0;
194 vocab_size++;
195 // Reallocate memory if needed
196 if (vocab_size + 2 >= vocab_max_size) {
197 vocab_max_size += 1000;
198 vocab = (struct vocab_word *) realloc(vocab,
199 vocab_max_size * sizeof(struct vocab_word));
200 }
201 hash = GetWordHash(word);
202 while (vocab_hash[hash] != -1)
203 hash = (hash + 1) % vocab_hash_size;
204 vocab_hash[hash] = vocab_size - 1;
205 return vocab_size - 1;
206}
207
208// Used later for sorting by word counts
209int VocabCompare(const void *a, const void *b) {
210 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
211}
212
213// Sorts the vocabulary by frequency using word counts
214void SortVocab() {
215 int a, size;
216 unsigned int hash;
217 // Sort the vocabulary and keep </s> at the first position
218 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
219 for (a = 0; a < vocab_hash_size; a++)
220 vocab_hash[a] = -1;
221 size = vocab_size;
222 train_words = 0;
223 for (a = 0; a < size; a++) {
224 // Words occuring less than min_count times will be discarded from the vocab
225 if ((vocab[a].cn < min_count) && (a != 0)) {
226 vocab_size--;
227 free(vocab[a].word);
228 } else {
229 // Hash will be re-computed, as after the sorting it is not actual
230 hash = GetWordHash(vocab[a].word);
231 while (vocab_hash[hash] != -1)
232 hash = (hash + 1) % vocab_hash_size;
233 vocab_hash[hash] = a;
234 train_words += vocab[a].cn;
235 }
236 }
237 vocab = (struct vocab_word *) realloc(vocab,
238 (vocab_size + 1) * sizeof(struct vocab_word));
239 // Allocate memory for the binary tree construction
240 for (a = 0; a < vocab_size; a++) {
241 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
242 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
243 }
244}
245
246// Reduces the vocabulary by removing infrequent tokens
247void ReduceVocab() {
248 int a, b = 0;
249 unsigned int hash;
250 for (a = 0; a < vocab_size; a++)
251 if (vocab[a].cn > min_reduce) {
252 vocab[b].cn = vocab[a].cn;
253 vocab[b].word = vocab[a].word;
254 b++;
255 } else
256 free(vocab[a].word);
257 vocab_size = b;
258 for (a = 0; a < vocab_hash_size; a++)
259 vocab_hash[a] = -1;
260 for (a = 0; a < vocab_size; a++) {
261 // Hash will be re-computed, as it is not actual
262 hash = GetWordHash(vocab[a].word);
263 while (vocab_hash[hash] != -1)
264 hash = (hash + 1) % vocab_hash_size;
265 vocab_hash[hash] = a;
266 }
267 fflush(stdout);
268 min_reduce++;
269}
270
271// Create binary Huffman tree using the word counts
272// Frequent words will have short uniqe binary codes
273void CreateBinaryTree() {
274 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
275 char code[MAX_CODE_LENGTH];
276 long long *count = (long long *) calloc(vocab_size * 2 + 1,
277 sizeof(long long));
278 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
279 sizeof(long long));
280 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
281 sizeof(long long));
282 for (a = 0; a < vocab_size; a++)
283 count[a] = vocab[a].cn;
284 for (a = vocab_size; a < vocab_size * 2; a++)
285 count[a] = 1e15;
286 pos1 = vocab_size - 1;
287 pos2 = vocab_size;
288 // Following algorithm constructs the Huffman tree by adding one node at a time
289 for (a = 0; a < vocab_size - 1; a++) {
290 // First, find two smallest nodes 'min1, min2'
291 if (pos1 >= 0) {
292 if (count[pos1] < count[pos2]) {
293 min1i = pos1;
294 pos1--;
295 } else {
296 min1i = pos2;
297 pos2++;
298 }
299 } else {
300 min1i = pos2;
301 pos2++;
302 }
303 if (pos1 >= 0) {
304 if (count[pos1] < count[pos2]) {
305 min2i = pos1;
306 pos1--;
307 } else {
308 min2i = pos2;
309 pos2++;
310 }
311 } else {
312 min2i = pos2;
313 pos2++;
314 }
315 count[vocab_size + a] = count[min1i] + count[min2i];
316 parent_node[min1i] = vocab_size + a;
317 parent_node[min2i] = vocab_size + a;
318 binary[min2i] = 1;
319 }
320 // Now assign binary code to each vocabulary word
321 for (a = 0; a < vocab_size; a++) {
322 b = a;
323 i = 0;
324 while (1) {
325 code[i] = binary[b];
326 point[i] = b;
327 i++;
328 b = parent_node[b];
329 if (b == vocab_size * 2 - 2)
330 break;
331 }
332 vocab[a].codelen = i;
333 vocab[a].point[0] = vocab_size - 2;
334 for (b = 0; b < i; b++) {
335 vocab[a].code[i - b - 1] = code[b];
336 vocab[a].point[i - b] = point[b] - vocab_size;
337 }
338 }
339 free(count);
340 free(binary);
341 free(parent_node);
342}
343
344void LearnVocabFromTrainFile() {
345 char word[MAX_STRING];
346 FILE *fin;
347 long long a, i;
348 for (a = 0; a < vocab_hash_size; a++)
349 vocab_hash[a] = -1;
350 fin = fopen(train_file, "rb");
351 if (fin == NULL) {
352 printf("ERROR: training data file not found!\n");
353 exit(1);
354 }
355 vocab_size = 0;
356 AddWordToVocab((char *) "</s>");
357 while (1) {
358 ReadWord(word, fin);
359 if (feof(fin))
360 break;
361 train_words++;
362 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
363 printf("%lldK%c", train_words / 1000, 13);
364 fflush(stdout);
365 }
366 i = SearchVocab(word);
367 if (i == -1) {
368 a = AddWordToVocab(word);
369 vocab[a].cn = 1;
370 } else
371 vocab[i].cn++;
372 if (vocab_size > vocab_hash_size * 0.7)
373 ReduceVocab();
374 }
375 SortVocab();
376 if (debug_mode > 0) {
377 printf("Vocab size: %lld\n", vocab_size);
378 printf("Words in train file: %lld\n", train_words);
379 }
380 file_size = ftell(fin);
381 fclose(fin);
382}
383
384void SaveVocab() {
385 long long i;
386 FILE *fo = fopen(save_vocab_file, "wb");
387 for (i = 0; i < vocab_size; i++)
388 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
389 fclose(fo);
390}
391
392void ReadVocab() {
393 long long a, i = 0;
394 char c;
395 char word[MAX_STRING];
396 FILE *fin = fopen(read_vocab_file, "rb");
397 if (fin == NULL) {
398 printf("Vocabulary file not found\n");
399 exit(1);
400 }
401 for (a = 0; a < vocab_hash_size; a++)
402 vocab_hash[a] = -1;
403 vocab_size = 0;
404 while (1) {
405 ReadWord(word, fin);
406 if (feof(fin))
407 break;
408 a = AddWordToVocab(word);
409 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
410 i++;
411 }
412 SortVocab();
413 if (debug_mode > 0) {
414 printf("Vocab size: %lld\n", vocab_size);
415 printf("Words in train file: %lld\n", train_words);
416 }
417 fin = fopen(train_file, "rb");
418 if (fin == NULL) {
419 printf("ERROR: training data file not found!\n");
420 exit(1);
421 }
422 fseek(fin, 0, SEEK_END);
423 file_size = ftell(fin);
424 fclose(fin);
425}
426
427void InitClassUnigramTable() {
428 long long a, c;
429 printf("loading class unigrams \n");
430 FILE *fin = fopen(negative_classes_file, "rb");
431 if (fin == NULL) {
432 printf("ERROR: class file not found!\n");
433 exit(1);
434 }
435 word_to_group = (int *) malloc(vocab_size * sizeof(int));
436 for (a = 0; a < vocab_size; a++)
437 word_to_group[a] = -1;
438 char class[MAX_STRING];
439 char prev_class[MAX_STRING];
440 prev_class[0] = 0;
441 char word[MAX_STRING];
442 class_number = -1;
443 while (1) {
444 if (feof(fin))
445 break;
446 ReadWord(class, fin);
447 ReadWord(word, fin);
448 int word_index = SearchVocab(word);
449 if (word_index != -1) {
450 if (strcmp(class, prev_class) != 0) {
451 class_number++;
452 strcpy(prev_class, class);
453 }
454 word_to_group[word_index] = class_number;
455 }
456 ReadWord(word, fin);
457 }
458 class_number++;
459 fclose(fin);
460
461 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
462 long long train_words_pow = 0;
463 real d1, power = 0.75;
464
465 for (c = 0; c < class_number; c++) {
466 long long offset = c * table_size;
467 train_words_pow = 0;
468 for (a = 0; a < vocab_size; a++)
469 if (word_to_group[a] == c)
470 train_words_pow += pow(vocab[a].cn, power);
471 int i = 0;
472 while (word_to_group[i] != c && i < vocab_size)
473 i++;
474 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
475 for (a = 0; a < table_size; a++) {
476 //printf("index %lld , word %d\n", a, i);
477 group_to_table[offset + a] = i;
478 if (a / (real) table_size > d1) {
479 i++;
480 while (word_to_group[i] != c && i < vocab_size)
481 i++;
482 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
483 }
484 if (i >= vocab_size)
485 while (word_to_group[i] != c && i >= 0)
486 i--;
487 }
488 }
489}
490
491void SaveNet() {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100492 FILE *fnet = fopen(save_net_file, "wb");
493 if (fnet == NULL) {
494 printf("Net parameter file not found\n");
495 exit(1);
496 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100497 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
498 fwrite(syn_window_hidden, sizeof(real), window_hidden_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100499 fclose(fnet);
500}
501
502void InitNet() {
503 long long a, b;
504 unsigned long long next_random = 1;
505 window_layer_size = layer1_size * window * 2;
506 a = posix_memalign((void **) &syn0, 128,
507 (long long) vocab_size * layer1_size * sizeof(real));
508 if (syn0 == NULL) {
509 printf("Memory allocation failed\n");
510 exit(1);
511 }
512
513 if (hs) {
514 a = posix_memalign((void **) &syn1, 128,
515 (long long) vocab_size * layer1_size * sizeof(real));
516 if (syn1 == NULL) {
517 printf("Memory allocation failed\n");
518 exit(1);
519 }
520 a = posix_memalign((void **) &syn1_window, 128,
521 (long long) vocab_size * window_layer_size * sizeof(real));
522 if (syn1_window == NULL) {
523 printf("Memory allocation failed\n");
524 exit(1);
525 }
526 a = posix_memalign((void **) &syn_hidden_word, 128,
527 (long long) vocab_size * window_hidden_size * sizeof(real));
528 if (syn_hidden_word == NULL) {
529 printf("Memory allocation failed\n");
530 exit(1);
531 }
532
533 for (a = 0; a < vocab_size; a++)
534 for (b = 0; b < layer1_size; b++)
535 syn1[a * layer1_size + b] = 0;
536 for (a = 0; a < vocab_size; a++)
537 for (b = 0; b < window_layer_size; b++)
538 syn1_window[a * window_layer_size + b] = 0;
539 for (a = 0; a < vocab_size; a++)
540 for (b = 0; b < window_hidden_size; b++)
541 syn_hidden_word[a * window_hidden_size + b] = 0;
542 }
543 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100544 if(type == 0) {
545 a = posix_memalign((void **) &syn1neg, 128,
546 (long long) vocab_size * layer1_size * sizeof(real));
547 if (syn1neg == NULL) {
548 printf("Memory allocation failed\n");
549 exit(1);
550 }
551 for (a = 0; a < vocab_size; a++)
552 for (b = 0; b < layer1_size; b++)
553 syn1neg[a * layer1_size + b] = 0;
554 } else if (type == 3) {
555 a = posix_memalign((void **) &syn1neg_window, 128,
556 (long long) vocab_size * window_layer_size * sizeof(real));
557 if (syn1neg_window == NULL) {
558 printf("Memory allocation failed\n");
559 exit(1);
560 }
561 for (a = 0; a < vocab_size; a++)
562 for (b = 0; b < window_layer_size; b++)
563 syn1neg_window[a * window_layer_size + b] = 0;
564 } else if (type == 4) {
565 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
566 (long long) vocab_size * window_hidden_size * sizeof(real));
567 if (syn_hidden_word_neg == NULL) {
568 printf("Memory allocation failed\n");
569 exit(1);
570 }
571 for (a = 0; a < vocab_size; a++)
572 for (b = 0; b < window_hidden_size; b++)
573 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100574 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100575 }
576 if (nce > 0) {
577 a = posix_memalign((void **) &syn1nce, 128,
578 (long long) vocab_size * layer1_size * sizeof(real));
579 if (syn1nce == NULL) {
580 printf("Memory allocation failed\n");
581 exit(1);
582 }
583 a = posix_memalign((void **) &syn1nce_window, 128,
584 (long long) vocab_size * window_layer_size * sizeof(real));
585 if (syn1nce_window == NULL) {
586 printf("Memory allocation failed\n");
587 exit(1);
588 }
589 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
590 (long long) vocab_size * window_hidden_size * sizeof(real));
591 if (syn_hidden_word_nce == NULL) {
592 printf("Memory allocation failed\n");
593 exit(1);
594 }
595
596 for (a = 0; a < vocab_size; a++)
597 for (b = 0; b < layer1_size; b++)
598 syn1nce[a * layer1_size + b] = 0;
599 for (a = 0; a < vocab_size; a++)
600 for (b = 0; b < window_layer_size; b++)
601 syn1nce_window[a * window_layer_size + b] = 0;
602 for (a = 0; a < vocab_size; a++)
603 for (b = 0; b < window_hidden_size; b++)
604 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
605 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100606
Marc Kupietz1006a272016-03-16 15:50:20 +0100607 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100608 a = posix_memalign((void **) &syn_window_hidden, 128,
609 window_hidden_size * window_layer_size * sizeof(real));
610 if (syn_window_hidden == NULL) {
611 printf("Memory allocation failed\n");
612 exit(1);
613 }
614 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
615 next_random = next_random * (unsigned long long) 25214903917 + 11;
616 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
617 - 0.5) / (window_hidden_size * window_layer_size);
618 }
619 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100620
621 if (read_net_file[0] == 0) {
622 for (a = 0; a < vocab_size; a++)
623 for (b = 0; b < layer1_size; b++) {
624 next_random = next_random * (unsigned long long) 25214903917
625 + 11;
626 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
627 / (real) 65536) - 0.5) / layer1_size;
628 }
629 } else {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100630 FILE *fnet = fopen(read_net_file, "rb");
631 if (fnet == NULL) {
632 printf("Net parameter file not found\n");
633 exit(1);
634 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100635 fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100636 a = posix_memalign((void **) &syn_window_hidden, 128,
637 window_hidden_size * window_layer_size * sizeof(real));
638 if (syn_window_hidden == NULL) {
639 printf("Memory allocation failed\n");
640 exit(1);
641 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100642 fread(syn_window_hidden, sizeof(real), window_hidden_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100643 fclose(fnet);
644 }
645
646 CreateBinaryTree();
647}
648
649void *TrainModelThread(void *id) {
650 long long a, b, d, cw, word, last_word, sentence_length = 0,
651 sentence_position = 0;
652 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
653 long long l1, l2, c, target, label, local_iter = iter;
654 unsigned long long next_random = (long long) id;
655 real f, g;
656 clock_t now;
657 int input_len_1 = layer1_size;
658 int window_offset = -1;
659 if (type == 2 || type == 4) {
660 input_len_1 = window_layer_size;
661 }
662 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
663 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
664
665 int input_len_2 = 0;
666 if (type == 4) {
667 input_len_2 = window_hidden_size;
668 }
669 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
670 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
671
672 FILE *fi = fopen(train_file, "rb");
673 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
674 while (1) {
675 if (word_count - last_word_count > 10000) {
676 word_count_actual += word_count - last_word_count;
677 last_word_count = word_count;
678 if ((debug_mode > 1)) {
679 now = clock();
680 printf(
681 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
682 13, alpha,
683 word_count_actual / (real) (iter * train_words + 1)
684 * 100,
685 word_count_actual
686 / ((real) (now - start + 1)
687 / (real) CLOCKS_PER_SEC * 1000));
688 fflush(stdout);
689 }
690 alpha = starting_alpha
691 * (1 - word_count_actual / (real) (iter * train_words + 1));
692 if (alpha < starting_alpha * 0.0001)
693 alpha = starting_alpha * 0.0001;
694 }
695 if (sentence_length == 0) {
696 while (1) {
697 word = ReadWordIndex(fi);
698 if (feof(fi))
699 break;
700 if (word == -1)
701 continue;
702 word_count++;
703 if (word == 0)
704 break;
705 // The subsampling randomly discards frequent words while keeping the ranking same
706 if (sample > 0) {
707 real ran = (sqrt(vocab[word].cn / (sample * train_words))
708 + 1) * (sample * train_words) / vocab[word].cn;
709 next_random = next_random * (unsigned long long) 25214903917
710 + 11;
711 if (ran < (next_random & 0xFFFF) / (real) 65536)
712 continue;
713 }
714 sen[sentence_length] = word;
715 sentence_length++;
716 if (sentence_length >= MAX_SENTENCE_LENGTH)
717 break;
718 }
719 sentence_position = 0;
720 }
721 if (feof(fi) || (word_count > train_words / num_threads)) {
722 word_count_actual += word_count - last_word_count;
723 local_iter--;
724 if (local_iter == 0)
725 break;
726 word_count = 0;
727 last_word_count = 0;
728 sentence_length = 0;
729 fseek(fi, file_size / (long long) num_threads * (long long) id,
730 SEEK_SET);
731 continue;
732 }
733 word = sen[sentence_position];
734 if (word == -1)
735 continue;
736 for (c = 0; c < input_len_1; c++)
737 neu1[c] = 0;
738 for (c = 0; c < input_len_1; c++)
739 neu1e[c] = 0;
740 for (c = 0; c < input_len_2; c++)
741 neu2[c] = 0;
742 for (c = 0; c < input_len_2; c++)
743 neu2e[c] = 0;
744 next_random = next_random * (unsigned long long) 25214903917 + 11;
745 b = next_random % window;
746 if (type == 0) { //train the cbow architecture
747 // in -> hidden
748 cw = 0;
749 for (a = b; a < window * 2 + 1 - b; a++)
750 if (a != window) {
751 c = sentence_position - window + a;
752 if (c < 0)
753 continue;
754 if (c >= sentence_length)
755 continue;
756 last_word = sen[c];
757 if (last_word == -1)
758 continue;
759 for (c = 0; c < layer1_size; c++)
760 neu1[c] += syn0[c + last_word * layer1_size];
761 cw++;
762 }
763 if (cw) {
764 for (c = 0; c < layer1_size; c++)
765 neu1[c] /= cw;
766 if (hs)
767 for (d = 0; d < vocab[word].codelen; d++) {
768 f = 0;
769 l2 = vocab[word].point[d] * layer1_size;
770 // Propagate hidden -> output
771 for (c = 0; c < layer1_size; c++)
772 f += neu1[c] * syn1[c + l2];
773 if (f <= -MAX_EXP)
774 continue;
775 else if (f >= MAX_EXP)
776 continue;
777 else
778 f = expTable[(int) ((f + MAX_EXP)
779 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
780 // 'g' is the gradient multiplied by the learning rate
781 g = (1 - vocab[word].code[d] - f) * alpha;
782 // Propagate errors output -> hidden
783 for (c = 0; c < layer1_size; c++)
784 neu1e[c] += g * syn1[c + l2];
785 // Learn weights hidden -> output
786 for (c = 0; c < layer1_size; c++)
787 syn1[c + l2] += g * neu1[c];
788 if (cap == 1)
789 for (c = 0; c < layer1_size; c++)
790 capParam(syn1, c + l2);
791 }
792 // NEGATIVE SAMPLING
793 if (negative > 0)
794 for (d = 0; d < negative + 1; d++) {
795 if (d == 0) {
796 target = word;
797 label = 1;
798 } else {
799 next_random = next_random
800 * (unsigned long long) 25214903917 + 11;
801 if (word_to_group != NULL
802 && word_to_group[word] != -1) {
803 target = word;
804 while (target == word) {
805 target = group_to_table[word_to_group[word]
806 * table_size
807 + (next_random >> 16) % table_size];
808 next_random = next_random
809 * (unsigned long long) 25214903917
810 + 11;
811 }
812 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
813 } else {
814 target =
815 table[(next_random >> 16) % table_size];
816 }
817 if (target == 0)
818 target = next_random % (vocab_size - 1) + 1;
819 if (target == word)
820 continue;
821 label = 0;
822 }
823 l2 = target * layer1_size;
824 f = 0;
825 for (c = 0; c < layer1_size; c++)
826 f += neu1[c] * syn1neg[c + l2];
827 if (f > MAX_EXP)
828 g = (label - 1) * alpha;
829 else if (f < -MAX_EXP)
830 g = (label - 0) * alpha;
831 else
832 g = (label
833 - expTable[(int) ((f + MAX_EXP)
834 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
835 * alpha;
836 for (c = 0; c < layer1_size; c++)
837 neu1e[c] += g * syn1neg[c + l2];
838 for (c = 0; c < layer1_size; c++)
839 syn1neg[c + l2] += g * neu1[c];
840 if (cap == 1)
841 for (c = 0; c < layer1_size; c++)
842 capParam(syn1neg, c + l2);
843 }
844 // Noise Contrastive Estimation
845 if (nce > 0)
846 for (d = 0; d < nce + 1; d++) {
847 if (d == 0) {
848 target = word;
849 label = 1;
850 } else {
851 next_random = next_random
852 * (unsigned long long) 25214903917 + 11;
853 if (word_to_group != NULL
854 && word_to_group[word] != -1) {
855 target = word;
856 while (target == word) {
857 target = group_to_table[word_to_group[word]
858 * table_size
859 + (next_random >> 16) % table_size];
860 next_random = next_random
861 * (unsigned long long) 25214903917
862 + 11;
863 }
864 } else {
865 target =
866 table[(next_random >> 16) % table_size];
867 }
868 if (target == 0)
869 target = next_random % (vocab_size - 1) + 1;
870 if (target == word)
871 continue;
872 label = 0;
873 }
874 l2 = target * layer1_size;
875 f = 0;
876
877 for (c = 0; c < layer1_size; c++)
878 f += neu1[c] * syn1nce[c + l2];
879 if (f > MAX_EXP)
880 g = (label - 1) * alpha;
881 else if (f < -MAX_EXP)
882 g = (label - 0) * alpha;
883 else {
884 f = exp(f);
885 g =
886 (label
887 - f
888 / (noise_distribution[target]
889 * nce + f)) * alpha;
890 }
891 for (c = 0; c < layer1_size; c++)
892 neu1e[c] += g * syn1nce[c + l2];
893 for (c = 0; c < layer1_size; c++)
894 syn1nce[c + l2] += g * neu1[c];
895 if (cap == 1)
896 for (c = 0; c < layer1_size; c++)
897 capParam(syn1nce, c + l2);
898 }
899 // hidden -> in
900 for (a = b; a < window * 2 + 1 - b; a++)
901 if (a != window) {
902 c = sentence_position - window + a;
903 if (c < 0)
904 continue;
905 if (c >= sentence_length)
906 continue;
907 last_word = sen[c];
908 if (last_word == -1)
909 continue;
910 for (c = 0; c < layer1_size; c++)
911 syn0[c + last_word * layer1_size] += neu1e[c];
912 }
913 }
914 } else if (type == 1) { //train skip-gram
915 for (a = b; a < window * 2 + 1 - b; a++)
916 if (a != window) {
917 c = sentence_position - window + a;
918 if (c < 0)
919 continue;
920 if (c >= sentence_length)
921 continue;
922 last_word = sen[c];
923 if (last_word == -1)
924 continue;
925 l1 = last_word * layer1_size;
926 for (c = 0; c < layer1_size; c++)
927 neu1e[c] = 0;
928 // HIERARCHICAL SOFTMAX
929 if (hs)
930 for (d = 0; d < vocab[word].codelen; d++) {
931 f = 0;
932 l2 = vocab[word].point[d] * layer1_size;
933 // Propagate hidden -> output
934 for (c = 0; c < layer1_size; c++)
935 f += syn0[c + l1] * syn1[c + l2];
936 if (f <= -MAX_EXP)
937 continue;
938 else if (f >= MAX_EXP)
939 continue;
940 else
941 f = expTable[(int) ((f + MAX_EXP)
942 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
943 // 'g' is the gradient multiplied by the learning rate
944 g = (1 - vocab[word].code[d] - f) * alpha;
945 // Propagate errors output -> hidden
946 for (c = 0; c < layer1_size; c++)
947 neu1e[c] += g * syn1[c + l2];
948 // Learn weights hidden -> output
949 for (c = 0; c < layer1_size; c++)
950 syn1[c + l2] += g * syn0[c + l1];
951 if (cap == 1)
952 for (c = 0; c < layer1_size; c++)
953 capParam(syn1, c + l2);
954 }
955 // NEGATIVE SAMPLING
956 if (negative > 0)
957 for (d = 0; d < negative + 1; d++) {
958 if (d == 0) {
959 target = word;
960 label = 1;
961 } else {
962 next_random = next_random
963 * (unsigned long long) 25214903917 + 11;
964 if (word_to_group != NULL
965 && word_to_group[word] != -1) {
966 target = word;
967 while (target == word) {
968 target =
969 group_to_table[word_to_group[word]
970 * table_size
971 + (next_random >> 16)
972 % table_size];
973 next_random =
974 next_random
975 * (unsigned long long) 25214903917
976 + 11;
977 }
978 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
979 } else {
980 target = table[(next_random >> 16)
981 % table_size];
982 }
983 if (target == 0)
984 target = next_random % (vocab_size - 1) + 1;
985 if (target == word)
986 continue;
987 label = 0;
988 }
989 l2 = target * layer1_size;
990 f = 0;
991 for (c = 0; c < layer1_size; c++)
992 f += syn0[c + l1] * syn1neg[c + l2];
993 if (f > MAX_EXP)
994 g = (label - 1) * alpha;
995 else if (f < -MAX_EXP)
996 g = (label - 0) * alpha;
997 else
998 g =
999 (label
1000 - expTable[(int) ((f + MAX_EXP)
1001 * (EXP_TABLE_SIZE
1002 / MAX_EXP / 2))])
1003 * alpha;
1004 for (c = 0; c < layer1_size; c++)
1005 neu1e[c] += g * syn1neg[c + l2];
1006 for (c = 0; c < layer1_size; c++)
1007 syn1neg[c + l2] += g * syn0[c + l1];
1008 if (cap == 1)
1009 for (c = 0; c < layer1_size; c++)
1010 capParam(syn1neg, c + l2);
1011 }
1012 //Noise Contrastive Estimation
1013 if (nce > 0)
1014 for (d = 0; d < nce + 1; d++) {
1015 if (d == 0) {
1016 target = word;
1017 label = 1;
1018 } else {
1019 next_random = next_random
1020 * (unsigned long long) 25214903917 + 11;
1021 if (word_to_group != NULL
1022 && word_to_group[word] != -1) {
1023 target = word;
1024 while (target == word) {
1025 target =
1026 group_to_table[word_to_group[word]
1027 * table_size
1028 + (next_random >> 16)
1029 % table_size];
1030 next_random =
1031 next_random
1032 * (unsigned long long) 25214903917
1033 + 11;
1034 }
1035 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1036 } else {
1037 target = table[(next_random >> 16)
1038 % table_size];
1039 }
1040 if (target == 0)
1041 target = next_random % (vocab_size - 1) + 1;
1042 if (target == word)
1043 continue;
1044 label = 0;
1045 }
1046 l2 = target * layer1_size;
1047 f = 0;
1048 for (c = 0; c < layer1_size; c++)
1049 f += syn0[c + l1] * syn1nce[c + l2];
1050 if (f > MAX_EXP)
1051 g = (label - 1) * alpha;
1052 else if (f < -MAX_EXP)
1053 g = (label - 0) * alpha;
1054 else {
1055 f = exp(f);
1056 g = (label
1057 - f
1058 / (noise_distribution[target]
1059 * nce + f)) * alpha;
1060 }
1061 for (c = 0; c < layer1_size; c++)
1062 neu1e[c] += g * syn1nce[c + l2];
1063 for (c = 0; c < layer1_size; c++)
1064 syn1nce[c + l2] += g * syn0[c + l1];
1065 if (cap == 1)
1066 for (c = 0; c < layer1_size; c++)
1067 capParam(syn1nce, c + l2);
1068 }
1069 // Learn weights input -> hidden
1070 for (c = 0; c < layer1_size; c++)
1071 syn0[c + l1] += neu1e[c];
1072 }
1073 } else if (type == 2) { //train the cwindow architecture
1074 // in -> hidden
1075 cw = 0;
1076 for (a = 0; a < window * 2 + 1; a++)
1077 if (a != window) {
1078 c = sentence_position - window + a;
1079 if (c < 0)
1080 continue;
1081 if (c >= sentence_length)
1082 continue;
1083 last_word = sen[c];
1084 if (last_word == -1)
1085 continue;
1086 window_offset = a * layer1_size;
1087 if (a > window)
1088 window_offset -= layer1_size;
1089 for (c = 0; c < layer1_size; c++)
1090 neu1[c + window_offset] += syn0[c
1091 + last_word * layer1_size];
1092 cw++;
1093 }
1094 if (cw) {
1095 if (hs)
1096 for (d = 0; d < vocab[word].codelen; d++) {
1097 f = 0;
1098 l2 = vocab[word].point[d] * window_layer_size;
1099 // Propagate hidden -> output
1100 for (c = 0; c < window_layer_size; c++)
1101 f += neu1[c] * syn1_window[c + l2];
1102 if (f <= -MAX_EXP)
1103 continue;
1104 else if (f >= MAX_EXP)
1105 continue;
1106 else
1107 f = expTable[(int) ((f + MAX_EXP)
1108 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1109 // 'g' is the gradient multiplied by the learning rate
1110 g = (1 - vocab[word].code[d] - f) * alpha;
1111 // Propagate errors output -> hidden
1112 for (c = 0; c < window_layer_size; c++)
1113 neu1e[c] += g * syn1_window[c + l2];
1114 // Learn weights hidden -> output
1115 for (c = 0; c < window_layer_size; c++)
1116 syn1_window[c + l2] += g * neu1[c];
1117 if (cap == 1)
1118 for (c = 0; c < window_layer_size; c++)
1119 capParam(syn1_window, c + l2);
1120 }
1121 // NEGATIVE SAMPLING
1122 if (negative > 0)
1123 for (d = 0; d < negative + 1; d++) {
1124 if (d == 0) {
1125 target = word;
1126 label = 1;
1127 } else {
1128 next_random = next_random
1129 * (unsigned long long) 25214903917 + 11;
1130 if (word_to_group != NULL
1131 && word_to_group[word] != -1) {
1132 target = word;
1133 while (target == word) {
1134 target = group_to_table[word_to_group[word]
1135 * table_size
1136 + (next_random >> 16) % table_size];
1137 next_random = next_random
1138 * (unsigned long long) 25214903917
1139 + 11;
1140 }
1141 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1142 } else {
1143 target =
1144 table[(next_random >> 16) % table_size];
1145 }
1146 if (target == 0)
1147 target = next_random % (vocab_size - 1) + 1;
1148 if (target == word)
1149 continue;
1150 label = 0;
1151 }
1152 l2 = target * window_layer_size;
1153 f = 0;
1154 for (c = 0; c < window_layer_size; c++)
1155 f += neu1[c] * syn1neg_window[c + l2];
1156 if (f > MAX_EXP)
1157 g = (label - 1) * alpha;
1158 else if (f < -MAX_EXP)
1159 g = (label - 0) * alpha;
1160 else
1161 g = (label
1162 - expTable[(int) ((f + MAX_EXP)
1163 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1164 * alpha;
1165 for (c = 0; c < window_layer_size; c++)
1166 neu1e[c] += g * syn1neg_window[c + l2];
1167 for (c = 0; c < window_layer_size; c++)
1168 syn1neg_window[c + l2] += g * neu1[c];
1169 if (cap == 1)
1170 for (c = 0; c < window_layer_size; c++)
1171 capParam(syn1neg_window, c + l2);
1172 }
1173 // Noise Contrastive Estimation
1174 if (nce > 0)
1175 for (d = 0; d < nce + 1; d++) {
1176 if (d == 0) {
1177 target = word;
1178 label = 1;
1179 } else {
1180 next_random = next_random
1181 * (unsigned long long) 25214903917 + 11;
1182 if (word_to_group != NULL
1183 && word_to_group[word] != -1) {
1184 target = word;
1185 while (target == word) {
1186 target = group_to_table[word_to_group[word]
1187 * table_size
1188 + (next_random >> 16) % table_size];
1189 next_random = next_random
1190 * (unsigned long long) 25214903917
1191 + 11;
1192 }
1193 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1194 } else {
1195 target =
1196 table[(next_random >> 16) % table_size];
1197 }
1198 if (target == 0)
1199 target = next_random % (vocab_size - 1) + 1;
1200 if (target == word)
1201 continue;
1202 label = 0;
1203 }
1204 l2 = target * window_layer_size;
1205 f = 0;
1206 for (c = 0; c < window_layer_size; c++)
1207 f += neu1[c] * syn1nce_window[c + l2];
1208 if (f > MAX_EXP)
1209 g = (label - 1) * alpha;
1210 else if (f < -MAX_EXP)
1211 g = (label - 0) * alpha;
1212 else {
1213 f = exp(f);
1214 g =
1215 (label
1216 - f
1217 / (noise_distribution[target]
1218 * nce + f)) * alpha;
1219 }
1220 for (c = 0; c < window_layer_size; c++)
1221 neu1e[c] += g * syn1nce_window[c + l2];
1222 for (c = 0; c < window_layer_size; c++)
1223 syn1nce_window[c + l2] += g * neu1[c];
1224 if (cap == 1)
1225 for (c = 0; c < window_layer_size; c++)
1226 capParam(syn1nce_window, c + l2);
1227 }
1228 // hidden -> in
1229 for (a = 0; a < window * 2 + 1; a++)
1230 if (a != window) {
1231 c = sentence_position - window + a;
1232 if (c < 0)
1233 continue;
1234 if (c >= sentence_length)
1235 continue;
1236 last_word = sen[c];
1237 if (last_word == -1)
1238 continue;
1239 window_offset = a * layer1_size;
1240 if (a > window)
1241 window_offset -= layer1_size;
1242 for (c = 0; c < layer1_size; c++)
1243 syn0[c + last_word * layer1_size] += neu1e[c
1244 + window_offset];
1245 }
1246 }
1247 } else if (type == 3) { //train structured skip-gram
1248 for (a = 0; a < window * 2 + 1; a++)
1249 if (a != window) {
1250 c = sentence_position - window + a;
1251 if (c < 0)
1252 continue;
1253 if (c >= sentence_length)
1254 continue;
1255 last_word = sen[c];
1256 if (last_word == -1)
1257 continue;
1258 l1 = last_word * layer1_size;
1259 window_offset = a * layer1_size;
1260 if (a > window)
1261 window_offset -= layer1_size;
1262 for (c = 0; c < layer1_size; c++)
1263 neu1e[c] = 0;
1264 // HIERARCHICAL SOFTMAX
1265 if (hs)
1266 for (d = 0; d < vocab[word].codelen; d++) {
1267 f = 0;
1268 l2 = vocab[word].point[d] * window_layer_size;
1269 // Propagate hidden -> output
1270 for (c = 0; c < layer1_size; c++)
1271 f += syn0[c + l1]
1272 * syn1_window[c + l2 + window_offset];
1273 if (f <= -MAX_EXP)
1274 continue;
1275 else if (f >= MAX_EXP)
1276 continue;
1277 else
1278 f = expTable[(int) ((f + MAX_EXP)
1279 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1280 // 'g' is the gradient multiplied by the learning rate
1281 g = (1 - vocab[word].code[d] - f) * alpha;
1282 // Propagate errors output -> hidden
1283 for (c = 0; c < layer1_size; c++)
1284 neu1e[c] += g
1285 * syn1_window[c + l2 + window_offset];
1286 // Learn weights hidden -> output
1287 for (c = 0; c < layer1_size; c++)
1288 syn1[c + l2 + window_offset] += g
1289 * syn0[c + l1];
1290 if (cap == 1)
1291 for (c = 0; c < layer1_size; c++)
1292 capParam(syn1, c + l2 + window_offset);
1293 }
1294 // NEGATIVE SAMPLING
1295 if (negative > 0)
1296 for (d = 0; d < negative + 1; d++) {
1297 if (d == 0) {
1298 target = word;
1299 label = 1;
1300 } else {
1301 next_random = next_random
1302 * (unsigned long long) 25214903917 + 11;
1303 if (word_to_group != NULL
1304 && word_to_group[word] != -1) {
1305 target = word;
1306 while (target == word) {
1307 target =
1308 group_to_table[word_to_group[word]
1309 * table_size
1310 + (next_random >> 16)
1311 % table_size];
1312 next_random =
1313 next_random
1314 * (unsigned long long) 25214903917
1315 + 11;
1316 }
1317 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1318 } else {
1319 target = table[(next_random >> 16)
1320 % table_size];
1321 }
1322 if (target == 0)
1323 target = next_random % (vocab_size - 1) + 1;
1324 if (target == word)
1325 continue;
1326 label = 0;
1327 }
1328 l2 = target * window_layer_size;
1329 f = 0;
1330 for (c = 0; c < layer1_size; c++)
1331 f +=
1332 syn0[c + l1]
1333 * syn1neg_window[c + l2
1334 + window_offset];
1335 if (f > MAX_EXP)
1336 g = (label - 1) * alpha;
1337 else if (f < -MAX_EXP)
1338 g = (label - 0) * alpha;
1339 else
1340 g =
1341 (label
1342 - expTable[(int) ((f + MAX_EXP)
1343 * (EXP_TABLE_SIZE
1344 / MAX_EXP / 2))])
1345 * alpha;
1346 for (c = 0; c < layer1_size; c++)
1347 neu1e[c] +=
1348 g
1349 * syn1neg_window[c + l2
1350 + window_offset];
1351 for (c = 0; c < layer1_size; c++)
1352 syn1neg_window[c + l2 + window_offset] += g
1353 * syn0[c + l1];
1354 if (cap == 1)
1355 for (c = 0; c < layer1_size; c++)
1356 capParam(syn1neg_window,
1357 c + l2 + window_offset);
1358 }
1359 // Noise Constrastive Estimation
1360 if (nce > 0)
1361 for (d = 0; d < nce + 1; d++) {
1362 if (d == 0) {
1363 target = word;
1364 label = 1;
1365 } else {
1366 next_random = next_random
1367 * (unsigned long long) 25214903917 + 11;
1368 if (word_to_group != NULL
1369 && word_to_group[word] != -1) {
1370 target = word;
1371 while (target == word) {
1372 target =
1373 group_to_table[word_to_group[word]
1374 * table_size
1375 + (next_random >> 16)
1376 % table_size];
1377 next_random =
1378 next_random
1379 * (unsigned long long) 25214903917
1380 + 11;
1381 }
1382 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1383 } else {
1384 target = table[(next_random >> 16)
1385 % table_size];
1386 }
1387 if (target == 0)
1388 target = next_random % (vocab_size - 1) + 1;
1389 if (target == word)
1390 continue;
1391 label = 0;
1392 }
1393 l2 = target * window_layer_size;
1394 f = 0;
1395 for (c = 0; c < layer1_size; c++)
1396 f +=
1397 syn0[c + l1]
1398 * syn1nce_window[c + l2
1399 + window_offset];
1400 if (f > MAX_EXP)
1401 g = (label - 1) * alpha;
1402 else if (f < -MAX_EXP)
1403 g = (label - 0) * alpha;
1404 else {
1405 f = exp(f);
1406 g = (label
1407 - f
1408 / (noise_distribution[target]
1409 * nce + f)) * alpha;
1410 }
1411 for (c = 0; c < layer1_size; c++)
1412 neu1e[c] +=
1413 g
1414 * syn1nce_window[c + l2
1415 + window_offset];
1416 for (c = 0; c < layer1_size; c++)
1417 syn1nce_window[c + l2 + window_offset] += g
1418 * syn0[c + l1];
1419 if (cap == 1)
1420 for (c = 0; c < layer1_size; c++)
1421 capParam(syn1nce_window,
1422 c + l2 + window_offset);
1423 }
1424 // Learn weights input -> hidden
1425 for (c = 0; c < layer1_size; c++) {
1426 syn0[c + l1] += neu1e[c];
1427 if (syn0[c + l1] > 50)
1428 syn0[c + l1] = 50;
1429 if (syn0[c + l1] < -50)
1430 syn0[c + l1] = -50;
1431 }
1432 }
1433 } else if (type == 4) { //training senna
1434 // in -> hidden
1435 cw = 0;
1436 for (a = 0; a < window * 2 + 1; a++)
1437 if (a != window) {
1438 c = sentence_position - window + a;
1439 if (c < 0)
1440 continue;
1441 if (c >= sentence_length)
1442 continue;
1443 last_word = sen[c];
1444 if (last_word == -1)
1445 continue;
1446 window_offset = a * layer1_size;
1447 if (a > window)
1448 window_offset -= layer1_size;
1449 for (c = 0; c < layer1_size; c++)
1450 neu1[c + window_offset] += syn0[c
1451 + last_word * layer1_size];
1452 cw++;
1453 }
1454 if (cw) {
1455 for (a = 0; a < window_hidden_size; a++) {
1456 c = a * window_layer_size;
1457 for (b = 0; b < window_layer_size; b++) {
1458 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1459 }
1460 }
1461 if (hs)
1462 for (d = 0; d < vocab[word].codelen; d++) {
1463 f = 0;
1464 l2 = vocab[word].point[d] * window_hidden_size;
1465 // Propagate hidden -> output
1466 for (c = 0; c < window_hidden_size; c++)
1467 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1468 if (f <= -MAX_EXP)
1469 continue;
1470 else if (f >= MAX_EXP)
1471 continue;
1472 else
1473 f = expTable[(int) ((f + MAX_EXP)
1474 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1475 // 'g' is the gradient multiplied by the learning rate
1476 g = (1 - vocab[word].code[d] - f) * alpha;
1477 // Propagate errors output -> hidden
1478 for (c = 0; c < window_hidden_size; c++)
1479 neu2e[c] += dHardTanh(neu2[c], g) * g
1480 * syn_hidden_word[c + l2];
1481 // Learn weights hidden -> output
1482 for (c = 0; c < window_hidden_size; c++)
1483 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1484 * neu2[c];
1485 }
1486 // NEGATIVE SAMPLING
1487 if (negative > 0)
1488 for (d = 0; d < negative + 1; d++) {
1489 if (d == 0) {
1490 target = word;
1491 label = 1;
1492 } else {
1493 next_random = next_random
1494 * (unsigned long long) 25214903917 + 11;
1495 if (word_to_group != NULL
1496 && word_to_group[word] != -1) {
1497 target = word;
1498 while (target == word) {
1499 target = group_to_table[word_to_group[word]
1500 * table_size
1501 + (next_random >> 16) % table_size];
1502 next_random = next_random
1503 * (unsigned long long) 25214903917
1504 + 11;
1505 }
1506 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1507 } else {
1508 target =
1509 table[(next_random >> 16) % table_size];
1510 }
1511 if (target == 0)
1512 target = next_random % (vocab_size - 1) + 1;
1513 if (target == word)
1514 continue;
1515 label = 0;
1516 }
1517 l2 = target * window_hidden_size;
1518 f = 0;
1519 for (c = 0; c < window_hidden_size; c++)
1520 f += hardTanh(neu2[c])
1521 * syn_hidden_word_neg[c + l2];
1522 if (f > MAX_EXP)
1523 g = (label - 1) * alpha / negative;
1524 else if (f < -MAX_EXP)
1525 g = (label - 0) * alpha / negative;
1526 else
1527 g = (label
1528 - expTable[(int) ((f + MAX_EXP)
1529 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1530 * alpha / negative;
1531 for (c = 0; c < window_hidden_size; c++)
1532 neu2e[c] += dHardTanh(neu2[c], g) * g
1533 * syn_hidden_word_neg[c + l2];
1534 for (c = 0; c < window_hidden_size; c++)
1535 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1536 * g * neu2[c];
1537 }
1538 for (a = 0; a < window_hidden_size; a++)
1539 for (b = 0; b < window_layer_size; b++)
1540 neu1e[b] += neu2e[a]
1541 * syn_window_hidden[a * window_layer_size + b];
1542 for (a = 0; a < window_hidden_size; a++)
1543 for (b = 0; b < window_layer_size; b++)
1544 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1545 * neu1[b];
1546 // hidden -> in
1547 for (a = 0; a < window * 2 + 1; a++)
1548 if (a != window) {
1549 c = sentence_position - window + a;
1550 if (c < 0)
1551 continue;
1552 if (c >= sentence_length)
1553 continue;
1554 last_word = sen[c];
1555 if (last_word == -1)
1556 continue;
1557 window_offset = a * layer1_size;
1558 if (a > window)
1559 window_offset -= layer1_size;
1560 for (c = 0; c < layer1_size; c++)
1561 syn0[c + last_word * layer1_size] += neu1e[c
1562 + window_offset];
1563 }
1564 }
1565 } else {
1566 printf("unknown type %i", type);
1567 exit(0);
1568 }
1569 sentence_position++;
1570 if (sentence_position >= sentence_length) {
1571 sentence_length = 0;
1572 continue;
1573 }
1574 }
1575 fclose(fi);
1576 free(neu1);
1577 free(neu1e);
1578 pthread_exit(NULL);
1579}
1580
1581void TrainModel() {
1582 long a, b, c, d;
1583 FILE *fo;
1584 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1585 printf("Starting training using file %s\n", train_file);
1586 starting_alpha = alpha;
1587 if (read_vocab_file[0] != 0)
1588 ReadVocab();
1589 else
1590 LearnVocabFromTrainFile();
1591 if (save_vocab_file[0] != 0)
1592 SaveVocab();
1593 if (output_file[0] == 0)
1594 return;
1595 InitNet();
1596 if (negative > 0 || nce > 0)
1597 InitUnigramTable();
1598 if (negative_classes_file[0] != 0)
1599 InitClassUnigramTable();
1600 start = clock();
1601 for (a = 0; a < num_threads; a++)
1602 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1603 for (a = 0; a < num_threads; a++)
1604 pthread_join(pt[a], NULL);
1605 fo = fopen(output_file, "wb");
1606 if (classes == 0) {
1607 // Save the word vectors
1608 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1609 for (a = 0; a < vocab_size; a++) {
1610 fprintf(fo, "%s ", vocab[a].word);
1611 if (binary)
1612 for (b = 0; b < layer1_size; b++)
1613 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1614 else
1615 for (b = 0; b < layer1_size; b++)
1616 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1617 fprintf(fo, "\n");
1618 }
1619 } else {
1620 // Run K-means on the word vectors
1621 int clcn = classes, iter = 10, closeid;
1622 int *centcn = (int *) malloc(classes * sizeof(int));
1623 int *cl = (int *) calloc(vocab_size, sizeof(int));
1624 real closev, x;
1625 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1626 for (a = 0; a < vocab_size; a++)
1627 cl[a] = a % clcn;
1628 for (a = 0; a < iter; a++) {
1629 for (b = 0; b < clcn * layer1_size; b++)
1630 cent[b] = 0;
1631 for (b = 0; b < clcn; b++)
1632 centcn[b] = 1;
1633 for (c = 0; c < vocab_size; c++) {
1634 for (d = 0; d < layer1_size; d++)
1635 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1636 centcn[cl[c]]++;
1637 }
1638 for (b = 0; b < clcn; b++) {
1639 closev = 0;
1640 for (c = 0; c < layer1_size; c++) {
1641 cent[layer1_size * b + c] /= centcn[b];
1642 closev += cent[layer1_size * b + c]
1643 * cent[layer1_size * b + c];
1644 }
1645 closev = sqrt(closev);
1646 for (c = 0; c < layer1_size; c++)
1647 cent[layer1_size * b + c] /= closev;
1648 }
1649 for (c = 0; c < vocab_size; c++) {
1650 closev = -10;
1651 closeid = 0;
1652 for (d = 0; d < clcn; d++) {
1653 x = 0;
1654 for (b = 0; b < layer1_size; b++)
1655 x += cent[layer1_size * d + b]
1656 * syn0[c * layer1_size + b];
1657 if (x > closev) {
1658 closev = x;
1659 closeid = d;
1660 }
1661 }
1662 cl[c] = closeid;
1663 }
1664 }
1665 // Save the K-means classes
1666 for (a = 0; a < vocab_size; a++)
1667 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1668 free(centcn);
1669 free(cent);
1670 free(cl);
1671 }
1672 fclose(fo);
1673 if (save_net_file[0] != 0)
1674 SaveNet();
1675}
1676
1677int ArgPos(char *str, int argc, char **argv) {
1678 int a;
1679 for (a = 1; a < argc; a++)
1680 if (!strcmp(str, argv[a])) {
1681 if (a == argc - 1) {
1682 printf("Argument missing for %s\n", str);
1683 exit(1);
1684 }
1685 return a;
1686 }
1687 return -1;
1688}
1689
1690int main(int argc, char **argv) {
1691 int i;
1692 if (argc == 1) {
1693 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1694 printf("Options:\n");
1695 printf("Parameters for training:\n");
1696 printf("\t-train <file>\n");
1697 printf("\t\tUse text data from <file> to train the model\n");
1698 printf("\t-output <file>\n");
1699 printf(
1700 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1701 printf("\t-size <int>\n");
1702 printf("\t\tSet size of word vectors; default is 100\n");
1703 printf("\t-window <int>\n");
1704 printf("\t\tSet max skip length between words; default is 5\n");
1705 printf("\t-sample <float>\n");
1706 printf(
1707 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1708 printf(
1709 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1710 printf("\t-hs <int>\n");
1711 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1712 printf("\t-negative <int>\n");
1713 printf(
1714 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1715 printf("\t-negative-classes <file>\n");
1716 printf("\t\tNegative classes to sample from\n");
1717 printf("\t-nce <int>\n");
1718 printf(
1719 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1720 printf("\t-threads <int>\n");
1721 printf("\t\tUse <int> threads (default 12)\n");
1722 printf("\t-iter <int>\n");
1723 printf("\t\tRun more training iterations (default 5)\n");
1724 printf("\t-min-count <int>\n");
1725 printf(
1726 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1727 printf("\t-alpha <float>\n");
1728 printf(
1729 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1730 printf("\t-classes <int>\n");
1731 printf(
1732 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1733 printf("\t-debug <int>\n");
1734 printf(
1735 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1736 printf("\t-binary <int>\n");
1737 printf(
1738 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1739 printf("\t-save-vocab <file>\n");
1740 printf("\t\tThe vocabulary will be saved to <file>\n");
1741 printf("\t-read-vocab <file>\n");
1742 printf(
1743 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1744 printf("\t-read-net <file>\n");
1745 printf(
1746 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1747 printf("\t-save-net <file>\n");
1748 printf("\t\tThe net parameters will be saved to <file>\n");
1749 printf("\t-type <int>\n");
1750 printf(
1751 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1752 printf("\t-cap <int>\n");
1753 printf(
1754 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1755 printf("\nExamples:\n");
1756 printf(
1757 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1758 return 0;
1759 }
1760 output_file[0] = 0;
1761 save_vocab_file[0] = 0;
1762 read_vocab_file[0] = 0;
1763 save_net_file[0] = 0;
1764 read_net_file[0] = 0;
1765 negative_classes_file[0] = 0;
1766 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1767 layer1_size = atoi(argv[i + 1]);
1768 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1769 strcpy(train_file, argv[i + 1]);
1770 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1771 strcpy(save_vocab_file, argv[i + 1]);
1772 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1773 strcpy(read_vocab_file, argv[i + 1]);
1774 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1775 strcpy(save_net_file, argv[i + 1]);
1776 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1777 strcpy(read_net_file, argv[i + 1]);
1778 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1779 debug_mode = atoi(argv[i + 1]);
1780 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1781 binary = atoi(argv[i + 1]);
1782 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1783 type = atoi(argv[i + 1]);
1784 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1785 strcpy(output_file, argv[i + 1]);
1786 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1787 window = atoi(argv[i + 1]);
1788 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1789 sample = atof(argv[i + 1]);
1790 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1791 hs = atoi(argv[i + 1]);
1792 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1793 negative = atoi(argv[i + 1]);
1794 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1795 strcpy(negative_classes_file, argv[i + 1]);
1796 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1797 nce = atoi(argv[i + 1]);
1798 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1799 num_threads = atoi(argv[i + 1]);
1800 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1801 iter = atoi(argv[i + 1]);
1802 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1803 min_count = atoi(argv[i + 1]);
1804 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1805 classes = atoi(argv[i + 1]);
1806 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1807 cap = atoi(argv[i + 1]);
1808 if (type == 0 || type == 2 || type == 4)
1809 alpha = 0.05;
1810 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1811 alpha = atof(argv[i + 1]);
1812 vocab = (struct vocab_word *) calloc(vocab_max_size,
1813 sizeof(struct vocab_word));
1814 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1815 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1816 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1817 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1818 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1819 }
1820 TrainModel();
1821 return 0;
1822}
1823