blob: 6aec264cb9b5d82c8b9110cea66be657504323b8 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
40struct vocab_word *vocab;
41int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
42 num_threads = 12, min_reduce = 1;
43int *vocab_hash;
44long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
45long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
46 classes = 0;
47real alpha = 0.025, starting_alpha, sample = 1e-3;
48real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
49clock_t start;
50
51real *syn1_window, *syn1neg_window, *syn1nce_window;
52int w_offset, window_layer_size;
53
54int window_hidden_size = 500;
55real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
56 *syn_hidden_word_nce;
57
58int hs = 0, negative = 5;
59const int table_size = 1e8;
60int *table;
61
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010062long cc = 0;
63
Marc Kupietzd6f9c712016-03-16 11:50:56 +010064//constrastive negative sampling
65char negative_classes_file[MAX_STRING];
66int *word_to_group;
67int *group_to_table; //group_size*table_size
68int class_number;
69
70//nce
71real* noise_distribution;
72int nce = 0;
73
74//param caps
75real CAP_VALUE = 50;
76int cap = 0;
77
78void capParam(real* array, int index) {
79 if (array[index] > CAP_VALUE)
80 array[index] = CAP_VALUE;
81 else if (array[index] < -CAP_VALUE)
82 array[index] = -CAP_VALUE;
83}
84
85real hardTanh(real x) {
86 if (x >= 1) {
87 return 1;
88 } else if (x <= -1) {
89 return -1;
90 } else {
91 return x;
92 }
93}
94
95real dHardTanh(real x, real g) {
96 if (x > 1 && g > 0) {
97 return 0;
98 }
99 if (x < -1 && g < 0) {
100 return 0;
101 }
102 return 1;
103}
104
105void InitUnigramTable() {
106 int a, i;
107 long long train_words_pow = 0;
108 real d1, power = 0.75;
109 table = (int *) malloc(table_size * sizeof(int));
110 for (a = 0; a < vocab_size; a++)
111 train_words_pow += pow(vocab[a].cn, power);
112 i = 0;
113 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
114 for (a = 0; a < table_size; a++) {
115 table[a] = i;
116 if (a / (real) table_size > d1) {
117 i++;
118 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
119 }
120 if (i >= vocab_size)
121 i = vocab_size - 1;
122 }
123
124 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
125 for (a = 0; a < vocab_size; a++)
126 noise_distribution[a] = pow(vocab[a].cn, power)
127 / (real) train_words_pow;
128}
129
130// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
131void ReadWord(char *word, FILE *fin) {
132 int a = 0, ch;
133 while (!feof(fin)) {
134 ch = fgetc(fin);
135 if (ch == 13)
136 continue;
137 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
138 if (a > 0) {
139 if (ch == '\n')
140 ungetc(ch, fin);
141 break;
142 }
143 if (ch == '\n') {
144 strcpy(word, (char *) "</s>");
145 return;
146 } else
147 continue;
148 }
149 word[a] = ch;
150 a++;
151 if (a >= MAX_STRING - 1)
152 a--; // Truncate too long words
153 }
154 word[a] = 0;
155}
156
157// Returns hash value of a word
158int GetWordHash(char *word) {
159 unsigned long long a, hash = 0;
160 for (a = 0; a < strlen(word); a++)
161 hash = hash * 257 + word[a];
162 hash = hash % vocab_hash_size;
163 return hash;
164}
165
166// Returns position of a word in the vocabulary; if the word is not found, returns -1
167int SearchVocab(char *word) {
168 unsigned int hash = GetWordHash(word);
169 while (1) {
170 if (vocab_hash[hash] == -1)
171 return -1;
172 if (!strcmp(word, vocab[vocab_hash[hash]].word))
173 return vocab_hash[hash];
174 hash = (hash + 1) % vocab_hash_size;
175 }
176 return -1;
177}
178
179// Reads a word and returns its index in the vocabulary
180int ReadWordIndex(FILE *fin) {
181 char word[MAX_STRING];
182 ReadWord(word, fin);
183 if (feof(fin))
184 return -1;
185 return SearchVocab(word);
186}
187
188// Adds a word to the vocabulary
189int AddWordToVocab(char *word) {
190 unsigned int hash, length = strlen(word) + 1;
191 if (length > MAX_STRING)
192 length = MAX_STRING;
193 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
194 strcpy(vocab[vocab_size].word, word);
195 vocab[vocab_size].cn = 0;
196 vocab_size++;
197 // Reallocate memory if needed
198 if (vocab_size + 2 >= vocab_max_size) {
199 vocab_max_size += 1000;
200 vocab = (struct vocab_word *) realloc(vocab,
201 vocab_max_size * sizeof(struct vocab_word));
202 }
203 hash = GetWordHash(word);
204 while (vocab_hash[hash] != -1)
205 hash = (hash + 1) % vocab_hash_size;
206 vocab_hash[hash] = vocab_size - 1;
207 return vocab_size - 1;
208}
209
210// Used later for sorting by word counts
211int VocabCompare(const void *a, const void *b) {
212 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
213}
214
215// Sorts the vocabulary by frequency using word counts
216void SortVocab() {
217 int a, size;
218 unsigned int hash;
219 // Sort the vocabulary and keep </s> at the first position
220 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
221 for (a = 0; a < vocab_hash_size; a++)
222 vocab_hash[a] = -1;
223 size = vocab_size;
224 train_words = 0;
225 for (a = 0; a < size; a++) {
226 // Words occuring less than min_count times will be discarded from the vocab
227 if ((vocab[a].cn < min_count) && (a != 0)) {
228 vocab_size--;
229 free(vocab[a].word);
230 } else {
231 // Hash will be re-computed, as after the sorting it is not actual
232 hash = GetWordHash(vocab[a].word);
233 while (vocab_hash[hash] != -1)
234 hash = (hash + 1) % vocab_hash_size;
235 vocab_hash[hash] = a;
236 train_words += vocab[a].cn;
237 }
238 }
239 vocab = (struct vocab_word *) realloc(vocab,
240 (vocab_size + 1) * sizeof(struct vocab_word));
241 // Allocate memory for the binary tree construction
242 for (a = 0; a < vocab_size; a++) {
243 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
244 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
245 }
246}
247
248// Reduces the vocabulary by removing infrequent tokens
249void ReduceVocab() {
250 int a, b = 0;
251 unsigned int hash;
252 for (a = 0; a < vocab_size; a++)
253 if (vocab[a].cn > min_reduce) {
254 vocab[b].cn = vocab[a].cn;
255 vocab[b].word = vocab[a].word;
256 b++;
257 } else
258 free(vocab[a].word);
259 vocab_size = b;
260 for (a = 0; a < vocab_hash_size; a++)
261 vocab_hash[a] = -1;
262 for (a = 0; a < vocab_size; a++) {
263 // Hash will be re-computed, as it is not actual
264 hash = GetWordHash(vocab[a].word);
265 while (vocab_hash[hash] != -1)
266 hash = (hash + 1) % vocab_hash_size;
267 vocab_hash[hash] = a;
268 }
269 fflush(stdout);
270 min_reduce++;
271}
272
273// Create binary Huffman tree using the word counts
274// Frequent words will have short uniqe binary codes
275void CreateBinaryTree() {
276 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
277 char code[MAX_CODE_LENGTH];
278 long long *count = (long long *) calloc(vocab_size * 2 + 1,
279 sizeof(long long));
280 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
281 sizeof(long long));
282 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
283 sizeof(long long));
284 for (a = 0; a < vocab_size; a++)
285 count[a] = vocab[a].cn;
286 for (a = vocab_size; a < vocab_size * 2; a++)
287 count[a] = 1e15;
288 pos1 = vocab_size - 1;
289 pos2 = vocab_size;
290 // Following algorithm constructs the Huffman tree by adding one node at a time
291 for (a = 0; a < vocab_size - 1; a++) {
292 // First, find two smallest nodes 'min1, min2'
293 if (pos1 >= 0) {
294 if (count[pos1] < count[pos2]) {
295 min1i = pos1;
296 pos1--;
297 } else {
298 min1i = pos2;
299 pos2++;
300 }
301 } else {
302 min1i = pos2;
303 pos2++;
304 }
305 if (pos1 >= 0) {
306 if (count[pos1] < count[pos2]) {
307 min2i = pos1;
308 pos1--;
309 } else {
310 min2i = pos2;
311 pos2++;
312 }
313 } else {
314 min2i = pos2;
315 pos2++;
316 }
317 count[vocab_size + a] = count[min1i] + count[min2i];
318 parent_node[min1i] = vocab_size + a;
319 parent_node[min2i] = vocab_size + a;
320 binary[min2i] = 1;
321 }
322 // Now assign binary code to each vocabulary word
323 for (a = 0; a < vocab_size; a++) {
324 b = a;
325 i = 0;
326 while (1) {
327 code[i] = binary[b];
328 point[i] = b;
329 i++;
330 b = parent_node[b];
331 if (b == vocab_size * 2 - 2)
332 break;
333 }
334 vocab[a].codelen = i;
335 vocab[a].point[0] = vocab_size - 2;
336 for (b = 0; b < i; b++) {
337 vocab[a].code[i - b - 1] = code[b];
338 vocab[a].point[i - b] = point[b] - vocab_size;
339 }
340 }
341 free(count);
342 free(binary);
343 free(parent_node);
344}
345
346void LearnVocabFromTrainFile() {
347 char word[MAX_STRING];
348 FILE *fin;
349 long long a, i;
350 for (a = 0; a < vocab_hash_size; a++)
351 vocab_hash[a] = -1;
352 fin = fopen(train_file, "rb");
353 if (fin == NULL) {
354 printf("ERROR: training data file not found!\n");
355 exit(1);
356 }
357 vocab_size = 0;
358 AddWordToVocab((char *) "</s>");
359 while (1) {
360 ReadWord(word, fin);
361 if (feof(fin))
362 break;
363 train_words++;
364 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
365 printf("%lldK%c", train_words / 1000, 13);
366 fflush(stdout);
367 }
368 i = SearchVocab(word);
369 if (i == -1) {
370 a = AddWordToVocab(word);
371 vocab[a].cn = 1;
372 } else
373 vocab[i].cn++;
374 if (vocab_size > vocab_hash_size * 0.7)
375 ReduceVocab();
376 }
377 SortVocab();
378 if (debug_mode > 0) {
379 printf("Vocab size: %lld\n", vocab_size);
380 printf("Words in train file: %lld\n", train_words);
381 }
382 file_size = ftell(fin);
383 fclose(fin);
384}
385
386void SaveVocab() {
387 long long i;
388 FILE *fo = fopen(save_vocab_file, "wb");
389 for (i = 0; i < vocab_size; i++)
390 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
391 fclose(fo);
392}
393
394void ReadVocab() {
395 long long a, i = 0;
396 char c;
397 char word[MAX_STRING];
398 FILE *fin = fopen(read_vocab_file, "rb");
399 if (fin == NULL) {
400 printf("Vocabulary file not found\n");
401 exit(1);
402 }
403 for (a = 0; a < vocab_hash_size; a++)
404 vocab_hash[a] = -1;
405 vocab_size = 0;
406 while (1) {
407 ReadWord(word, fin);
408 if (feof(fin))
409 break;
410 a = AddWordToVocab(word);
411 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
412 i++;
413 }
414 SortVocab();
415 if (debug_mode > 0) {
416 printf("Vocab size: %lld\n", vocab_size);
417 printf("Words in train file: %lld\n", train_words);
418 }
419 fin = fopen(train_file, "rb");
420 if (fin == NULL) {
421 printf("ERROR: training data file not found!\n");
422 exit(1);
423 }
424 fseek(fin, 0, SEEK_END);
425 file_size = ftell(fin);
426 fclose(fin);
427}
428
429void InitClassUnigramTable() {
430 long long a, c;
431 printf("loading class unigrams \n");
432 FILE *fin = fopen(negative_classes_file, "rb");
433 if (fin == NULL) {
434 printf("ERROR: class file not found!\n");
435 exit(1);
436 }
437 word_to_group = (int *) malloc(vocab_size * sizeof(int));
438 for (a = 0; a < vocab_size; a++)
439 word_to_group[a] = -1;
440 char class[MAX_STRING];
441 char prev_class[MAX_STRING];
442 prev_class[0] = 0;
443 char word[MAX_STRING];
444 class_number = -1;
445 while (1) {
446 if (feof(fin))
447 break;
448 ReadWord(class, fin);
449 ReadWord(word, fin);
450 int word_index = SearchVocab(word);
451 if (word_index != -1) {
452 if (strcmp(class, prev_class) != 0) {
453 class_number++;
454 strcpy(prev_class, class);
455 }
456 word_to_group[word_index] = class_number;
457 }
458 ReadWord(word, fin);
459 }
460 class_number++;
461 fclose(fin);
462
463 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
464 long long train_words_pow = 0;
465 real d1, power = 0.75;
466
467 for (c = 0; c < class_number; c++) {
468 long long offset = c * table_size;
469 train_words_pow = 0;
470 for (a = 0; a < vocab_size; a++)
471 if (word_to_group[a] == c)
472 train_words_pow += pow(vocab[a].cn, power);
473 int i = 0;
474 while (word_to_group[i] != c && i < vocab_size)
475 i++;
476 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
477 for (a = 0; a < table_size; a++) {
478 //printf("index %lld , word %d\n", a, i);
479 group_to_table[offset + a] = i;
480 if (a / (real) table_size > d1) {
481 i++;
482 while (word_to_group[i] != c && i < vocab_size)
483 i++;
484 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
485 }
486 if (i >= vocab_size)
487 while (word_to_group[i] != c && i >= 0)
488 i--;
489 }
490 }
491}
492
493void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100494 if(type != 3 || negative <= 0) {
495 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
496 return;
497 }
498
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100499 FILE *fnet = fopen(save_net_file, "wb");
500 if (fnet == NULL) {
501 printf("Net parameter file not found\n");
502 exit(1);
503 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100504 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100505 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100506 fclose(fnet);
507}
508
509void InitNet() {
510 long long a, b;
511 unsigned long long next_random = 1;
512 window_layer_size = layer1_size * window * 2;
513 a = posix_memalign((void **) &syn0, 128,
514 (long long) vocab_size * layer1_size * sizeof(real));
515 if (syn0 == NULL) {
516 printf("Memory allocation failed\n");
517 exit(1);
518 }
519
520 if (hs) {
521 a = posix_memalign((void **) &syn1, 128,
522 (long long) vocab_size * layer1_size * sizeof(real));
523 if (syn1 == NULL) {
524 printf("Memory allocation failed\n");
525 exit(1);
526 }
527 a = posix_memalign((void **) &syn1_window, 128,
528 (long long) vocab_size * window_layer_size * sizeof(real));
529 if (syn1_window == NULL) {
530 printf("Memory allocation failed\n");
531 exit(1);
532 }
533 a = posix_memalign((void **) &syn_hidden_word, 128,
534 (long long) vocab_size * window_hidden_size * sizeof(real));
535 if (syn_hidden_word == NULL) {
536 printf("Memory allocation failed\n");
537 exit(1);
538 }
539
540 for (a = 0; a < vocab_size; a++)
541 for (b = 0; b < layer1_size; b++)
542 syn1[a * layer1_size + b] = 0;
543 for (a = 0; a < vocab_size; a++)
544 for (b = 0; b < window_layer_size; b++)
545 syn1_window[a * window_layer_size + b] = 0;
546 for (a = 0; a < vocab_size; a++)
547 for (b = 0; b < window_hidden_size; b++)
548 syn_hidden_word[a * window_hidden_size + b] = 0;
549 }
550 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100551 if(type == 0) {
552 a = posix_memalign((void **) &syn1neg, 128,
553 (long long) vocab_size * layer1_size * sizeof(real));
554 if (syn1neg == NULL) {
555 printf("Memory allocation failed\n");
556 exit(1);
557 }
558 for (a = 0; a < vocab_size; a++)
559 for (b = 0; b < layer1_size; b++)
560 syn1neg[a * layer1_size + b] = 0;
561 } else if (type == 3) {
562 a = posix_memalign((void **) &syn1neg_window, 128,
563 (long long) vocab_size * window_layer_size * sizeof(real));
564 if (syn1neg_window == NULL) {
565 printf("Memory allocation failed\n");
566 exit(1);
567 }
568 for (a = 0; a < vocab_size; a++)
569 for (b = 0; b < window_layer_size; b++)
570 syn1neg_window[a * window_layer_size + b] = 0;
571 } else if (type == 4) {
572 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
573 (long long) vocab_size * window_hidden_size * sizeof(real));
574 if (syn_hidden_word_neg == NULL) {
575 printf("Memory allocation failed\n");
576 exit(1);
577 }
578 for (a = 0; a < vocab_size; a++)
579 for (b = 0; b < window_hidden_size; b++)
580 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100581 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100582 }
583 if (nce > 0) {
584 a = posix_memalign((void **) &syn1nce, 128,
585 (long long) vocab_size * layer1_size * sizeof(real));
586 if (syn1nce == NULL) {
587 printf("Memory allocation failed\n");
588 exit(1);
589 }
590 a = posix_memalign((void **) &syn1nce_window, 128,
591 (long long) vocab_size * window_layer_size * sizeof(real));
592 if (syn1nce_window == NULL) {
593 printf("Memory allocation failed\n");
594 exit(1);
595 }
596 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
597 (long long) vocab_size * window_hidden_size * sizeof(real));
598 if (syn_hidden_word_nce == NULL) {
599 printf("Memory allocation failed\n");
600 exit(1);
601 }
602
603 for (a = 0; a < vocab_size; a++)
604 for (b = 0; b < layer1_size; b++)
605 syn1nce[a * layer1_size + b] = 0;
606 for (a = 0; a < vocab_size; a++)
607 for (b = 0; b < window_layer_size; b++)
608 syn1nce_window[a * window_layer_size + b] = 0;
609 for (a = 0; a < vocab_size; a++)
610 for (b = 0; b < window_hidden_size; b++)
611 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
612 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100613
Marc Kupietz1006a272016-03-16 15:50:20 +0100614 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100615 a = posix_memalign((void **) &syn_window_hidden, 128,
616 window_hidden_size * window_layer_size * sizeof(real));
617 if (syn_window_hidden == NULL) {
618 printf("Memory allocation failed\n");
619 exit(1);
620 }
621 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
622 next_random = next_random * (unsigned long long) 25214903917 + 11;
623 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
624 - 0.5) / (window_hidden_size * window_layer_size);
625 }
626 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100627
628 if (read_net_file[0] == 0) {
629 for (a = 0; a < vocab_size; a++)
630 for (b = 0; b < layer1_size; b++) {
631 next_random = next_random * (unsigned long long) 25214903917
632 + 11;
633 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
634 / (real) 65536) - 0.5) / layer1_size;
635 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100636 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100637 FILE *fnet = fopen(read_net_file, "rb");
638 if (fnet == NULL) {
639 printf("Net parameter file not found\n");
640 exit(1);
641 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100642 fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100643 fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100644 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100645 } else {
646 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
647 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100648 }
649
650 CreateBinaryTree();
651}
652
653void *TrainModelThread(void *id) {
654 long long a, b, d, cw, word, last_word, sentence_length = 0,
655 sentence_position = 0;
656 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
657 long long l1, l2, c, target, label, local_iter = iter;
658 unsigned long long next_random = (long long) id;
659 real f, g;
660 clock_t now;
661 int input_len_1 = layer1_size;
662 int window_offset = -1;
663 if (type == 2 || type == 4) {
664 input_len_1 = window_layer_size;
665 }
666 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
667 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
668
669 int input_len_2 = 0;
670 if (type == 4) {
671 input_len_2 = window_hidden_size;
672 }
673 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
674 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
675
676 FILE *fi = fopen(train_file, "rb");
677 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
678 while (1) {
679 if (word_count - last_word_count > 10000) {
680 word_count_actual += word_count - last_word_count;
681 last_word_count = word_count;
682 if ((debug_mode > 1)) {
683 now = clock();
684 printf(
685 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
686 13, alpha,
687 word_count_actual / (real) (iter * train_words + 1)
688 * 100,
689 word_count_actual
690 / ((real) (now - start + 1)
691 / (real) CLOCKS_PER_SEC * 1000));
692 fflush(stdout);
693 }
694 alpha = starting_alpha
695 * (1 - word_count_actual / (real) (iter * train_words + 1));
696 if (alpha < starting_alpha * 0.0001)
697 alpha = starting_alpha * 0.0001;
698 }
699 if (sentence_length == 0) {
700 while (1) {
701 word = ReadWordIndex(fi);
702 if (feof(fi))
703 break;
704 if (word == -1)
705 continue;
706 word_count++;
707 if (word == 0)
708 break;
709 // The subsampling randomly discards frequent words while keeping the ranking same
710 if (sample > 0) {
711 real ran = (sqrt(vocab[word].cn / (sample * train_words))
712 + 1) * (sample * train_words) / vocab[word].cn;
713 next_random = next_random * (unsigned long long) 25214903917
714 + 11;
715 if (ran < (next_random & 0xFFFF) / (real) 65536)
716 continue;
717 }
718 sen[sentence_length] = word;
719 sentence_length++;
720 if (sentence_length >= MAX_SENTENCE_LENGTH)
721 break;
722 }
723 sentence_position = 0;
724 }
725 if (feof(fi) || (word_count > train_words / num_threads)) {
726 word_count_actual += word_count - last_word_count;
727 local_iter--;
728 if (local_iter == 0)
729 break;
730 word_count = 0;
731 last_word_count = 0;
732 sentence_length = 0;
733 fseek(fi, file_size / (long long) num_threads * (long long) id,
734 SEEK_SET);
735 continue;
736 }
737 word = sen[sentence_position];
738 if (word == -1)
739 continue;
740 for (c = 0; c < input_len_1; c++)
741 neu1[c] = 0;
742 for (c = 0; c < input_len_1; c++)
743 neu1e[c] = 0;
744 for (c = 0; c < input_len_2; c++)
745 neu2[c] = 0;
746 for (c = 0; c < input_len_2; c++)
747 neu2e[c] = 0;
748 next_random = next_random * (unsigned long long) 25214903917 + 11;
749 b = next_random % window;
750 if (type == 0) { //train the cbow architecture
751 // in -> hidden
752 cw = 0;
753 for (a = b; a < window * 2 + 1 - b; a++)
754 if (a != window) {
755 c = sentence_position - window + a;
756 if (c < 0)
757 continue;
758 if (c >= sentence_length)
759 continue;
760 last_word = sen[c];
761 if (last_word == -1)
762 continue;
763 for (c = 0; c < layer1_size; c++)
764 neu1[c] += syn0[c + last_word * layer1_size];
765 cw++;
766 }
767 if (cw) {
768 for (c = 0; c < layer1_size; c++)
769 neu1[c] /= cw;
770 if (hs)
771 for (d = 0; d < vocab[word].codelen; d++) {
772 f = 0;
773 l2 = vocab[word].point[d] * layer1_size;
774 // Propagate hidden -> output
775 for (c = 0; c < layer1_size; c++)
776 f += neu1[c] * syn1[c + l2];
777 if (f <= -MAX_EXP)
778 continue;
779 else if (f >= MAX_EXP)
780 continue;
781 else
782 f = expTable[(int) ((f + MAX_EXP)
783 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
784 // 'g' is the gradient multiplied by the learning rate
785 g = (1 - vocab[word].code[d] - f) * alpha;
786 // Propagate errors output -> hidden
787 for (c = 0; c < layer1_size; c++)
788 neu1e[c] += g * syn1[c + l2];
789 // Learn weights hidden -> output
790 for (c = 0; c < layer1_size; c++)
791 syn1[c + l2] += g * neu1[c];
792 if (cap == 1)
793 for (c = 0; c < layer1_size; c++)
794 capParam(syn1, c + l2);
795 }
796 // NEGATIVE SAMPLING
797 if (negative > 0)
798 for (d = 0; d < negative + 1; d++) {
799 if (d == 0) {
800 target = word;
801 label = 1;
802 } else {
803 next_random = next_random
804 * (unsigned long long) 25214903917 + 11;
805 if (word_to_group != NULL
806 && word_to_group[word] != -1) {
807 target = word;
808 while (target == word) {
809 target = group_to_table[word_to_group[word]
810 * table_size
811 + (next_random >> 16) % table_size];
812 next_random = next_random
813 * (unsigned long long) 25214903917
814 + 11;
815 }
816 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
817 } else {
818 target =
819 table[(next_random >> 16) % table_size];
820 }
821 if (target == 0)
822 target = next_random % (vocab_size - 1) + 1;
823 if (target == word)
824 continue;
825 label = 0;
826 }
827 l2 = target * layer1_size;
828 f = 0;
829 for (c = 0; c < layer1_size; c++)
830 f += neu1[c] * syn1neg[c + l2];
831 if (f > MAX_EXP)
832 g = (label - 1) * alpha;
833 else if (f < -MAX_EXP)
834 g = (label - 0) * alpha;
835 else
836 g = (label
837 - expTable[(int) ((f + MAX_EXP)
838 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
839 * alpha;
840 for (c = 0; c < layer1_size; c++)
841 neu1e[c] += g * syn1neg[c + l2];
842 for (c = 0; c < layer1_size; c++)
843 syn1neg[c + l2] += g * neu1[c];
844 if (cap == 1)
845 for (c = 0; c < layer1_size; c++)
846 capParam(syn1neg, c + l2);
847 }
848 // Noise Contrastive Estimation
849 if (nce > 0)
850 for (d = 0; d < nce + 1; d++) {
851 if (d == 0) {
852 target = word;
853 label = 1;
854 } else {
855 next_random = next_random
856 * (unsigned long long) 25214903917 + 11;
857 if (word_to_group != NULL
858 && word_to_group[word] != -1) {
859 target = word;
860 while (target == word) {
861 target = group_to_table[word_to_group[word]
862 * table_size
863 + (next_random >> 16) % table_size];
864 next_random = next_random
865 * (unsigned long long) 25214903917
866 + 11;
867 }
868 } else {
869 target =
870 table[(next_random >> 16) % table_size];
871 }
872 if (target == 0)
873 target = next_random % (vocab_size - 1) + 1;
874 if (target == word)
875 continue;
876 label = 0;
877 }
878 l2 = target * layer1_size;
879 f = 0;
880
881 for (c = 0; c < layer1_size; c++)
882 f += neu1[c] * syn1nce[c + l2];
883 if (f > MAX_EXP)
884 g = (label - 1) * alpha;
885 else if (f < -MAX_EXP)
886 g = (label - 0) * alpha;
887 else {
888 f = exp(f);
889 g =
890 (label
891 - f
892 / (noise_distribution[target]
893 * nce + f)) * alpha;
894 }
895 for (c = 0; c < layer1_size; c++)
896 neu1e[c] += g * syn1nce[c + l2];
897 for (c = 0; c < layer1_size; c++)
898 syn1nce[c + l2] += g * neu1[c];
899 if (cap == 1)
900 for (c = 0; c < layer1_size; c++)
901 capParam(syn1nce, c + l2);
902 }
903 // hidden -> in
904 for (a = b; a < window * 2 + 1 - b; a++)
905 if (a != window) {
906 c = sentence_position - window + a;
907 if (c < 0)
908 continue;
909 if (c >= sentence_length)
910 continue;
911 last_word = sen[c];
912 if (last_word == -1)
913 continue;
914 for (c = 0; c < layer1_size; c++)
915 syn0[c + last_word * layer1_size] += neu1e[c];
916 }
917 }
918 } else if (type == 1) { //train skip-gram
919 for (a = b; a < window * 2 + 1 - b; a++)
920 if (a != window) {
921 c = sentence_position - window + a;
922 if (c < 0)
923 continue;
924 if (c >= sentence_length)
925 continue;
926 last_word = sen[c];
927 if (last_word == -1)
928 continue;
929 l1 = last_word * layer1_size;
930 for (c = 0; c < layer1_size; c++)
931 neu1e[c] = 0;
932 // HIERARCHICAL SOFTMAX
933 if (hs)
934 for (d = 0; d < vocab[word].codelen; d++) {
935 f = 0;
936 l2 = vocab[word].point[d] * layer1_size;
937 // Propagate hidden -> output
938 for (c = 0; c < layer1_size; c++)
939 f += syn0[c + l1] * syn1[c + l2];
940 if (f <= -MAX_EXP)
941 continue;
942 else if (f >= MAX_EXP)
943 continue;
944 else
945 f = expTable[(int) ((f + MAX_EXP)
946 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
947 // 'g' is the gradient multiplied by the learning rate
948 g = (1 - vocab[word].code[d] - f) * alpha;
949 // Propagate errors output -> hidden
950 for (c = 0; c < layer1_size; c++)
951 neu1e[c] += g * syn1[c + l2];
952 // Learn weights hidden -> output
953 for (c = 0; c < layer1_size; c++)
954 syn1[c + l2] += g * syn0[c + l1];
955 if (cap == 1)
956 for (c = 0; c < layer1_size; c++)
957 capParam(syn1, c + l2);
958 }
959 // NEGATIVE SAMPLING
960 if (negative > 0)
961 for (d = 0; d < negative + 1; d++) {
962 if (d == 0) {
963 target = word;
964 label = 1;
965 } else {
966 next_random = next_random
967 * (unsigned long long) 25214903917 + 11;
968 if (word_to_group != NULL
969 && word_to_group[word] != -1) {
970 target = word;
971 while (target == word) {
972 target =
973 group_to_table[word_to_group[word]
974 * table_size
975 + (next_random >> 16)
976 % table_size];
977 next_random =
978 next_random
979 * (unsigned long long) 25214903917
980 + 11;
981 }
982 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
983 } else {
984 target = table[(next_random >> 16)
985 % table_size];
986 }
987 if (target == 0)
988 target = next_random % (vocab_size - 1) + 1;
989 if (target == word)
990 continue;
991 label = 0;
992 }
993 l2 = target * layer1_size;
994 f = 0;
995 for (c = 0; c < layer1_size; c++)
996 f += syn0[c + l1] * syn1neg[c + l2];
997 if (f > MAX_EXP)
998 g = (label - 1) * alpha;
999 else if (f < -MAX_EXP)
1000 g = (label - 0) * alpha;
1001 else
1002 g =
1003 (label
1004 - expTable[(int) ((f + MAX_EXP)
1005 * (EXP_TABLE_SIZE
1006 / MAX_EXP / 2))])
1007 * alpha;
1008 for (c = 0; c < layer1_size; c++)
1009 neu1e[c] += g * syn1neg[c + l2];
1010 for (c = 0; c < layer1_size; c++)
1011 syn1neg[c + l2] += g * syn0[c + l1];
1012 if (cap == 1)
1013 for (c = 0; c < layer1_size; c++)
1014 capParam(syn1neg, c + l2);
1015 }
1016 //Noise Contrastive Estimation
1017 if (nce > 0)
1018 for (d = 0; d < nce + 1; d++) {
1019 if (d == 0) {
1020 target = word;
1021 label = 1;
1022 } else {
1023 next_random = next_random
1024 * (unsigned long long) 25214903917 + 11;
1025 if (word_to_group != NULL
1026 && word_to_group[word] != -1) {
1027 target = word;
1028 while (target == word) {
1029 target =
1030 group_to_table[word_to_group[word]
1031 * table_size
1032 + (next_random >> 16)
1033 % table_size];
1034 next_random =
1035 next_random
1036 * (unsigned long long) 25214903917
1037 + 11;
1038 }
1039 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1040 } else {
1041 target = table[(next_random >> 16)
1042 % table_size];
1043 }
1044 if (target == 0)
1045 target = next_random % (vocab_size - 1) + 1;
1046 if (target == word)
1047 continue;
1048 label = 0;
1049 }
1050 l2 = target * layer1_size;
1051 f = 0;
1052 for (c = 0; c < layer1_size; c++)
1053 f += syn0[c + l1] * syn1nce[c + l2];
1054 if (f > MAX_EXP)
1055 g = (label - 1) * alpha;
1056 else if (f < -MAX_EXP)
1057 g = (label - 0) * alpha;
1058 else {
1059 f = exp(f);
1060 g = (label
1061 - f
1062 / (noise_distribution[target]
1063 * nce + f)) * alpha;
1064 }
1065 for (c = 0; c < layer1_size; c++)
1066 neu1e[c] += g * syn1nce[c + l2];
1067 for (c = 0; c < layer1_size; c++)
1068 syn1nce[c + l2] += g * syn0[c + l1];
1069 if (cap == 1)
1070 for (c = 0; c < layer1_size; c++)
1071 capParam(syn1nce, c + l2);
1072 }
1073 // Learn weights input -> hidden
1074 for (c = 0; c < layer1_size; c++)
1075 syn0[c + l1] += neu1e[c];
1076 }
1077 } else if (type == 2) { //train the cwindow architecture
1078 // in -> hidden
1079 cw = 0;
1080 for (a = 0; a < window * 2 + 1; a++)
1081 if (a != window) {
1082 c = sentence_position - window + a;
1083 if (c < 0)
1084 continue;
1085 if (c >= sentence_length)
1086 continue;
1087 last_word = sen[c];
1088 if (last_word == -1)
1089 continue;
1090 window_offset = a * layer1_size;
1091 if (a > window)
1092 window_offset -= layer1_size;
1093 for (c = 0; c < layer1_size; c++)
1094 neu1[c + window_offset] += syn0[c
1095 + last_word * layer1_size];
1096 cw++;
1097 }
1098 if (cw) {
1099 if (hs)
1100 for (d = 0; d < vocab[word].codelen; d++) {
1101 f = 0;
1102 l2 = vocab[word].point[d] * window_layer_size;
1103 // Propagate hidden -> output
1104 for (c = 0; c < window_layer_size; c++)
1105 f += neu1[c] * syn1_window[c + l2];
1106 if (f <= -MAX_EXP)
1107 continue;
1108 else if (f >= MAX_EXP)
1109 continue;
1110 else
1111 f = expTable[(int) ((f + MAX_EXP)
1112 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1113 // 'g' is the gradient multiplied by the learning rate
1114 g = (1 - vocab[word].code[d] - f) * alpha;
1115 // Propagate errors output -> hidden
1116 for (c = 0; c < window_layer_size; c++)
1117 neu1e[c] += g * syn1_window[c + l2];
1118 // Learn weights hidden -> output
1119 for (c = 0; c < window_layer_size; c++)
1120 syn1_window[c + l2] += g * neu1[c];
1121 if (cap == 1)
1122 for (c = 0; c < window_layer_size; c++)
1123 capParam(syn1_window, c + l2);
1124 }
1125 // NEGATIVE SAMPLING
1126 if (negative > 0)
1127 for (d = 0; d < negative + 1; d++) {
1128 if (d == 0) {
1129 target = word;
1130 label = 1;
1131 } else {
1132 next_random = next_random
1133 * (unsigned long long) 25214903917 + 11;
1134 if (word_to_group != NULL
1135 && word_to_group[word] != -1) {
1136 target = word;
1137 while (target == word) {
1138 target = group_to_table[word_to_group[word]
1139 * table_size
1140 + (next_random >> 16) % table_size];
1141 next_random = next_random
1142 * (unsigned long long) 25214903917
1143 + 11;
1144 }
1145 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1146 } else {
1147 target =
1148 table[(next_random >> 16) % table_size];
1149 }
1150 if (target == 0)
1151 target = next_random % (vocab_size - 1) + 1;
1152 if (target == word)
1153 continue;
1154 label = 0;
1155 }
1156 l2 = target * window_layer_size;
1157 f = 0;
1158 for (c = 0; c < window_layer_size; c++)
1159 f += neu1[c] * syn1neg_window[c + l2];
1160 if (f > MAX_EXP)
1161 g = (label - 1) * alpha;
1162 else if (f < -MAX_EXP)
1163 g = (label - 0) * alpha;
1164 else
1165 g = (label
1166 - expTable[(int) ((f + MAX_EXP)
1167 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1168 * alpha;
1169 for (c = 0; c < window_layer_size; c++)
1170 neu1e[c] += g * syn1neg_window[c + l2];
1171 for (c = 0; c < window_layer_size; c++)
1172 syn1neg_window[c + l2] += g * neu1[c];
1173 if (cap == 1)
1174 for (c = 0; c < window_layer_size; c++)
1175 capParam(syn1neg_window, c + l2);
1176 }
1177 // Noise Contrastive Estimation
1178 if (nce > 0)
1179 for (d = 0; d < nce + 1; d++) {
1180 if (d == 0) {
1181 target = word;
1182 label = 1;
1183 } else {
1184 next_random = next_random
1185 * (unsigned long long) 25214903917 + 11;
1186 if (word_to_group != NULL
1187 && word_to_group[word] != -1) {
1188 target = word;
1189 while (target == word) {
1190 target = group_to_table[word_to_group[word]
1191 * table_size
1192 + (next_random >> 16) % table_size];
1193 next_random = next_random
1194 * (unsigned long long) 25214903917
1195 + 11;
1196 }
1197 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1198 } else {
1199 target =
1200 table[(next_random >> 16) % table_size];
1201 }
1202 if (target == 0)
1203 target = next_random % (vocab_size - 1) + 1;
1204 if (target == word)
1205 continue;
1206 label = 0;
1207 }
1208 l2 = target * window_layer_size;
1209 f = 0;
1210 for (c = 0; c < window_layer_size; c++)
1211 f += neu1[c] * syn1nce_window[c + l2];
1212 if (f > MAX_EXP)
1213 g = (label - 1) * alpha;
1214 else if (f < -MAX_EXP)
1215 g = (label - 0) * alpha;
1216 else {
1217 f = exp(f);
1218 g =
1219 (label
1220 - f
1221 / (noise_distribution[target]
1222 * nce + f)) * alpha;
1223 }
1224 for (c = 0; c < window_layer_size; c++)
1225 neu1e[c] += g * syn1nce_window[c + l2];
1226 for (c = 0; c < window_layer_size; c++)
1227 syn1nce_window[c + l2] += g * neu1[c];
1228 if (cap == 1)
1229 for (c = 0; c < window_layer_size; c++)
1230 capParam(syn1nce_window, c + l2);
1231 }
1232 // hidden -> in
1233 for (a = 0; a < window * 2 + 1; a++)
1234 if (a != window) {
1235 c = sentence_position - window + a;
1236 if (c < 0)
1237 continue;
1238 if (c >= sentence_length)
1239 continue;
1240 last_word = sen[c];
1241 if (last_word == -1)
1242 continue;
1243 window_offset = a * layer1_size;
1244 if (a > window)
1245 window_offset -= layer1_size;
1246 for (c = 0; c < layer1_size; c++)
1247 syn0[c + last_word * layer1_size] += neu1e[c
1248 + window_offset];
1249 }
1250 }
1251 } else if (type == 3) { //train structured skip-gram
1252 for (a = 0; a < window * 2 + 1; a++)
1253 if (a != window) {
1254 c = sentence_position - window + a;
1255 if (c < 0)
1256 continue;
1257 if (c >= sentence_length)
1258 continue;
1259 last_word = sen[c];
1260 if (last_word == -1)
1261 continue;
1262 l1 = last_word * layer1_size;
1263 window_offset = a * layer1_size;
1264 if (a > window)
1265 window_offset -= layer1_size;
1266 for (c = 0; c < layer1_size; c++)
1267 neu1e[c] = 0;
1268 // HIERARCHICAL SOFTMAX
1269 if (hs)
1270 for (d = 0; d < vocab[word].codelen; d++) {
1271 f = 0;
1272 l2 = vocab[word].point[d] * window_layer_size;
1273 // Propagate hidden -> output
1274 for (c = 0; c < layer1_size; c++)
1275 f += syn0[c + l1]
1276 * syn1_window[c + l2 + window_offset];
1277 if (f <= -MAX_EXP)
1278 continue;
1279 else if (f >= MAX_EXP)
1280 continue;
1281 else
1282 f = expTable[(int) ((f + MAX_EXP)
1283 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1284 // 'g' is the gradient multiplied by the learning rate
1285 g = (1 - vocab[word].code[d] - f) * alpha;
1286 // Propagate errors output -> hidden
1287 for (c = 0; c < layer1_size; c++)
1288 neu1e[c] += g
1289 * syn1_window[c + l2 + window_offset];
1290 // Learn weights hidden -> output
1291 for (c = 0; c < layer1_size; c++)
1292 syn1[c + l2 + window_offset] += g
1293 * syn0[c + l1];
1294 if (cap == 1)
1295 for (c = 0; c < layer1_size; c++)
1296 capParam(syn1, c + l2 + window_offset);
1297 }
1298 // NEGATIVE SAMPLING
1299 if (negative > 0)
1300 for (d = 0; d < negative + 1; d++) {
1301 if (d == 0) {
1302 target = word;
1303 label = 1;
1304 } else {
1305 next_random = next_random
1306 * (unsigned long long) 25214903917 + 11;
1307 if (word_to_group != NULL
1308 && word_to_group[word] != -1) {
1309 target = word;
1310 while (target == word) {
1311 target =
1312 group_to_table[word_to_group[word]
1313 * table_size
1314 + (next_random >> 16)
1315 % table_size];
1316 next_random =
1317 next_random
1318 * (unsigned long long) 25214903917
1319 + 11;
1320 }
1321 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1322 } else {
1323 target = table[(next_random >> 16)
1324 % table_size];
1325 }
1326 if (target == 0)
1327 target = next_random % (vocab_size - 1) + 1;
1328 if (target == word)
1329 continue;
1330 label = 0;
1331 }
1332 l2 = target * window_layer_size;
1333 f = 0;
1334 for (c = 0; c < layer1_size; c++)
1335 f +=
1336 syn0[c + l1]
1337 * syn1neg_window[c + l2
1338 + window_offset];
1339 if (f > MAX_EXP)
1340 g = (label - 1) * alpha;
1341 else if (f < -MAX_EXP)
1342 g = (label - 0) * alpha;
1343 else
1344 g =
1345 (label
1346 - expTable[(int) ((f + MAX_EXP)
1347 * (EXP_TABLE_SIZE
1348 / MAX_EXP / 2))])
1349 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001350 if(debug_mode > 2 && ((long long) id) == 0) {
1351 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1352 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1353 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001354 for (c = 0; c < layer1_size; c++)
1355 neu1e[c] +=
1356 g
1357 * syn1neg_window[c + l2
1358 + window_offset];
1359 for (c = 0; c < layer1_size; c++)
1360 syn1neg_window[c + l2 + window_offset] += g
1361 * syn0[c + l1];
1362 if (cap == 1)
1363 for (c = 0; c < layer1_size; c++)
1364 capParam(syn1neg_window,
1365 c + l2 + window_offset);
1366 }
1367 // Noise Constrastive Estimation
1368 if (nce > 0)
1369 for (d = 0; d < nce + 1; d++) {
1370 if (d == 0) {
1371 target = word;
1372 label = 1;
1373 } else {
1374 next_random = next_random
1375 * (unsigned long long) 25214903917 + 11;
1376 if (word_to_group != NULL
1377 && word_to_group[word] != -1) {
1378 target = word;
1379 while (target == word) {
1380 target =
1381 group_to_table[word_to_group[word]
1382 * table_size
1383 + (next_random >> 16)
1384 % table_size];
1385 next_random =
1386 next_random
1387 * (unsigned long long) 25214903917
1388 + 11;
1389 }
1390 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1391 } else {
1392 target = table[(next_random >> 16)
1393 % table_size];
1394 }
1395 if (target == 0)
1396 target = next_random % (vocab_size - 1) + 1;
1397 if (target == word)
1398 continue;
1399 label = 0;
1400 }
1401 l2 = target * window_layer_size;
1402 f = 0;
1403 for (c = 0; c < layer1_size; c++)
1404 f +=
1405 syn0[c + l1]
1406 * syn1nce_window[c + l2
1407 + window_offset];
1408 if (f > MAX_EXP)
1409 g = (label - 1) * alpha;
1410 else if (f < -MAX_EXP)
1411 g = (label - 0) * alpha;
1412 else {
1413 f = exp(f);
1414 g = (label
1415 - f
1416 / (noise_distribution[target]
1417 * nce + f)) * alpha;
1418 }
1419 for (c = 0; c < layer1_size; c++)
1420 neu1e[c] +=
1421 g
1422 * syn1nce_window[c + l2
1423 + window_offset];
1424 for (c = 0; c < layer1_size; c++)
1425 syn1nce_window[c + l2 + window_offset] += g
1426 * syn0[c + l1];
1427 if (cap == 1)
1428 for (c = 0; c < layer1_size; c++)
1429 capParam(syn1nce_window,
1430 c + l2 + window_offset);
1431 }
1432 // Learn weights input -> hidden
1433 for (c = 0; c < layer1_size; c++) {
1434 syn0[c + l1] += neu1e[c];
1435 if (syn0[c + l1] > 50)
1436 syn0[c + l1] = 50;
1437 if (syn0[c + l1] < -50)
1438 syn0[c + l1] = -50;
1439 }
1440 }
1441 } else if (type == 4) { //training senna
1442 // in -> hidden
1443 cw = 0;
1444 for (a = 0; a < window * 2 + 1; a++)
1445 if (a != window) {
1446 c = sentence_position - window + a;
1447 if (c < 0)
1448 continue;
1449 if (c >= sentence_length)
1450 continue;
1451 last_word = sen[c];
1452 if (last_word == -1)
1453 continue;
1454 window_offset = a * layer1_size;
1455 if (a > window)
1456 window_offset -= layer1_size;
1457 for (c = 0; c < layer1_size; c++)
1458 neu1[c + window_offset] += syn0[c
1459 + last_word * layer1_size];
1460 cw++;
1461 }
1462 if (cw) {
1463 for (a = 0; a < window_hidden_size; a++) {
1464 c = a * window_layer_size;
1465 for (b = 0; b < window_layer_size; b++) {
1466 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1467 }
1468 }
1469 if (hs)
1470 for (d = 0; d < vocab[word].codelen; d++) {
1471 f = 0;
1472 l2 = vocab[word].point[d] * window_hidden_size;
1473 // Propagate hidden -> output
1474 for (c = 0; c < window_hidden_size; c++)
1475 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1476 if (f <= -MAX_EXP)
1477 continue;
1478 else if (f >= MAX_EXP)
1479 continue;
1480 else
1481 f = expTable[(int) ((f + MAX_EXP)
1482 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1483 // 'g' is the gradient multiplied by the learning rate
1484 g = (1 - vocab[word].code[d] - f) * alpha;
1485 // Propagate errors output -> hidden
1486 for (c = 0; c < window_hidden_size; c++)
1487 neu2e[c] += dHardTanh(neu2[c], g) * g
1488 * syn_hidden_word[c + l2];
1489 // Learn weights hidden -> output
1490 for (c = 0; c < window_hidden_size; c++)
1491 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1492 * neu2[c];
1493 }
1494 // NEGATIVE SAMPLING
1495 if (negative > 0)
1496 for (d = 0; d < negative + 1; d++) {
1497 if (d == 0) {
1498 target = word;
1499 label = 1;
1500 } else {
1501 next_random = next_random
1502 * (unsigned long long) 25214903917 + 11;
1503 if (word_to_group != NULL
1504 && word_to_group[word] != -1) {
1505 target = word;
1506 while (target == word) {
1507 target = group_to_table[word_to_group[word]
1508 * table_size
1509 + (next_random >> 16) % table_size];
1510 next_random = next_random
1511 * (unsigned long long) 25214903917
1512 + 11;
1513 }
1514 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1515 } else {
1516 target =
1517 table[(next_random >> 16) % table_size];
1518 }
1519 if (target == 0)
1520 target = next_random % (vocab_size - 1) + 1;
1521 if (target == word)
1522 continue;
1523 label = 0;
1524 }
1525 l2 = target * window_hidden_size;
1526 f = 0;
1527 for (c = 0; c < window_hidden_size; c++)
1528 f += hardTanh(neu2[c])
1529 * syn_hidden_word_neg[c + l2];
1530 if (f > MAX_EXP)
1531 g = (label - 1) * alpha / negative;
1532 else if (f < -MAX_EXP)
1533 g = (label - 0) * alpha / negative;
1534 else
1535 g = (label
1536 - expTable[(int) ((f + MAX_EXP)
1537 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1538 * alpha / negative;
1539 for (c = 0; c < window_hidden_size; c++)
1540 neu2e[c] += dHardTanh(neu2[c], g) * g
1541 * syn_hidden_word_neg[c + l2];
1542 for (c = 0; c < window_hidden_size; c++)
1543 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1544 * g * neu2[c];
1545 }
1546 for (a = 0; a < window_hidden_size; a++)
1547 for (b = 0; b < window_layer_size; b++)
1548 neu1e[b] += neu2e[a]
1549 * syn_window_hidden[a * window_layer_size + b];
1550 for (a = 0; a < window_hidden_size; a++)
1551 for (b = 0; b < window_layer_size; b++)
1552 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1553 * neu1[b];
1554 // hidden -> in
1555 for (a = 0; a < window * 2 + 1; a++)
1556 if (a != window) {
1557 c = sentence_position - window + a;
1558 if (c < 0)
1559 continue;
1560 if (c >= sentence_length)
1561 continue;
1562 last_word = sen[c];
1563 if (last_word == -1)
1564 continue;
1565 window_offset = a * layer1_size;
1566 if (a > window)
1567 window_offset -= layer1_size;
1568 for (c = 0; c < layer1_size; c++)
1569 syn0[c + last_word * layer1_size] += neu1e[c
1570 + window_offset];
1571 }
1572 }
1573 } else {
1574 printf("unknown type %i", type);
1575 exit(0);
1576 }
1577 sentence_position++;
1578 if (sentence_position >= sentence_length) {
1579 sentence_length = 0;
1580 continue;
1581 }
1582 }
1583 fclose(fi);
1584 free(neu1);
1585 free(neu1e);
1586 pthread_exit(NULL);
1587}
1588
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001589void ShowCollocations() {
1590 long a, b, c, d, window_offset, target, max_target=0, maxmax_target;
1591 real f, max_f, maxmax_f;
1592 real *target_sums;
1593 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1594
1595 for (d = cc; d < vocab_size; d++) {
1596 for (b = 0; b < vocab_size; b++)
1597 target_sums[b]=0;
1598 maxmax_f = -1;
1599 maxmax_target = 0;
1600 for (a = 0; a < window * 2 + 1; a++) {
1601 if (a != window) {
1602 max_f = -1;
1603 window_offset = a * layer1_size;
1604 if (a > window)
1605 window_offset -= layer1_size;
1606 for(target = 0; target < vocab_size; target ++) {
1607 if(target == d)
1608 continue;
1609 f = 0;
1610 for (c = 0; c < layer1_size; c++)
1611 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1612 if (f < -MAX_EXP)
1613 continue;
1614 else if (f > MAX_EXP)
1615 continue;
1616 else
1617 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1618 if(f > max_f) {
1619 max_f = f;
1620 max_target = target;
1621 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001622 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001623 }
1624 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1625 if(max_f > maxmax_f) {
1626 maxmax_f = max_f;
1627 maxmax_target = max_target;
1628 }
1629 } else {
1630 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1631 }
1632 }
1633 max_f = -1;
1634 for (b = 0; b < vocab_size; b++) {
1635 if(target_sums[b] > max_f) {
1636 max_f = target_sums[b];
1637 max_target = b;
1638 }
1639 }
1640 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001641 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001642 vocab[maxmax_target].word, maxmax_f);
1643 }
1644}
1645
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001646void TrainModel() {
1647 long a, b, c, d;
1648 FILE *fo;
1649 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1650 printf("Starting training using file %s\n", train_file);
1651 starting_alpha = alpha;
1652 if (read_vocab_file[0] != 0)
1653 ReadVocab();
1654 else
1655 LearnVocabFromTrainFile();
1656 if (save_vocab_file[0] != 0)
1657 SaveVocab();
1658 if (output_file[0] == 0)
1659 return;
1660 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001661 if(cc > 0)
1662 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001663 if (negative > 0 || nce > 0)
1664 InitUnigramTable();
1665 if (negative_classes_file[0] != 0)
1666 InitClassUnigramTable();
1667 start = clock();
1668 for (a = 0; a < num_threads; a++)
1669 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1670 for (a = 0; a < num_threads; a++)
1671 pthread_join(pt[a], NULL);
1672 fo = fopen(output_file, "wb");
1673 if (classes == 0) {
1674 // Save the word vectors
1675 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1676 for (a = 0; a < vocab_size; a++) {
1677 fprintf(fo, "%s ", vocab[a].word);
1678 if (binary)
1679 for (b = 0; b < layer1_size; b++)
1680 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1681 else
1682 for (b = 0; b < layer1_size; b++)
1683 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1684 fprintf(fo, "\n");
1685 }
1686 } else {
1687 // Run K-means on the word vectors
1688 int clcn = classes, iter = 10, closeid;
1689 int *centcn = (int *) malloc(classes * sizeof(int));
1690 int *cl = (int *) calloc(vocab_size, sizeof(int));
1691 real closev, x;
1692 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1693 for (a = 0; a < vocab_size; a++)
1694 cl[a] = a % clcn;
1695 for (a = 0; a < iter; a++) {
1696 for (b = 0; b < clcn * layer1_size; b++)
1697 cent[b] = 0;
1698 for (b = 0; b < clcn; b++)
1699 centcn[b] = 1;
1700 for (c = 0; c < vocab_size; c++) {
1701 for (d = 0; d < layer1_size; d++)
1702 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1703 centcn[cl[c]]++;
1704 }
1705 for (b = 0; b < clcn; b++) {
1706 closev = 0;
1707 for (c = 0; c < layer1_size; c++) {
1708 cent[layer1_size * b + c] /= centcn[b];
1709 closev += cent[layer1_size * b + c]
1710 * cent[layer1_size * b + c];
1711 }
1712 closev = sqrt(closev);
1713 for (c = 0; c < layer1_size; c++)
1714 cent[layer1_size * b + c] /= closev;
1715 }
1716 for (c = 0; c < vocab_size; c++) {
1717 closev = -10;
1718 closeid = 0;
1719 for (d = 0; d < clcn; d++) {
1720 x = 0;
1721 for (b = 0; b < layer1_size; b++)
1722 x += cent[layer1_size * d + b]
1723 * syn0[c * layer1_size + b];
1724 if (x > closev) {
1725 closev = x;
1726 closeid = d;
1727 }
1728 }
1729 cl[c] = closeid;
1730 }
1731 }
1732 // Save the K-means classes
1733 for (a = 0; a < vocab_size; a++)
1734 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1735 free(centcn);
1736 free(cent);
1737 free(cl);
1738 }
1739 fclose(fo);
1740 if (save_net_file[0] != 0)
1741 SaveNet();
1742}
1743
1744int ArgPos(char *str, int argc, char **argv) {
1745 int a;
1746 for (a = 1; a < argc; a++)
1747 if (!strcmp(str, argv[a])) {
1748 if (a == argc - 1) {
1749 printf("Argument missing for %s\n", str);
1750 exit(1);
1751 }
1752 return a;
1753 }
1754 return -1;
1755}
1756
1757int main(int argc, char **argv) {
1758 int i;
1759 if (argc == 1) {
1760 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1761 printf("Options:\n");
1762 printf("Parameters for training:\n");
1763 printf("\t-train <file>\n");
1764 printf("\t\tUse text data from <file> to train the model\n");
1765 printf("\t-output <file>\n");
1766 printf(
1767 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1768 printf("\t-size <int>\n");
1769 printf("\t\tSet size of word vectors; default is 100\n");
1770 printf("\t-window <int>\n");
1771 printf("\t\tSet max skip length between words; default is 5\n");
1772 printf("\t-sample <float>\n");
1773 printf(
1774 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1775 printf(
1776 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1777 printf("\t-hs <int>\n");
1778 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1779 printf("\t-negative <int>\n");
1780 printf(
1781 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1782 printf("\t-negative-classes <file>\n");
1783 printf("\t\tNegative classes to sample from\n");
1784 printf("\t-nce <int>\n");
1785 printf(
1786 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1787 printf("\t-threads <int>\n");
1788 printf("\t\tUse <int> threads (default 12)\n");
1789 printf("\t-iter <int>\n");
1790 printf("\t\tRun more training iterations (default 5)\n");
1791 printf("\t-min-count <int>\n");
1792 printf(
1793 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1794 printf("\t-alpha <float>\n");
1795 printf(
1796 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1797 printf("\t-classes <int>\n");
1798 printf(
1799 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1800 printf("\t-debug <int>\n");
1801 printf(
1802 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1803 printf("\t-binary <int>\n");
1804 printf(
1805 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1806 printf("\t-save-vocab <file>\n");
1807 printf("\t\tThe vocabulary will be saved to <file>\n");
1808 printf("\t-read-vocab <file>\n");
1809 printf(
1810 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1811 printf("\t-read-net <file>\n");
1812 printf(
1813 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1814 printf("\t-save-net <file>\n");
1815 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001816 printf("\t-show-cc <int>\n");
1817 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001818 printf("\t-type <int>\n");
1819 printf(
1820 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1821 printf("\t-cap <int>\n");
1822 printf(
1823 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1824 printf("\nExamples:\n");
1825 printf(
1826 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1827 return 0;
1828 }
1829 output_file[0] = 0;
1830 save_vocab_file[0] = 0;
1831 read_vocab_file[0] = 0;
1832 save_net_file[0] = 0;
1833 read_net_file[0] = 0;
1834 negative_classes_file[0] = 0;
1835 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1836 layer1_size = atoi(argv[i + 1]);
1837 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1838 strcpy(train_file, argv[i + 1]);
1839 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1840 strcpy(save_vocab_file, argv[i + 1]);
1841 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1842 strcpy(read_vocab_file, argv[i + 1]);
1843 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1844 strcpy(save_net_file, argv[i + 1]);
1845 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1846 strcpy(read_net_file, argv[i + 1]);
1847 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1848 debug_mode = atoi(argv[i + 1]);
1849 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1850 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001851 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
1852 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001853 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1854 type = atoi(argv[i + 1]);
1855 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1856 strcpy(output_file, argv[i + 1]);
1857 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1858 window = atoi(argv[i + 1]);
1859 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1860 sample = atof(argv[i + 1]);
1861 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1862 hs = atoi(argv[i + 1]);
1863 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1864 negative = atoi(argv[i + 1]);
1865 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1866 strcpy(negative_classes_file, argv[i + 1]);
1867 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1868 nce = atoi(argv[i + 1]);
1869 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1870 num_threads = atoi(argv[i + 1]);
1871 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1872 iter = atoi(argv[i + 1]);
1873 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1874 min_count = atoi(argv[i + 1]);
1875 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1876 classes = atoi(argv[i + 1]);
1877 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1878 cap = atoi(argv[i + 1]);
1879 if (type == 0 || type == 2 || type == 4)
1880 alpha = 0.05;
1881 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1882 alpha = atof(argv[i + 1]);
1883 vocab = (struct vocab_word *) calloc(vocab_max_size,
1884 sizeof(struct vocab_word));
1885 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1886 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1887 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1888 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1889 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1890 }
1891 TrainModel();
1892 return 0;
1893}
1894