blob: 871558cee8161e44b4a8c22c295c528d19dcaddc [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
40struct vocab_word *vocab;
41int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
42 num_threads = 12, min_reduce = 1;
43int *vocab_hash;
44long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
45long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
46 classes = 0;
47real alpha = 0.025, starting_alpha, sample = 1e-3;
48real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
49clock_t start;
50
51real *syn1_window, *syn1neg_window, *syn1nce_window;
52int w_offset, window_layer_size;
53
54int window_hidden_size = 500;
55real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
56 *syn_hidden_word_nce;
57
58int hs = 0, negative = 5;
59const int table_size = 1e8;
60int *table;
61
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010062long cc = 0;
63
Marc Kupietzd6f9c712016-03-16 11:50:56 +010064//constrastive negative sampling
65char negative_classes_file[MAX_STRING];
66int *word_to_group;
67int *group_to_table; //group_size*table_size
68int class_number;
69
70//nce
71real* noise_distribution;
72int nce = 0;
73
74//param caps
75real CAP_VALUE = 50;
76int cap = 0;
77
78void capParam(real* array, int index) {
79 if (array[index] > CAP_VALUE)
80 array[index] = CAP_VALUE;
81 else if (array[index] < -CAP_VALUE)
82 array[index] = -CAP_VALUE;
83}
84
85real hardTanh(real x) {
86 if (x >= 1) {
87 return 1;
88 } else if (x <= -1) {
89 return -1;
90 } else {
91 return x;
92 }
93}
94
95real dHardTanh(real x, real g) {
96 if (x > 1 && g > 0) {
97 return 0;
98 }
99 if (x < -1 && g < 0) {
100 return 0;
101 }
102 return 1;
103}
104
105void InitUnigramTable() {
106 int a, i;
107 long long train_words_pow = 0;
108 real d1, power = 0.75;
109 table = (int *) malloc(table_size * sizeof(int));
110 for (a = 0; a < vocab_size; a++)
111 train_words_pow += pow(vocab[a].cn, power);
112 i = 0;
113 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
114 for (a = 0; a < table_size; a++) {
115 table[a] = i;
116 if (a / (real) table_size > d1) {
117 i++;
118 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
119 }
120 if (i >= vocab_size)
121 i = vocab_size - 1;
122 }
123
124 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
125 for (a = 0; a < vocab_size; a++)
126 noise_distribution[a] = pow(vocab[a].cn, power)
127 / (real) train_words_pow;
128}
129
130// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
131void ReadWord(char *word, FILE *fin) {
132 int a = 0, ch;
133 while (!feof(fin)) {
134 ch = fgetc(fin);
135 if (ch == 13)
136 continue;
137 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
138 if (a > 0) {
139 if (ch == '\n')
140 ungetc(ch, fin);
141 break;
142 }
143 if (ch == '\n') {
144 strcpy(word, (char *) "</s>");
145 return;
146 } else
147 continue;
148 }
149 word[a] = ch;
150 a++;
151 if (a >= MAX_STRING - 1)
152 a--; // Truncate too long words
153 }
154 word[a] = 0;
155}
156
157// Returns hash value of a word
158int GetWordHash(char *word) {
159 unsigned long long a, hash = 0;
160 for (a = 0; a < strlen(word); a++)
161 hash = hash * 257 + word[a];
162 hash = hash % vocab_hash_size;
163 return hash;
164}
165
166// Returns position of a word in the vocabulary; if the word is not found, returns -1
167int SearchVocab(char *word) {
168 unsigned int hash = GetWordHash(word);
169 while (1) {
170 if (vocab_hash[hash] == -1)
171 return -1;
172 if (!strcmp(word, vocab[vocab_hash[hash]].word))
173 return vocab_hash[hash];
174 hash = (hash + 1) % vocab_hash_size;
175 }
176 return -1;
177}
178
179// Reads a word and returns its index in the vocabulary
180int ReadWordIndex(FILE *fin) {
181 char word[MAX_STRING];
182 ReadWord(word, fin);
183 if (feof(fin))
184 return -1;
185 return SearchVocab(word);
186}
187
188// Adds a word to the vocabulary
189int AddWordToVocab(char *word) {
190 unsigned int hash, length = strlen(word) + 1;
191 if (length > MAX_STRING)
192 length = MAX_STRING;
193 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
194 strcpy(vocab[vocab_size].word, word);
195 vocab[vocab_size].cn = 0;
196 vocab_size++;
197 // Reallocate memory if needed
198 if (vocab_size + 2 >= vocab_max_size) {
199 vocab_max_size += 1000;
200 vocab = (struct vocab_word *) realloc(vocab,
201 vocab_max_size * sizeof(struct vocab_word));
202 }
203 hash = GetWordHash(word);
204 while (vocab_hash[hash] != -1)
205 hash = (hash + 1) % vocab_hash_size;
206 vocab_hash[hash] = vocab_size - 1;
207 return vocab_size - 1;
208}
209
210// Used later for sorting by word counts
211int VocabCompare(const void *a, const void *b) {
212 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
213}
214
215// Sorts the vocabulary by frequency using word counts
216void SortVocab() {
217 int a, size;
218 unsigned int hash;
219 // Sort the vocabulary and keep </s> at the first position
220 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
221 for (a = 0; a < vocab_hash_size; a++)
222 vocab_hash[a] = -1;
223 size = vocab_size;
224 train_words = 0;
225 for (a = 0; a < size; a++) {
226 // Words occuring less than min_count times will be discarded from the vocab
227 if ((vocab[a].cn < min_count) && (a != 0)) {
228 vocab_size--;
229 free(vocab[a].word);
230 } else {
231 // Hash will be re-computed, as after the sorting it is not actual
232 hash = GetWordHash(vocab[a].word);
233 while (vocab_hash[hash] != -1)
234 hash = (hash + 1) % vocab_hash_size;
235 vocab_hash[hash] = a;
236 train_words += vocab[a].cn;
237 }
238 }
239 vocab = (struct vocab_word *) realloc(vocab,
240 (vocab_size + 1) * sizeof(struct vocab_word));
241 // Allocate memory for the binary tree construction
242 for (a = 0; a < vocab_size; a++) {
243 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
244 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
245 }
246}
247
248// Reduces the vocabulary by removing infrequent tokens
249void ReduceVocab() {
250 int a, b = 0;
251 unsigned int hash;
252 for (a = 0; a < vocab_size; a++)
253 if (vocab[a].cn > min_reduce) {
254 vocab[b].cn = vocab[a].cn;
255 vocab[b].word = vocab[a].word;
256 b++;
257 } else
258 free(vocab[a].word);
259 vocab_size = b;
260 for (a = 0; a < vocab_hash_size; a++)
261 vocab_hash[a] = -1;
262 for (a = 0; a < vocab_size; a++) {
263 // Hash will be re-computed, as it is not actual
264 hash = GetWordHash(vocab[a].word);
265 while (vocab_hash[hash] != -1)
266 hash = (hash + 1) % vocab_hash_size;
267 vocab_hash[hash] = a;
268 }
269 fflush(stdout);
270 min_reduce++;
271}
272
273// Create binary Huffman tree using the word counts
274// Frequent words will have short uniqe binary codes
275void CreateBinaryTree() {
276 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
277 char code[MAX_CODE_LENGTH];
278 long long *count = (long long *) calloc(vocab_size * 2 + 1,
279 sizeof(long long));
280 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
281 sizeof(long long));
282 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
283 sizeof(long long));
284 for (a = 0; a < vocab_size; a++)
285 count[a] = vocab[a].cn;
286 for (a = vocab_size; a < vocab_size * 2; a++)
287 count[a] = 1e15;
288 pos1 = vocab_size - 1;
289 pos2 = vocab_size;
290 // Following algorithm constructs the Huffman tree by adding one node at a time
291 for (a = 0; a < vocab_size - 1; a++) {
292 // First, find two smallest nodes 'min1, min2'
293 if (pos1 >= 0) {
294 if (count[pos1] < count[pos2]) {
295 min1i = pos1;
296 pos1--;
297 } else {
298 min1i = pos2;
299 pos2++;
300 }
301 } else {
302 min1i = pos2;
303 pos2++;
304 }
305 if (pos1 >= 0) {
306 if (count[pos1] < count[pos2]) {
307 min2i = pos1;
308 pos1--;
309 } else {
310 min2i = pos2;
311 pos2++;
312 }
313 } else {
314 min2i = pos2;
315 pos2++;
316 }
317 count[vocab_size + a] = count[min1i] + count[min2i];
318 parent_node[min1i] = vocab_size + a;
319 parent_node[min2i] = vocab_size + a;
320 binary[min2i] = 1;
321 }
322 // Now assign binary code to each vocabulary word
323 for (a = 0; a < vocab_size; a++) {
324 b = a;
325 i = 0;
326 while (1) {
327 code[i] = binary[b];
328 point[i] = b;
329 i++;
330 b = parent_node[b];
331 if (b == vocab_size * 2 - 2)
332 break;
333 }
334 vocab[a].codelen = i;
335 vocab[a].point[0] = vocab_size - 2;
336 for (b = 0; b < i; b++) {
337 vocab[a].code[i - b - 1] = code[b];
338 vocab[a].point[i - b] = point[b] - vocab_size;
339 }
340 }
341 free(count);
342 free(binary);
343 free(parent_node);
344}
345
346void LearnVocabFromTrainFile() {
347 char word[MAX_STRING];
348 FILE *fin;
349 long long a, i;
350 for (a = 0; a < vocab_hash_size; a++)
351 vocab_hash[a] = -1;
352 fin = fopen(train_file, "rb");
353 if (fin == NULL) {
354 printf("ERROR: training data file not found!\n");
355 exit(1);
356 }
357 vocab_size = 0;
358 AddWordToVocab((char *) "</s>");
359 while (1) {
360 ReadWord(word, fin);
361 if (feof(fin))
362 break;
363 train_words++;
364 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
365 printf("%lldK%c", train_words / 1000, 13);
366 fflush(stdout);
367 }
368 i = SearchVocab(word);
369 if (i == -1) {
370 a = AddWordToVocab(word);
371 vocab[a].cn = 1;
372 } else
373 vocab[i].cn++;
374 if (vocab_size > vocab_hash_size * 0.7)
375 ReduceVocab();
376 }
377 SortVocab();
378 if (debug_mode > 0) {
379 printf("Vocab size: %lld\n", vocab_size);
380 printf("Words in train file: %lld\n", train_words);
381 }
382 file_size = ftell(fin);
383 fclose(fin);
384}
385
386void SaveVocab() {
387 long long i;
388 FILE *fo = fopen(save_vocab_file, "wb");
389 for (i = 0; i < vocab_size; i++)
390 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
391 fclose(fo);
392}
393
394void ReadVocab() {
395 long long a, i = 0;
396 char c;
397 char word[MAX_STRING];
398 FILE *fin = fopen(read_vocab_file, "rb");
399 if (fin == NULL) {
400 printf("Vocabulary file not found\n");
401 exit(1);
402 }
403 for (a = 0; a < vocab_hash_size; a++)
404 vocab_hash[a] = -1;
405 vocab_size = 0;
406 while (1) {
407 ReadWord(word, fin);
408 if (feof(fin))
409 break;
410 a = AddWordToVocab(word);
411 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
412 i++;
413 }
414 SortVocab();
415 if (debug_mode > 0) {
416 printf("Vocab size: %lld\n", vocab_size);
417 printf("Words in train file: %lld\n", train_words);
418 }
419 fin = fopen(train_file, "rb");
420 if (fin == NULL) {
421 printf("ERROR: training data file not found!\n");
422 exit(1);
423 }
424 fseek(fin, 0, SEEK_END);
425 file_size = ftell(fin);
426 fclose(fin);
427}
428
429void InitClassUnigramTable() {
430 long long a, c;
431 printf("loading class unigrams \n");
432 FILE *fin = fopen(negative_classes_file, "rb");
433 if (fin == NULL) {
434 printf("ERROR: class file not found!\n");
435 exit(1);
436 }
437 word_to_group = (int *) malloc(vocab_size * sizeof(int));
438 for (a = 0; a < vocab_size; a++)
439 word_to_group[a] = -1;
440 char class[MAX_STRING];
441 char prev_class[MAX_STRING];
442 prev_class[0] = 0;
443 char word[MAX_STRING];
444 class_number = -1;
445 while (1) {
446 if (feof(fin))
447 break;
448 ReadWord(class, fin);
449 ReadWord(word, fin);
450 int word_index = SearchVocab(word);
451 if (word_index != -1) {
452 if (strcmp(class, prev_class) != 0) {
453 class_number++;
454 strcpy(prev_class, class);
455 }
456 word_to_group[word_index] = class_number;
457 }
458 ReadWord(word, fin);
459 }
460 class_number++;
461 fclose(fin);
462
463 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
464 long long train_words_pow = 0;
465 real d1, power = 0.75;
466
467 for (c = 0; c < class_number; c++) {
468 long long offset = c * table_size;
469 train_words_pow = 0;
470 for (a = 0; a < vocab_size; a++)
471 if (word_to_group[a] == c)
472 train_words_pow += pow(vocab[a].cn, power);
473 int i = 0;
474 while (word_to_group[i] != c && i < vocab_size)
475 i++;
476 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
477 for (a = 0; a < table_size; a++) {
478 //printf("index %lld , word %d\n", a, i);
479 group_to_table[offset + a] = i;
480 if (a / (real) table_size > d1) {
481 i++;
482 while (word_to_group[i] != c && i < vocab_size)
483 i++;
484 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
485 }
486 if (i >= vocab_size)
487 while (word_to_group[i] != c && i >= 0)
488 i--;
489 }
490 }
491}
492
493void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100494 if(type != 3 || negative <= 0) {
495 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
496 return;
497 }
498
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100499 FILE *fnet = fopen(save_net_file, "wb");
500 if (fnet == NULL) {
501 printf("Net parameter file not found\n");
502 exit(1);
503 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100504 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100505 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100506 fclose(fnet);
507}
508
509void InitNet() {
510 long long a, b;
511 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100512 long long read;
513
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100514 window_layer_size = layer1_size * window * 2;
515 a = posix_memalign((void **) &syn0, 128,
516 (long long) vocab_size * layer1_size * sizeof(real));
517 if (syn0 == NULL) {
518 printf("Memory allocation failed\n");
519 exit(1);
520 }
521
522 if (hs) {
523 a = posix_memalign((void **) &syn1, 128,
524 (long long) vocab_size * layer1_size * sizeof(real));
525 if (syn1 == NULL) {
526 printf("Memory allocation failed\n");
527 exit(1);
528 }
529 a = posix_memalign((void **) &syn1_window, 128,
530 (long long) vocab_size * window_layer_size * sizeof(real));
531 if (syn1_window == NULL) {
532 printf("Memory allocation failed\n");
533 exit(1);
534 }
535 a = posix_memalign((void **) &syn_hidden_word, 128,
536 (long long) vocab_size * window_hidden_size * sizeof(real));
537 if (syn_hidden_word == NULL) {
538 printf("Memory allocation failed\n");
539 exit(1);
540 }
541
542 for (a = 0; a < vocab_size; a++)
543 for (b = 0; b < layer1_size; b++)
544 syn1[a * layer1_size + b] = 0;
545 for (a = 0; a < vocab_size; a++)
546 for (b = 0; b < window_layer_size; b++)
547 syn1_window[a * window_layer_size + b] = 0;
548 for (a = 0; a < vocab_size; a++)
549 for (b = 0; b < window_hidden_size; b++)
550 syn_hidden_word[a * window_hidden_size + b] = 0;
551 }
552 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100553 if(type == 0) {
554 a = posix_memalign((void **) &syn1neg, 128,
555 (long long) vocab_size * layer1_size * sizeof(real));
556 if (syn1neg == NULL) {
557 printf("Memory allocation failed\n");
558 exit(1);
559 }
560 for (a = 0; a < vocab_size; a++)
561 for (b = 0; b < layer1_size; b++)
562 syn1neg[a * layer1_size + b] = 0;
563 } else if (type == 3) {
564 a = posix_memalign((void **) &syn1neg_window, 128,
565 (long long) vocab_size * window_layer_size * sizeof(real));
566 if (syn1neg_window == NULL) {
567 printf("Memory allocation failed\n");
568 exit(1);
569 }
570 for (a = 0; a < vocab_size; a++)
571 for (b = 0; b < window_layer_size; b++)
572 syn1neg_window[a * window_layer_size + b] = 0;
573 } else if (type == 4) {
574 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
575 (long long) vocab_size * window_hidden_size * sizeof(real));
576 if (syn_hidden_word_neg == NULL) {
577 printf("Memory allocation failed\n");
578 exit(1);
579 }
580 for (a = 0; a < vocab_size; a++)
581 for (b = 0; b < window_hidden_size; b++)
582 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100583 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100584 }
585 if (nce > 0) {
586 a = posix_memalign((void **) &syn1nce, 128,
587 (long long) vocab_size * layer1_size * sizeof(real));
588 if (syn1nce == NULL) {
589 printf("Memory allocation failed\n");
590 exit(1);
591 }
592 a = posix_memalign((void **) &syn1nce_window, 128,
593 (long long) vocab_size * window_layer_size * sizeof(real));
594 if (syn1nce_window == NULL) {
595 printf("Memory allocation failed\n");
596 exit(1);
597 }
598 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
599 (long long) vocab_size * window_hidden_size * sizeof(real));
600 if (syn_hidden_word_nce == NULL) {
601 printf("Memory allocation failed\n");
602 exit(1);
603 }
604
605 for (a = 0; a < vocab_size; a++)
606 for (b = 0; b < layer1_size; b++)
607 syn1nce[a * layer1_size + b] = 0;
608 for (a = 0; a < vocab_size; a++)
609 for (b = 0; b < window_layer_size; b++)
610 syn1nce_window[a * window_layer_size + b] = 0;
611 for (a = 0; a < vocab_size; a++)
612 for (b = 0; b < window_hidden_size; b++)
613 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
614 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100615
Marc Kupietz1006a272016-03-16 15:50:20 +0100616 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100617 a = posix_memalign((void **) &syn_window_hidden, 128,
618 window_hidden_size * window_layer_size * sizeof(real));
619 if (syn_window_hidden == NULL) {
620 printf("Memory allocation failed\n");
621 exit(1);
622 }
623 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
624 next_random = next_random * (unsigned long long) 25214903917 + 11;
625 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
626 - 0.5) / (window_hidden_size * window_layer_size);
627 }
628 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100629
630 if (read_net_file[0] == 0) {
631 for (a = 0; a < vocab_size; a++)
632 for (b = 0; b < layer1_size; b++) {
633 next_random = next_random * (unsigned long long) 25214903917
634 + 11;
635 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
636 / (real) 65536) - 0.5) / layer1_size;
637 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100638 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100639 FILE *fnet = fopen(read_net_file, "rb");
640 if (fnet == NULL) {
641 printf("Net parameter file not found\n");
642 exit(1);
643 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100644 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
645 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
646 if(read != vocab_size * layer1_size) {
647 fprintf(stderr, "read-net failed %lld\n", read);
648 exit(-1);
649 }
650 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
651 if(read != (long long) vocab_size * window_layer_size) {
652 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
653 (long long) sizeof(real) * vocab_size * window_layer_size);
654 exit(-1);
655 }
656 fgetc(fnet);
657 if(!feof(fnet)) {
658 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
659 exit(-1);
660 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100661 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100662 } else {
663 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
664 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100665 }
666
667 CreateBinaryTree();
668}
669
670void *TrainModelThread(void *id) {
671 long long a, b, d, cw, word, last_word, sentence_length = 0,
672 sentence_position = 0;
673 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
674 long long l1, l2, c, target, label, local_iter = iter;
675 unsigned long long next_random = (long long) id;
676 real f, g;
677 clock_t now;
678 int input_len_1 = layer1_size;
679 int window_offset = -1;
680 if (type == 2 || type == 4) {
681 input_len_1 = window_layer_size;
682 }
683 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
684 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
685
686 int input_len_2 = 0;
687 if (type == 4) {
688 input_len_2 = window_hidden_size;
689 }
690 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
691 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
692
693 FILE *fi = fopen(train_file, "rb");
694 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
695 while (1) {
696 if (word_count - last_word_count > 10000) {
697 word_count_actual += word_count - last_word_count;
698 last_word_count = word_count;
699 if ((debug_mode > 1)) {
700 now = clock();
701 printf(
702 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
703 13, alpha,
704 word_count_actual / (real) (iter * train_words + 1)
705 * 100,
706 word_count_actual
707 / ((real) (now - start + 1)
708 / (real) CLOCKS_PER_SEC * 1000));
709 fflush(stdout);
710 }
711 alpha = starting_alpha
712 * (1 - word_count_actual / (real) (iter * train_words + 1));
713 if (alpha < starting_alpha * 0.0001)
714 alpha = starting_alpha * 0.0001;
715 }
716 if (sentence_length == 0) {
717 while (1) {
718 word = ReadWordIndex(fi);
719 if (feof(fi))
720 break;
721 if (word == -1)
722 continue;
723 word_count++;
724 if (word == 0)
725 break;
726 // The subsampling randomly discards frequent words while keeping the ranking same
727 if (sample > 0) {
728 real ran = (sqrt(vocab[word].cn / (sample * train_words))
729 + 1) * (sample * train_words) / vocab[word].cn;
730 next_random = next_random * (unsigned long long) 25214903917
731 + 11;
732 if (ran < (next_random & 0xFFFF) / (real) 65536)
733 continue;
734 }
735 sen[sentence_length] = word;
736 sentence_length++;
737 if (sentence_length >= MAX_SENTENCE_LENGTH)
738 break;
739 }
740 sentence_position = 0;
741 }
742 if (feof(fi) || (word_count > train_words / num_threads)) {
743 word_count_actual += word_count - last_word_count;
744 local_iter--;
745 if (local_iter == 0)
746 break;
747 word_count = 0;
748 last_word_count = 0;
749 sentence_length = 0;
750 fseek(fi, file_size / (long long) num_threads * (long long) id,
751 SEEK_SET);
752 continue;
753 }
754 word = sen[sentence_position];
755 if (word == -1)
756 continue;
757 for (c = 0; c < input_len_1; c++)
758 neu1[c] = 0;
759 for (c = 0; c < input_len_1; c++)
760 neu1e[c] = 0;
761 for (c = 0; c < input_len_2; c++)
762 neu2[c] = 0;
763 for (c = 0; c < input_len_2; c++)
764 neu2e[c] = 0;
765 next_random = next_random * (unsigned long long) 25214903917 + 11;
766 b = next_random % window;
767 if (type == 0) { //train the cbow architecture
768 // in -> hidden
769 cw = 0;
770 for (a = b; a < window * 2 + 1 - b; a++)
771 if (a != window) {
772 c = sentence_position - window + a;
773 if (c < 0)
774 continue;
775 if (c >= sentence_length)
776 continue;
777 last_word = sen[c];
778 if (last_word == -1)
779 continue;
780 for (c = 0; c < layer1_size; c++)
781 neu1[c] += syn0[c + last_word * layer1_size];
782 cw++;
783 }
784 if (cw) {
785 for (c = 0; c < layer1_size; c++)
786 neu1[c] /= cw;
787 if (hs)
788 for (d = 0; d < vocab[word].codelen; d++) {
789 f = 0;
790 l2 = vocab[word].point[d] * layer1_size;
791 // Propagate hidden -> output
792 for (c = 0; c < layer1_size; c++)
793 f += neu1[c] * syn1[c + l2];
794 if (f <= -MAX_EXP)
795 continue;
796 else if (f >= MAX_EXP)
797 continue;
798 else
799 f = expTable[(int) ((f + MAX_EXP)
800 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
801 // 'g' is the gradient multiplied by the learning rate
802 g = (1 - vocab[word].code[d] - f) * alpha;
803 // Propagate errors output -> hidden
804 for (c = 0; c < layer1_size; c++)
805 neu1e[c] += g * syn1[c + l2];
806 // Learn weights hidden -> output
807 for (c = 0; c < layer1_size; c++)
808 syn1[c + l2] += g * neu1[c];
809 if (cap == 1)
810 for (c = 0; c < layer1_size; c++)
811 capParam(syn1, c + l2);
812 }
813 // NEGATIVE SAMPLING
814 if (negative > 0)
815 for (d = 0; d < negative + 1; d++) {
816 if (d == 0) {
817 target = word;
818 label = 1;
819 } else {
820 next_random = next_random
821 * (unsigned long long) 25214903917 + 11;
822 if (word_to_group != NULL
823 && word_to_group[word] != -1) {
824 target = word;
825 while (target == word) {
826 target = group_to_table[word_to_group[word]
827 * table_size
828 + (next_random >> 16) % table_size];
829 next_random = next_random
830 * (unsigned long long) 25214903917
831 + 11;
832 }
833 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
834 } else {
835 target =
836 table[(next_random >> 16) % table_size];
837 }
838 if (target == 0)
839 target = next_random % (vocab_size - 1) + 1;
840 if (target == word)
841 continue;
842 label = 0;
843 }
844 l2 = target * layer1_size;
845 f = 0;
846 for (c = 0; c < layer1_size; c++)
847 f += neu1[c] * syn1neg[c + l2];
848 if (f > MAX_EXP)
849 g = (label - 1) * alpha;
850 else if (f < -MAX_EXP)
851 g = (label - 0) * alpha;
852 else
853 g = (label
854 - expTable[(int) ((f + MAX_EXP)
855 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
856 * alpha;
857 for (c = 0; c < layer1_size; c++)
858 neu1e[c] += g * syn1neg[c + l2];
859 for (c = 0; c < layer1_size; c++)
860 syn1neg[c + l2] += g * neu1[c];
861 if (cap == 1)
862 for (c = 0; c < layer1_size; c++)
863 capParam(syn1neg, c + l2);
864 }
865 // Noise Contrastive Estimation
866 if (nce > 0)
867 for (d = 0; d < nce + 1; d++) {
868 if (d == 0) {
869 target = word;
870 label = 1;
871 } else {
872 next_random = next_random
873 * (unsigned long long) 25214903917 + 11;
874 if (word_to_group != NULL
875 && word_to_group[word] != -1) {
876 target = word;
877 while (target == word) {
878 target = group_to_table[word_to_group[word]
879 * table_size
880 + (next_random >> 16) % table_size];
881 next_random = next_random
882 * (unsigned long long) 25214903917
883 + 11;
884 }
885 } else {
886 target =
887 table[(next_random >> 16) % table_size];
888 }
889 if (target == 0)
890 target = next_random % (vocab_size - 1) + 1;
891 if (target == word)
892 continue;
893 label = 0;
894 }
895 l2 = target * layer1_size;
896 f = 0;
897
898 for (c = 0; c < layer1_size; c++)
899 f += neu1[c] * syn1nce[c + l2];
900 if (f > MAX_EXP)
901 g = (label - 1) * alpha;
902 else if (f < -MAX_EXP)
903 g = (label - 0) * alpha;
904 else {
905 f = exp(f);
906 g =
907 (label
908 - f
909 / (noise_distribution[target]
910 * nce + f)) * alpha;
911 }
912 for (c = 0; c < layer1_size; c++)
913 neu1e[c] += g * syn1nce[c + l2];
914 for (c = 0; c < layer1_size; c++)
915 syn1nce[c + l2] += g * neu1[c];
916 if (cap == 1)
917 for (c = 0; c < layer1_size; c++)
918 capParam(syn1nce, c + l2);
919 }
920 // hidden -> in
921 for (a = b; a < window * 2 + 1 - b; a++)
922 if (a != window) {
923 c = sentence_position - window + a;
924 if (c < 0)
925 continue;
926 if (c >= sentence_length)
927 continue;
928 last_word = sen[c];
929 if (last_word == -1)
930 continue;
931 for (c = 0; c < layer1_size; c++)
932 syn0[c + last_word * layer1_size] += neu1e[c];
933 }
934 }
935 } else if (type == 1) { //train skip-gram
936 for (a = b; a < window * 2 + 1 - b; a++)
937 if (a != window) {
938 c = sentence_position - window + a;
939 if (c < 0)
940 continue;
941 if (c >= sentence_length)
942 continue;
943 last_word = sen[c];
944 if (last_word == -1)
945 continue;
946 l1 = last_word * layer1_size;
947 for (c = 0; c < layer1_size; c++)
948 neu1e[c] = 0;
949 // HIERARCHICAL SOFTMAX
950 if (hs)
951 for (d = 0; d < vocab[word].codelen; d++) {
952 f = 0;
953 l2 = vocab[word].point[d] * layer1_size;
954 // Propagate hidden -> output
955 for (c = 0; c < layer1_size; c++)
956 f += syn0[c + l1] * syn1[c + l2];
957 if (f <= -MAX_EXP)
958 continue;
959 else if (f >= MAX_EXP)
960 continue;
961 else
962 f = expTable[(int) ((f + MAX_EXP)
963 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
964 // 'g' is the gradient multiplied by the learning rate
965 g = (1 - vocab[word].code[d] - f) * alpha;
966 // Propagate errors output -> hidden
967 for (c = 0; c < layer1_size; c++)
968 neu1e[c] += g * syn1[c + l2];
969 // Learn weights hidden -> output
970 for (c = 0; c < layer1_size; c++)
971 syn1[c + l2] += g * syn0[c + l1];
972 if (cap == 1)
973 for (c = 0; c < layer1_size; c++)
974 capParam(syn1, c + l2);
975 }
976 // NEGATIVE SAMPLING
977 if (negative > 0)
978 for (d = 0; d < negative + 1; d++) {
979 if (d == 0) {
980 target = word;
981 label = 1;
982 } else {
983 next_random = next_random
984 * (unsigned long long) 25214903917 + 11;
985 if (word_to_group != NULL
986 && word_to_group[word] != -1) {
987 target = word;
988 while (target == word) {
989 target =
990 group_to_table[word_to_group[word]
991 * table_size
992 + (next_random >> 16)
993 % table_size];
994 next_random =
995 next_random
996 * (unsigned long long) 25214903917
997 + 11;
998 }
999 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1000 } else {
1001 target = table[(next_random >> 16)
1002 % table_size];
1003 }
1004 if (target == 0)
1005 target = next_random % (vocab_size - 1) + 1;
1006 if (target == word)
1007 continue;
1008 label = 0;
1009 }
1010 l2 = target * layer1_size;
1011 f = 0;
1012 for (c = 0; c < layer1_size; c++)
1013 f += syn0[c + l1] * syn1neg[c + l2];
1014 if (f > MAX_EXP)
1015 g = (label - 1) * alpha;
1016 else if (f < -MAX_EXP)
1017 g = (label - 0) * alpha;
1018 else
1019 g =
1020 (label
1021 - expTable[(int) ((f + MAX_EXP)
1022 * (EXP_TABLE_SIZE
1023 / MAX_EXP / 2))])
1024 * alpha;
1025 for (c = 0; c < layer1_size; c++)
1026 neu1e[c] += g * syn1neg[c + l2];
1027 for (c = 0; c < layer1_size; c++)
1028 syn1neg[c + l2] += g * syn0[c + l1];
1029 if (cap == 1)
1030 for (c = 0; c < layer1_size; c++)
1031 capParam(syn1neg, c + l2);
1032 }
1033 //Noise Contrastive Estimation
1034 if (nce > 0)
1035 for (d = 0; d < nce + 1; d++) {
1036 if (d == 0) {
1037 target = word;
1038 label = 1;
1039 } else {
1040 next_random = next_random
1041 * (unsigned long long) 25214903917 + 11;
1042 if (word_to_group != NULL
1043 && word_to_group[word] != -1) {
1044 target = word;
1045 while (target == word) {
1046 target =
1047 group_to_table[word_to_group[word]
1048 * table_size
1049 + (next_random >> 16)
1050 % table_size];
1051 next_random =
1052 next_random
1053 * (unsigned long long) 25214903917
1054 + 11;
1055 }
1056 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1057 } else {
1058 target = table[(next_random >> 16)
1059 % table_size];
1060 }
1061 if (target == 0)
1062 target = next_random % (vocab_size - 1) + 1;
1063 if (target == word)
1064 continue;
1065 label = 0;
1066 }
1067 l2 = target * layer1_size;
1068 f = 0;
1069 for (c = 0; c < layer1_size; c++)
1070 f += syn0[c + l1] * syn1nce[c + l2];
1071 if (f > MAX_EXP)
1072 g = (label - 1) * alpha;
1073 else if (f < -MAX_EXP)
1074 g = (label - 0) * alpha;
1075 else {
1076 f = exp(f);
1077 g = (label
1078 - f
1079 / (noise_distribution[target]
1080 * nce + f)) * alpha;
1081 }
1082 for (c = 0; c < layer1_size; c++)
1083 neu1e[c] += g * syn1nce[c + l2];
1084 for (c = 0; c < layer1_size; c++)
1085 syn1nce[c + l2] += g * syn0[c + l1];
1086 if (cap == 1)
1087 for (c = 0; c < layer1_size; c++)
1088 capParam(syn1nce, c + l2);
1089 }
1090 // Learn weights input -> hidden
1091 for (c = 0; c < layer1_size; c++)
1092 syn0[c + l1] += neu1e[c];
1093 }
1094 } else if (type == 2) { //train the cwindow architecture
1095 // in -> hidden
1096 cw = 0;
1097 for (a = 0; a < window * 2 + 1; a++)
1098 if (a != window) {
1099 c = sentence_position - window + a;
1100 if (c < 0)
1101 continue;
1102 if (c >= sentence_length)
1103 continue;
1104 last_word = sen[c];
1105 if (last_word == -1)
1106 continue;
1107 window_offset = a * layer1_size;
1108 if (a > window)
1109 window_offset -= layer1_size;
1110 for (c = 0; c < layer1_size; c++)
1111 neu1[c + window_offset] += syn0[c
1112 + last_word * layer1_size];
1113 cw++;
1114 }
1115 if (cw) {
1116 if (hs)
1117 for (d = 0; d < vocab[word].codelen; d++) {
1118 f = 0;
1119 l2 = vocab[word].point[d] * window_layer_size;
1120 // Propagate hidden -> output
1121 for (c = 0; c < window_layer_size; c++)
1122 f += neu1[c] * syn1_window[c + l2];
1123 if (f <= -MAX_EXP)
1124 continue;
1125 else if (f >= MAX_EXP)
1126 continue;
1127 else
1128 f = expTable[(int) ((f + MAX_EXP)
1129 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1130 // 'g' is the gradient multiplied by the learning rate
1131 g = (1 - vocab[word].code[d] - f) * alpha;
1132 // Propagate errors output -> hidden
1133 for (c = 0; c < window_layer_size; c++)
1134 neu1e[c] += g * syn1_window[c + l2];
1135 // Learn weights hidden -> output
1136 for (c = 0; c < window_layer_size; c++)
1137 syn1_window[c + l2] += g * neu1[c];
1138 if (cap == 1)
1139 for (c = 0; c < window_layer_size; c++)
1140 capParam(syn1_window, c + l2);
1141 }
1142 // NEGATIVE SAMPLING
1143 if (negative > 0)
1144 for (d = 0; d < negative + 1; d++) {
1145 if (d == 0) {
1146 target = word;
1147 label = 1;
1148 } else {
1149 next_random = next_random
1150 * (unsigned long long) 25214903917 + 11;
1151 if (word_to_group != NULL
1152 && word_to_group[word] != -1) {
1153 target = word;
1154 while (target == word) {
1155 target = group_to_table[word_to_group[word]
1156 * table_size
1157 + (next_random >> 16) % table_size];
1158 next_random = next_random
1159 * (unsigned long long) 25214903917
1160 + 11;
1161 }
1162 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1163 } else {
1164 target =
1165 table[(next_random >> 16) % table_size];
1166 }
1167 if (target == 0)
1168 target = next_random % (vocab_size - 1) + 1;
1169 if (target == word)
1170 continue;
1171 label = 0;
1172 }
1173 l2 = target * window_layer_size;
1174 f = 0;
1175 for (c = 0; c < window_layer_size; c++)
1176 f += neu1[c] * syn1neg_window[c + l2];
1177 if (f > MAX_EXP)
1178 g = (label - 1) * alpha;
1179 else if (f < -MAX_EXP)
1180 g = (label - 0) * alpha;
1181 else
1182 g = (label
1183 - expTable[(int) ((f + MAX_EXP)
1184 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1185 * alpha;
1186 for (c = 0; c < window_layer_size; c++)
1187 neu1e[c] += g * syn1neg_window[c + l2];
1188 for (c = 0; c < window_layer_size; c++)
1189 syn1neg_window[c + l2] += g * neu1[c];
1190 if (cap == 1)
1191 for (c = 0; c < window_layer_size; c++)
1192 capParam(syn1neg_window, c + l2);
1193 }
1194 // Noise Contrastive Estimation
1195 if (nce > 0)
1196 for (d = 0; d < nce + 1; d++) {
1197 if (d == 0) {
1198 target = word;
1199 label = 1;
1200 } else {
1201 next_random = next_random
1202 * (unsigned long long) 25214903917 + 11;
1203 if (word_to_group != NULL
1204 && word_to_group[word] != -1) {
1205 target = word;
1206 while (target == word) {
1207 target = group_to_table[word_to_group[word]
1208 * table_size
1209 + (next_random >> 16) % table_size];
1210 next_random = next_random
1211 * (unsigned long long) 25214903917
1212 + 11;
1213 }
1214 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1215 } else {
1216 target =
1217 table[(next_random >> 16) % table_size];
1218 }
1219 if (target == 0)
1220 target = next_random % (vocab_size - 1) + 1;
1221 if (target == word)
1222 continue;
1223 label = 0;
1224 }
1225 l2 = target * window_layer_size;
1226 f = 0;
1227 for (c = 0; c < window_layer_size; c++)
1228 f += neu1[c] * syn1nce_window[c + l2];
1229 if (f > MAX_EXP)
1230 g = (label - 1) * alpha;
1231 else if (f < -MAX_EXP)
1232 g = (label - 0) * alpha;
1233 else {
1234 f = exp(f);
1235 g =
1236 (label
1237 - f
1238 / (noise_distribution[target]
1239 * nce + f)) * alpha;
1240 }
1241 for (c = 0; c < window_layer_size; c++)
1242 neu1e[c] += g * syn1nce_window[c + l2];
1243 for (c = 0; c < window_layer_size; c++)
1244 syn1nce_window[c + l2] += g * neu1[c];
1245 if (cap == 1)
1246 for (c = 0; c < window_layer_size; c++)
1247 capParam(syn1nce_window, c + l2);
1248 }
1249 // hidden -> in
1250 for (a = 0; a < window * 2 + 1; a++)
1251 if (a != window) {
1252 c = sentence_position - window + a;
1253 if (c < 0)
1254 continue;
1255 if (c >= sentence_length)
1256 continue;
1257 last_word = sen[c];
1258 if (last_word == -1)
1259 continue;
1260 window_offset = a * layer1_size;
1261 if (a > window)
1262 window_offset -= layer1_size;
1263 for (c = 0; c < layer1_size; c++)
1264 syn0[c + last_word * layer1_size] += neu1e[c
1265 + window_offset];
1266 }
1267 }
1268 } else if (type == 3) { //train structured skip-gram
1269 for (a = 0; a < window * 2 + 1; a++)
1270 if (a != window) {
1271 c = sentence_position - window + a;
1272 if (c < 0)
1273 continue;
1274 if (c >= sentence_length)
1275 continue;
1276 last_word = sen[c];
1277 if (last_word == -1)
1278 continue;
1279 l1 = last_word * layer1_size;
1280 window_offset = a * layer1_size;
1281 if (a > window)
1282 window_offset -= layer1_size;
1283 for (c = 0; c < layer1_size; c++)
1284 neu1e[c] = 0;
1285 // HIERARCHICAL SOFTMAX
1286 if (hs)
1287 for (d = 0; d < vocab[word].codelen; d++) {
1288 f = 0;
1289 l2 = vocab[word].point[d] * window_layer_size;
1290 // Propagate hidden -> output
1291 for (c = 0; c < layer1_size; c++)
1292 f += syn0[c + l1]
1293 * syn1_window[c + l2 + window_offset];
1294 if (f <= -MAX_EXP)
1295 continue;
1296 else if (f >= MAX_EXP)
1297 continue;
1298 else
1299 f = expTable[(int) ((f + MAX_EXP)
1300 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1301 // 'g' is the gradient multiplied by the learning rate
1302 g = (1 - vocab[word].code[d] - f) * alpha;
1303 // Propagate errors output -> hidden
1304 for (c = 0; c < layer1_size; c++)
1305 neu1e[c] += g
1306 * syn1_window[c + l2 + window_offset];
1307 // Learn weights hidden -> output
1308 for (c = 0; c < layer1_size; c++)
1309 syn1[c + l2 + window_offset] += g
1310 * syn0[c + l1];
1311 if (cap == 1)
1312 for (c = 0; c < layer1_size; c++)
1313 capParam(syn1, c + l2 + window_offset);
1314 }
1315 // NEGATIVE SAMPLING
1316 if (negative > 0)
1317 for (d = 0; d < negative + 1; d++) {
1318 if (d == 0) {
1319 target = word;
1320 label = 1;
1321 } else {
1322 next_random = next_random
1323 * (unsigned long long) 25214903917 + 11;
1324 if (word_to_group != NULL
1325 && word_to_group[word] != -1) {
1326 target = word;
1327 while (target == word) {
1328 target =
1329 group_to_table[word_to_group[word]
1330 * table_size
1331 + (next_random >> 16)
1332 % table_size];
1333 next_random =
1334 next_random
1335 * (unsigned long long) 25214903917
1336 + 11;
1337 }
1338 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1339 } else {
1340 target = table[(next_random >> 16)
1341 % table_size];
1342 }
1343 if (target == 0)
1344 target = next_random % (vocab_size - 1) + 1;
1345 if (target == word)
1346 continue;
1347 label = 0;
1348 }
1349 l2 = target * window_layer_size;
1350 f = 0;
1351 for (c = 0; c < layer1_size; c++)
1352 f +=
1353 syn0[c + l1]
1354 * syn1neg_window[c + l2
1355 + window_offset];
1356 if (f > MAX_EXP)
1357 g = (label - 1) * alpha;
1358 else if (f < -MAX_EXP)
1359 g = (label - 0) * alpha;
1360 else
1361 g =
1362 (label
1363 - expTable[(int) ((f + MAX_EXP)
1364 * (EXP_TABLE_SIZE
1365 / MAX_EXP / 2))])
1366 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001367 if(debug_mode > 2 && ((long long) id) == 0) {
1368 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1369 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1370 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001371 for (c = 0; c < layer1_size; c++)
1372 neu1e[c] +=
1373 g
1374 * syn1neg_window[c + l2
1375 + window_offset];
1376 for (c = 0; c < layer1_size; c++)
1377 syn1neg_window[c + l2 + window_offset] += g
1378 * syn0[c + l1];
1379 if (cap == 1)
1380 for (c = 0; c < layer1_size; c++)
1381 capParam(syn1neg_window,
1382 c + l2 + window_offset);
1383 }
1384 // Noise Constrastive Estimation
1385 if (nce > 0)
1386 for (d = 0; d < nce + 1; d++) {
1387 if (d == 0) {
1388 target = word;
1389 label = 1;
1390 } else {
1391 next_random = next_random
1392 * (unsigned long long) 25214903917 + 11;
1393 if (word_to_group != NULL
1394 && word_to_group[word] != -1) {
1395 target = word;
1396 while (target == word) {
1397 target =
1398 group_to_table[word_to_group[word]
1399 * table_size
1400 + (next_random >> 16)
1401 % table_size];
1402 next_random =
1403 next_random
1404 * (unsigned long long) 25214903917
1405 + 11;
1406 }
1407 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1408 } else {
1409 target = table[(next_random >> 16)
1410 % table_size];
1411 }
1412 if (target == 0)
1413 target = next_random % (vocab_size - 1) + 1;
1414 if (target == word)
1415 continue;
1416 label = 0;
1417 }
1418 l2 = target * window_layer_size;
1419 f = 0;
1420 for (c = 0; c < layer1_size; c++)
1421 f +=
1422 syn0[c + l1]
1423 * syn1nce_window[c + l2
1424 + window_offset];
1425 if (f > MAX_EXP)
1426 g = (label - 1) * alpha;
1427 else if (f < -MAX_EXP)
1428 g = (label - 0) * alpha;
1429 else {
1430 f = exp(f);
1431 g = (label
1432 - f
1433 / (noise_distribution[target]
1434 * nce + f)) * alpha;
1435 }
1436 for (c = 0; c < layer1_size; c++)
1437 neu1e[c] +=
1438 g
1439 * syn1nce_window[c + l2
1440 + window_offset];
1441 for (c = 0; c < layer1_size; c++)
1442 syn1nce_window[c + l2 + window_offset] += g
1443 * syn0[c + l1];
1444 if (cap == 1)
1445 for (c = 0; c < layer1_size; c++)
1446 capParam(syn1nce_window,
1447 c + l2 + window_offset);
1448 }
1449 // Learn weights input -> hidden
1450 for (c = 0; c < layer1_size; c++) {
1451 syn0[c + l1] += neu1e[c];
1452 if (syn0[c + l1] > 50)
1453 syn0[c + l1] = 50;
1454 if (syn0[c + l1] < -50)
1455 syn0[c + l1] = -50;
1456 }
1457 }
1458 } else if (type == 4) { //training senna
1459 // in -> hidden
1460 cw = 0;
1461 for (a = 0; a < window * 2 + 1; a++)
1462 if (a != window) {
1463 c = sentence_position - window + a;
1464 if (c < 0)
1465 continue;
1466 if (c >= sentence_length)
1467 continue;
1468 last_word = sen[c];
1469 if (last_word == -1)
1470 continue;
1471 window_offset = a * layer1_size;
1472 if (a > window)
1473 window_offset -= layer1_size;
1474 for (c = 0; c < layer1_size; c++)
1475 neu1[c + window_offset] += syn0[c
1476 + last_word * layer1_size];
1477 cw++;
1478 }
1479 if (cw) {
1480 for (a = 0; a < window_hidden_size; a++) {
1481 c = a * window_layer_size;
1482 for (b = 0; b < window_layer_size; b++) {
1483 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1484 }
1485 }
1486 if (hs)
1487 for (d = 0; d < vocab[word].codelen; d++) {
1488 f = 0;
1489 l2 = vocab[word].point[d] * window_hidden_size;
1490 // Propagate hidden -> output
1491 for (c = 0; c < window_hidden_size; c++)
1492 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1493 if (f <= -MAX_EXP)
1494 continue;
1495 else if (f >= MAX_EXP)
1496 continue;
1497 else
1498 f = expTable[(int) ((f + MAX_EXP)
1499 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1500 // 'g' is the gradient multiplied by the learning rate
1501 g = (1 - vocab[word].code[d] - f) * alpha;
1502 // Propagate errors output -> hidden
1503 for (c = 0; c < window_hidden_size; c++)
1504 neu2e[c] += dHardTanh(neu2[c], g) * g
1505 * syn_hidden_word[c + l2];
1506 // Learn weights hidden -> output
1507 for (c = 0; c < window_hidden_size; c++)
1508 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1509 * neu2[c];
1510 }
1511 // NEGATIVE SAMPLING
1512 if (negative > 0)
1513 for (d = 0; d < negative + 1; d++) {
1514 if (d == 0) {
1515 target = word;
1516 label = 1;
1517 } else {
1518 next_random = next_random
1519 * (unsigned long long) 25214903917 + 11;
1520 if (word_to_group != NULL
1521 && word_to_group[word] != -1) {
1522 target = word;
1523 while (target == word) {
1524 target = group_to_table[word_to_group[word]
1525 * table_size
1526 + (next_random >> 16) % table_size];
1527 next_random = next_random
1528 * (unsigned long long) 25214903917
1529 + 11;
1530 }
1531 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1532 } else {
1533 target =
1534 table[(next_random >> 16) % table_size];
1535 }
1536 if (target == 0)
1537 target = next_random % (vocab_size - 1) + 1;
1538 if (target == word)
1539 continue;
1540 label = 0;
1541 }
1542 l2 = target * window_hidden_size;
1543 f = 0;
1544 for (c = 0; c < window_hidden_size; c++)
1545 f += hardTanh(neu2[c])
1546 * syn_hidden_word_neg[c + l2];
1547 if (f > MAX_EXP)
1548 g = (label - 1) * alpha / negative;
1549 else if (f < -MAX_EXP)
1550 g = (label - 0) * alpha / negative;
1551 else
1552 g = (label
1553 - expTable[(int) ((f + MAX_EXP)
1554 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1555 * alpha / negative;
1556 for (c = 0; c < window_hidden_size; c++)
1557 neu2e[c] += dHardTanh(neu2[c], g) * g
1558 * syn_hidden_word_neg[c + l2];
1559 for (c = 0; c < window_hidden_size; c++)
1560 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1561 * g * neu2[c];
1562 }
1563 for (a = 0; a < window_hidden_size; a++)
1564 for (b = 0; b < window_layer_size; b++)
1565 neu1e[b] += neu2e[a]
1566 * syn_window_hidden[a * window_layer_size + b];
1567 for (a = 0; a < window_hidden_size; a++)
1568 for (b = 0; b < window_layer_size; b++)
1569 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1570 * neu1[b];
1571 // hidden -> in
1572 for (a = 0; a < window * 2 + 1; a++)
1573 if (a != window) {
1574 c = sentence_position - window + a;
1575 if (c < 0)
1576 continue;
1577 if (c >= sentence_length)
1578 continue;
1579 last_word = sen[c];
1580 if (last_word == -1)
1581 continue;
1582 window_offset = a * layer1_size;
1583 if (a > window)
1584 window_offset -= layer1_size;
1585 for (c = 0; c < layer1_size; c++)
1586 syn0[c + last_word * layer1_size] += neu1e[c
1587 + window_offset];
1588 }
1589 }
1590 } else {
1591 printf("unknown type %i", type);
1592 exit(0);
1593 }
1594 sentence_position++;
1595 if (sentence_position >= sentence_length) {
1596 sentence_length = 0;
1597 continue;
1598 }
1599 }
1600 fclose(fi);
1601 free(neu1);
1602 free(neu1e);
1603 pthread_exit(NULL);
1604}
1605
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001606void ShowCollocations() {
1607 long a, b, c, d, window_offset, target, max_target=0, maxmax_target;
1608 real f, max_f, maxmax_f;
1609 real *target_sums;
1610 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1611
1612 for (d = cc; d < vocab_size; d++) {
1613 for (b = 0; b < vocab_size; b++)
1614 target_sums[b]=0;
1615 maxmax_f = -1;
1616 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001617 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001618 if (a != window) {
1619 max_f = -1;
1620 window_offset = a * layer1_size;
1621 if (a > window)
1622 window_offset -= layer1_size;
1623 for(target = 0; target < vocab_size; target ++) {
1624 if(target == d)
1625 continue;
1626 f = 0;
1627 for (c = 0; c < layer1_size; c++)
1628 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1629 if (f < -MAX_EXP)
1630 continue;
1631 else if (f > MAX_EXP)
1632 continue;
1633 else
1634 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1635 if(f > max_f) {
1636 max_f = f;
1637 max_target = target;
1638 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001639 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001640 }
1641 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1642 if(max_f > maxmax_f) {
1643 maxmax_f = max_f;
1644 maxmax_target = max_target;
1645 }
1646 } else {
1647 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1648 }
1649 }
1650 max_f = -1;
1651 for (b = 0; b < vocab_size; b++) {
1652 if(target_sums[b] > max_f) {
1653 max_f = target_sums[b];
1654 max_target = b;
1655 }
1656 }
1657 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001658 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001659 vocab[maxmax_target].word, maxmax_f);
1660 }
1661}
1662
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001663void TrainModel() {
1664 long a, b, c, d;
1665 FILE *fo;
1666 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1667 printf("Starting training using file %s\n", train_file);
1668 starting_alpha = alpha;
1669 if (read_vocab_file[0] != 0)
1670 ReadVocab();
1671 else
1672 LearnVocabFromTrainFile();
1673 if (save_vocab_file[0] != 0)
1674 SaveVocab();
1675 if (output_file[0] == 0)
1676 return;
1677 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001678 if(cc > 0)
1679 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001680 if (negative > 0 || nce > 0)
1681 InitUnigramTable();
1682 if (negative_classes_file[0] != 0)
1683 InitClassUnigramTable();
1684 start = clock();
1685 for (a = 0; a < num_threads; a++)
1686 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1687 for (a = 0; a < num_threads; a++)
1688 pthread_join(pt[a], NULL);
1689 fo = fopen(output_file, "wb");
1690 if (classes == 0) {
1691 // Save the word vectors
1692 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1693 for (a = 0; a < vocab_size; a++) {
1694 fprintf(fo, "%s ", vocab[a].word);
1695 if (binary)
1696 for (b = 0; b < layer1_size; b++)
1697 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1698 else
1699 for (b = 0; b < layer1_size; b++)
1700 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1701 fprintf(fo, "\n");
1702 }
1703 } else {
1704 // Run K-means on the word vectors
1705 int clcn = classes, iter = 10, closeid;
1706 int *centcn = (int *) malloc(classes * sizeof(int));
1707 int *cl = (int *) calloc(vocab_size, sizeof(int));
1708 real closev, x;
1709 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1710 for (a = 0; a < vocab_size; a++)
1711 cl[a] = a % clcn;
1712 for (a = 0; a < iter; a++) {
1713 for (b = 0; b < clcn * layer1_size; b++)
1714 cent[b] = 0;
1715 for (b = 0; b < clcn; b++)
1716 centcn[b] = 1;
1717 for (c = 0; c < vocab_size; c++) {
1718 for (d = 0; d < layer1_size; d++)
1719 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1720 centcn[cl[c]]++;
1721 }
1722 for (b = 0; b < clcn; b++) {
1723 closev = 0;
1724 for (c = 0; c < layer1_size; c++) {
1725 cent[layer1_size * b + c] /= centcn[b];
1726 closev += cent[layer1_size * b + c]
1727 * cent[layer1_size * b + c];
1728 }
1729 closev = sqrt(closev);
1730 for (c = 0; c < layer1_size; c++)
1731 cent[layer1_size * b + c] /= closev;
1732 }
1733 for (c = 0; c < vocab_size; c++) {
1734 closev = -10;
1735 closeid = 0;
1736 for (d = 0; d < clcn; d++) {
1737 x = 0;
1738 for (b = 0; b < layer1_size; b++)
1739 x += cent[layer1_size * d + b]
1740 * syn0[c * layer1_size + b];
1741 if (x > closev) {
1742 closev = x;
1743 closeid = d;
1744 }
1745 }
1746 cl[c] = closeid;
1747 }
1748 }
1749 // Save the K-means classes
1750 for (a = 0; a < vocab_size; a++)
1751 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1752 free(centcn);
1753 free(cent);
1754 free(cl);
1755 }
1756 fclose(fo);
1757 if (save_net_file[0] != 0)
1758 SaveNet();
1759}
1760
1761int ArgPos(char *str, int argc, char **argv) {
1762 int a;
1763 for (a = 1; a < argc; a++)
1764 if (!strcmp(str, argv[a])) {
1765 if (a == argc - 1) {
1766 printf("Argument missing for %s\n", str);
1767 exit(1);
1768 }
1769 return a;
1770 }
1771 return -1;
1772}
1773
1774int main(int argc, char **argv) {
1775 int i;
1776 if (argc == 1) {
1777 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1778 printf("Options:\n");
1779 printf("Parameters for training:\n");
1780 printf("\t-train <file>\n");
1781 printf("\t\tUse text data from <file> to train the model\n");
1782 printf("\t-output <file>\n");
1783 printf(
1784 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1785 printf("\t-size <int>\n");
1786 printf("\t\tSet size of word vectors; default is 100\n");
1787 printf("\t-window <int>\n");
1788 printf("\t\tSet max skip length between words; default is 5\n");
1789 printf("\t-sample <float>\n");
1790 printf(
1791 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1792 printf(
1793 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1794 printf("\t-hs <int>\n");
1795 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1796 printf("\t-negative <int>\n");
1797 printf(
1798 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1799 printf("\t-negative-classes <file>\n");
1800 printf("\t\tNegative classes to sample from\n");
1801 printf("\t-nce <int>\n");
1802 printf(
1803 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1804 printf("\t-threads <int>\n");
1805 printf("\t\tUse <int> threads (default 12)\n");
1806 printf("\t-iter <int>\n");
1807 printf("\t\tRun more training iterations (default 5)\n");
1808 printf("\t-min-count <int>\n");
1809 printf(
1810 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1811 printf("\t-alpha <float>\n");
1812 printf(
1813 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1814 printf("\t-classes <int>\n");
1815 printf(
1816 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1817 printf("\t-debug <int>\n");
1818 printf(
1819 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1820 printf("\t-binary <int>\n");
1821 printf(
1822 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1823 printf("\t-save-vocab <file>\n");
1824 printf("\t\tThe vocabulary will be saved to <file>\n");
1825 printf("\t-read-vocab <file>\n");
1826 printf(
1827 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1828 printf("\t-read-net <file>\n");
1829 printf(
1830 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1831 printf("\t-save-net <file>\n");
1832 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001833 printf("\t-show-cc <int>\n");
1834 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001835 printf("\t-type <int>\n");
1836 printf(
1837 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1838 printf("\t-cap <int>\n");
1839 printf(
1840 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1841 printf("\nExamples:\n");
1842 printf(
1843 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1844 return 0;
1845 }
1846 output_file[0] = 0;
1847 save_vocab_file[0] = 0;
1848 read_vocab_file[0] = 0;
1849 save_net_file[0] = 0;
1850 read_net_file[0] = 0;
1851 negative_classes_file[0] = 0;
1852 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1853 layer1_size = atoi(argv[i + 1]);
1854 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1855 strcpy(train_file, argv[i + 1]);
1856 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1857 strcpy(save_vocab_file, argv[i + 1]);
1858 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1859 strcpy(read_vocab_file, argv[i + 1]);
1860 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1861 strcpy(save_net_file, argv[i + 1]);
1862 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1863 strcpy(read_net_file, argv[i + 1]);
1864 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1865 debug_mode = atoi(argv[i + 1]);
1866 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1867 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001868 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
1869 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001870 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1871 type = atoi(argv[i + 1]);
1872 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1873 strcpy(output_file, argv[i + 1]);
1874 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1875 window = atoi(argv[i + 1]);
1876 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1877 sample = atof(argv[i + 1]);
1878 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1879 hs = atoi(argv[i + 1]);
1880 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1881 negative = atoi(argv[i + 1]);
1882 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1883 strcpy(negative_classes_file, argv[i + 1]);
1884 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1885 nce = atoi(argv[i + 1]);
1886 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1887 num_threads = atoi(argv[i + 1]);
1888 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1889 iter = atoi(argv[i + 1]);
1890 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1891 min_count = atoi(argv[i + 1]);
1892 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1893 classes = atoi(argv[i + 1]);
1894 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1895 cap = atoi(argv[i + 1]);
1896 if (type == 0 || type == 2 || type == 4)
1897 alpha = 0.05;
1898 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1899 alpha = atof(argv[i + 1]);
1900 vocab = (struct vocab_word *) calloc(vocab_max_size,
1901 sizeof(struct vocab_word));
1902 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1903 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1904 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1905 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1906 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1907 }
1908 TrainModel();
1909 return 0;
1910}
1911