blob: 7dbc168c5974a7d51f6743369ebe46d2ef545ec9 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010025#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010026#define MAX_CODE_LENGTH 40
27
28const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
29
30typedef float real; // Precision of float numbers
31
32struct vocab_word {
33 long long cn;
34 int *point;
35 char *word, *code, codelen;
36};
37
38char train_file[MAX_STRING], output_file[MAX_STRING];
39char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
40char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
41struct vocab_word *vocab;
42int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
43 num_threads = 12, min_reduce = 1;
44int *vocab_hash;
45long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
46long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
47 classes = 0;
48real alpha = 0.025, starting_alpha, sample = 1e-3;
49real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
50clock_t start;
51
52real *syn1_window, *syn1neg_window, *syn1nce_window;
53int w_offset, window_layer_size;
54
55int window_hidden_size = 500;
56real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
57 *syn_hidden_word_nce;
58
59int hs = 0, negative = 5;
60const int table_size = 1e8;
61int *table;
62
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010063long cc = 0;
64
Marc Kupietzd6f9c712016-03-16 11:50:56 +010065//constrastive negative sampling
66char negative_classes_file[MAX_STRING];
67int *word_to_group;
68int *group_to_table; //group_size*table_size
69int class_number;
70
71//nce
72real* noise_distribution;
73int nce = 0;
74
75//param caps
76real CAP_VALUE = 50;
77int cap = 0;
78
79void capParam(real* array, int index) {
80 if (array[index] > CAP_VALUE)
81 array[index] = CAP_VALUE;
82 else if (array[index] < -CAP_VALUE)
83 array[index] = -CAP_VALUE;
84}
85
86real hardTanh(real x) {
87 if (x >= 1) {
88 return 1;
89 } else if (x <= -1) {
90 return -1;
91 } else {
92 return x;
93 }
94}
95
96real dHardTanh(real x, real g) {
97 if (x > 1 && g > 0) {
98 return 0;
99 }
100 if (x < -1 && g < 0) {
101 return 0;
102 }
103 return 1;
104}
105
106void InitUnigramTable() {
107 int a, i;
108 long long train_words_pow = 0;
109 real d1, power = 0.75;
110 table = (int *) malloc(table_size * sizeof(int));
111 for (a = 0; a < vocab_size; a++)
112 train_words_pow += pow(vocab[a].cn, power);
113 i = 0;
114 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
115 for (a = 0; a < table_size; a++) {
116 table[a] = i;
117 if (a / (real) table_size > d1) {
118 i++;
119 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
120 }
121 if (i >= vocab_size)
122 i = vocab_size - 1;
123 }
124
125 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
126 for (a = 0; a < vocab_size; a++)
127 noise_distribution[a] = pow(vocab[a].cn, power)
128 / (real) train_words_pow;
129}
130
131// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
132void ReadWord(char *word, FILE *fin) {
133 int a = 0, ch;
134 while (!feof(fin)) {
135 ch = fgetc(fin);
136 if (ch == 13)
137 continue;
138 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
139 if (a > 0) {
140 if (ch == '\n')
141 ungetc(ch, fin);
142 break;
143 }
144 if (ch == '\n') {
145 strcpy(word, (char *) "</s>");
146 return;
147 } else
148 continue;
149 }
150 word[a] = ch;
151 a++;
152 if (a >= MAX_STRING - 1)
153 a--; // Truncate too long words
154 }
155 word[a] = 0;
156}
157
158// Returns hash value of a word
159int GetWordHash(char *word) {
160 unsigned long long a, hash = 0;
161 for (a = 0; a < strlen(word); a++)
162 hash = hash * 257 + word[a];
163 hash = hash % vocab_hash_size;
164 return hash;
165}
166
167// Returns position of a word in the vocabulary; if the word is not found, returns -1
168int SearchVocab(char *word) {
169 unsigned int hash = GetWordHash(word);
170 while (1) {
171 if (vocab_hash[hash] == -1)
172 return -1;
173 if (!strcmp(word, vocab[vocab_hash[hash]].word))
174 return vocab_hash[hash];
175 hash = (hash + 1) % vocab_hash_size;
176 }
177 return -1;
178}
179
180// Reads a word and returns its index in the vocabulary
181int ReadWordIndex(FILE *fin) {
182 char word[MAX_STRING];
183 ReadWord(word, fin);
184 if (feof(fin))
185 return -1;
186 return SearchVocab(word);
187}
188
189// Adds a word to the vocabulary
190int AddWordToVocab(char *word) {
191 unsigned int hash, length = strlen(word) + 1;
192 if (length > MAX_STRING)
193 length = MAX_STRING;
194 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
195 strcpy(vocab[vocab_size].word, word);
196 vocab[vocab_size].cn = 0;
197 vocab_size++;
198 // Reallocate memory if needed
199 if (vocab_size + 2 >= vocab_max_size) {
200 vocab_max_size += 1000;
201 vocab = (struct vocab_word *) realloc(vocab,
202 vocab_max_size * sizeof(struct vocab_word));
203 }
204 hash = GetWordHash(word);
205 while (vocab_hash[hash] != -1)
206 hash = (hash + 1) % vocab_hash_size;
207 vocab_hash[hash] = vocab_size - 1;
208 return vocab_size - 1;
209}
210
211// Used later for sorting by word counts
212int VocabCompare(const void *a, const void *b) {
213 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
214}
215
216// Sorts the vocabulary by frequency using word counts
217void SortVocab() {
218 int a, size;
219 unsigned int hash;
220 // Sort the vocabulary and keep </s> at the first position
221 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
222 for (a = 0; a < vocab_hash_size; a++)
223 vocab_hash[a] = -1;
224 size = vocab_size;
225 train_words = 0;
226 for (a = 0; a < size; a++) {
227 // Words occuring less than min_count times will be discarded from the vocab
228 if ((vocab[a].cn < min_count) && (a != 0)) {
229 vocab_size--;
230 free(vocab[a].word);
231 } else {
232 // Hash will be re-computed, as after the sorting it is not actual
233 hash = GetWordHash(vocab[a].word);
234 while (vocab_hash[hash] != -1)
235 hash = (hash + 1) % vocab_hash_size;
236 vocab_hash[hash] = a;
237 train_words += vocab[a].cn;
238 }
239 }
240 vocab = (struct vocab_word *) realloc(vocab,
241 (vocab_size + 1) * sizeof(struct vocab_word));
242 // Allocate memory for the binary tree construction
243 for (a = 0; a < vocab_size; a++) {
244 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
245 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
246 }
247}
248
249// Reduces the vocabulary by removing infrequent tokens
250void ReduceVocab() {
251 int a, b = 0;
252 unsigned int hash;
253 for (a = 0; a < vocab_size; a++)
254 if (vocab[a].cn > min_reduce) {
255 vocab[b].cn = vocab[a].cn;
256 vocab[b].word = vocab[a].word;
257 b++;
258 } else
259 free(vocab[a].word);
260 vocab_size = b;
261 for (a = 0; a < vocab_hash_size; a++)
262 vocab_hash[a] = -1;
263 for (a = 0; a < vocab_size; a++) {
264 // Hash will be re-computed, as it is not actual
265 hash = GetWordHash(vocab[a].word);
266 while (vocab_hash[hash] != -1)
267 hash = (hash + 1) % vocab_hash_size;
268 vocab_hash[hash] = a;
269 }
270 fflush(stdout);
271 min_reduce++;
272}
273
274// Create binary Huffman tree using the word counts
275// Frequent words will have short uniqe binary codes
276void CreateBinaryTree() {
277 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
278 char code[MAX_CODE_LENGTH];
279 long long *count = (long long *) calloc(vocab_size * 2 + 1,
280 sizeof(long long));
281 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
282 sizeof(long long));
283 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
284 sizeof(long long));
285 for (a = 0; a < vocab_size; a++)
286 count[a] = vocab[a].cn;
287 for (a = vocab_size; a < vocab_size * 2; a++)
288 count[a] = 1e15;
289 pos1 = vocab_size - 1;
290 pos2 = vocab_size;
291 // Following algorithm constructs the Huffman tree by adding one node at a time
292 for (a = 0; a < vocab_size - 1; a++) {
293 // First, find two smallest nodes 'min1, min2'
294 if (pos1 >= 0) {
295 if (count[pos1] < count[pos2]) {
296 min1i = pos1;
297 pos1--;
298 } else {
299 min1i = pos2;
300 pos2++;
301 }
302 } else {
303 min1i = pos2;
304 pos2++;
305 }
306 if (pos1 >= 0) {
307 if (count[pos1] < count[pos2]) {
308 min2i = pos1;
309 pos1--;
310 } else {
311 min2i = pos2;
312 pos2++;
313 }
314 } else {
315 min2i = pos2;
316 pos2++;
317 }
318 count[vocab_size + a] = count[min1i] + count[min2i];
319 parent_node[min1i] = vocab_size + a;
320 parent_node[min2i] = vocab_size + a;
321 binary[min2i] = 1;
322 }
323 // Now assign binary code to each vocabulary word
324 for (a = 0; a < vocab_size; a++) {
325 b = a;
326 i = 0;
327 while (1) {
328 code[i] = binary[b];
329 point[i] = b;
330 i++;
331 b = parent_node[b];
332 if (b == vocab_size * 2 - 2)
333 break;
334 }
335 vocab[a].codelen = i;
336 vocab[a].point[0] = vocab_size - 2;
337 for (b = 0; b < i; b++) {
338 vocab[a].code[i - b - 1] = code[b];
339 vocab[a].point[i - b] = point[b] - vocab_size;
340 }
341 }
342 free(count);
343 free(binary);
344 free(parent_node);
345}
346
347void LearnVocabFromTrainFile() {
348 char word[MAX_STRING];
349 FILE *fin;
350 long long a, i;
351 for (a = 0; a < vocab_hash_size; a++)
352 vocab_hash[a] = -1;
353 fin = fopen(train_file, "rb");
354 if (fin == NULL) {
355 printf("ERROR: training data file not found!\n");
356 exit(1);
357 }
358 vocab_size = 0;
359 AddWordToVocab((char *) "</s>");
360 while (1) {
361 ReadWord(word, fin);
362 if (feof(fin))
363 break;
364 train_words++;
365 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
366 printf("%lldK%c", train_words / 1000, 13);
367 fflush(stdout);
368 }
369 i = SearchVocab(word);
370 if (i == -1) {
371 a = AddWordToVocab(word);
372 vocab[a].cn = 1;
373 } else
374 vocab[i].cn++;
375 if (vocab_size > vocab_hash_size * 0.7)
376 ReduceVocab();
377 }
378 SortVocab();
379 if (debug_mode > 0) {
380 printf("Vocab size: %lld\n", vocab_size);
381 printf("Words in train file: %lld\n", train_words);
382 }
383 file_size = ftell(fin);
384 fclose(fin);
385}
386
387void SaveVocab() {
388 long long i;
389 FILE *fo = fopen(save_vocab_file, "wb");
390 for (i = 0; i < vocab_size; i++)
391 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
392 fclose(fo);
393}
394
395void ReadVocab() {
396 long long a, i = 0;
397 char c;
398 char word[MAX_STRING];
399 FILE *fin = fopen(read_vocab_file, "rb");
400 if (fin == NULL) {
401 printf("Vocabulary file not found\n");
402 exit(1);
403 }
404 for (a = 0; a < vocab_hash_size; a++)
405 vocab_hash[a] = -1;
406 vocab_size = 0;
407 while (1) {
408 ReadWord(word, fin);
409 if (feof(fin))
410 break;
411 a = AddWordToVocab(word);
412 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
413 i++;
414 }
415 SortVocab();
416 if (debug_mode > 0) {
417 printf("Vocab size: %lld\n", vocab_size);
418 printf("Words in train file: %lld\n", train_words);
419 }
420 fin = fopen(train_file, "rb");
421 if (fin == NULL) {
422 printf("ERROR: training data file not found!\n");
423 exit(1);
424 }
425 fseek(fin, 0, SEEK_END);
426 file_size = ftell(fin);
427 fclose(fin);
428}
429
430void InitClassUnigramTable() {
431 long long a, c;
432 printf("loading class unigrams \n");
433 FILE *fin = fopen(negative_classes_file, "rb");
434 if (fin == NULL) {
435 printf("ERROR: class file not found!\n");
436 exit(1);
437 }
438 word_to_group = (int *) malloc(vocab_size * sizeof(int));
439 for (a = 0; a < vocab_size; a++)
440 word_to_group[a] = -1;
441 char class[MAX_STRING];
442 char prev_class[MAX_STRING];
443 prev_class[0] = 0;
444 char word[MAX_STRING];
445 class_number = -1;
446 while (1) {
447 if (feof(fin))
448 break;
449 ReadWord(class, fin);
450 ReadWord(word, fin);
451 int word_index = SearchVocab(word);
452 if (word_index != -1) {
453 if (strcmp(class, prev_class) != 0) {
454 class_number++;
455 strcpy(prev_class, class);
456 }
457 word_to_group[word_index] = class_number;
458 }
459 ReadWord(word, fin);
460 }
461 class_number++;
462 fclose(fin);
463
464 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
465 long long train_words_pow = 0;
466 real d1, power = 0.75;
467
468 for (c = 0; c < class_number; c++) {
469 long long offset = c * table_size;
470 train_words_pow = 0;
471 for (a = 0; a < vocab_size; a++)
472 if (word_to_group[a] == c)
473 train_words_pow += pow(vocab[a].cn, power);
474 int i = 0;
475 while (word_to_group[i] != c && i < vocab_size)
476 i++;
477 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
478 for (a = 0; a < table_size; a++) {
479 //printf("index %lld , word %d\n", a, i);
480 group_to_table[offset + a] = i;
481 if (a / (real) table_size > d1) {
482 i++;
483 while (word_to_group[i] != c && i < vocab_size)
484 i++;
485 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
486 }
487 if (i >= vocab_size)
488 while (word_to_group[i] != c && i >= 0)
489 i--;
490 }
491 }
492}
493
Marc Kupietz210b9d52016-04-02 21:48:13 +0200494void SaveArgs(int argc, char **argv) {
495 unsigned int i;
496 size_t len = 0;
497 char *_all_args, *all_args;
498 char *args_file = strdup(output_file);
499 strcat(args_file, ".args");
500 FILE *fargs = fopen(args_file, "w");
501 if (fargs == NULL) {
502 printf("Cannot save args to %s.\n", args_file);
503 return;
504 }
505
506 for(i=1; i<argc; i++) {
507 len += strlen(argv[i]);
508 }
509
510 _all_args = all_args = (char *)malloc(len+argc-1);
511
512 for(i=1; i<argc; i++) {
513 memcpy(_all_args, argv[i], strlen(argv[i]));
514 _all_args += strlen(argv[i])+1;
515 *(_all_args-1) = ' ';
516 }
517 *(_all_args-1) = 0;
518
519 fprintf(fargs, "%s\n", all_args);
520 fclose(fargs);
521
522 free(all_args);
523
524 return;
525}
526
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100527void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100528 if(type != 3 || negative <= 0) {
529 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
530 return;
531 }
532
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100533 FILE *fnet = fopen(save_net_file, "wb");
534 if (fnet == NULL) {
535 printf("Net parameter file not found\n");
536 exit(1);
537 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100538 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100539 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100540 fclose(fnet);
541}
542
543void InitNet() {
544 long long a, b;
545 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100546 long long read;
547
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100548 window_layer_size = layer1_size * window * 2;
549 a = posix_memalign((void **) &syn0, 128,
550 (long long) vocab_size * layer1_size * sizeof(real));
551 if (syn0 == NULL) {
552 printf("Memory allocation failed\n");
553 exit(1);
554 }
555
556 if (hs) {
557 a = posix_memalign((void **) &syn1, 128,
558 (long long) vocab_size * layer1_size * sizeof(real));
559 if (syn1 == NULL) {
560 printf("Memory allocation failed\n");
561 exit(1);
562 }
563 a = posix_memalign((void **) &syn1_window, 128,
564 (long long) vocab_size * window_layer_size * sizeof(real));
565 if (syn1_window == NULL) {
566 printf("Memory allocation failed\n");
567 exit(1);
568 }
569 a = posix_memalign((void **) &syn_hidden_word, 128,
570 (long long) vocab_size * window_hidden_size * sizeof(real));
571 if (syn_hidden_word == NULL) {
572 printf("Memory allocation failed\n");
573 exit(1);
574 }
575
576 for (a = 0; a < vocab_size; a++)
577 for (b = 0; b < layer1_size; b++)
578 syn1[a * layer1_size + b] = 0;
579 for (a = 0; a < vocab_size; a++)
580 for (b = 0; b < window_layer_size; b++)
581 syn1_window[a * window_layer_size + b] = 0;
582 for (a = 0; a < vocab_size; a++)
583 for (b = 0; b < window_hidden_size; b++)
584 syn_hidden_word[a * window_hidden_size + b] = 0;
585 }
586 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100587 if(type == 0) {
588 a = posix_memalign((void **) &syn1neg, 128,
589 (long long) vocab_size * layer1_size * sizeof(real));
590 if (syn1neg == NULL) {
591 printf("Memory allocation failed\n");
592 exit(1);
593 }
594 for (a = 0; a < vocab_size; a++)
595 for (b = 0; b < layer1_size; b++)
596 syn1neg[a * layer1_size + b] = 0;
597 } else if (type == 3) {
598 a = posix_memalign((void **) &syn1neg_window, 128,
599 (long long) vocab_size * window_layer_size * sizeof(real));
600 if (syn1neg_window == NULL) {
601 printf("Memory allocation failed\n");
602 exit(1);
603 }
604 for (a = 0; a < vocab_size; a++)
605 for (b = 0; b < window_layer_size; b++)
606 syn1neg_window[a * window_layer_size + b] = 0;
607 } else if (type == 4) {
608 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
609 (long long) vocab_size * window_hidden_size * sizeof(real));
610 if (syn_hidden_word_neg == NULL) {
611 printf("Memory allocation failed\n");
612 exit(1);
613 }
614 for (a = 0; a < vocab_size; a++)
615 for (b = 0; b < window_hidden_size; b++)
616 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100617 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100618 }
619 if (nce > 0) {
620 a = posix_memalign((void **) &syn1nce, 128,
621 (long long) vocab_size * layer1_size * sizeof(real));
622 if (syn1nce == NULL) {
623 printf("Memory allocation failed\n");
624 exit(1);
625 }
626 a = posix_memalign((void **) &syn1nce_window, 128,
627 (long long) vocab_size * window_layer_size * sizeof(real));
628 if (syn1nce_window == NULL) {
629 printf("Memory allocation failed\n");
630 exit(1);
631 }
632 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
633 (long long) vocab_size * window_hidden_size * sizeof(real));
634 if (syn_hidden_word_nce == NULL) {
635 printf("Memory allocation failed\n");
636 exit(1);
637 }
638
639 for (a = 0; a < vocab_size; a++)
640 for (b = 0; b < layer1_size; b++)
641 syn1nce[a * layer1_size + b] = 0;
642 for (a = 0; a < vocab_size; a++)
643 for (b = 0; b < window_layer_size; b++)
644 syn1nce_window[a * window_layer_size + b] = 0;
645 for (a = 0; a < vocab_size; a++)
646 for (b = 0; b < window_hidden_size; b++)
647 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
648 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100649
Marc Kupietz1006a272016-03-16 15:50:20 +0100650 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100651 a = posix_memalign((void **) &syn_window_hidden, 128,
652 window_hidden_size * window_layer_size * sizeof(real));
653 if (syn_window_hidden == NULL) {
654 printf("Memory allocation failed\n");
655 exit(1);
656 }
657 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
658 next_random = next_random * (unsigned long long) 25214903917 + 11;
659 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
660 - 0.5) / (window_hidden_size * window_layer_size);
661 }
662 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100663
664 if (read_net_file[0] == 0) {
665 for (a = 0; a < vocab_size; a++)
666 for (b = 0; b < layer1_size; b++) {
667 next_random = next_random * (unsigned long long) 25214903917
668 + 11;
669 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
670 / (real) 65536) - 0.5) / layer1_size;
671 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100672 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100673 FILE *fnet = fopen(read_net_file, "rb");
674 if (fnet == NULL) {
675 printf("Net parameter file not found\n");
676 exit(1);
677 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100678 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
679 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
680 if(read != vocab_size * layer1_size) {
681 fprintf(stderr, "read-net failed %lld\n", read);
682 exit(-1);
683 }
684 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
685 if(read != (long long) vocab_size * window_layer_size) {
686 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
687 (long long) sizeof(real) * vocab_size * window_layer_size);
688 exit(-1);
689 }
690 fgetc(fnet);
691 if(!feof(fnet)) {
692 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
693 exit(-1);
694 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100695 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100696 } else {
697 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
698 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100699 }
700
701 CreateBinaryTree();
702}
703
704void *TrainModelThread(void *id) {
705 long long a, b, d, cw, word, last_word, sentence_length = 0,
706 sentence_position = 0;
707 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
708 long long l1, l2, c, target, label, local_iter = iter;
709 unsigned long long next_random = (long long) id;
710 real f, g;
711 clock_t now;
712 int input_len_1 = layer1_size;
713 int window_offset = -1;
714 if (type == 2 || type == 4) {
715 input_len_1 = window_layer_size;
716 }
717 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
718 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
719
720 int input_len_2 = 0;
721 if (type == 4) {
722 input_len_2 = window_hidden_size;
723 }
724 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
725 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
726
727 FILE *fi = fopen(train_file, "rb");
728 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
729 while (1) {
730 if (word_count - last_word_count > 10000) {
731 word_count_actual += word_count - last_word_count;
732 last_word_count = word_count;
733 if ((debug_mode > 1)) {
734 now = clock();
735 printf(
736 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
737 13, alpha,
738 word_count_actual / (real) (iter * train_words + 1)
739 * 100,
740 word_count_actual
741 / ((real) (now - start + 1)
742 / (real) CLOCKS_PER_SEC * 1000));
743 fflush(stdout);
744 }
745 alpha = starting_alpha
746 * (1 - word_count_actual / (real) (iter * train_words + 1));
747 if (alpha < starting_alpha * 0.0001)
748 alpha = starting_alpha * 0.0001;
749 }
750 if (sentence_length == 0) {
751 while (1) {
752 word = ReadWordIndex(fi);
753 if (feof(fi))
754 break;
755 if (word == -1)
756 continue;
757 word_count++;
758 if (word == 0)
759 break;
760 // The subsampling randomly discards frequent words while keeping the ranking same
761 if (sample > 0) {
762 real ran = (sqrt(vocab[word].cn / (sample * train_words))
763 + 1) * (sample * train_words) / vocab[word].cn;
764 next_random = next_random * (unsigned long long) 25214903917
765 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100766 if (ran < (next_random & 0xFFFF) / (real) 65536) {
767 if(type == 3) // in structured skipgrams
768 word = -2; // keep the window position correct
769 else
770 continue;
771 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100772 }
773 sen[sentence_length] = word;
774 sentence_length++;
775 if (sentence_length >= MAX_SENTENCE_LENGTH)
776 break;
777 }
778 sentence_position = 0;
779 }
780 if (feof(fi) || (word_count > train_words / num_threads)) {
781 word_count_actual += word_count - last_word_count;
782 local_iter--;
783 if (local_iter == 0)
784 break;
785 word_count = 0;
786 last_word_count = 0;
787 sentence_length = 0;
788 fseek(fi, file_size / (long long) num_threads * (long long) id,
789 SEEK_SET);
790 continue;
791 }
792 word = sen[sentence_position];
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100793 if (word == -2)
794 word = sen[++sentence_position];
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100795 if (word == -1)
796 continue;
797 for (c = 0; c < input_len_1; c++)
798 neu1[c] = 0;
799 for (c = 0; c < input_len_1; c++)
800 neu1e[c] = 0;
801 for (c = 0; c < input_len_2; c++)
802 neu2[c] = 0;
803 for (c = 0; c < input_len_2; c++)
804 neu2e[c] = 0;
805 next_random = next_random * (unsigned long long) 25214903917 + 11;
806 b = next_random % window;
807 if (type == 0) { //train the cbow architecture
808 // in -> hidden
809 cw = 0;
810 for (a = b; a < window * 2 + 1 - b; a++)
811 if (a != window) {
812 c = sentence_position - window + a;
813 if (c < 0)
814 continue;
815 if (c >= sentence_length)
816 continue;
817 last_word = sen[c];
818 if (last_word == -1)
819 continue;
820 for (c = 0; c < layer1_size; c++)
821 neu1[c] += syn0[c + last_word * layer1_size];
822 cw++;
823 }
824 if (cw) {
825 for (c = 0; c < layer1_size; c++)
826 neu1[c] /= cw;
827 if (hs)
828 for (d = 0; d < vocab[word].codelen; d++) {
829 f = 0;
830 l2 = vocab[word].point[d] * layer1_size;
831 // Propagate hidden -> output
832 for (c = 0; c < layer1_size; c++)
833 f += neu1[c] * syn1[c + l2];
834 if (f <= -MAX_EXP)
835 continue;
836 else if (f >= MAX_EXP)
837 continue;
838 else
839 f = expTable[(int) ((f + MAX_EXP)
840 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
841 // 'g' is the gradient multiplied by the learning rate
842 g = (1 - vocab[word].code[d] - f) * alpha;
843 // Propagate errors output -> hidden
844 for (c = 0; c < layer1_size; c++)
845 neu1e[c] += g * syn1[c + l2];
846 // Learn weights hidden -> output
847 for (c = 0; c < layer1_size; c++)
848 syn1[c + l2] += g * neu1[c];
849 if (cap == 1)
850 for (c = 0; c < layer1_size; c++)
851 capParam(syn1, c + l2);
852 }
853 // NEGATIVE SAMPLING
854 if (negative > 0)
855 for (d = 0; d < negative + 1; d++) {
856 if (d == 0) {
857 target = word;
858 label = 1;
859 } else {
860 next_random = next_random
861 * (unsigned long long) 25214903917 + 11;
862 if (word_to_group != NULL
863 && word_to_group[word] != -1) {
864 target = word;
865 while (target == word) {
866 target = group_to_table[word_to_group[word]
867 * table_size
868 + (next_random >> 16) % table_size];
869 next_random = next_random
870 * (unsigned long long) 25214903917
871 + 11;
872 }
873 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
874 } else {
875 target =
876 table[(next_random >> 16) % table_size];
877 }
878 if (target == 0)
879 target = next_random % (vocab_size - 1) + 1;
880 if (target == word)
881 continue;
882 label = 0;
883 }
884 l2 = target * layer1_size;
885 f = 0;
886 for (c = 0; c < layer1_size; c++)
887 f += neu1[c] * syn1neg[c + l2];
888 if (f > MAX_EXP)
889 g = (label - 1) * alpha;
890 else if (f < -MAX_EXP)
891 g = (label - 0) * alpha;
892 else
893 g = (label
894 - expTable[(int) ((f + MAX_EXP)
895 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
896 * alpha;
897 for (c = 0; c < layer1_size; c++)
898 neu1e[c] += g * syn1neg[c + l2];
899 for (c = 0; c < layer1_size; c++)
900 syn1neg[c + l2] += g * neu1[c];
901 if (cap == 1)
902 for (c = 0; c < layer1_size; c++)
903 capParam(syn1neg, c + l2);
904 }
905 // Noise Contrastive Estimation
906 if (nce > 0)
907 for (d = 0; d < nce + 1; d++) {
908 if (d == 0) {
909 target = word;
910 label = 1;
911 } else {
912 next_random = next_random
913 * (unsigned long long) 25214903917 + 11;
914 if (word_to_group != NULL
915 && word_to_group[word] != -1) {
916 target = word;
917 while (target == word) {
918 target = group_to_table[word_to_group[word]
919 * table_size
920 + (next_random >> 16) % table_size];
921 next_random = next_random
922 * (unsigned long long) 25214903917
923 + 11;
924 }
925 } else {
926 target =
927 table[(next_random >> 16) % table_size];
928 }
929 if (target == 0)
930 target = next_random % (vocab_size - 1) + 1;
931 if (target == word)
932 continue;
933 label = 0;
934 }
935 l2 = target * layer1_size;
936 f = 0;
937
938 for (c = 0; c < layer1_size; c++)
939 f += neu1[c] * syn1nce[c + l2];
940 if (f > MAX_EXP)
941 g = (label - 1) * alpha;
942 else if (f < -MAX_EXP)
943 g = (label - 0) * alpha;
944 else {
945 f = exp(f);
946 g =
947 (label
948 - f
949 / (noise_distribution[target]
950 * nce + f)) * alpha;
951 }
952 for (c = 0; c < layer1_size; c++)
953 neu1e[c] += g * syn1nce[c + l2];
954 for (c = 0; c < layer1_size; c++)
955 syn1nce[c + l2] += g * neu1[c];
956 if (cap == 1)
957 for (c = 0; c < layer1_size; c++)
958 capParam(syn1nce, c + l2);
959 }
960 // hidden -> in
961 for (a = b; a < window * 2 + 1 - b; a++)
962 if (a != window) {
963 c = sentence_position - window + a;
964 if (c < 0)
965 continue;
966 if (c >= sentence_length)
967 continue;
968 last_word = sen[c];
969 if (last_word == -1)
970 continue;
971 for (c = 0; c < layer1_size; c++)
972 syn0[c + last_word * layer1_size] += neu1e[c];
973 }
974 }
975 } else if (type == 1) { //train skip-gram
976 for (a = b; a < window * 2 + 1 - b; a++)
977 if (a != window) {
978 c = sentence_position - window + a;
979 if (c < 0)
980 continue;
981 if (c >= sentence_length)
982 continue;
983 last_word = sen[c];
984 if (last_word == -1)
985 continue;
986 l1 = last_word * layer1_size;
987 for (c = 0; c < layer1_size; c++)
988 neu1e[c] = 0;
989 // HIERARCHICAL SOFTMAX
990 if (hs)
991 for (d = 0; d < vocab[word].codelen; d++) {
992 f = 0;
993 l2 = vocab[word].point[d] * layer1_size;
994 // Propagate hidden -> output
995 for (c = 0; c < layer1_size; c++)
996 f += syn0[c + l1] * syn1[c + l2];
997 if (f <= -MAX_EXP)
998 continue;
999 else if (f >= MAX_EXP)
1000 continue;
1001 else
1002 f = expTable[(int) ((f + MAX_EXP)
1003 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1004 // 'g' is the gradient multiplied by the learning rate
1005 g = (1 - vocab[word].code[d] - f) * alpha;
1006 // Propagate errors output -> hidden
1007 for (c = 0; c < layer1_size; c++)
1008 neu1e[c] += g * syn1[c + l2];
1009 // Learn weights hidden -> output
1010 for (c = 0; c < layer1_size; c++)
1011 syn1[c + l2] += g * syn0[c + l1];
1012 if (cap == 1)
1013 for (c = 0; c < layer1_size; c++)
1014 capParam(syn1, c + l2);
1015 }
1016 // NEGATIVE SAMPLING
1017 if (negative > 0)
1018 for (d = 0; d < negative + 1; d++) {
1019 if (d == 0) {
1020 target = word;
1021 label = 1;
1022 } else {
1023 next_random = next_random
1024 * (unsigned long long) 25214903917 + 11;
1025 if (word_to_group != NULL
1026 && word_to_group[word] != -1) {
1027 target = word;
1028 while (target == word) {
1029 target =
1030 group_to_table[word_to_group[word]
1031 * table_size
1032 + (next_random >> 16)
1033 % table_size];
1034 next_random =
1035 next_random
1036 * (unsigned long long) 25214903917
1037 + 11;
1038 }
1039 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1040 } else {
1041 target = table[(next_random >> 16)
1042 % table_size];
1043 }
1044 if (target == 0)
1045 target = next_random % (vocab_size - 1) + 1;
1046 if (target == word)
1047 continue;
1048 label = 0;
1049 }
1050 l2 = target * layer1_size;
1051 f = 0;
1052 for (c = 0; c < layer1_size; c++)
1053 f += syn0[c + l1] * syn1neg[c + l2];
1054 if (f > MAX_EXP)
1055 g = (label - 1) * alpha;
1056 else if (f < -MAX_EXP)
1057 g = (label - 0) * alpha;
1058 else
1059 g =
1060 (label
1061 - expTable[(int) ((f + MAX_EXP)
1062 * (EXP_TABLE_SIZE
1063 / MAX_EXP / 2))])
1064 * alpha;
1065 for (c = 0; c < layer1_size; c++)
1066 neu1e[c] += g * syn1neg[c + l2];
1067 for (c = 0; c < layer1_size; c++)
1068 syn1neg[c + l2] += g * syn0[c + l1];
1069 if (cap == 1)
1070 for (c = 0; c < layer1_size; c++)
1071 capParam(syn1neg, c + l2);
1072 }
1073 //Noise Contrastive Estimation
1074 if (nce > 0)
1075 for (d = 0; d < nce + 1; d++) {
1076 if (d == 0) {
1077 target = word;
1078 label = 1;
1079 } else {
1080 next_random = next_random
1081 * (unsigned long long) 25214903917 + 11;
1082 if (word_to_group != NULL
1083 && word_to_group[word] != -1) {
1084 target = word;
1085 while (target == word) {
1086 target =
1087 group_to_table[word_to_group[word]
1088 * table_size
1089 + (next_random >> 16)
1090 % table_size];
1091 next_random =
1092 next_random
1093 * (unsigned long long) 25214903917
1094 + 11;
1095 }
1096 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1097 } else {
1098 target = table[(next_random >> 16)
1099 % table_size];
1100 }
1101 if (target == 0)
1102 target = next_random % (vocab_size - 1) + 1;
1103 if (target == word)
1104 continue;
1105 label = 0;
1106 }
1107 l2 = target * layer1_size;
1108 f = 0;
1109 for (c = 0; c < layer1_size; c++)
1110 f += syn0[c + l1] * syn1nce[c + l2];
1111 if (f > MAX_EXP)
1112 g = (label - 1) * alpha;
1113 else if (f < -MAX_EXP)
1114 g = (label - 0) * alpha;
1115 else {
1116 f = exp(f);
1117 g = (label
1118 - f
1119 / (noise_distribution[target]
1120 * nce + f)) * alpha;
1121 }
1122 for (c = 0; c < layer1_size; c++)
1123 neu1e[c] += g * syn1nce[c + l2];
1124 for (c = 0; c < layer1_size; c++)
1125 syn1nce[c + l2] += g * syn0[c + l1];
1126 if (cap == 1)
1127 for (c = 0; c < layer1_size; c++)
1128 capParam(syn1nce, c + l2);
1129 }
1130 // Learn weights input -> hidden
1131 for (c = 0; c < layer1_size; c++)
1132 syn0[c + l1] += neu1e[c];
1133 }
1134 } else if (type == 2) { //train the cwindow architecture
1135 // in -> hidden
1136 cw = 0;
1137 for (a = 0; a < window * 2 + 1; a++)
1138 if (a != window) {
1139 c = sentence_position - window + a;
1140 if (c < 0)
1141 continue;
1142 if (c >= sentence_length)
1143 continue;
1144 last_word = sen[c];
1145 if (last_word == -1)
1146 continue;
1147 window_offset = a * layer1_size;
1148 if (a > window)
1149 window_offset -= layer1_size;
1150 for (c = 0; c < layer1_size; c++)
1151 neu1[c + window_offset] += syn0[c
1152 + last_word * layer1_size];
1153 cw++;
1154 }
1155 if (cw) {
1156 if (hs)
1157 for (d = 0; d < vocab[word].codelen; d++) {
1158 f = 0;
1159 l2 = vocab[word].point[d] * window_layer_size;
1160 // Propagate hidden -> output
1161 for (c = 0; c < window_layer_size; c++)
1162 f += neu1[c] * syn1_window[c + l2];
1163 if (f <= -MAX_EXP)
1164 continue;
1165 else if (f >= MAX_EXP)
1166 continue;
1167 else
1168 f = expTable[(int) ((f + MAX_EXP)
1169 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1170 // 'g' is the gradient multiplied by the learning rate
1171 g = (1 - vocab[word].code[d] - f) * alpha;
1172 // Propagate errors output -> hidden
1173 for (c = 0; c < window_layer_size; c++)
1174 neu1e[c] += g * syn1_window[c + l2];
1175 // Learn weights hidden -> output
1176 for (c = 0; c < window_layer_size; c++)
1177 syn1_window[c + l2] += g * neu1[c];
1178 if (cap == 1)
1179 for (c = 0; c < window_layer_size; c++)
1180 capParam(syn1_window, c + l2);
1181 }
1182 // NEGATIVE SAMPLING
1183 if (negative > 0)
1184 for (d = 0; d < negative + 1; d++) {
1185 if (d == 0) {
1186 target = word;
1187 label = 1;
1188 } else {
1189 next_random = next_random
1190 * (unsigned long long) 25214903917 + 11;
1191 if (word_to_group != NULL
1192 && word_to_group[word] != -1) {
1193 target = word;
1194 while (target == word) {
1195 target = group_to_table[word_to_group[word]
1196 * table_size
1197 + (next_random >> 16) % table_size];
1198 next_random = next_random
1199 * (unsigned long long) 25214903917
1200 + 11;
1201 }
1202 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1203 } else {
1204 target =
1205 table[(next_random >> 16) % table_size];
1206 }
1207 if (target == 0)
1208 target = next_random % (vocab_size - 1) + 1;
1209 if (target == word)
1210 continue;
1211 label = 0;
1212 }
1213 l2 = target * window_layer_size;
1214 f = 0;
1215 for (c = 0; c < window_layer_size; c++)
1216 f += neu1[c] * syn1neg_window[c + l2];
1217 if (f > MAX_EXP)
1218 g = (label - 1) * alpha;
1219 else if (f < -MAX_EXP)
1220 g = (label - 0) * alpha;
1221 else
1222 g = (label
1223 - expTable[(int) ((f + MAX_EXP)
1224 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1225 * alpha;
1226 for (c = 0; c < window_layer_size; c++)
1227 neu1e[c] += g * syn1neg_window[c + l2];
1228 for (c = 0; c < window_layer_size; c++)
1229 syn1neg_window[c + l2] += g * neu1[c];
1230 if (cap == 1)
1231 for (c = 0; c < window_layer_size; c++)
1232 capParam(syn1neg_window, c + l2);
1233 }
1234 // Noise Contrastive Estimation
1235 if (nce > 0)
1236 for (d = 0; d < nce + 1; d++) {
1237 if (d == 0) {
1238 target = word;
1239 label = 1;
1240 } else {
1241 next_random = next_random
1242 * (unsigned long long) 25214903917 + 11;
1243 if (word_to_group != NULL
1244 && word_to_group[word] != -1) {
1245 target = word;
1246 while (target == word) {
1247 target = group_to_table[word_to_group[word]
1248 * table_size
1249 + (next_random >> 16) % table_size];
1250 next_random = next_random
1251 * (unsigned long long) 25214903917
1252 + 11;
1253 }
1254 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1255 } else {
1256 target =
1257 table[(next_random >> 16) % table_size];
1258 }
1259 if (target == 0)
1260 target = next_random % (vocab_size - 1) + 1;
1261 if (target == word)
1262 continue;
1263 label = 0;
1264 }
1265 l2 = target * window_layer_size;
1266 f = 0;
1267 for (c = 0; c < window_layer_size; c++)
1268 f += neu1[c] * syn1nce_window[c + l2];
1269 if (f > MAX_EXP)
1270 g = (label - 1) * alpha;
1271 else if (f < -MAX_EXP)
1272 g = (label - 0) * alpha;
1273 else {
1274 f = exp(f);
1275 g =
1276 (label
1277 - f
1278 / (noise_distribution[target]
1279 * nce + f)) * alpha;
1280 }
1281 for (c = 0; c < window_layer_size; c++)
1282 neu1e[c] += g * syn1nce_window[c + l2];
1283 for (c = 0; c < window_layer_size; c++)
1284 syn1nce_window[c + l2] += g * neu1[c];
1285 if (cap == 1)
1286 for (c = 0; c < window_layer_size; c++)
1287 capParam(syn1nce_window, c + l2);
1288 }
1289 // hidden -> in
1290 for (a = 0; a < window * 2 + 1; a++)
1291 if (a != window) {
1292 c = sentence_position - window + a;
1293 if (c < 0)
1294 continue;
1295 if (c >= sentence_length)
1296 continue;
1297 last_word = sen[c];
1298 if (last_word == -1)
1299 continue;
1300 window_offset = a * layer1_size;
1301 if (a > window)
1302 window_offset -= layer1_size;
1303 for (c = 0; c < layer1_size; c++)
1304 syn0[c + last_word * layer1_size] += neu1e[c
1305 + window_offset];
1306 }
1307 }
1308 } else if (type == 3) { //train structured skip-gram
1309 for (a = 0; a < window * 2 + 1; a++)
1310 if (a != window) {
1311 c = sentence_position - window + a;
1312 if (c < 0)
1313 continue;
Marc Kupietzab4e5af2016-03-22 14:24:03 +01001314 if(sen[c] == -2)
1315 continue;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001316 if (c >= sentence_length)
1317 continue;
1318 last_word = sen[c];
1319 if (last_word == -1)
1320 continue;
1321 l1 = last_word * layer1_size;
1322 window_offset = a * layer1_size;
1323 if (a > window)
1324 window_offset -= layer1_size;
1325 for (c = 0; c < layer1_size; c++)
1326 neu1e[c] = 0;
1327 // HIERARCHICAL SOFTMAX
1328 if (hs)
1329 for (d = 0; d < vocab[word].codelen; d++) {
1330 f = 0;
1331 l2 = vocab[word].point[d] * window_layer_size;
1332 // Propagate hidden -> output
1333 for (c = 0; c < layer1_size; c++)
1334 f += syn0[c + l1]
1335 * syn1_window[c + l2 + window_offset];
1336 if (f <= -MAX_EXP)
1337 continue;
1338 else if (f >= MAX_EXP)
1339 continue;
1340 else
1341 f = expTable[(int) ((f + MAX_EXP)
1342 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1343 // 'g' is the gradient multiplied by the learning rate
1344 g = (1 - vocab[word].code[d] - f) * alpha;
1345 // Propagate errors output -> hidden
1346 for (c = 0; c < layer1_size; c++)
1347 neu1e[c] += g
1348 * syn1_window[c + l2 + window_offset];
1349 // Learn weights hidden -> output
1350 for (c = 0; c < layer1_size; c++)
1351 syn1[c + l2 + window_offset] += g
1352 * syn0[c + l1];
1353 if (cap == 1)
1354 for (c = 0; c < layer1_size; c++)
1355 capParam(syn1, c + l2 + window_offset);
1356 }
1357 // NEGATIVE SAMPLING
1358 if (negative > 0)
1359 for (d = 0; d < negative + 1; d++) {
1360 if (d == 0) {
1361 target = word;
1362 label = 1;
1363 } else {
1364 next_random = next_random
1365 * (unsigned long long) 25214903917 + 11;
1366 if (word_to_group != NULL
1367 && word_to_group[word] != -1) {
1368 target = word;
1369 while (target == word) {
1370 target =
1371 group_to_table[word_to_group[word]
1372 * table_size
1373 + (next_random >> 16)
1374 % table_size];
1375 next_random =
1376 next_random
1377 * (unsigned long long) 25214903917
1378 + 11;
1379 }
1380 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1381 } else {
1382 target = table[(next_random >> 16)
1383 % table_size];
1384 }
1385 if (target == 0)
1386 target = next_random % (vocab_size - 1) + 1;
1387 if (target == word)
1388 continue;
1389 label = 0;
1390 }
1391 l2 = target * window_layer_size;
1392 f = 0;
1393 for (c = 0; c < layer1_size; c++)
1394 f +=
1395 syn0[c + l1]
1396 * syn1neg_window[c + l2
1397 + window_offset];
1398 if (f > MAX_EXP)
1399 g = (label - 1) * alpha;
1400 else if (f < -MAX_EXP)
1401 g = (label - 0) * alpha;
1402 else
1403 g =
1404 (label
1405 - expTable[(int) ((f + MAX_EXP)
1406 * (EXP_TABLE_SIZE
1407 / MAX_EXP / 2))])
1408 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001409 if(debug_mode > 2 && ((long long) id) == 0) {
1410 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1411 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1412 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001413 for (c = 0; c < layer1_size; c++)
1414 neu1e[c] +=
1415 g
1416 * syn1neg_window[c + l2
1417 + window_offset];
1418 for (c = 0; c < layer1_size; c++)
1419 syn1neg_window[c + l2 + window_offset] += g
1420 * syn0[c + l1];
1421 if (cap == 1)
1422 for (c = 0; c < layer1_size; c++)
1423 capParam(syn1neg_window,
1424 c + l2 + window_offset);
1425 }
1426 // Noise Constrastive Estimation
1427 if (nce > 0)
1428 for (d = 0; d < nce + 1; d++) {
1429 if (d == 0) {
1430 target = word;
1431 label = 1;
1432 } else {
1433 next_random = next_random
1434 * (unsigned long long) 25214903917 + 11;
1435 if (word_to_group != NULL
1436 && word_to_group[word] != -1) {
1437 target = word;
1438 while (target == word) {
1439 target =
1440 group_to_table[word_to_group[word]
1441 * table_size
1442 + (next_random >> 16)
1443 % table_size];
1444 next_random =
1445 next_random
1446 * (unsigned long long) 25214903917
1447 + 11;
1448 }
1449 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1450 } else {
1451 target = table[(next_random >> 16)
1452 % table_size];
1453 }
1454 if (target == 0)
1455 target = next_random % (vocab_size - 1) + 1;
1456 if (target == word)
1457 continue;
1458 label = 0;
1459 }
1460 l2 = target * window_layer_size;
1461 f = 0;
1462 for (c = 0; c < layer1_size; c++)
1463 f +=
1464 syn0[c + l1]
1465 * syn1nce_window[c + l2
1466 + window_offset];
1467 if (f > MAX_EXP)
1468 g = (label - 1) * alpha;
1469 else if (f < -MAX_EXP)
1470 g = (label - 0) * alpha;
1471 else {
1472 f = exp(f);
1473 g = (label
1474 - f
1475 / (noise_distribution[target]
1476 * nce + f)) * alpha;
1477 }
1478 for (c = 0; c < layer1_size; c++)
1479 neu1e[c] +=
1480 g
1481 * syn1nce_window[c + l2
1482 + window_offset];
1483 for (c = 0; c < layer1_size; c++)
1484 syn1nce_window[c + l2 + window_offset] += g
1485 * syn0[c + l1];
1486 if (cap == 1)
1487 for (c = 0; c < layer1_size; c++)
1488 capParam(syn1nce_window,
1489 c + l2 + window_offset);
1490 }
1491 // Learn weights input -> hidden
1492 for (c = 0; c < layer1_size; c++) {
1493 syn0[c + l1] += neu1e[c];
1494 if (syn0[c + l1] > 50)
1495 syn0[c + l1] = 50;
1496 if (syn0[c + l1] < -50)
1497 syn0[c + l1] = -50;
1498 }
1499 }
1500 } else if (type == 4) { //training senna
1501 // in -> hidden
1502 cw = 0;
1503 for (a = 0; a < window * 2 + 1; a++)
1504 if (a != window) {
1505 c = sentence_position - window + a;
1506 if (c < 0)
1507 continue;
1508 if (c >= sentence_length)
1509 continue;
1510 last_word = sen[c];
1511 if (last_word == -1)
1512 continue;
1513 window_offset = a * layer1_size;
1514 if (a > window)
1515 window_offset -= layer1_size;
1516 for (c = 0; c < layer1_size; c++)
1517 neu1[c + window_offset] += syn0[c
1518 + last_word * layer1_size];
1519 cw++;
1520 }
1521 if (cw) {
1522 for (a = 0; a < window_hidden_size; a++) {
1523 c = a * window_layer_size;
1524 for (b = 0; b < window_layer_size; b++) {
1525 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1526 }
1527 }
1528 if (hs)
1529 for (d = 0; d < vocab[word].codelen; d++) {
1530 f = 0;
1531 l2 = vocab[word].point[d] * window_hidden_size;
1532 // Propagate hidden -> output
1533 for (c = 0; c < window_hidden_size; c++)
1534 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1535 if (f <= -MAX_EXP)
1536 continue;
1537 else if (f >= MAX_EXP)
1538 continue;
1539 else
1540 f = expTable[(int) ((f + MAX_EXP)
1541 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1542 // 'g' is the gradient multiplied by the learning rate
1543 g = (1 - vocab[word].code[d] - f) * alpha;
1544 // Propagate errors output -> hidden
1545 for (c = 0; c < window_hidden_size; c++)
1546 neu2e[c] += dHardTanh(neu2[c], g) * g
1547 * syn_hidden_word[c + l2];
1548 // Learn weights hidden -> output
1549 for (c = 0; c < window_hidden_size; c++)
1550 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1551 * neu2[c];
1552 }
1553 // NEGATIVE SAMPLING
1554 if (negative > 0)
1555 for (d = 0; d < negative + 1; d++) {
1556 if (d == 0) {
1557 target = word;
1558 label = 1;
1559 } else {
1560 next_random = next_random
1561 * (unsigned long long) 25214903917 + 11;
1562 if (word_to_group != NULL
1563 && word_to_group[word] != -1) {
1564 target = word;
1565 while (target == word) {
1566 target = group_to_table[word_to_group[word]
1567 * table_size
1568 + (next_random >> 16) % table_size];
1569 next_random = next_random
1570 * (unsigned long long) 25214903917
1571 + 11;
1572 }
1573 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1574 } else {
1575 target =
1576 table[(next_random >> 16) % table_size];
1577 }
1578 if (target == 0)
1579 target = next_random % (vocab_size - 1) + 1;
1580 if (target == word)
1581 continue;
1582 label = 0;
1583 }
1584 l2 = target * window_hidden_size;
1585 f = 0;
1586 for (c = 0; c < window_hidden_size; c++)
1587 f += hardTanh(neu2[c])
1588 * syn_hidden_word_neg[c + l2];
1589 if (f > MAX_EXP)
1590 g = (label - 1) * alpha / negative;
1591 else if (f < -MAX_EXP)
1592 g = (label - 0) * alpha / negative;
1593 else
1594 g = (label
1595 - expTable[(int) ((f + MAX_EXP)
1596 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1597 * alpha / negative;
1598 for (c = 0; c < window_hidden_size; c++)
1599 neu2e[c] += dHardTanh(neu2[c], g) * g
1600 * syn_hidden_word_neg[c + l2];
1601 for (c = 0; c < window_hidden_size; c++)
1602 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1603 * g * neu2[c];
1604 }
1605 for (a = 0; a < window_hidden_size; a++)
1606 for (b = 0; b < window_layer_size; b++)
1607 neu1e[b] += neu2e[a]
1608 * syn_window_hidden[a * window_layer_size + b];
1609 for (a = 0; a < window_hidden_size; a++)
1610 for (b = 0; b < window_layer_size; b++)
1611 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1612 * neu1[b];
1613 // hidden -> in
1614 for (a = 0; a < window * 2 + 1; a++)
1615 if (a != window) {
1616 c = sentence_position - window + a;
1617 if (c < 0)
1618 continue;
1619 if (c >= sentence_length)
1620 continue;
1621 last_word = sen[c];
1622 if (last_word == -1)
1623 continue;
1624 window_offset = a * layer1_size;
1625 if (a > window)
1626 window_offset -= layer1_size;
1627 for (c = 0; c < layer1_size; c++)
1628 syn0[c + last_word * layer1_size] += neu1e[c
1629 + window_offset];
1630 }
1631 }
1632 } else {
1633 printf("unknown type %i", type);
1634 exit(0);
1635 }
1636 sentence_position++;
1637 if (sentence_position >= sentence_length) {
1638 sentence_length = 0;
1639 continue;
1640 }
1641 }
1642 fclose(fi);
1643 free(neu1);
1644 free(neu1e);
1645 pthread_exit(NULL);
1646}
1647
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001648void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001649 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001650 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001651 real *target_sums, bestf[MAX_CC], worstbest;
1652 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001653 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001654 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1655
1656 for (d = cc; d < vocab_size; d++) {
1657 for (b = 0; b < vocab_size; b++)
1658 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001659 for (b = 0; b < N; b++)
1660 bestf[b]=-1;
1661 worstbest = -1;
1662
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001663 maxmax_f = -1;
1664 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001665 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001666 if (a != window) {
1667 max_f = -1;
1668 window_offset = a * layer1_size;
1669 if (a > window)
1670 window_offset -= layer1_size;
1671 for(target = 0; target < vocab_size; target ++) {
1672 if(target == d)
1673 continue;
1674 f = 0;
1675 for (c = 0; c < layer1_size; c++)
1676 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1677 if (f < -MAX_EXP)
1678 continue;
1679 else if (f > MAX_EXP)
1680 continue;
1681 else
1682 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1683 if(f > max_f) {
1684 max_f = f;
1685 max_target = target;
1686 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001687 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001688 if(f > worstbest) {
1689 for (b = 0; b < N; b++) {
1690 if (f > bestf[b]) {
1691 for (e = N - 1; e > b; e--) {
1692 bestf[e] = bestf[e - 1];
1693 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001694 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001695 }
1696 bestf[b] = f;
1697 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001698 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001699 break;
1700 }
1701 }
1702 worstbest = bestf[N-1];
1703 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001704 }
1705 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1706 if(max_f > maxmax_f) {
1707 maxmax_f = max_f;
1708 maxmax_target = max_target;
1709 }
1710 } else {
1711 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1712 }
1713 }
1714 max_f = -1;
1715 for (b = 0; b < vocab_size; b++) {
1716 if(target_sums[b] > max_f) {
1717 max_f = target_sums[b];
1718 max_target = b;
1719 }
1720 }
1721 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001722 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001723 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001724 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001725 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001726 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001727 }
1728}
1729
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001730void TrainModel() {
1731 long a, b, c, d;
1732 FILE *fo;
1733 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1734 printf("Starting training using file %s\n", train_file);
1735 starting_alpha = alpha;
1736 if (read_vocab_file[0] != 0)
1737 ReadVocab();
1738 else
1739 LearnVocabFromTrainFile();
1740 if (save_vocab_file[0] != 0)
1741 SaveVocab();
1742 if (output_file[0] == 0)
1743 return;
1744 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001745 if(cc > 0)
1746 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001747 if (negative > 0 || nce > 0)
1748 InitUnigramTable();
1749 if (negative_classes_file[0] != 0)
1750 InitClassUnigramTable();
1751 start = clock();
1752 for (a = 0; a < num_threads; a++)
1753 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1754 for (a = 0; a < num_threads; a++)
1755 pthread_join(pt[a], NULL);
1756 fo = fopen(output_file, "wb");
1757 if (classes == 0) {
1758 // Save the word vectors
1759 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1760 for (a = 0; a < vocab_size; a++) {
1761 fprintf(fo, "%s ", vocab[a].word);
1762 if (binary)
1763 for (b = 0; b < layer1_size; b++)
1764 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1765 else
1766 for (b = 0; b < layer1_size; b++)
1767 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1768 fprintf(fo, "\n");
1769 }
1770 } else {
1771 // Run K-means on the word vectors
1772 int clcn = classes, iter = 10, closeid;
1773 int *centcn = (int *) malloc(classes * sizeof(int));
1774 int *cl = (int *) calloc(vocab_size, sizeof(int));
1775 real closev, x;
1776 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1777 for (a = 0; a < vocab_size; a++)
1778 cl[a] = a % clcn;
1779 for (a = 0; a < iter; a++) {
1780 for (b = 0; b < clcn * layer1_size; b++)
1781 cent[b] = 0;
1782 for (b = 0; b < clcn; b++)
1783 centcn[b] = 1;
1784 for (c = 0; c < vocab_size; c++) {
1785 for (d = 0; d < layer1_size; d++)
1786 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1787 centcn[cl[c]]++;
1788 }
1789 for (b = 0; b < clcn; b++) {
1790 closev = 0;
1791 for (c = 0; c < layer1_size; c++) {
1792 cent[layer1_size * b + c] /= centcn[b];
1793 closev += cent[layer1_size * b + c]
1794 * cent[layer1_size * b + c];
1795 }
1796 closev = sqrt(closev);
1797 for (c = 0; c < layer1_size; c++)
1798 cent[layer1_size * b + c] /= closev;
1799 }
1800 for (c = 0; c < vocab_size; c++) {
1801 closev = -10;
1802 closeid = 0;
1803 for (d = 0; d < clcn; d++) {
1804 x = 0;
1805 for (b = 0; b < layer1_size; b++)
1806 x += cent[layer1_size * d + b]
1807 * syn0[c * layer1_size + b];
1808 if (x > closev) {
1809 closev = x;
1810 closeid = d;
1811 }
1812 }
1813 cl[c] = closeid;
1814 }
1815 }
1816 // Save the K-means classes
1817 for (a = 0; a < vocab_size; a++)
1818 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1819 free(centcn);
1820 free(cent);
1821 free(cl);
1822 }
1823 fclose(fo);
1824 if (save_net_file[0] != 0)
1825 SaveNet();
1826}
1827
1828int ArgPos(char *str, int argc, char **argv) {
1829 int a;
1830 for (a = 1; a < argc; a++)
1831 if (!strcmp(str, argv[a])) {
1832 if (a == argc - 1) {
1833 printf("Argument missing for %s\n", str);
1834 exit(1);
1835 }
1836 return a;
1837 }
1838 return -1;
1839}
1840
1841int main(int argc, char **argv) {
1842 int i;
1843 if (argc == 1) {
1844 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1845 printf("Options:\n");
1846 printf("Parameters for training:\n");
1847 printf("\t-train <file>\n");
1848 printf("\t\tUse text data from <file> to train the model\n");
1849 printf("\t-output <file>\n");
1850 printf(
1851 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1852 printf("\t-size <int>\n");
1853 printf("\t\tSet size of word vectors; default is 100\n");
1854 printf("\t-window <int>\n");
1855 printf("\t\tSet max skip length between words; default is 5\n");
1856 printf("\t-sample <float>\n");
1857 printf(
1858 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1859 printf(
1860 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1861 printf("\t-hs <int>\n");
1862 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1863 printf("\t-negative <int>\n");
1864 printf(
1865 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1866 printf("\t-negative-classes <file>\n");
1867 printf("\t\tNegative classes to sample from\n");
1868 printf("\t-nce <int>\n");
1869 printf(
1870 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1871 printf("\t-threads <int>\n");
1872 printf("\t\tUse <int> threads (default 12)\n");
1873 printf("\t-iter <int>\n");
1874 printf("\t\tRun more training iterations (default 5)\n");
1875 printf("\t-min-count <int>\n");
1876 printf(
1877 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1878 printf("\t-alpha <float>\n");
1879 printf(
1880 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1881 printf("\t-classes <int>\n");
1882 printf(
1883 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1884 printf("\t-debug <int>\n");
1885 printf(
1886 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1887 printf("\t-binary <int>\n");
1888 printf(
1889 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1890 printf("\t-save-vocab <file>\n");
1891 printf("\t\tThe vocabulary will be saved to <file>\n");
1892 printf("\t-read-vocab <file>\n");
1893 printf(
1894 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1895 printf("\t-read-net <file>\n");
1896 printf(
1897 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1898 printf("\t-save-net <file>\n");
1899 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001900 printf("\t-show-cc <int>\n");
1901 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001902 printf("\t-type <int>\n");
1903 printf(
1904 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1905 printf("\t-cap <int>\n");
1906 printf(
1907 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1908 printf("\nExamples:\n");
1909 printf(
1910 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1911 return 0;
1912 }
1913 output_file[0] = 0;
1914 save_vocab_file[0] = 0;
1915 read_vocab_file[0] = 0;
1916 save_net_file[0] = 0;
1917 read_net_file[0] = 0;
1918 negative_classes_file[0] = 0;
1919 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1920 layer1_size = atoi(argv[i + 1]);
1921 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1922 strcpy(train_file, argv[i + 1]);
1923 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1924 strcpy(save_vocab_file, argv[i + 1]);
1925 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1926 strcpy(read_vocab_file, argv[i + 1]);
1927 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1928 strcpy(save_net_file, argv[i + 1]);
1929 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1930 strcpy(read_net_file, argv[i + 1]);
1931 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1932 debug_mode = atoi(argv[i + 1]);
1933 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1934 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001935 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
1936 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001937 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1938 type = atoi(argv[i + 1]);
1939 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1940 strcpy(output_file, argv[i + 1]);
1941 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1942 window = atoi(argv[i + 1]);
1943 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1944 sample = atof(argv[i + 1]);
1945 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1946 hs = atoi(argv[i + 1]);
1947 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1948 negative = atoi(argv[i + 1]);
1949 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1950 strcpy(negative_classes_file, argv[i + 1]);
1951 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1952 nce = atoi(argv[i + 1]);
1953 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1954 num_threads = atoi(argv[i + 1]);
1955 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1956 iter = atoi(argv[i + 1]);
1957 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1958 min_count = atoi(argv[i + 1]);
1959 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1960 classes = atoi(argv[i + 1]);
1961 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1962 cap = atoi(argv[i + 1]);
1963 if (type == 0 || type == 2 || type == 4)
1964 alpha = 0.05;
1965 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1966 alpha = atof(argv[i + 1]);
1967 vocab = (struct vocab_word *) calloc(vocab_max_size,
1968 sizeof(struct vocab_word));
1969 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1970 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1971 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1972 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1973 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1974 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02001975 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001976 TrainModel();
1977 return 0;
1978}
1979