blob: df9dbe6217436f5c935c1be71701fd4a5f94c97e [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010025#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010026#define MAX_CODE_LENGTH 40
27
28const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
29
30typedef float real; // Precision of float numbers
31
32struct vocab_word {
33 long long cn;
34 int *point;
35 char *word, *code, codelen;
36};
37
38char train_file[MAX_STRING], output_file[MAX_STRING];
39char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
40char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
41struct vocab_word *vocab;
42int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
43 num_threads = 12, min_reduce = 1;
44int *vocab_hash;
45long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
46long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
47 classes = 0;
48real alpha = 0.025, starting_alpha, sample = 1e-3;
49real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
50clock_t start;
51
52real *syn1_window, *syn1neg_window, *syn1nce_window;
53int w_offset, window_layer_size;
54
55int window_hidden_size = 500;
56real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
57 *syn_hidden_word_nce;
58
59int hs = 0, negative = 5;
60const int table_size = 1e8;
61int *table;
62
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010063long cc = 0;
64
Marc Kupietzd6f9c712016-03-16 11:50:56 +010065//constrastive negative sampling
66char negative_classes_file[MAX_STRING];
67int *word_to_group;
68int *group_to_table; //group_size*table_size
69int class_number;
70
71//nce
72real* noise_distribution;
73int nce = 0;
74
75//param caps
76real CAP_VALUE = 50;
77int cap = 0;
78
79void capParam(real* array, int index) {
80 if (array[index] > CAP_VALUE)
81 array[index] = CAP_VALUE;
82 else if (array[index] < -CAP_VALUE)
83 array[index] = -CAP_VALUE;
84}
85
86real hardTanh(real x) {
87 if (x >= 1) {
88 return 1;
89 } else if (x <= -1) {
90 return -1;
91 } else {
92 return x;
93 }
94}
95
96real dHardTanh(real x, real g) {
97 if (x > 1 && g > 0) {
98 return 0;
99 }
100 if (x < -1 && g < 0) {
101 return 0;
102 }
103 return 1;
104}
105
106void InitUnigramTable() {
107 int a, i;
108 long long train_words_pow = 0;
109 real d1, power = 0.75;
110 table = (int *) malloc(table_size * sizeof(int));
111 for (a = 0; a < vocab_size; a++)
112 train_words_pow += pow(vocab[a].cn, power);
113 i = 0;
114 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
115 for (a = 0; a < table_size; a++) {
116 table[a] = i;
117 if (a / (real) table_size > d1) {
118 i++;
119 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
120 }
121 if (i >= vocab_size)
122 i = vocab_size - 1;
123 }
124
125 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
126 for (a = 0; a < vocab_size; a++)
127 noise_distribution[a] = pow(vocab[a].cn, power)
128 / (real) train_words_pow;
129}
130
131// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
132void ReadWord(char *word, FILE *fin) {
133 int a = 0, ch;
134 while (!feof(fin)) {
135 ch = fgetc(fin);
136 if (ch == 13)
137 continue;
138 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
139 if (a > 0) {
140 if (ch == '\n')
141 ungetc(ch, fin);
142 break;
143 }
144 if (ch == '\n') {
145 strcpy(word, (char *) "</s>");
146 return;
147 } else
148 continue;
149 }
150 word[a] = ch;
151 a++;
152 if (a >= MAX_STRING - 1)
153 a--; // Truncate too long words
154 }
155 word[a] = 0;
156}
157
158// Returns hash value of a word
159int GetWordHash(char *word) {
160 unsigned long long a, hash = 0;
161 for (a = 0; a < strlen(word); a++)
162 hash = hash * 257 + word[a];
163 hash = hash % vocab_hash_size;
164 return hash;
165}
166
167// Returns position of a word in the vocabulary; if the word is not found, returns -1
168int SearchVocab(char *word) {
169 unsigned int hash = GetWordHash(word);
170 while (1) {
171 if (vocab_hash[hash] == -1)
172 return -1;
173 if (!strcmp(word, vocab[vocab_hash[hash]].word))
174 return vocab_hash[hash];
175 hash = (hash + 1) % vocab_hash_size;
176 }
177 return -1;
178}
179
180// Reads a word and returns its index in the vocabulary
181int ReadWordIndex(FILE *fin) {
182 char word[MAX_STRING];
183 ReadWord(word, fin);
184 if (feof(fin))
185 return -1;
186 return SearchVocab(word);
187}
188
189// Adds a word to the vocabulary
190int AddWordToVocab(char *word) {
191 unsigned int hash, length = strlen(word) + 1;
192 if (length > MAX_STRING)
193 length = MAX_STRING;
194 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
195 strcpy(vocab[vocab_size].word, word);
196 vocab[vocab_size].cn = 0;
197 vocab_size++;
198 // Reallocate memory if needed
199 if (vocab_size + 2 >= vocab_max_size) {
200 vocab_max_size += 1000;
201 vocab = (struct vocab_word *) realloc(vocab,
202 vocab_max_size * sizeof(struct vocab_word));
203 }
204 hash = GetWordHash(word);
205 while (vocab_hash[hash] != -1)
206 hash = (hash + 1) % vocab_hash_size;
207 vocab_hash[hash] = vocab_size - 1;
208 return vocab_size - 1;
209}
210
211// Used later for sorting by word counts
212int VocabCompare(const void *a, const void *b) {
213 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
214}
215
216// Sorts the vocabulary by frequency using word counts
217void SortVocab() {
218 int a, size;
219 unsigned int hash;
220 // Sort the vocabulary and keep </s> at the first position
221 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
222 for (a = 0; a < vocab_hash_size; a++)
223 vocab_hash[a] = -1;
224 size = vocab_size;
225 train_words = 0;
226 for (a = 0; a < size; a++) {
227 // Words occuring less than min_count times will be discarded from the vocab
228 if ((vocab[a].cn < min_count) && (a != 0)) {
229 vocab_size--;
230 free(vocab[a].word);
231 } else {
232 // Hash will be re-computed, as after the sorting it is not actual
233 hash = GetWordHash(vocab[a].word);
234 while (vocab_hash[hash] != -1)
235 hash = (hash + 1) % vocab_hash_size;
236 vocab_hash[hash] = a;
237 train_words += vocab[a].cn;
238 }
239 }
240 vocab = (struct vocab_word *) realloc(vocab,
241 (vocab_size + 1) * sizeof(struct vocab_word));
242 // Allocate memory for the binary tree construction
243 for (a = 0; a < vocab_size; a++) {
244 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
245 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
246 }
247}
248
249// Reduces the vocabulary by removing infrequent tokens
250void ReduceVocab() {
251 int a, b = 0;
252 unsigned int hash;
253 for (a = 0; a < vocab_size; a++)
254 if (vocab[a].cn > min_reduce) {
255 vocab[b].cn = vocab[a].cn;
256 vocab[b].word = vocab[a].word;
257 b++;
258 } else
259 free(vocab[a].word);
260 vocab_size = b;
261 for (a = 0; a < vocab_hash_size; a++)
262 vocab_hash[a] = -1;
263 for (a = 0; a < vocab_size; a++) {
264 // Hash will be re-computed, as it is not actual
265 hash = GetWordHash(vocab[a].word);
266 while (vocab_hash[hash] != -1)
267 hash = (hash + 1) % vocab_hash_size;
268 vocab_hash[hash] = a;
269 }
270 fflush(stdout);
271 min_reduce++;
272}
273
274// Create binary Huffman tree using the word counts
275// Frequent words will have short uniqe binary codes
276void CreateBinaryTree() {
277 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
278 char code[MAX_CODE_LENGTH];
279 long long *count = (long long *) calloc(vocab_size * 2 + 1,
280 sizeof(long long));
281 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
282 sizeof(long long));
283 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
284 sizeof(long long));
285 for (a = 0; a < vocab_size; a++)
286 count[a] = vocab[a].cn;
287 for (a = vocab_size; a < vocab_size * 2; a++)
288 count[a] = 1e15;
289 pos1 = vocab_size - 1;
290 pos2 = vocab_size;
291 // Following algorithm constructs the Huffman tree by adding one node at a time
292 for (a = 0; a < vocab_size - 1; a++) {
293 // First, find two smallest nodes 'min1, min2'
294 if (pos1 >= 0) {
295 if (count[pos1] < count[pos2]) {
296 min1i = pos1;
297 pos1--;
298 } else {
299 min1i = pos2;
300 pos2++;
301 }
302 } else {
303 min1i = pos2;
304 pos2++;
305 }
306 if (pos1 >= 0) {
307 if (count[pos1] < count[pos2]) {
308 min2i = pos1;
309 pos1--;
310 } else {
311 min2i = pos2;
312 pos2++;
313 }
314 } else {
315 min2i = pos2;
316 pos2++;
317 }
318 count[vocab_size + a] = count[min1i] + count[min2i];
319 parent_node[min1i] = vocab_size + a;
320 parent_node[min2i] = vocab_size + a;
321 binary[min2i] = 1;
322 }
323 // Now assign binary code to each vocabulary word
324 for (a = 0; a < vocab_size; a++) {
325 b = a;
326 i = 0;
327 while (1) {
328 code[i] = binary[b];
329 point[i] = b;
330 i++;
331 b = parent_node[b];
332 if (b == vocab_size * 2 - 2)
333 break;
334 }
335 vocab[a].codelen = i;
336 vocab[a].point[0] = vocab_size - 2;
337 for (b = 0; b < i; b++) {
338 vocab[a].code[i - b - 1] = code[b];
339 vocab[a].point[i - b] = point[b] - vocab_size;
340 }
341 }
342 free(count);
343 free(binary);
344 free(parent_node);
345}
346
347void LearnVocabFromTrainFile() {
348 char word[MAX_STRING];
349 FILE *fin;
350 long long a, i;
351 for (a = 0; a < vocab_hash_size; a++)
352 vocab_hash[a] = -1;
353 fin = fopen(train_file, "rb");
354 if (fin == NULL) {
355 printf("ERROR: training data file not found!\n");
356 exit(1);
357 }
358 vocab_size = 0;
359 AddWordToVocab((char *) "</s>");
360 while (1) {
361 ReadWord(word, fin);
362 if (feof(fin))
363 break;
364 train_words++;
365 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
366 printf("%lldK%c", train_words / 1000, 13);
367 fflush(stdout);
368 }
369 i = SearchVocab(word);
370 if (i == -1) {
371 a = AddWordToVocab(word);
372 vocab[a].cn = 1;
373 } else
374 vocab[i].cn++;
375 if (vocab_size > vocab_hash_size * 0.7)
376 ReduceVocab();
377 }
378 SortVocab();
379 if (debug_mode > 0) {
380 printf("Vocab size: %lld\n", vocab_size);
381 printf("Words in train file: %lld\n", train_words);
382 }
383 file_size = ftell(fin);
384 fclose(fin);
385}
386
387void SaveVocab() {
388 long long i;
389 FILE *fo = fopen(save_vocab_file, "wb");
390 for (i = 0; i < vocab_size; i++)
391 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
392 fclose(fo);
393}
394
395void ReadVocab() {
396 long long a, i = 0;
397 char c;
398 char word[MAX_STRING];
399 FILE *fin = fopen(read_vocab_file, "rb");
400 if (fin == NULL) {
401 printf("Vocabulary file not found\n");
402 exit(1);
403 }
404 for (a = 0; a < vocab_hash_size; a++)
405 vocab_hash[a] = -1;
406 vocab_size = 0;
407 while (1) {
408 ReadWord(word, fin);
409 if (feof(fin))
410 break;
411 a = AddWordToVocab(word);
412 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
413 i++;
414 }
415 SortVocab();
416 if (debug_mode > 0) {
417 printf("Vocab size: %lld\n", vocab_size);
418 printf("Words in train file: %lld\n", train_words);
419 }
420 fin = fopen(train_file, "rb");
421 if (fin == NULL) {
422 printf("ERROR: training data file not found!\n");
423 exit(1);
424 }
425 fseek(fin, 0, SEEK_END);
426 file_size = ftell(fin);
427 fclose(fin);
428}
429
430void InitClassUnigramTable() {
431 long long a, c;
432 printf("loading class unigrams \n");
433 FILE *fin = fopen(negative_classes_file, "rb");
434 if (fin == NULL) {
435 printf("ERROR: class file not found!\n");
436 exit(1);
437 }
438 word_to_group = (int *) malloc(vocab_size * sizeof(int));
439 for (a = 0; a < vocab_size; a++)
440 word_to_group[a] = -1;
441 char class[MAX_STRING];
442 char prev_class[MAX_STRING];
443 prev_class[0] = 0;
444 char word[MAX_STRING];
445 class_number = -1;
446 while (1) {
447 if (feof(fin))
448 break;
449 ReadWord(class, fin);
450 ReadWord(word, fin);
451 int word_index = SearchVocab(word);
452 if (word_index != -1) {
453 if (strcmp(class, prev_class) != 0) {
454 class_number++;
455 strcpy(prev_class, class);
456 }
457 word_to_group[word_index] = class_number;
458 }
459 ReadWord(word, fin);
460 }
461 class_number++;
462 fclose(fin);
463
464 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
465 long long train_words_pow = 0;
466 real d1, power = 0.75;
467
468 for (c = 0; c < class_number; c++) {
469 long long offset = c * table_size;
470 train_words_pow = 0;
471 for (a = 0; a < vocab_size; a++)
472 if (word_to_group[a] == c)
473 train_words_pow += pow(vocab[a].cn, power);
474 int i = 0;
475 while (word_to_group[i] != c && i < vocab_size)
476 i++;
477 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
478 for (a = 0; a < table_size; a++) {
479 //printf("index %lld , word %d\n", a, i);
480 group_to_table[offset + a] = i;
481 if (a / (real) table_size > d1) {
482 i++;
483 while (word_to_group[i] != c && i < vocab_size)
484 i++;
485 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
486 }
487 if (i >= vocab_size)
488 while (word_to_group[i] != c && i >= 0)
489 i--;
490 }
491 }
492}
493
Marc Kupietz210b9d52016-04-02 21:48:13 +0200494void SaveArgs(int argc, char **argv) {
495 unsigned int i;
496 size_t len = 0;
497 char *_all_args, *all_args;
498 char *args_file = strdup(output_file);
499 strcat(args_file, ".args");
500 FILE *fargs = fopen(args_file, "w");
501 if (fargs == NULL) {
502 printf("Cannot save args to %s.\n", args_file);
503 return;
504 }
505
506 for(i=1; i<argc; i++) {
507 len += strlen(argv[i]);
508 }
509
510 _all_args = all_args = (char *)malloc(len+argc-1);
511
512 for(i=1; i<argc; i++) {
513 memcpy(_all_args, argv[i], strlen(argv[i]));
514 _all_args += strlen(argv[i])+1;
515 *(_all_args-1) = ' ';
516 }
517 *(_all_args-1) = 0;
518
519 fprintf(fargs, "%s\n", all_args);
520 fclose(fargs);
521
522 free(all_args);
523
524 return;
525}
526
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100527void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100528 if(type != 3 || negative <= 0) {
529 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
530 return;
531 }
532
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100533 FILE *fnet = fopen(save_net_file, "wb");
534 if (fnet == NULL) {
535 printf("Net parameter file not found\n");
536 exit(1);
537 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100538 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100539 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100540 fclose(fnet);
541}
542
543void InitNet() {
544 long long a, b;
545 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100546 long long read;
547
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100548 window_layer_size = layer1_size * window * 2;
549 a = posix_memalign((void **) &syn0, 128,
550 (long long) vocab_size * layer1_size * sizeof(real));
551 if (syn0 == NULL) {
552 printf("Memory allocation failed\n");
553 exit(1);
554 }
555
556 if (hs) {
557 a = posix_memalign((void **) &syn1, 128,
558 (long long) vocab_size * layer1_size * sizeof(real));
559 if (syn1 == NULL) {
560 printf("Memory allocation failed\n");
561 exit(1);
562 }
563 a = posix_memalign((void **) &syn1_window, 128,
564 (long long) vocab_size * window_layer_size * sizeof(real));
565 if (syn1_window == NULL) {
566 printf("Memory allocation failed\n");
567 exit(1);
568 }
569 a = posix_memalign((void **) &syn_hidden_word, 128,
570 (long long) vocab_size * window_hidden_size * sizeof(real));
571 if (syn_hidden_word == NULL) {
572 printf("Memory allocation failed\n");
573 exit(1);
574 }
575
576 for (a = 0; a < vocab_size; a++)
577 for (b = 0; b < layer1_size; b++)
578 syn1[a * layer1_size + b] = 0;
579 for (a = 0; a < vocab_size; a++)
580 for (b = 0; b < window_layer_size; b++)
581 syn1_window[a * window_layer_size + b] = 0;
582 for (a = 0; a < vocab_size; a++)
583 for (b = 0; b < window_hidden_size; b++)
584 syn_hidden_word[a * window_hidden_size + b] = 0;
585 }
586 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100587 if(type == 0) {
588 a = posix_memalign((void **) &syn1neg, 128,
589 (long long) vocab_size * layer1_size * sizeof(real));
590 if (syn1neg == NULL) {
591 printf("Memory allocation failed\n");
592 exit(1);
593 }
594 for (a = 0; a < vocab_size; a++)
595 for (b = 0; b < layer1_size; b++)
596 syn1neg[a * layer1_size + b] = 0;
597 } else if (type == 3) {
598 a = posix_memalign((void **) &syn1neg_window, 128,
599 (long long) vocab_size * window_layer_size * sizeof(real));
600 if (syn1neg_window == NULL) {
601 printf("Memory allocation failed\n");
602 exit(1);
603 }
604 for (a = 0; a < vocab_size; a++)
605 for (b = 0; b < window_layer_size; b++)
606 syn1neg_window[a * window_layer_size + b] = 0;
607 } else if (type == 4) {
608 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
609 (long long) vocab_size * window_hidden_size * sizeof(real));
610 if (syn_hidden_word_neg == NULL) {
611 printf("Memory allocation failed\n");
612 exit(1);
613 }
614 for (a = 0; a < vocab_size; a++)
615 for (b = 0; b < window_hidden_size; b++)
616 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100617 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100618 }
619 if (nce > 0) {
620 a = posix_memalign((void **) &syn1nce, 128,
621 (long long) vocab_size * layer1_size * sizeof(real));
622 if (syn1nce == NULL) {
623 printf("Memory allocation failed\n");
624 exit(1);
625 }
626 a = posix_memalign((void **) &syn1nce_window, 128,
627 (long long) vocab_size * window_layer_size * sizeof(real));
628 if (syn1nce_window == NULL) {
629 printf("Memory allocation failed\n");
630 exit(1);
631 }
632 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
633 (long long) vocab_size * window_hidden_size * sizeof(real));
634 if (syn_hidden_word_nce == NULL) {
635 printf("Memory allocation failed\n");
636 exit(1);
637 }
638
639 for (a = 0; a < vocab_size; a++)
640 for (b = 0; b < layer1_size; b++)
641 syn1nce[a * layer1_size + b] = 0;
642 for (a = 0; a < vocab_size; a++)
643 for (b = 0; b < window_layer_size; b++)
644 syn1nce_window[a * window_layer_size + b] = 0;
645 for (a = 0; a < vocab_size; a++)
646 for (b = 0; b < window_hidden_size; b++)
647 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
648 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100649
Marc Kupietz1006a272016-03-16 15:50:20 +0100650 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100651 a = posix_memalign((void **) &syn_window_hidden, 128,
652 window_hidden_size * window_layer_size * sizeof(real));
653 if (syn_window_hidden == NULL) {
654 printf("Memory allocation failed\n");
655 exit(1);
656 }
657 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
658 next_random = next_random * (unsigned long long) 25214903917 + 11;
659 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
660 - 0.5) / (window_hidden_size * window_layer_size);
661 }
662 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100663
664 if (read_net_file[0] == 0) {
665 for (a = 0; a < vocab_size; a++)
666 for (b = 0; b < layer1_size; b++) {
667 next_random = next_random * (unsigned long long) 25214903917
668 + 11;
669 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
670 / (real) 65536) - 0.5) / layer1_size;
671 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100672 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100673 FILE *fnet = fopen(read_net_file, "rb");
674 if (fnet == NULL) {
675 printf("Net parameter file not found\n");
676 exit(1);
677 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100678 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
679 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
680 if(read != vocab_size * layer1_size) {
681 fprintf(stderr, "read-net failed %lld\n", read);
682 exit(-1);
683 }
684 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
685 if(read != (long long) vocab_size * window_layer_size) {
686 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
687 (long long) sizeof(real) * vocab_size * window_layer_size);
688 exit(-1);
689 }
690 fgetc(fnet);
691 if(!feof(fnet)) {
692 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
693 exit(-1);
694 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100695 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100696 } else {
697 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
698 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100699 }
700
701 CreateBinaryTree();
702}
703
704void *TrainModelThread(void *id) {
705 long long a, b, d, cw, word, last_word, sentence_length = 0,
706 sentence_position = 0;
707 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
708 long long l1, l2, c, target, label, local_iter = iter;
709 unsigned long long next_random = (long long) id;
710 real f, g;
711 clock_t now;
712 int input_len_1 = layer1_size;
713 int window_offset = -1;
714 if (type == 2 || type == 4) {
715 input_len_1 = window_layer_size;
716 }
717 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
718 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
719
720 int input_len_2 = 0;
721 if (type == 4) {
722 input_len_2 = window_hidden_size;
723 }
724 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
725 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
726
727 FILE *fi = fopen(train_file, "rb");
728 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
729 while (1) {
730 if (word_count - last_word_count > 10000) {
731 word_count_actual += word_count - last_word_count;
732 last_word_count = word_count;
733 if ((debug_mode > 1)) {
734 now = clock();
735 printf(
736 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
737 13, alpha,
738 word_count_actual / (real) (iter * train_words + 1)
739 * 100,
740 word_count_actual
741 / ((real) (now - start + 1)
742 / (real) CLOCKS_PER_SEC * 1000));
743 fflush(stdout);
744 }
745 alpha = starting_alpha
746 * (1 - word_count_actual / (real) (iter * train_words + 1));
747 if (alpha < starting_alpha * 0.0001)
748 alpha = starting_alpha * 0.0001;
749 }
750 if (sentence_length == 0) {
751 while (1) {
752 word = ReadWordIndex(fi);
753 if (feof(fi))
754 break;
755 if (word == -1)
756 continue;
757 word_count++;
758 if (word == 0)
759 break;
760 // The subsampling randomly discards frequent words while keeping the ranking same
761 if (sample > 0) {
762 real ran = (sqrt(vocab[word].cn / (sample * train_words))
763 + 1) * (sample * train_words) / vocab[word].cn;
764 next_random = next_random * (unsigned long long) 25214903917
765 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100766 if (ran < (next_random & 0xFFFF) / (real) 65536) {
767 if(type == 3) // in structured skipgrams
768 word = -2; // keep the window position correct
769 else
770 continue;
771 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100772 }
773 sen[sentence_length] = word;
774 sentence_length++;
775 if (sentence_length >= MAX_SENTENCE_LENGTH)
776 break;
777 }
778 sentence_position = 0;
779 }
780 if (feof(fi) || (word_count > train_words / num_threads)) {
781 word_count_actual += word_count - last_word_count;
782 local_iter--;
783 if (local_iter == 0)
784 break;
785 word_count = 0;
786 last_word_count = 0;
787 sentence_length = 0;
788 fseek(fi, file_size / (long long) num_threads * (long long) id,
789 SEEK_SET);
790 continue;
791 }
792 word = sen[sentence_position];
Peter Fankhauser5bec9e12016-04-20 13:29:33 +0200793 while (word == -2 && sentence_position<sentence_length)
794 word = sen[++sentence_position];
795 if (sentence_position>=sentence_length) {
796 sentence_length=0;
797 continue;
798 }
799 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100800 continue;
801 for (c = 0; c < input_len_1; c++)
802 neu1[c] = 0;
803 for (c = 0; c < input_len_1; c++)
804 neu1e[c] = 0;
805 for (c = 0; c < input_len_2; c++)
806 neu2[c] = 0;
807 for (c = 0; c < input_len_2; c++)
808 neu2e[c] = 0;
809 next_random = next_random * (unsigned long long) 25214903917 + 11;
810 b = next_random % window;
811 if (type == 0) { //train the cbow architecture
812 // in -> hidden
813 cw = 0;
814 for (a = b; a < window * 2 + 1 - b; a++)
815 if (a != window) {
816 c = sentence_position - window + a;
817 if (c < 0)
818 continue;
819 if (c >= sentence_length)
820 continue;
821 last_word = sen[c];
822 if (last_word == -1)
823 continue;
824 for (c = 0; c < layer1_size; c++)
825 neu1[c] += syn0[c + last_word * layer1_size];
826 cw++;
827 }
828 if (cw) {
829 for (c = 0; c < layer1_size; c++)
830 neu1[c] /= cw;
831 if (hs)
832 for (d = 0; d < vocab[word].codelen; d++) {
833 f = 0;
834 l2 = vocab[word].point[d] * layer1_size;
835 // Propagate hidden -> output
836 for (c = 0; c < layer1_size; c++)
837 f += neu1[c] * syn1[c + l2];
838 if (f <= -MAX_EXP)
839 continue;
840 else if (f >= MAX_EXP)
841 continue;
842 else
843 f = expTable[(int) ((f + MAX_EXP)
844 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
845 // 'g' is the gradient multiplied by the learning rate
846 g = (1 - vocab[word].code[d] - f) * alpha;
847 // Propagate errors output -> hidden
848 for (c = 0; c < layer1_size; c++)
849 neu1e[c] += g * syn1[c + l2];
850 // Learn weights hidden -> output
851 for (c = 0; c < layer1_size; c++)
852 syn1[c + l2] += g * neu1[c];
853 if (cap == 1)
854 for (c = 0; c < layer1_size; c++)
855 capParam(syn1, c + l2);
856 }
857 // NEGATIVE SAMPLING
858 if (negative > 0)
859 for (d = 0; d < negative + 1; d++) {
860 if (d == 0) {
861 target = word;
862 label = 1;
863 } else {
864 next_random = next_random
865 * (unsigned long long) 25214903917 + 11;
866 if (word_to_group != NULL
867 && word_to_group[word] != -1) {
868 target = word;
869 while (target == word) {
870 target = group_to_table[word_to_group[word]
871 * table_size
872 + (next_random >> 16) % table_size];
873 next_random = next_random
874 * (unsigned long long) 25214903917
875 + 11;
876 }
877 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
878 } else {
879 target =
880 table[(next_random >> 16) % table_size];
881 }
882 if (target == 0)
883 target = next_random % (vocab_size - 1) + 1;
884 if (target == word)
885 continue;
886 label = 0;
887 }
888 l2 = target * layer1_size;
889 f = 0;
890 for (c = 0; c < layer1_size; c++)
891 f += neu1[c] * syn1neg[c + l2];
892 if (f > MAX_EXP)
893 g = (label - 1) * alpha;
894 else if (f < -MAX_EXP)
895 g = (label - 0) * alpha;
896 else
897 g = (label
898 - expTable[(int) ((f + MAX_EXP)
899 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
900 * alpha;
901 for (c = 0; c < layer1_size; c++)
902 neu1e[c] += g * syn1neg[c + l2];
903 for (c = 0; c < layer1_size; c++)
904 syn1neg[c + l2] += g * neu1[c];
905 if (cap == 1)
906 for (c = 0; c < layer1_size; c++)
907 capParam(syn1neg, c + l2);
908 }
909 // Noise Contrastive Estimation
910 if (nce > 0)
911 for (d = 0; d < nce + 1; d++) {
912 if (d == 0) {
913 target = word;
914 label = 1;
915 } else {
916 next_random = next_random
917 * (unsigned long long) 25214903917 + 11;
918 if (word_to_group != NULL
919 && word_to_group[word] != -1) {
920 target = word;
921 while (target == word) {
922 target = group_to_table[word_to_group[word]
923 * table_size
924 + (next_random >> 16) % table_size];
925 next_random = next_random
926 * (unsigned long long) 25214903917
927 + 11;
928 }
929 } else {
930 target =
931 table[(next_random >> 16) % table_size];
932 }
933 if (target == 0)
934 target = next_random % (vocab_size - 1) + 1;
935 if (target == word)
936 continue;
937 label = 0;
938 }
939 l2 = target * layer1_size;
940 f = 0;
941
942 for (c = 0; c < layer1_size; c++)
943 f += neu1[c] * syn1nce[c + l2];
944 if (f > MAX_EXP)
945 g = (label - 1) * alpha;
946 else if (f < -MAX_EXP)
947 g = (label - 0) * alpha;
948 else {
949 f = exp(f);
950 g =
951 (label
952 - f
953 / (noise_distribution[target]
954 * nce + f)) * alpha;
955 }
956 for (c = 0; c < layer1_size; c++)
957 neu1e[c] += g * syn1nce[c + l2];
958 for (c = 0; c < layer1_size; c++)
959 syn1nce[c + l2] += g * neu1[c];
960 if (cap == 1)
961 for (c = 0; c < layer1_size; c++)
962 capParam(syn1nce, c + l2);
963 }
964 // hidden -> in
965 for (a = b; a < window * 2 + 1 - b; a++)
966 if (a != window) {
967 c = sentence_position - window + a;
968 if (c < 0)
969 continue;
970 if (c >= sentence_length)
971 continue;
972 last_word = sen[c];
973 if (last_word == -1)
974 continue;
975 for (c = 0; c < layer1_size; c++)
976 syn0[c + last_word * layer1_size] += neu1e[c];
977 }
978 }
979 } else if (type == 1) { //train skip-gram
980 for (a = b; a < window * 2 + 1 - b; a++)
981 if (a != window) {
982 c = sentence_position - window + a;
983 if (c < 0)
984 continue;
985 if (c >= sentence_length)
986 continue;
987 last_word = sen[c];
988 if (last_word == -1)
989 continue;
990 l1 = last_word * layer1_size;
991 for (c = 0; c < layer1_size; c++)
992 neu1e[c] = 0;
993 // HIERARCHICAL SOFTMAX
994 if (hs)
995 for (d = 0; d < vocab[word].codelen; d++) {
996 f = 0;
997 l2 = vocab[word].point[d] * layer1_size;
998 // Propagate hidden -> output
999 for (c = 0; c < layer1_size; c++)
1000 f += syn0[c + l1] * syn1[c + l2];
1001 if (f <= -MAX_EXP)
1002 continue;
1003 else if (f >= MAX_EXP)
1004 continue;
1005 else
1006 f = expTable[(int) ((f + MAX_EXP)
1007 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1008 // 'g' is the gradient multiplied by the learning rate
1009 g = (1 - vocab[word].code[d] - f) * alpha;
1010 // Propagate errors output -> hidden
1011 for (c = 0; c < layer1_size; c++)
1012 neu1e[c] += g * syn1[c + l2];
1013 // Learn weights hidden -> output
1014 for (c = 0; c < layer1_size; c++)
1015 syn1[c + l2] += g * syn0[c + l1];
1016 if (cap == 1)
1017 for (c = 0; c < layer1_size; c++)
1018 capParam(syn1, c + l2);
1019 }
1020 // NEGATIVE SAMPLING
1021 if (negative > 0)
1022 for (d = 0; d < negative + 1; d++) {
1023 if (d == 0) {
1024 target = word;
1025 label = 1;
1026 } else {
1027 next_random = next_random
1028 * (unsigned long long) 25214903917 + 11;
1029 if (word_to_group != NULL
1030 && word_to_group[word] != -1) {
1031 target = word;
1032 while (target == word) {
1033 target =
1034 group_to_table[word_to_group[word]
1035 * table_size
1036 + (next_random >> 16)
1037 % table_size];
1038 next_random =
1039 next_random
1040 * (unsigned long long) 25214903917
1041 + 11;
1042 }
1043 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1044 } else {
1045 target = table[(next_random >> 16)
1046 % table_size];
1047 }
1048 if (target == 0)
1049 target = next_random % (vocab_size - 1) + 1;
1050 if (target == word)
1051 continue;
1052 label = 0;
1053 }
1054 l2 = target * layer1_size;
1055 f = 0;
1056 for (c = 0; c < layer1_size; c++)
1057 f += syn0[c + l1] * syn1neg[c + l2];
1058 if (f > MAX_EXP)
1059 g = (label - 1) * alpha;
1060 else if (f < -MAX_EXP)
1061 g = (label - 0) * alpha;
1062 else
1063 g =
1064 (label
1065 - expTable[(int) ((f + MAX_EXP)
1066 * (EXP_TABLE_SIZE
1067 / MAX_EXP / 2))])
1068 * alpha;
1069 for (c = 0; c < layer1_size; c++)
1070 neu1e[c] += g * syn1neg[c + l2];
1071 for (c = 0; c < layer1_size; c++)
1072 syn1neg[c + l2] += g * syn0[c + l1];
1073 if (cap == 1)
1074 for (c = 0; c < layer1_size; c++)
1075 capParam(syn1neg, c + l2);
1076 }
1077 //Noise Contrastive Estimation
1078 if (nce > 0)
1079 for (d = 0; d < nce + 1; d++) {
1080 if (d == 0) {
1081 target = word;
1082 label = 1;
1083 } else {
1084 next_random = next_random
1085 * (unsigned long long) 25214903917 + 11;
1086 if (word_to_group != NULL
1087 && word_to_group[word] != -1) {
1088 target = word;
1089 while (target == word) {
1090 target =
1091 group_to_table[word_to_group[word]
1092 * table_size
1093 + (next_random >> 16)
1094 % table_size];
1095 next_random =
1096 next_random
1097 * (unsigned long long) 25214903917
1098 + 11;
1099 }
1100 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1101 } else {
1102 target = table[(next_random >> 16)
1103 % table_size];
1104 }
1105 if (target == 0)
1106 target = next_random % (vocab_size - 1) + 1;
1107 if (target == word)
1108 continue;
1109 label = 0;
1110 }
1111 l2 = target * layer1_size;
1112 f = 0;
1113 for (c = 0; c < layer1_size; c++)
1114 f += syn0[c + l1] * syn1nce[c + l2];
1115 if (f > MAX_EXP)
1116 g = (label - 1) * alpha;
1117 else if (f < -MAX_EXP)
1118 g = (label - 0) * alpha;
1119 else {
1120 f = exp(f);
1121 g = (label
1122 - f
1123 / (noise_distribution[target]
1124 * nce + f)) * alpha;
1125 }
1126 for (c = 0; c < layer1_size; c++)
1127 neu1e[c] += g * syn1nce[c + l2];
1128 for (c = 0; c < layer1_size; c++)
1129 syn1nce[c + l2] += g * syn0[c + l1];
1130 if (cap == 1)
1131 for (c = 0; c < layer1_size; c++)
1132 capParam(syn1nce, c + l2);
1133 }
1134 // Learn weights input -> hidden
1135 for (c = 0; c < layer1_size; c++)
1136 syn0[c + l1] += neu1e[c];
1137 }
1138 } else if (type == 2) { //train the cwindow architecture
1139 // in -> hidden
1140 cw = 0;
1141 for (a = 0; a < window * 2 + 1; a++)
1142 if (a != window) {
1143 c = sentence_position - window + a;
1144 if (c < 0)
1145 continue;
1146 if (c >= sentence_length)
1147 continue;
1148 last_word = sen[c];
1149 if (last_word == -1)
1150 continue;
1151 window_offset = a * layer1_size;
1152 if (a > window)
1153 window_offset -= layer1_size;
1154 for (c = 0; c < layer1_size; c++)
1155 neu1[c + window_offset] += syn0[c
1156 + last_word * layer1_size];
1157 cw++;
1158 }
1159 if (cw) {
1160 if (hs)
1161 for (d = 0; d < vocab[word].codelen; d++) {
1162 f = 0;
1163 l2 = vocab[word].point[d] * window_layer_size;
1164 // Propagate hidden -> output
1165 for (c = 0; c < window_layer_size; c++)
1166 f += neu1[c] * syn1_window[c + l2];
1167 if (f <= -MAX_EXP)
1168 continue;
1169 else if (f >= MAX_EXP)
1170 continue;
1171 else
1172 f = expTable[(int) ((f + MAX_EXP)
1173 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1174 // 'g' is the gradient multiplied by the learning rate
1175 g = (1 - vocab[word].code[d] - f) * alpha;
1176 // Propagate errors output -> hidden
1177 for (c = 0; c < window_layer_size; c++)
1178 neu1e[c] += g * syn1_window[c + l2];
1179 // Learn weights hidden -> output
1180 for (c = 0; c < window_layer_size; c++)
1181 syn1_window[c + l2] += g * neu1[c];
1182 if (cap == 1)
1183 for (c = 0; c < window_layer_size; c++)
1184 capParam(syn1_window, c + l2);
1185 }
1186 // NEGATIVE SAMPLING
1187 if (negative > 0)
1188 for (d = 0; d < negative + 1; d++) {
1189 if (d == 0) {
1190 target = word;
1191 label = 1;
1192 } else {
1193 next_random = next_random
1194 * (unsigned long long) 25214903917 + 11;
1195 if (word_to_group != NULL
1196 && word_to_group[word] != -1) {
1197 target = word;
1198 while (target == word) {
1199 target = group_to_table[word_to_group[word]
1200 * table_size
1201 + (next_random >> 16) % table_size];
1202 next_random = next_random
1203 * (unsigned long long) 25214903917
1204 + 11;
1205 }
1206 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1207 } else {
1208 target =
1209 table[(next_random >> 16) % table_size];
1210 }
1211 if (target == 0)
1212 target = next_random % (vocab_size - 1) + 1;
1213 if (target == word)
1214 continue;
1215 label = 0;
1216 }
1217 l2 = target * window_layer_size;
1218 f = 0;
1219 for (c = 0; c < window_layer_size; c++)
1220 f += neu1[c] * syn1neg_window[c + l2];
1221 if (f > MAX_EXP)
1222 g = (label - 1) * alpha;
1223 else if (f < -MAX_EXP)
1224 g = (label - 0) * alpha;
1225 else
1226 g = (label
1227 - expTable[(int) ((f + MAX_EXP)
1228 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1229 * alpha;
1230 for (c = 0; c < window_layer_size; c++)
1231 neu1e[c] += g * syn1neg_window[c + l2];
1232 for (c = 0; c < window_layer_size; c++)
1233 syn1neg_window[c + l2] += g * neu1[c];
1234 if (cap == 1)
1235 for (c = 0; c < window_layer_size; c++)
1236 capParam(syn1neg_window, c + l2);
1237 }
1238 // Noise Contrastive Estimation
1239 if (nce > 0)
1240 for (d = 0; d < nce + 1; d++) {
1241 if (d == 0) {
1242 target = word;
1243 label = 1;
1244 } else {
1245 next_random = next_random
1246 * (unsigned long long) 25214903917 + 11;
1247 if (word_to_group != NULL
1248 && word_to_group[word] != -1) {
1249 target = word;
1250 while (target == word) {
1251 target = group_to_table[word_to_group[word]
1252 * table_size
1253 + (next_random >> 16) % table_size];
1254 next_random = next_random
1255 * (unsigned long long) 25214903917
1256 + 11;
1257 }
1258 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1259 } else {
1260 target =
1261 table[(next_random >> 16) % table_size];
1262 }
1263 if (target == 0)
1264 target = next_random % (vocab_size - 1) + 1;
1265 if (target == word)
1266 continue;
1267 label = 0;
1268 }
1269 l2 = target * window_layer_size;
1270 f = 0;
1271 for (c = 0; c < window_layer_size; c++)
1272 f += neu1[c] * syn1nce_window[c + l2];
1273 if (f > MAX_EXP)
1274 g = (label - 1) * alpha;
1275 else if (f < -MAX_EXP)
1276 g = (label - 0) * alpha;
1277 else {
1278 f = exp(f);
1279 g =
1280 (label
1281 - f
1282 / (noise_distribution[target]
1283 * nce + f)) * alpha;
1284 }
1285 for (c = 0; c < window_layer_size; c++)
1286 neu1e[c] += g * syn1nce_window[c + l2];
1287 for (c = 0; c < window_layer_size; c++)
1288 syn1nce_window[c + l2] += g * neu1[c];
1289 if (cap == 1)
1290 for (c = 0; c < window_layer_size; c++)
1291 capParam(syn1nce_window, c + l2);
1292 }
1293 // hidden -> in
1294 for (a = 0; a < window * 2 + 1; a++)
1295 if (a != window) {
1296 c = sentence_position - window + a;
1297 if (c < 0)
1298 continue;
1299 if (c >= sentence_length)
1300 continue;
1301 last_word = sen[c];
1302 if (last_word == -1)
1303 continue;
1304 window_offset = a * layer1_size;
1305 if (a > window)
1306 window_offset -= layer1_size;
1307 for (c = 0; c < layer1_size; c++)
1308 syn0[c + last_word * layer1_size] += neu1e[c
1309 + window_offset];
1310 }
1311 }
1312 } else if (type == 3) { //train structured skip-gram
1313 for (a = 0; a < window * 2 + 1; a++)
1314 if (a != window) {
1315 c = sentence_position - window + a;
1316 if (c < 0)
1317 continue;
1318 if (c >= sentence_length)
1319 continue;
1320 last_word = sen[c];
Peter Fankhauser5bec9e12016-04-20 13:29:33 +02001321 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001322 continue;
1323 l1 = last_word * layer1_size;
1324 window_offset = a * layer1_size;
1325 if (a > window)
1326 window_offset -= layer1_size;
1327 for (c = 0; c < layer1_size; c++)
1328 neu1e[c] = 0;
1329 // HIERARCHICAL SOFTMAX
1330 if (hs)
1331 for (d = 0; d < vocab[word].codelen; d++) {
1332 f = 0;
1333 l2 = vocab[word].point[d] * window_layer_size;
1334 // Propagate hidden -> output
1335 for (c = 0; c < layer1_size; c++)
1336 f += syn0[c + l1]
1337 * syn1_window[c + l2 + window_offset];
1338 if (f <= -MAX_EXP)
1339 continue;
1340 else if (f >= MAX_EXP)
1341 continue;
1342 else
1343 f = expTable[(int) ((f + MAX_EXP)
1344 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1345 // 'g' is the gradient multiplied by the learning rate
1346 g = (1 - vocab[word].code[d] - f) * alpha;
1347 // Propagate errors output -> hidden
1348 for (c = 0; c < layer1_size; c++)
1349 neu1e[c] += g
1350 * syn1_window[c + l2 + window_offset];
1351 // Learn weights hidden -> output
1352 for (c = 0; c < layer1_size; c++)
1353 syn1[c + l2 + window_offset] += g
1354 * syn0[c + l1];
1355 if (cap == 1)
1356 for (c = 0; c < layer1_size; c++)
1357 capParam(syn1, c + l2 + window_offset);
1358 }
1359 // NEGATIVE SAMPLING
1360 if (negative > 0)
1361 for (d = 0; d < negative + 1; d++) {
1362 if (d == 0) {
1363 target = word;
1364 label = 1;
1365 } else {
1366 next_random = next_random
1367 * (unsigned long long) 25214903917 + 11;
1368 if (word_to_group != NULL
1369 && word_to_group[word] != -1) {
1370 target = word;
1371 while (target == word) {
1372 target =
1373 group_to_table[word_to_group[word]
1374 * table_size
1375 + (next_random >> 16)
1376 % table_size];
1377 next_random =
1378 next_random
1379 * (unsigned long long) 25214903917
1380 + 11;
1381 }
1382 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1383 } else {
1384 target = table[(next_random >> 16)
1385 % table_size];
1386 }
1387 if (target == 0)
1388 target = next_random % (vocab_size - 1) + 1;
1389 if (target == word)
1390 continue;
1391 label = 0;
1392 }
1393 l2 = target * window_layer_size;
1394 f = 0;
1395 for (c = 0; c < layer1_size; c++)
1396 f +=
1397 syn0[c + l1]
1398 * syn1neg_window[c + l2
1399 + window_offset];
1400 if (f > MAX_EXP)
1401 g = (label - 1) * alpha;
1402 else if (f < -MAX_EXP)
1403 g = (label - 0) * alpha;
1404 else
1405 g =
1406 (label
1407 - expTable[(int) ((f + MAX_EXP)
1408 * (EXP_TABLE_SIZE
1409 / MAX_EXP / 2))])
1410 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001411 if(debug_mode > 2 && ((long long) id) == 0) {
1412 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1413 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1414 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001415 for (c = 0; c < layer1_size; c++)
1416 neu1e[c] +=
1417 g
1418 * syn1neg_window[c + l2
1419 + window_offset];
1420 for (c = 0; c < layer1_size; c++)
1421 syn1neg_window[c + l2 + window_offset] += g
1422 * syn0[c + l1];
1423 if (cap == 1)
1424 for (c = 0; c < layer1_size; c++)
1425 capParam(syn1neg_window,
1426 c + l2 + window_offset);
1427 }
1428 // Noise Constrastive Estimation
1429 if (nce > 0)
1430 for (d = 0; d < nce + 1; d++) {
1431 if (d == 0) {
1432 target = word;
1433 label = 1;
1434 } else {
1435 next_random = next_random
1436 * (unsigned long long) 25214903917 + 11;
1437 if (word_to_group != NULL
1438 && word_to_group[word] != -1) {
1439 target = word;
1440 while (target == word) {
1441 target =
1442 group_to_table[word_to_group[word]
1443 * table_size
1444 + (next_random >> 16)
1445 % table_size];
1446 next_random =
1447 next_random
1448 * (unsigned long long) 25214903917
1449 + 11;
1450 }
1451 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1452 } else {
1453 target = table[(next_random >> 16)
1454 % table_size];
1455 }
1456 if (target == 0)
1457 target = next_random % (vocab_size - 1) + 1;
1458 if (target == word)
1459 continue;
1460 label = 0;
1461 }
1462 l2 = target * window_layer_size;
1463 f = 0;
1464 for (c = 0; c < layer1_size; c++)
1465 f +=
1466 syn0[c + l1]
1467 * syn1nce_window[c + l2
1468 + window_offset];
1469 if (f > MAX_EXP)
1470 g = (label - 1) * alpha;
1471 else if (f < -MAX_EXP)
1472 g = (label - 0) * alpha;
1473 else {
1474 f = exp(f);
1475 g = (label
1476 - f
1477 / (noise_distribution[target]
1478 * nce + f)) * alpha;
1479 }
1480 for (c = 0; c < layer1_size; c++)
1481 neu1e[c] +=
1482 g
1483 * syn1nce_window[c + l2
1484 + window_offset];
1485 for (c = 0; c < layer1_size; c++)
1486 syn1nce_window[c + l2 + window_offset] += g
1487 * syn0[c + l1];
1488 if (cap == 1)
1489 for (c = 0; c < layer1_size; c++)
1490 capParam(syn1nce_window,
1491 c + l2 + window_offset);
1492 }
1493 // Learn weights input -> hidden
1494 for (c = 0; c < layer1_size; c++) {
1495 syn0[c + l1] += neu1e[c];
1496 if (syn0[c + l1] > 50)
1497 syn0[c + l1] = 50;
1498 if (syn0[c + l1] < -50)
1499 syn0[c + l1] = -50;
1500 }
1501 }
1502 } else if (type == 4) { //training senna
1503 // in -> hidden
1504 cw = 0;
1505 for (a = 0; a < window * 2 + 1; a++)
1506 if (a != window) {
1507 c = sentence_position - window + a;
1508 if (c < 0)
1509 continue;
1510 if (c >= sentence_length)
1511 continue;
1512 last_word = sen[c];
1513 if (last_word == -1)
1514 continue;
1515 window_offset = a * layer1_size;
1516 if (a > window)
1517 window_offset -= layer1_size;
1518 for (c = 0; c < layer1_size; c++)
1519 neu1[c + window_offset] += syn0[c
1520 + last_word * layer1_size];
1521 cw++;
1522 }
1523 if (cw) {
1524 for (a = 0; a < window_hidden_size; a++) {
1525 c = a * window_layer_size;
1526 for (b = 0; b < window_layer_size; b++) {
1527 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1528 }
1529 }
1530 if (hs)
1531 for (d = 0; d < vocab[word].codelen; d++) {
1532 f = 0;
1533 l2 = vocab[word].point[d] * window_hidden_size;
1534 // Propagate hidden -> output
1535 for (c = 0; c < window_hidden_size; c++)
1536 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1537 if (f <= -MAX_EXP)
1538 continue;
1539 else if (f >= MAX_EXP)
1540 continue;
1541 else
1542 f = expTable[(int) ((f + MAX_EXP)
1543 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1544 // 'g' is the gradient multiplied by the learning rate
1545 g = (1 - vocab[word].code[d] - f) * alpha;
1546 // Propagate errors output -> hidden
1547 for (c = 0; c < window_hidden_size; c++)
1548 neu2e[c] += dHardTanh(neu2[c], g) * g
1549 * syn_hidden_word[c + l2];
1550 // Learn weights hidden -> output
1551 for (c = 0; c < window_hidden_size; c++)
1552 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1553 * neu2[c];
1554 }
1555 // NEGATIVE SAMPLING
1556 if (negative > 0)
1557 for (d = 0; d < negative + 1; d++) {
1558 if (d == 0) {
1559 target = word;
1560 label = 1;
1561 } else {
1562 next_random = next_random
1563 * (unsigned long long) 25214903917 + 11;
1564 if (word_to_group != NULL
1565 && word_to_group[word] != -1) {
1566 target = word;
1567 while (target == word) {
1568 target = group_to_table[word_to_group[word]
1569 * table_size
1570 + (next_random >> 16) % table_size];
1571 next_random = next_random
1572 * (unsigned long long) 25214903917
1573 + 11;
1574 }
1575 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1576 } else {
1577 target =
1578 table[(next_random >> 16) % table_size];
1579 }
1580 if (target == 0)
1581 target = next_random % (vocab_size - 1) + 1;
1582 if (target == word)
1583 continue;
1584 label = 0;
1585 }
1586 l2 = target * window_hidden_size;
1587 f = 0;
1588 for (c = 0; c < window_hidden_size; c++)
1589 f += hardTanh(neu2[c])
1590 * syn_hidden_word_neg[c + l2];
1591 if (f > MAX_EXP)
1592 g = (label - 1) * alpha / negative;
1593 else if (f < -MAX_EXP)
1594 g = (label - 0) * alpha / negative;
1595 else
1596 g = (label
1597 - expTable[(int) ((f + MAX_EXP)
1598 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1599 * alpha / negative;
1600 for (c = 0; c < window_hidden_size; c++)
1601 neu2e[c] += dHardTanh(neu2[c], g) * g
1602 * syn_hidden_word_neg[c + l2];
1603 for (c = 0; c < window_hidden_size; c++)
1604 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1605 * g * neu2[c];
1606 }
1607 for (a = 0; a < window_hidden_size; a++)
1608 for (b = 0; b < window_layer_size; b++)
1609 neu1e[b] += neu2e[a]
1610 * syn_window_hidden[a * window_layer_size + b];
1611 for (a = 0; a < window_hidden_size; a++)
1612 for (b = 0; b < window_layer_size; b++)
1613 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1614 * neu1[b];
1615 // hidden -> in
1616 for (a = 0; a < window * 2 + 1; a++)
1617 if (a != window) {
1618 c = sentence_position - window + a;
1619 if (c < 0)
1620 continue;
1621 if (c >= sentence_length)
1622 continue;
1623 last_word = sen[c];
1624 if (last_word == -1)
1625 continue;
1626 window_offset = a * layer1_size;
1627 if (a > window)
1628 window_offset -= layer1_size;
1629 for (c = 0; c < layer1_size; c++)
1630 syn0[c + last_word * layer1_size] += neu1e[c
1631 + window_offset];
1632 }
1633 }
1634 } else {
1635 printf("unknown type %i", type);
1636 exit(0);
1637 }
1638 sentence_position++;
1639 if (sentence_position >= sentence_length) {
1640 sentence_length = 0;
1641 continue;
1642 }
1643 }
1644 fclose(fi);
1645 free(neu1);
1646 free(neu1e);
1647 pthread_exit(NULL);
1648}
1649
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001650void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001651 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001652 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001653 real *target_sums, bestf[MAX_CC], worstbest;
1654 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001655 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001656 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1657
1658 for (d = cc; d < vocab_size; d++) {
1659 for (b = 0; b < vocab_size; b++)
1660 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001661 for (b = 0; b < N; b++)
1662 bestf[b]=-1;
1663 worstbest = -1;
1664
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001665 maxmax_f = -1;
1666 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001667 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001668 if (a != window) {
1669 max_f = -1;
1670 window_offset = a * layer1_size;
1671 if (a > window)
1672 window_offset -= layer1_size;
1673 for(target = 0; target < vocab_size; target ++) {
1674 if(target == d)
1675 continue;
1676 f = 0;
1677 for (c = 0; c < layer1_size; c++)
1678 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1679 if (f < -MAX_EXP)
1680 continue;
1681 else if (f > MAX_EXP)
1682 continue;
1683 else
1684 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1685 if(f > max_f) {
1686 max_f = f;
1687 max_target = target;
1688 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001689 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001690 if(f > worstbest) {
1691 for (b = 0; b < N; b++) {
1692 if (f > bestf[b]) {
1693 for (e = N - 1; e > b; e--) {
1694 bestf[e] = bestf[e - 1];
1695 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001696 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001697 }
1698 bestf[b] = f;
1699 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001700 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001701 break;
1702 }
1703 }
1704 worstbest = bestf[N-1];
1705 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001706 }
1707 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1708 if(max_f > maxmax_f) {
1709 maxmax_f = max_f;
1710 maxmax_target = max_target;
1711 }
1712 } else {
1713 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1714 }
1715 }
1716 max_f = -1;
1717 for (b = 0; b < vocab_size; b++) {
1718 if(target_sums[b] > max_f) {
1719 max_f = target_sums[b];
1720 max_target = b;
1721 }
1722 }
1723 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001724 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001725 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001726 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001727 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001728 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001729 }
1730}
1731
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001732void TrainModel() {
1733 long a, b, c, d;
1734 FILE *fo;
1735 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1736 printf("Starting training using file %s\n", train_file);
1737 starting_alpha = alpha;
1738 if (read_vocab_file[0] != 0)
1739 ReadVocab();
1740 else
1741 LearnVocabFromTrainFile();
1742 if (save_vocab_file[0] != 0)
1743 SaveVocab();
1744 if (output_file[0] == 0)
1745 return;
1746 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001747 if(cc > 0)
1748 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001749 if (negative > 0 || nce > 0)
1750 InitUnigramTable();
1751 if (negative_classes_file[0] != 0)
1752 InitClassUnigramTable();
1753 start = clock();
1754 for (a = 0; a < num_threads; a++)
1755 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1756 for (a = 0; a < num_threads; a++)
1757 pthread_join(pt[a], NULL);
1758 fo = fopen(output_file, "wb");
1759 if (classes == 0) {
1760 // Save the word vectors
1761 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1762 for (a = 0; a < vocab_size; a++) {
1763 fprintf(fo, "%s ", vocab[a].word);
1764 if (binary)
1765 for (b = 0; b < layer1_size; b++)
1766 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1767 else
1768 for (b = 0; b < layer1_size; b++)
1769 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1770 fprintf(fo, "\n");
1771 }
1772 } else {
1773 // Run K-means on the word vectors
1774 int clcn = classes, iter = 10, closeid;
1775 int *centcn = (int *) malloc(classes * sizeof(int));
1776 int *cl = (int *) calloc(vocab_size, sizeof(int));
1777 real closev, x;
1778 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1779 for (a = 0; a < vocab_size; a++)
1780 cl[a] = a % clcn;
1781 for (a = 0; a < iter; a++) {
1782 for (b = 0; b < clcn * layer1_size; b++)
1783 cent[b] = 0;
1784 for (b = 0; b < clcn; b++)
1785 centcn[b] = 1;
1786 for (c = 0; c < vocab_size; c++) {
1787 for (d = 0; d < layer1_size; d++)
1788 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1789 centcn[cl[c]]++;
1790 }
1791 for (b = 0; b < clcn; b++) {
1792 closev = 0;
1793 for (c = 0; c < layer1_size; c++) {
1794 cent[layer1_size * b + c] /= centcn[b];
1795 closev += cent[layer1_size * b + c]
1796 * cent[layer1_size * b + c];
1797 }
1798 closev = sqrt(closev);
1799 for (c = 0; c < layer1_size; c++)
1800 cent[layer1_size * b + c] /= closev;
1801 }
1802 for (c = 0; c < vocab_size; c++) {
1803 closev = -10;
1804 closeid = 0;
1805 for (d = 0; d < clcn; d++) {
1806 x = 0;
1807 for (b = 0; b < layer1_size; b++)
1808 x += cent[layer1_size * d + b]
1809 * syn0[c * layer1_size + b];
1810 if (x > closev) {
1811 closev = x;
1812 closeid = d;
1813 }
1814 }
1815 cl[c] = closeid;
1816 }
1817 }
1818 // Save the K-means classes
1819 for (a = 0; a < vocab_size; a++)
1820 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1821 free(centcn);
1822 free(cent);
1823 free(cl);
1824 }
1825 fclose(fo);
1826 if (save_net_file[0] != 0)
1827 SaveNet();
1828}
1829
1830int ArgPos(char *str, int argc, char **argv) {
1831 int a;
1832 for (a = 1; a < argc; a++)
1833 if (!strcmp(str, argv[a])) {
1834 if (a == argc - 1) {
1835 printf("Argument missing for %s\n", str);
1836 exit(1);
1837 }
1838 return a;
1839 }
1840 return -1;
1841}
1842
1843int main(int argc, char **argv) {
1844 int i;
1845 if (argc == 1) {
1846 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1847 printf("Options:\n");
1848 printf("Parameters for training:\n");
1849 printf("\t-train <file>\n");
1850 printf("\t\tUse text data from <file> to train the model\n");
1851 printf("\t-output <file>\n");
1852 printf(
1853 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1854 printf("\t-size <int>\n");
1855 printf("\t\tSet size of word vectors; default is 100\n");
1856 printf("\t-window <int>\n");
1857 printf("\t\tSet max skip length between words; default is 5\n");
1858 printf("\t-sample <float>\n");
1859 printf(
1860 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1861 printf(
1862 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1863 printf("\t-hs <int>\n");
1864 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1865 printf("\t-negative <int>\n");
1866 printf(
1867 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1868 printf("\t-negative-classes <file>\n");
1869 printf("\t\tNegative classes to sample from\n");
1870 printf("\t-nce <int>\n");
1871 printf(
1872 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1873 printf("\t-threads <int>\n");
1874 printf("\t\tUse <int> threads (default 12)\n");
1875 printf("\t-iter <int>\n");
1876 printf("\t\tRun more training iterations (default 5)\n");
1877 printf("\t-min-count <int>\n");
1878 printf(
1879 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1880 printf("\t-alpha <float>\n");
1881 printf(
1882 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1883 printf("\t-classes <int>\n");
1884 printf(
1885 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1886 printf("\t-debug <int>\n");
1887 printf(
1888 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1889 printf("\t-binary <int>\n");
1890 printf(
1891 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1892 printf("\t-save-vocab <file>\n");
1893 printf("\t\tThe vocabulary will be saved to <file>\n");
1894 printf("\t-read-vocab <file>\n");
1895 printf(
1896 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1897 printf("\t-read-net <file>\n");
1898 printf(
1899 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1900 printf("\t-save-net <file>\n");
1901 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001902 printf("\t-show-cc <int>\n");
1903 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001904 printf("\t-type <int>\n");
1905 printf(
1906 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1907 printf("\t-cap <int>\n");
1908 printf(
1909 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1910 printf("\nExamples:\n");
1911 printf(
1912 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1913 return 0;
1914 }
1915 output_file[0] = 0;
1916 save_vocab_file[0] = 0;
1917 read_vocab_file[0] = 0;
1918 save_net_file[0] = 0;
1919 read_net_file[0] = 0;
1920 negative_classes_file[0] = 0;
1921 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1922 layer1_size = atoi(argv[i + 1]);
1923 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1924 strcpy(train_file, argv[i + 1]);
1925 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1926 strcpy(save_vocab_file, argv[i + 1]);
1927 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1928 strcpy(read_vocab_file, argv[i + 1]);
1929 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1930 strcpy(save_net_file, argv[i + 1]);
1931 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1932 strcpy(read_net_file, argv[i + 1]);
1933 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1934 debug_mode = atoi(argv[i + 1]);
1935 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1936 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001937 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
1938 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001939 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1940 type = atoi(argv[i + 1]);
1941 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1942 strcpy(output_file, argv[i + 1]);
1943 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1944 window = atoi(argv[i + 1]);
1945 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1946 sample = atof(argv[i + 1]);
1947 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1948 hs = atoi(argv[i + 1]);
1949 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1950 negative = atoi(argv[i + 1]);
1951 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1952 strcpy(negative_classes_file, argv[i + 1]);
1953 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1954 nce = atoi(argv[i + 1]);
1955 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1956 num_threads = atoi(argv[i + 1]);
1957 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1958 iter = atoi(argv[i + 1]);
1959 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1960 min_count = atoi(argv[i + 1]);
1961 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1962 classes = atoi(argv[i + 1]);
1963 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1964 cap = atoi(argv[i + 1]);
1965 if (type == 0 || type == 2 || type == 4)
1966 alpha = 0.05;
1967 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1968 alpha = atof(argv[i + 1]);
1969 vocab = (struct vocab_word *) calloc(vocab_max_size,
1970 sizeof(struct vocab_word));
1971 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1972 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1973 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1974 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1975 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1976 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02001977 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001978 TrainModel();
1979 return 0;
1980}
1981