blob: 0cdbf5d008b400dc967805f326d54d8a5b5b8299 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010025#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010026#define MAX_CODE_LENGTH 40
27
28const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
29
30typedef float real; // Precision of float numbers
31
32struct vocab_word {
33 long long cn;
34 int *point;
35 char *word, *code, codelen;
36};
37
38char train_file[MAX_STRING], output_file[MAX_STRING];
39char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
40char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
41struct vocab_word *vocab;
42int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
43 num_threads = 12, min_reduce = 1;
44int *vocab_hash;
45long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
46long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
47 classes = 0;
48real alpha = 0.025, starting_alpha, sample = 1e-3;
49real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
50clock_t start;
51
52real *syn1_window, *syn1neg_window, *syn1nce_window;
53int w_offset, window_layer_size;
54
55int window_hidden_size = 500;
56real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
57 *syn_hidden_word_nce;
58
59int hs = 0, negative = 5;
60const int table_size = 1e8;
61int *table;
62
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010063long cc = 0;
64
Marc Kupietzd6f9c712016-03-16 11:50:56 +010065//constrastive negative sampling
66char negative_classes_file[MAX_STRING];
67int *word_to_group;
68int *group_to_table; //group_size*table_size
69int class_number;
70
71//nce
72real* noise_distribution;
73int nce = 0;
74
75//param caps
76real CAP_VALUE = 50;
77int cap = 0;
78
79void capParam(real* array, int index) {
80 if (array[index] > CAP_VALUE)
81 array[index] = CAP_VALUE;
82 else if (array[index] < -CAP_VALUE)
83 array[index] = -CAP_VALUE;
84}
85
86real hardTanh(real x) {
87 if (x >= 1) {
88 return 1;
89 } else if (x <= -1) {
90 return -1;
91 } else {
92 return x;
93 }
94}
95
96real dHardTanh(real x, real g) {
97 if (x > 1 && g > 0) {
98 return 0;
99 }
100 if (x < -1 && g < 0) {
101 return 0;
102 }
103 return 1;
104}
105
106void InitUnigramTable() {
107 int a, i;
108 long long train_words_pow = 0;
109 real d1, power = 0.75;
110 table = (int *) malloc(table_size * sizeof(int));
111 for (a = 0; a < vocab_size; a++)
112 train_words_pow += pow(vocab[a].cn, power);
113 i = 0;
114 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
115 for (a = 0; a < table_size; a++) {
116 table[a] = i;
117 if (a / (real) table_size > d1) {
118 i++;
119 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
120 }
121 if (i >= vocab_size)
122 i = vocab_size - 1;
123 }
124
125 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
126 for (a = 0; a < vocab_size; a++)
127 noise_distribution[a] = pow(vocab[a].cn, power)
128 / (real) train_words_pow;
129}
130
131// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
132void ReadWord(char *word, FILE *fin) {
133 int a = 0, ch;
134 while (!feof(fin)) {
135 ch = fgetc(fin);
136 if (ch == 13)
137 continue;
138 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
139 if (a > 0) {
140 if (ch == '\n')
141 ungetc(ch, fin);
142 break;
143 }
144 if (ch == '\n') {
145 strcpy(word, (char *) "</s>");
146 return;
147 } else
148 continue;
149 }
150 word[a] = ch;
151 a++;
152 if (a >= MAX_STRING - 1)
153 a--; // Truncate too long words
154 }
155 word[a] = 0;
156}
157
158// Returns hash value of a word
159int GetWordHash(char *word) {
160 unsigned long long a, hash = 0;
161 for (a = 0; a < strlen(word); a++)
162 hash = hash * 257 + word[a];
163 hash = hash % vocab_hash_size;
164 return hash;
165}
166
167// Returns position of a word in the vocabulary; if the word is not found, returns -1
168int SearchVocab(char *word) {
169 unsigned int hash = GetWordHash(word);
170 while (1) {
171 if (vocab_hash[hash] == -1)
172 return -1;
173 if (!strcmp(word, vocab[vocab_hash[hash]].word))
174 return vocab_hash[hash];
175 hash = (hash + 1) % vocab_hash_size;
176 }
177 return -1;
178}
179
180// Reads a word and returns its index in the vocabulary
181int ReadWordIndex(FILE *fin) {
182 char word[MAX_STRING];
183 ReadWord(word, fin);
184 if (feof(fin))
185 return -1;
186 return SearchVocab(word);
187}
188
189// Adds a word to the vocabulary
190int AddWordToVocab(char *word) {
191 unsigned int hash, length = strlen(word) + 1;
192 if (length > MAX_STRING)
193 length = MAX_STRING;
194 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
195 strcpy(vocab[vocab_size].word, word);
196 vocab[vocab_size].cn = 0;
197 vocab_size++;
198 // Reallocate memory if needed
199 if (vocab_size + 2 >= vocab_max_size) {
200 vocab_max_size += 1000;
201 vocab = (struct vocab_word *) realloc(vocab,
202 vocab_max_size * sizeof(struct vocab_word));
203 }
204 hash = GetWordHash(word);
205 while (vocab_hash[hash] != -1)
206 hash = (hash + 1) % vocab_hash_size;
207 vocab_hash[hash] = vocab_size - 1;
208 return vocab_size - 1;
209}
210
211// Used later for sorting by word counts
212int VocabCompare(const void *a, const void *b) {
213 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
214}
215
216// Sorts the vocabulary by frequency using word counts
217void SortVocab() {
218 int a, size;
219 unsigned int hash;
220 // Sort the vocabulary and keep </s> at the first position
221 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
222 for (a = 0; a < vocab_hash_size; a++)
223 vocab_hash[a] = -1;
224 size = vocab_size;
225 train_words = 0;
226 for (a = 0; a < size; a++) {
227 // Words occuring less than min_count times will be discarded from the vocab
228 if ((vocab[a].cn < min_count) && (a != 0)) {
229 vocab_size--;
230 free(vocab[a].word);
231 } else {
232 // Hash will be re-computed, as after the sorting it is not actual
233 hash = GetWordHash(vocab[a].word);
234 while (vocab_hash[hash] != -1)
235 hash = (hash + 1) % vocab_hash_size;
236 vocab_hash[hash] = a;
237 train_words += vocab[a].cn;
238 }
239 }
240 vocab = (struct vocab_word *) realloc(vocab,
241 (vocab_size + 1) * sizeof(struct vocab_word));
242 // Allocate memory for the binary tree construction
243 for (a = 0; a < vocab_size; a++) {
244 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
245 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
246 }
247}
248
249// Reduces the vocabulary by removing infrequent tokens
250void ReduceVocab() {
251 int a, b = 0;
252 unsigned int hash;
253 for (a = 0; a < vocab_size; a++)
254 if (vocab[a].cn > min_reduce) {
255 vocab[b].cn = vocab[a].cn;
256 vocab[b].word = vocab[a].word;
257 b++;
258 } else
259 free(vocab[a].word);
260 vocab_size = b;
261 for (a = 0; a < vocab_hash_size; a++)
262 vocab_hash[a] = -1;
263 for (a = 0; a < vocab_size; a++) {
264 // Hash will be re-computed, as it is not actual
265 hash = GetWordHash(vocab[a].word);
266 while (vocab_hash[hash] != -1)
267 hash = (hash + 1) % vocab_hash_size;
268 vocab_hash[hash] = a;
269 }
270 fflush(stdout);
271 min_reduce++;
272}
273
274// Create binary Huffman tree using the word counts
275// Frequent words will have short uniqe binary codes
276void CreateBinaryTree() {
277 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
278 char code[MAX_CODE_LENGTH];
279 long long *count = (long long *) calloc(vocab_size * 2 + 1,
280 sizeof(long long));
281 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
282 sizeof(long long));
283 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
284 sizeof(long long));
285 for (a = 0; a < vocab_size; a++)
286 count[a] = vocab[a].cn;
287 for (a = vocab_size; a < vocab_size * 2; a++)
288 count[a] = 1e15;
289 pos1 = vocab_size - 1;
290 pos2 = vocab_size;
291 // Following algorithm constructs the Huffman tree by adding one node at a time
292 for (a = 0; a < vocab_size - 1; a++) {
293 // First, find two smallest nodes 'min1, min2'
294 if (pos1 >= 0) {
295 if (count[pos1] < count[pos2]) {
296 min1i = pos1;
297 pos1--;
298 } else {
299 min1i = pos2;
300 pos2++;
301 }
302 } else {
303 min1i = pos2;
304 pos2++;
305 }
306 if (pos1 >= 0) {
307 if (count[pos1] < count[pos2]) {
308 min2i = pos1;
309 pos1--;
310 } else {
311 min2i = pos2;
312 pos2++;
313 }
314 } else {
315 min2i = pos2;
316 pos2++;
317 }
318 count[vocab_size + a] = count[min1i] + count[min2i];
319 parent_node[min1i] = vocab_size + a;
320 parent_node[min2i] = vocab_size + a;
321 binary[min2i] = 1;
322 }
323 // Now assign binary code to each vocabulary word
324 for (a = 0; a < vocab_size; a++) {
325 b = a;
326 i = 0;
327 while (1) {
328 code[i] = binary[b];
329 point[i] = b;
330 i++;
331 b = parent_node[b];
332 if (b == vocab_size * 2 - 2)
333 break;
334 }
335 vocab[a].codelen = i;
336 vocab[a].point[0] = vocab_size - 2;
337 for (b = 0; b < i; b++) {
338 vocab[a].code[i - b - 1] = code[b];
339 vocab[a].point[i - b] = point[b] - vocab_size;
340 }
341 }
342 free(count);
343 free(binary);
344 free(parent_node);
345}
346
347void LearnVocabFromTrainFile() {
348 char word[MAX_STRING];
349 FILE *fin;
350 long long a, i;
351 for (a = 0; a < vocab_hash_size; a++)
352 vocab_hash[a] = -1;
353 fin = fopen(train_file, "rb");
354 if (fin == NULL) {
355 printf("ERROR: training data file not found!\n");
356 exit(1);
357 }
358 vocab_size = 0;
359 AddWordToVocab((char *) "</s>");
360 while (1) {
361 ReadWord(word, fin);
362 if (feof(fin))
363 break;
364 train_words++;
365 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
366 printf("%lldK%c", train_words / 1000, 13);
367 fflush(stdout);
368 }
369 i = SearchVocab(word);
370 if (i == -1) {
371 a = AddWordToVocab(word);
372 vocab[a].cn = 1;
373 } else
374 vocab[i].cn++;
375 if (vocab_size > vocab_hash_size * 0.7)
376 ReduceVocab();
377 }
378 SortVocab();
379 if (debug_mode > 0) {
380 printf("Vocab size: %lld\n", vocab_size);
381 printf("Words in train file: %lld\n", train_words);
382 }
383 file_size = ftell(fin);
384 fclose(fin);
385}
386
387void SaveVocab() {
388 long long i;
389 FILE *fo = fopen(save_vocab_file, "wb");
390 for (i = 0; i < vocab_size; i++)
391 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
392 fclose(fo);
393}
394
395void ReadVocab() {
396 long long a, i = 0;
397 char c;
398 char word[MAX_STRING];
399 FILE *fin = fopen(read_vocab_file, "rb");
400 if (fin == NULL) {
401 printf("Vocabulary file not found\n");
402 exit(1);
403 }
404 for (a = 0; a < vocab_hash_size; a++)
405 vocab_hash[a] = -1;
406 vocab_size = 0;
407 while (1) {
408 ReadWord(word, fin);
409 if (feof(fin))
410 break;
411 a = AddWordToVocab(word);
412 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
413 i++;
414 }
415 SortVocab();
416 if (debug_mode > 0) {
417 printf("Vocab size: %lld\n", vocab_size);
418 printf("Words in train file: %lld\n", train_words);
419 }
420 fin = fopen(train_file, "rb");
421 if (fin == NULL) {
422 printf("ERROR: training data file not found!\n");
423 exit(1);
424 }
425 fseek(fin, 0, SEEK_END);
426 file_size = ftell(fin);
427 fclose(fin);
428}
429
430void InitClassUnigramTable() {
431 long long a, c;
432 printf("loading class unigrams \n");
433 FILE *fin = fopen(negative_classes_file, "rb");
434 if (fin == NULL) {
435 printf("ERROR: class file not found!\n");
436 exit(1);
437 }
438 word_to_group = (int *) malloc(vocab_size * sizeof(int));
439 for (a = 0; a < vocab_size; a++)
440 word_to_group[a] = -1;
441 char class[MAX_STRING];
442 char prev_class[MAX_STRING];
443 prev_class[0] = 0;
444 char word[MAX_STRING];
445 class_number = -1;
446 while (1) {
447 if (feof(fin))
448 break;
449 ReadWord(class, fin);
450 ReadWord(word, fin);
451 int word_index = SearchVocab(word);
452 if (word_index != -1) {
453 if (strcmp(class, prev_class) != 0) {
454 class_number++;
455 strcpy(prev_class, class);
456 }
457 word_to_group[word_index] = class_number;
458 }
459 ReadWord(word, fin);
460 }
461 class_number++;
462 fclose(fin);
463
464 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
465 long long train_words_pow = 0;
466 real d1, power = 0.75;
467
468 for (c = 0; c < class_number; c++) {
469 long long offset = c * table_size;
470 train_words_pow = 0;
471 for (a = 0; a < vocab_size; a++)
472 if (word_to_group[a] == c)
473 train_words_pow += pow(vocab[a].cn, power);
474 int i = 0;
475 while (word_to_group[i] != c && i < vocab_size)
476 i++;
477 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
478 for (a = 0; a < table_size; a++) {
479 //printf("index %lld , word %d\n", a, i);
480 group_to_table[offset + a] = i;
481 if (a / (real) table_size > d1) {
482 i++;
483 while (word_to_group[i] != c && i < vocab_size)
484 i++;
485 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
486 }
487 if (i >= vocab_size)
488 while (word_to_group[i] != c && i >= 0)
489 i--;
490 }
491 }
492}
493
494void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100495 if(type != 3 || negative <= 0) {
496 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
497 return;
498 }
499
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100500 FILE *fnet = fopen(save_net_file, "wb");
501 if (fnet == NULL) {
502 printf("Net parameter file not found\n");
503 exit(1);
504 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100505 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100506 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100507 fclose(fnet);
508}
509
510void InitNet() {
511 long long a, b;
512 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100513 long long read;
514
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100515 window_layer_size = layer1_size * window * 2;
516 a = posix_memalign((void **) &syn0, 128,
517 (long long) vocab_size * layer1_size * sizeof(real));
518 if (syn0 == NULL) {
519 printf("Memory allocation failed\n");
520 exit(1);
521 }
522
523 if (hs) {
524 a = posix_memalign((void **) &syn1, 128,
525 (long long) vocab_size * layer1_size * sizeof(real));
526 if (syn1 == NULL) {
527 printf("Memory allocation failed\n");
528 exit(1);
529 }
530 a = posix_memalign((void **) &syn1_window, 128,
531 (long long) vocab_size * window_layer_size * sizeof(real));
532 if (syn1_window == NULL) {
533 printf("Memory allocation failed\n");
534 exit(1);
535 }
536 a = posix_memalign((void **) &syn_hidden_word, 128,
537 (long long) vocab_size * window_hidden_size * sizeof(real));
538 if (syn_hidden_word == NULL) {
539 printf("Memory allocation failed\n");
540 exit(1);
541 }
542
543 for (a = 0; a < vocab_size; a++)
544 for (b = 0; b < layer1_size; b++)
545 syn1[a * layer1_size + b] = 0;
546 for (a = 0; a < vocab_size; a++)
547 for (b = 0; b < window_layer_size; b++)
548 syn1_window[a * window_layer_size + b] = 0;
549 for (a = 0; a < vocab_size; a++)
550 for (b = 0; b < window_hidden_size; b++)
551 syn_hidden_word[a * window_hidden_size + b] = 0;
552 }
553 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100554 if(type == 0) {
555 a = posix_memalign((void **) &syn1neg, 128,
556 (long long) vocab_size * layer1_size * sizeof(real));
557 if (syn1neg == NULL) {
558 printf("Memory allocation failed\n");
559 exit(1);
560 }
561 for (a = 0; a < vocab_size; a++)
562 for (b = 0; b < layer1_size; b++)
563 syn1neg[a * layer1_size + b] = 0;
564 } else if (type == 3) {
565 a = posix_memalign((void **) &syn1neg_window, 128,
566 (long long) vocab_size * window_layer_size * sizeof(real));
567 if (syn1neg_window == NULL) {
568 printf("Memory allocation failed\n");
569 exit(1);
570 }
571 for (a = 0; a < vocab_size; a++)
572 for (b = 0; b < window_layer_size; b++)
573 syn1neg_window[a * window_layer_size + b] = 0;
574 } else if (type == 4) {
575 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
576 (long long) vocab_size * window_hidden_size * sizeof(real));
577 if (syn_hidden_word_neg == NULL) {
578 printf("Memory allocation failed\n");
579 exit(1);
580 }
581 for (a = 0; a < vocab_size; a++)
582 for (b = 0; b < window_hidden_size; b++)
583 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100584 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100585 }
586 if (nce > 0) {
587 a = posix_memalign((void **) &syn1nce, 128,
588 (long long) vocab_size * layer1_size * sizeof(real));
589 if (syn1nce == NULL) {
590 printf("Memory allocation failed\n");
591 exit(1);
592 }
593 a = posix_memalign((void **) &syn1nce_window, 128,
594 (long long) vocab_size * window_layer_size * sizeof(real));
595 if (syn1nce_window == NULL) {
596 printf("Memory allocation failed\n");
597 exit(1);
598 }
599 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
600 (long long) vocab_size * window_hidden_size * sizeof(real));
601 if (syn_hidden_word_nce == NULL) {
602 printf("Memory allocation failed\n");
603 exit(1);
604 }
605
606 for (a = 0; a < vocab_size; a++)
607 for (b = 0; b < layer1_size; b++)
608 syn1nce[a * layer1_size + b] = 0;
609 for (a = 0; a < vocab_size; a++)
610 for (b = 0; b < window_layer_size; b++)
611 syn1nce_window[a * window_layer_size + b] = 0;
612 for (a = 0; a < vocab_size; a++)
613 for (b = 0; b < window_hidden_size; b++)
614 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
615 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100616
Marc Kupietz1006a272016-03-16 15:50:20 +0100617 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100618 a = posix_memalign((void **) &syn_window_hidden, 128,
619 window_hidden_size * window_layer_size * sizeof(real));
620 if (syn_window_hidden == NULL) {
621 printf("Memory allocation failed\n");
622 exit(1);
623 }
624 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
625 next_random = next_random * (unsigned long long) 25214903917 + 11;
626 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
627 - 0.5) / (window_hidden_size * window_layer_size);
628 }
629 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100630
631 if (read_net_file[0] == 0) {
632 for (a = 0; a < vocab_size; a++)
633 for (b = 0; b < layer1_size; b++) {
634 next_random = next_random * (unsigned long long) 25214903917
635 + 11;
636 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
637 / (real) 65536) - 0.5) / layer1_size;
638 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100639 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100640 FILE *fnet = fopen(read_net_file, "rb");
641 if (fnet == NULL) {
642 printf("Net parameter file not found\n");
643 exit(1);
644 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100645 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
646 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
647 if(read != vocab_size * layer1_size) {
648 fprintf(stderr, "read-net failed %lld\n", read);
649 exit(-1);
650 }
651 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
652 if(read != (long long) vocab_size * window_layer_size) {
653 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
654 (long long) sizeof(real) * vocab_size * window_layer_size);
655 exit(-1);
656 }
657 fgetc(fnet);
658 if(!feof(fnet)) {
659 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
660 exit(-1);
661 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100662 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100663 } else {
664 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
665 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100666 }
667
668 CreateBinaryTree();
669}
670
671void *TrainModelThread(void *id) {
672 long long a, b, d, cw, word, last_word, sentence_length = 0,
673 sentence_position = 0;
674 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
675 long long l1, l2, c, target, label, local_iter = iter;
676 unsigned long long next_random = (long long) id;
677 real f, g;
678 clock_t now;
679 int input_len_1 = layer1_size;
680 int window_offset = -1;
681 if (type == 2 || type == 4) {
682 input_len_1 = window_layer_size;
683 }
684 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
685 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
686
687 int input_len_2 = 0;
688 if (type == 4) {
689 input_len_2 = window_hidden_size;
690 }
691 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
692 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
693
694 FILE *fi = fopen(train_file, "rb");
695 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
696 while (1) {
697 if (word_count - last_word_count > 10000) {
698 word_count_actual += word_count - last_word_count;
699 last_word_count = word_count;
700 if ((debug_mode > 1)) {
701 now = clock();
702 printf(
703 "%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
704 13, alpha,
705 word_count_actual / (real) (iter * train_words + 1)
706 * 100,
707 word_count_actual
708 / ((real) (now - start + 1)
709 / (real) CLOCKS_PER_SEC * 1000));
710 fflush(stdout);
711 }
712 alpha = starting_alpha
713 * (1 - word_count_actual / (real) (iter * train_words + 1));
714 if (alpha < starting_alpha * 0.0001)
715 alpha = starting_alpha * 0.0001;
716 }
717 if (sentence_length == 0) {
718 while (1) {
719 word = ReadWordIndex(fi);
720 if (feof(fi))
721 break;
722 if (word == -1)
723 continue;
724 word_count++;
725 if (word == 0)
726 break;
727 // The subsampling randomly discards frequent words while keeping the ranking same
728 if (sample > 0) {
729 real ran = (sqrt(vocab[word].cn / (sample * train_words))
730 + 1) * (sample * train_words) / vocab[word].cn;
731 next_random = next_random * (unsigned long long) 25214903917
732 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100733 if (ran < (next_random & 0xFFFF) / (real) 65536) {
734 if(type == 3) // in structured skipgrams
735 word = -2; // keep the window position correct
736 else
737 continue;
738 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100739 }
740 sen[sentence_length] = word;
741 sentence_length++;
742 if (sentence_length >= MAX_SENTENCE_LENGTH)
743 break;
744 }
745 sentence_position = 0;
746 }
747 if (feof(fi) || (word_count > train_words / num_threads)) {
748 word_count_actual += word_count - last_word_count;
749 local_iter--;
750 if (local_iter == 0)
751 break;
752 word_count = 0;
753 last_word_count = 0;
754 sentence_length = 0;
755 fseek(fi, file_size / (long long) num_threads * (long long) id,
756 SEEK_SET);
757 continue;
758 }
759 word = sen[sentence_position];
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100760 if (word == -2)
761 word = sen[++sentence_position];
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100762 if (word == -1)
763 continue;
764 for (c = 0; c < input_len_1; c++)
765 neu1[c] = 0;
766 for (c = 0; c < input_len_1; c++)
767 neu1e[c] = 0;
768 for (c = 0; c < input_len_2; c++)
769 neu2[c] = 0;
770 for (c = 0; c < input_len_2; c++)
771 neu2e[c] = 0;
772 next_random = next_random * (unsigned long long) 25214903917 + 11;
773 b = next_random % window;
774 if (type == 0) { //train the cbow architecture
775 // in -> hidden
776 cw = 0;
777 for (a = b; a < window * 2 + 1 - b; a++)
778 if (a != window) {
779 c = sentence_position - window + a;
780 if (c < 0)
781 continue;
782 if (c >= sentence_length)
783 continue;
784 last_word = sen[c];
785 if (last_word == -1)
786 continue;
787 for (c = 0; c < layer1_size; c++)
788 neu1[c] += syn0[c + last_word * layer1_size];
789 cw++;
790 }
791 if (cw) {
792 for (c = 0; c < layer1_size; c++)
793 neu1[c] /= cw;
794 if (hs)
795 for (d = 0; d < vocab[word].codelen; d++) {
796 f = 0;
797 l2 = vocab[word].point[d] * layer1_size;
798 // Propagate hidden -> output
799 for (c = 0; c < layer1_size; c++)
800 f += neu1[c] * syn1[c + l2];
801 if (f <= -MAX_EXP)
802 continue;
803 else if (f >= MAX_EXP)
804 continue;
805 else
806 f = expTable[(int) ((f + MAX_EXP)
807 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
808 // 'g' is the gradient multiplied by the learning rate
809 g = (1 - vocab[word].code[d] - f) * alpha;
810 // Propagate errors output -> hidden
811 for (c = 0; c < layer1_size; c++)
812 neu1e[c] += g * syn1[c + l2];
813 // Learn weights hidden -> output
814 for (c = 0; c < layer1_size; c++)
815 syn1[c + l2] += g * neu1[c];
816 if (cap == 1)
817 for (c = 0; c < layer1_size; c++)
818 capParam(syn1, c + l2);
819 }
820 // NEGATIVE SAMPLING
821 if (negative > 0)
822 for (d = 0; d < negative + 1; d++) {
823 if (d == 0) {
824 target = word;
825 label = 1;
826 } else {
827 next_random = next_random
828 * (unsigned long long) 25214903917 + 11;
829 if (word_to_group != NULL
830 && word_to_group[word] != -1) {
831 target = word;
832 while (target == word) {
833 target = group_to_table[word_to_group[word]
834 * table_size
835 + (next_random >> 16) % table_size];
836 next_random = next_random
837 * (unsigned long long) 25214903917
838 + 11;
839 }
840 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
841 } else {
842 target =
843 table[(next_random >> 16) % table_size];
844 }
845 if (target == 0)
846 target = next_random % (vocab_size - 1) + 1;
847 if (target == word)
848 continue;
849 label = 0;
850 }
851 l2 = target * layer1_size;
852 f = 0;
853 for (c = 0; c < layer1_size; c++)
854 f += neu1[c] * syn1neg[c + l2];
855 if (f > MAX_EXP)
856 g = (label - 1) * alpha;
857 else if (f < -MAX_EXP)
858 g = (label - 0) * alpha;
859 else
860 g = (label
861 - expTable[(int) ((f + MAX_EXP)
862 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
863 * alpha;
864 for (c = 0; c < layer1_size; c++)
865 neu1e[c] += g * syn1neg[c + l2];
866 for (c = 0; c < layer1_size; c++)
867 syn1neg[c + l2] += g * neu1[c];
868 if (cap == 1)
869 for (c = 0; c < layer1_size; c++)
870 capParam(syn1neg, c + l2);
871 }
872 // Noise Contrastive Estimation
873 if (nce > 0)
874 for (d = 0; d < nce + 1; d++) {
875 if (d == 0) {
876 target = word;
877 label = 1;
878 } else {
879 next_random = next_random
880 * (unsigned long long) 25214903917 + 11;
881 if (word_to_group != NULL
882 && word_to_group[word] != -1) {
883 target = word;
884 while (target == word) {
885 target = group_to_table[word_to_group[word]
886 * table_size
887 + (next_random >> 16) % table_size];
888 next_random = next_random
889 * (unsigned long long) 25214903917
890 + 11;
891 }
892 } else {
893 target =
894 table[(next_random >> 16) % table_size];
895 }
896 if (target == 0)
897 target = next_random % (vocab_size - 1) + 1;
898 if (target == word)
899 continue;
900 label = 0;
901 }
902 l2 = target * layer1_size;
903 f = 0;
904
905 for (c = 0; c < layer1_size; c++)
906 f += neu1[c] * syn1nce[c + l2];
907 if (f > MAX_EXP)
908 g = (label - 1) * alpha;
909 else if (f < -MAX_EXP)
910 g = (label - 0) * alpha;
911 else {
912 f = exp(f);
913 g =
914 (label
915 - f
916 / (noise_distribution[target]
917 * nce + f)) * alpha;
918 }
919 for (c = 0; c < layer1_size; c++)
920 neu1e[c] += g * syn1nce[c + l2];
921 for (c = 0; c < layer1_size; c++)
922 syn1nce[c + l2] += g * neu1[c];
923 if (cap == 1)
924 for (c = 0; c < layer1_size; c++)
925 capParam(syn1nce, c + l2);
926 }
927 // hidden -> in
928 for (a = b; a < window * 2 + 1 - b; a++)
929 if (a != window) {
930 c = sentence_position - window + a;
931 if (c < 0)
932 continue;
933 if (c >= sentence_length)
934 continue;
935 last_word = sen[c];
936 if (last_word == -1)
937 continue;
938 for (c = 0; c < layer1_size; c++)
939 syn0[c + last_word * layer1_size] += neu1e[c];
940 }
941 }
942 } else if (type == 1) { //train skip-gram
943 for (a = b; a < window * 2 + 1 - b; a++)
944 if (a != window) {
945 c = sentence_position - window + a;
946 if (c < 0)
947 continue;
948 if (c >= sentence_length)
949 continue;
950 last_word = sen[c];
951 if (last_word == -1)
952 continue;
953 l1 = last_word * layer1_size;
954 for (c = 0; c < layer1_size; c++)
955 neu1e[c] = 0;
956 // HIERARCHICAL SOFTMAX
957 if (hs)
958 for (d = 0; d < vocab[word].codelen; d++) {
959 f = 0;
960 l2 = vocab[word].point[d] * layer1_size;
961 // Propagate hidden -> output
962 for (c = 0; c < layer1_size; c++)
963 f += syn0[c + l1] * syn1[c + l2];
964 if (f <= -MAX_EXP)
965 continue;
966 else if (f >= MAX_EXP)
967 continue;
968 else
969 f = expTable[(int) ((f + MAX_EXP)
970 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
971 // 'g' is the gradient multiplied by the learning rate
972 g = (1 - vocab[word].code[d] - f) * alpha;
973 // Propagate errors output -> hidden
974 for (c = 0; c < layer1_size; c++)
975 neu1e[c] += g * syn1[c + l2];
976 // Learn weights hidden -> output
977 for (c = 0; c < layer1_size; c++)
978 syn1[c + l2] += g * syn0[c + l1];
979 if (cap == 1)
980 for (c = 0; c < layer1_size; c++)
981 capParam(syn1, c + l2);
982 }
983 // NEGATIVE SAMPLING
984 if (negative > 0)
985 for (d = 0; d < negative + 1; d++) {
986 if (d == 0) {
987 target = word;
988 label = 1;
989 } else {
990 next_random = next_random
991 * (unsigned long long) 25214903917 + 11;
992 if (word_to_group != NULL
993 && word_to_group[word] != -1) {
994 target = word;
995 while (target == word) {
996 target =
997 group_to_table[word_to_group[word]
998 * table_size
999 + (next_random >> 16)
1000 % table_size];
1001 next_random =
1002 next_random
1003 * (unsigned long long) 25214903917
1004 + 11;
1005 }
1006 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1007 } else {
1008 target = table[(next_random >> 16)
1009 % table_size];
1010 }
1011 if (target == 0)
1012 target = next_random % (vocab_size - 1) + 1;
1013 if (target == word)
1014 continue;
1015 label = 0;
1016 }
1017 l2 = target * layer1_size;
1018 f = 0;
1019 for (c = 0; c < layer1_size; c++)
1020 f += syn0[c + l1] * syn1neg[c + l2];
1021 if (f > MAX_EXP)
1022 g = (label - 1) * alpha;
1023 else if (f < -MAX_EXP)
1024 g = (label - 0) * alpha;
1025 else
1026 g =
1027 (label
1028 - expTable[(int) ((f + MAX_EXP)
1029 * (EXP_TABLE_SIZE
1030 / MAX_EXP / 2))])
1031 * alpha;
1032 for (c = 0; c < layer1_size; c++)
1033 neu1e[c] += g * syn1neg[c + l2];
1034 for (c = 0; c < layer1_size; c++)
1035 syn1neg[c + l2] += g * syn0[c + l1];
1036 if (cap == 1)
1037 for (c = 0; c < layer1_size; c++)
1038 capParam(syn1neg, c + l2);
1039 }
1040 //Noise Contrastive Estimation
1041 if (nce > 0)
1042 for (d = 0; d < nce + 1; d++) {
1043 if (d == 0) {
1044 target = word;
1045 label = 1;
1046 } else {
1047 next_random = next_random
1048 * (unsigned long long) 25214903917 + 11;
1049 if (word_to_group != NULL
1050 && word_to_group[word] != -1) {
1051 target = word;
1052 while (target == word) {
1053 target =
1054 group_to_table[word_to_group[word]
1055 * table_size
1056 + (next_random >> 16)
1057 % table_size];
1058 next_random =
1059 next_random
1060 * (unsigned long long) 25214903917
1061 + 11;
1062 }
1063 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1064 } else {
1065 target = table[(next_random >> 16)
1066 % table_size];
1067 }
1068 if (target == 0)
1069 target = next_random % (vocab_size - 1) + 1;
1070 if (target == word)
1071 continue;
1072 label = 0;
1073 }
1074 l2 = target * layer1_size;
1075 f = 0;
1076 for (c = 0; c < layer1_size; c++)
1077 f += syn0[c + l1] * syn1nce[c + l2];
1078 if (f > MAX_EXP)
1079 g = (label - 1) * alpha;
1080 else if (f < -MAX_EXP)
1081 g = (label - 0) * alpha;
1082 else {
1083 f = exp(f);
1084 g = (label
1085 - f
1086 / (noise_distribution[target]
1087 * nce + f)) * alpha;
1088 }
1089 for (c = 0; c < layer1_size; c++)
1090 neu1e[c] += g * syn1nce[c + l2];
1091 for (c = 0; c < layer1_size; c++)
1092 syn1nce[c + l2] += g * syn0[c + l1];
1093 if (cap == 1)
1094 for (c = 0; c < layer1_size; c++)
1095 capParam(syn1nce, c + l2);
1096 }
1097 // Learn weights input -> hidden
1098 for (c = 0; c < layer1_size; c++)
1099 syn0[c + l1] += neu1e[c];
1100 }
1101 } else if (type == 2) { //train the cwindow architecture
1102 // in -> hidden
1103 cw = 0;
1104 for (a = 0; a < window * 2 + 1; a++)
1105 if (a != window) {
1106 c = sentence_position - window + a;
1107 if (c < 0)
1108 continue;
1109 if (c >= sentence_length)
1110 continue;
1111 last_word = sen[c];
1112 if (last_word == -1)
1113 continue;
1114 window_offset = a * layer1_size;
1115 if (a > window)
1116 window_offset -= layer1_size;
1117 for (c = 0; c < layer1_size; c++)
1118 neu1[c + window_offset] += syn0[c
1119 + last_word * layer1_size];
1120 cw++;
1121 }
1122 if (cw) {
1123 if (hs)
1124 for (d = 0; d < vocab[word].codelen; d++) {
1125 f = 0;
1126 l2 = vocab[word].point[d] * window_layer_size;
1127 // Propagate hidden -> output
1128 for (c = 0; c < window_layer_size; c++)
1129 f += neu1[c] * syn1_window[c + l2];
1130 if (f <= -MAX_EXP)
1131 continue;
1132 else if (f >= MAX_EXP)
1133 continue;
1134 else
1135 f = expTable[(int) ((f + MAX_EXP)
1136 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1137 // 'g' is the gradient multiplied by the learning rate
1138 g = (1 - vocab[word].code[d] - f) * alpha;
1139 // Propagate errors output -> hidden
1140 for (c = 0; c < window_layer_size; c++)
1141 neu1e[c] += g * syn1_window[c + l2];
1142 // Learn weights hidden -> output
1143 for (c = 0; c < window_layer_size; c++)
1144 syn1_window[c + l2] += g * neu1[c];
1145 if (cap == 1)
1146 for (c = 0; c < window_layer_size; c++)
1147 capParam(syn1_window, c + l2);
1148 }
1149 // NEGATIVE SAMPLING
1150 if (negative > 0)
1151 for (d = 0; d < negative + 1; d++) {
1152 if (d == 0) {
1153 target = word;
1154 label = 1;
1155 } else {
1156 next_random = next_random
1157 * (unsigned long long) 25214903917 + 11;
1158 if (word_to_group != NULL
1159 && word_to_group[word] != -1) {
1160 target = word;
1161 while (target == word) {
1162 target = group_to_table[word_to_group[word]
1163 * table_size
1164 + (next_random >> 16) % table_size];
1165 next_random = next_random
1166 * (unsigned long long) 25214903917
1167 + 11;
1168 }
1169 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1170 } else {
1171 target =
1172 table[(next_random >> 16) % table_size];
1173 }
1174 if (target == 0)
1175 target = next_random % (vocab_size - 1) + 1;
1176 if (target == word)
1177 continue;
1178 label = 0;
1179 }
1180 l2 = target * window_layer_size;
1181 f = 0;
1182 for (c = 0; c < window_layer_size; c++)
1183 f += neu1[c] * syn1neg_window[c + l2];
1184 if (f > MAX_EXP)
1185 g = (label - 1) * alpha;
1186 else if (f < -MAX_EXP)
1187 g = (label - 0) * alpha;
1188 else
1189 g = (label
1190 - expTable[(int) ((f + MAX_EXP)
1191 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1192 * alpha;
1193 for (c = 0; c < window_layer_size; c++)
1194 neu1e[c] += g * syn1neg_window[c + l2];
1195 for (c = 0; c < window_layer_size; c++)
1196 syn1neg_window[c + l2] += g * neu1[c];
1197 if (cap == 1)
1198 for (c = 0; c < window_layer_size; c++)
1199 capParam(syn1neg_window, c + l2);
1200 }
1201 // Noise Contrastive Estimation
1202 if (nce > 0)
1203 for (d = 0; d < nce + 1; d++) {
1204 if (d == 0) {
1205 target = word;
1206 label = 1;
1207 } else {
1208 next_random = next_random
1209 * (unsigned long long) 25214903917 + 11;
1210 if (word_to_group != NULL
1211 && word_to_group[word] != -1) {
1212 target = word;
1213 while (target == word) {
1214 target = group_to_table[word_to_group[word]
1215 * table_size
1216 + (next_random >> 16) % table_size];
1217 next_random = next_random
1218 * (unsigned long long) 25214903917
1219 + 11;
1220 }
1221 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1222 } else {
1223 target =
1224 table[(next_random >> 16) % table_size];
1225 }
1226 if (target == 0)
1227 target = next_random % (vocab_size - 1) + 1;
1228 if (target == word)
1229 continue;
1230 label = 0;
1231 }
1232 l2 = target * window_layer_size;
1233 f = 0;
1234 for (c = 0; c < window_layer_size; c++)
1235 f += neu1[c] * syn1nce_window[c + l2];
1236 if (f > MAX_EXP)
1237 g = (label - 1) * alpha;
1238 else if (f < -MAX_EXP)
1239 g = (label - 0) * alpha;
1240 else {
1241 f = exp(f);
1242 g =
1243 (label
1244 - f
1245 / (noise_distribution[target]
1246 * nce + f)) * alpha;
1247 }
1248 for (c = 0; c < window_layer_size; c++)
1249 neu1e[c] += g * syn1nce_window[c + l2];
1250 for (c = 0; c < window_layer_size; c++)
1251 syn1nce_window[c + l2] += g * neu1[c];
1252 if (cap == 1)
1253 for (c = 0; c < window_layer_size; c++)
1254 capParam(syn1nce_window, c + l2);
1255 }
1256 // hidden -> in
1257 for (a = 0; a < window * 2 + 1; a++)
1258 if (a != window) {
1259 c = sentence_position - window + a;
1260 if (c < 0)
1261 continue;
1262 if (c >= sentence_length)
1263 continue;
1264 last_word = sen[c];
1265 if (last_word == -1)
1266 continue;
1267 window_offset = a * layer1_size;
1268 if (a > window)
1269 window_offset -= layer1_size;
1270 for (c = 0; c < layer1_size; c++)
1271 syn0[c + last_word * layer1_size] += neu1e[c
1272 + window_offset];
1273 }
1274 }
1275 } else if (type == 3) { //train structured skip-gram
1276 for (a = 0; a < window * 2 + 1; a++)
1277 if (a != window) {
1278 c = sentence_position - window + a;
1279 if (c < 0)
1280 continue;
Marc Kupietzab4e5af2016-03-22 14:24:03 +01001281 if(sen[c] == -2)
1282 continue;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001283 if (c >= sentence_length)
1284 continue;
1285 last_word = sen[c];
1286 if (last_word == -1)
1287 continue;
1288 l1 = last_word * layer1_size;
1289 window_offset = a * layer1_size;
1290 if (a > window)
1291 window_offset -= layer1_size;
1292 for (c = 0; c < layer1_size; c++)
1293 neu1e[c] = 0;
1294 // HIERARCHICAL SOFTMAX
1295 if (hs)
1296 for (d = 0; d < vocab[word].codelen; d++) {
1297 f = 0;
1298 l2 = vocab[word].point[d] * window_layer_size;
1299 // Propagate hidden -> output
1300 for (c = 0; c < layer1_size; c++)
1301 f += syn0[c + l1]
1302 * syn1_window[c + l2 + window_offset];
1303 if (f <= -MAX_EXP)
1304 continue;
1305 else if (f >= MAX_EXP)
1306 continue;
1307 else
1308 f = expTable[(int) ((f + MAX_EXP)
1309 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1310 // 'g' is the gradient multiplied by the learning rate
1311 g = (1 - vocab[word].code[d] - f) * alpha;
1312 // Propagate errors output -> hidden
1313 for (c = 0; c < layer1_size; c++)
1314 neu1e[c] += g
1315 * syn1_window[c + l2 + window_offset];
1316 // Learn weights hidden -> output
1317 for (c = 0; c < layer1_size; c++)
1318 syn1[c + l2 + window_offset] += g
1319 * syn0[c + l1];
1320 if (cap == 1)
1321 for (c = 0; c < layer1_size; c++)
1322 capParam(syn1, c + l2 + window_offset);
1323 }
1324 // NEGATIVE SAMPLING
1325 if (negative > 0)
1326 for (d = 0; d < negative + 1; d++) {
1327 if (d == 0) {
1328 target = word;
1329 label = 1;
1330 } else {
1331 next_random = next_random
1332 * (unsigned long long) 25214903917 + 11;
1333 if (word_to_group != NULL
1334 && word_to_group[word] != -1) {
1335 target = word;
1336 while (target == word) {
1337 target =
1338 group_to_table[word_to_group[word]
1339 * table_size
1340 + (next_random >> 16)
1341 % table_size];
1342 next_random =
1343 next_random
1344 * (unsigned long long) 25214903917
1345 + 11;
1346 }
1347 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1348 } else {
1349 target = table[(next_random >> 16)
1350 % table_size];
1351 }
1352 if (target == 0)
1353 target = next_random % (vocab_size - 1) + 1;
1354 if (target == word)
1355 continue;
1356 label = 0;
1357 }
1358 l2 = target * window_layer_size;
1359 f = 0;
1360 for (c = 0; c < layer1_size; c++)
1361 f +=
1362 syn0[c + l1]
1363 * syn1neg_window[c + l2
1364 + window_offset];
1365 if (f > MAX_EXP)
1366 g = (label - 1) * alpha;
1367 else if (f < -MAX_EXP)
1368 g = (label - 0) * alpha;
1369 else
1370 g =
1371 (label
1372 - expTable[(int) ((f + MAX_EXP)
1373 * (EXP_TABLE_SIZE
1374 / MAX_EXP / 2))])
1375 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001376 if(debug_mode > 2 && ((long long) id) == 0) {
1377 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1378 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1379 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001380 for (c = 0; c < layer1_size; c++)
1381 neu1e[c] +=
1382 g
1383 * syn1neg_window[c + l2
1384 + window_offset];
1385 for (c = 0; c < layer1_size; c++)
1386 syn1neg_window[c + l2 + window_offset] += g
1387 * syn0[c + l1];
1388 if (cap == 1)
1389 for (c = 0; c < layer1_size; c++)
1390 capParam(syn1neg_window,
1391 c + l2 + window_offset);
1392 }
1393 // Noise Constrastive Estimation
1394 if (nce > 0)
1395 for (d = 0; d < nce + 1; d++) {
1396 if (d == 0) {
1397 target = word;
1398 label = 1;
1399 } else {
1400 next_random = next_random
1401 * (unsigned long long) 25214903917 + 11;
1402 if (word_to_group != NULL
1403 && word_to_group[word] != -1) {
1404 target = word;
1405 while (target == word) {
1406 target =
1407 group_to_table[word_to_group[word]
1408 * table_size
1409 + (next_random >> 16)
1410 % table_size];
1411 next_random =
1412 next_random
1413 * (unsigned long long) 25214903917
1414 + 11;
1415 }
1416 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1417 } else {
1418 target = table[(next_random >> 16)
1419 % table_size];
1420 }
1421 if (target == 0)
1422 target = next_random % (vocab_size - 1) + 1;
1423 if (target == word)
1424 continue;
1425 label = 0;
1426 }
1427 l2 = target * window_layer_size;
1428 f = 0;
1429 for (c = 0; c < layer1_size; c++)
1430 f +=
1431 syn0[c + l1]
1432 * syn1nce_window[c + l2
1433 + window_offset];
1434 if (f > MAX_EXP)
1435 g = (label - 1) * alpha;
1436 else if (f < -MAX_EXP)
1437 g = (label - 0) * alpha;
1438 else {
1439 f = exp(f);
1440 g = (label
1441 - f
1442 / (noise_distribution[target]
1443 * nce + f)) * alpha;
1444 }
1445 for (c = 0; c < layer1_size; c++)
1446 neu1e[c] +=
1447 g
1448 * syn1nce_window[c + l2
1449 + window_offset];
1450 for (c = 0; c < layer1_size; c++)
1451 syn1nce_window[c + l2 + window_offset] += g
1452 * syn0[c + l1];
1453 if (cap == 1)
1454 for (c = 0; c < layer1_size; c++)
1455 capParam(syn1nce_window,
1456 c + l2 + window_offset);
1457 }
1458 // Learn weights input -> hidden
1459 for (c = 0; c < layer1_size; c++) {
1460 syn0[c + l1] += neu1e[c];
1461 if (syn0[c + l1] > 50)
1462 syn0[c + l1] = 50;
1463 if (syn0[c + l1] < -50)
1464 syn0[c + l1] = -50;
1465 }
1466 }
1467 } else if (type == 4) { //training senna
1468 // in -> hidden
1469 cw = 0;
1470 for (a = 0; a < window * 2 + 1; a++)
1471 if (a != window) {
1472 c = sentence_position - window + a;
1473 if (c < 0)
1474 continue;
1475 if (c >= sentence_length)
1476 continue;
1477 last_word = sen[c];
1478 if (last_word == -1)
1479 continue;
1480 window_offset = a * layer1_size;
1481 if (a > window)
1482 window_offset -= layer1_size;
1483 for (c = 0; c < layer1_size; c++)
1484 neu1[c + window_offset] += syn0[c
1485 + last_word * layer1_size];
1486 cw++;
1487 }
1488 if (cw) {
1489 for (a = 0; a < window_hidden_size; a++) {
1490 c = a * window_layer_size;
1491 for (b = 0; b < window_layer_size; b++) {
1492 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1493 }
1494 }
1495 if (hs)
1496 for (d = 0; d < vocab[word].codelen; d++) {
1497 f = 0;
1498 l2 = vocab[word].point[d] * window_hidden_size;
1499 // Propagate hidden -> output
1500 for (c = 0; c < window_hidden_size; c++)
1501 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1502 if (f <= -MAX_EXP)
1503 continue;
1504 else if (f >= MAX_EXP)
1505 continue;
1506 else
1507 f = expTable[(int) ((f + MAX_EXP)
1508 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1509 // 'g' is the gradient multiplied by the learning rate
1510 g = (1 - vocab[word].code[d] - f) * alpha;
1511 // Propagate errors output -> hidden
1512 for (c = 0; c < window_hidden_size; c++)
1513 neu2e[c] += dHardTanh(neu2[c], g) * g
1514 * syn_hidden_word[c + l2];
1515 // Learn weights hidden -> output
1516 for (c = 0; c < window_hidden_size; c++)
1517 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1518 * neu2[c];
1519 }
1520 // NEGATIVE SAMPLING
1521 if (negative > 0)
1522 for (d = 0; d < negative + 1; d++) {
1523 if (d == 0) {
1524 target = word;
1525 label = 1;
1526 } else {
1527 next_random = next_random
1528 * (unsigned long long) 25214903917 + 11;
1529 if (word_to_group != NULL
1530 && word_to_group[word] != -1) {
1531 target = word;
1532 while (target == word) {
1533 target = group_to_table[word_to_group[word]
1534 * table_size
1535 + (next_random >> 16) % table_size];
1536 next_random = next_random
1537 * (unsigned long long) 25214903917
1538 + 11;
1539 }
1540 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1541 } else {
1542 target =
1543 table[(next_random >> 16) % table_size];
1544 }
1545 if (target == 0)
1546 target = next_random % (vocab_size - 1) + 1;
1547 if (target == word)
1548 continue;
1549 label = 0;
1550 }
1551 l2 = target * window_hidden_size;
1552 f = 0;
1553 for (c = 0; c < window_hidden_size; c++)
1554 f += hardTanh(neu2[c])
1555 * syn_hidden_word_neg[c + l2];
1556 if (f > MAX_EXP)
1557 g = (label - 1) * alpha / negative;
1558 else if (f < -MAX_EXP)
1559 g = (label - 0) * alpha / negative;
1560 else
1561 g = (label
1562 - expTable[(int) ((f + MAX_EXP)
1563 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1564 * alpha / negative;
1565 for (c = 0; c < window_hidden_size; c++)
1566 neu2e[c] += dHardTanh(neu2[c], g) * g
1567 * syn_hidden_word_neg[c + l2];
1568 for (c = 0; c < window_hidden_size; c++)
1569 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1570 * g * neu2[c];
1571 }
1572 for (a = 0; a < window_hidden_size; a++)
1573 for (b = 0; b < window_layer_size; b++)
1574 neu1e[b] += neu2e[a]
1575 * syn_window_hidden[a * window_layer_size + b];
1576 for (a = 0; a < window_hidden_size; a++)
1577 for (b = 0; b < window_layer_size; b++)
1578 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1579 * neu1[b];
1580 // hidden -> in
1581 for (a = 0; a < window * 2 + 1; a++)
1582 if (a != window) {
1583 c = sentence_position - window + a;
1584 if (c < 0)
1585 continue;
1586 if (c >= sentence_length)
1587 continue;
1588 last_word = sen[c];
1589 if (last_word == -1)
1590 continue;
1591 window_offset = a * layer1_size;
1592 if (a > window)
1593 window_offset -= layer1_size;
1594 for (c = 0; c < layer1_size; c++)
1595 syn0[c + last_word * layer1_size] += neu1e[c
1596 + window_offset];
1597 }
1598 }
1599 } else {
1600 printf("unknown type %i", type);
1601 exit(0);
1602 }
1603 sentence_position++;
1604 if (sentence_position >= sentence_length) {
1605 sentence_length = 0;
1606 continue;
1607 }
1608 }
1609 fclose(fi);
1610 free(neu1);
1611 free(neu1e);
1612 pthread_exit(NULL);
1613}
1614
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001615void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001616 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001617 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001618 real *target_sums, bestf[MAX_CC], worstbest;
1619 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001620 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001621 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1622
1623 for (d = cc; d < vocab_size; d++) {
1624 for (b = 0; b < vocab_size; b++)
1625 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001626 for (b = 0; b < N; b++)
1627 bestf[b]=-1;
1628 worstbest = -1;
1629
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001630 maxmax_f = -1;
1631 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001632 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001633 if (a != window) {
1634 max_f = -1;
1635 window_offset = a * layer1_size;
1636 if (a > window)
1637 window_offset -= layer1_size;
1638 for(target = 0; target < vocab_size; target ++) {
1639 if(target == d)
1640 continue;
1641 f = 0;
1642 for (c = 0; c < layer1_size; c++)
1643 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1644 if (f < -MAX_EXP)
1645 continue;
1646 else if (f > MAX_EXP)
1647 continue;
1648 else
1649 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1650 if(f > max_f) {
1651 max_f = f;
1652 max_target = target;
1653 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001654 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001655 if(f > worstbest) {
1656 for (b = 0; b < N; b++) {
1657 if (f > bestf[b]) {
1658 for (e = N - 1; e > b; e--) {
1659 bestf[e] = bestf[e - 1];
1660 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001661 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001662 }
1663 bestf[b] = f;
1664 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001665 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001666 break;
1667 }
1668 }
1669 worstbest = bestf[N-1];
1670 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001671 }
1672 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1673 if(max_f > maxmax_f) {
1674 maxmax_f = max_f;
1675 maxmax_target = max_target;
1676 }
1677 } else {
1678 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1679 }
1680 }
1681 max_f = -1;
1682 for (b = 0; b < vocab_size; b++) {
1683 if(target_sums[b] > max_f) {
1684 max_f = target_sums[b];
1685 max_target = b;
1686 }
1687 }
1688 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001689 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001690 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001691 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001692 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001693 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001694 }
1695}
1696
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001697void TrainModel() {
1698 long a, b, c, d;
1699 FILE *fo;
1700 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1701 printf("Starting training using file %s\n", train_file);
1702 starting_alpha = alpha;
1703 if (read_vocab_file[0] != 0)
1704 ReadVocab();
1705 else
1706 LearnVocabFromTrainFile();
1707 if (save_vocab_file[0] != 0)
1708 SaveVocab();
1709 if (output_file[0] == 0)
1710 return;
1711 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001712 if(cc > 0)
1713 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001714 if (negative > 0 || nce > 0)
1715 InitUnigramTable();
1716 if (negative_classes_file[0] != 0)
1717 InitClassUnigramTable();
1718 start = clock();
1719 for (a = 0; a < num_threads; a++)
1720 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1721 for (a = 0; a < num_threads; a++)
1722 pthread_join(pt[a], NULL);
1723 fo = fopen(output_file, "wb");
1724 if (classes == 0) {
1725 // Save the word vectors
1726 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1727 for (a = 0; a < vocab_size; a++) {
1728 fprintf(fo, "%s ", vocab[a].word);
1729 if (binary)
1730 for (b = 0; b < layer1_size; b++)
1731 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1732 else
1733 for (b = 0; b < layer1_size; b++)
1734 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1735 fprintf(fo, "\n");
1736 }
1737 } else {
1738 // Run K-means on the word vectors
1739 int clcn = classes, iter = 10, closeid;
1740 int *centcn = (int *) malloc(classes * sizeof(int));
1741 int *cl = (int *) calloc(vocab_size, sizeof(int));
1742 real closev, x;
1743 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1744 for (a = 0; a < vocab_size; a++)
1745 cl[a] = a % clcn;
1746 for (a = 0; a < iter; a++) {
1747 for (b = 0; b < clcn * layer1_size; b++)
1748 cent[b] = 0;
1749 for (b = 0; b < clcn; b++)
1750 centcn[b] = 1;
1751 for (c = 0; c < vocab_size; c++) {
1752 for (d = 0; d < layer1_size; d++)
1753 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1754 centcn[cl[c]]++;
1755 }
1756 for (b = 0; b < clcn; b++) {
1757 closev = 0;
1758 for (c = 0; c < layer1_size; c++) {
1759 cent[layer1_size * b + c] /= centcn[b];
1760 closev += cent[layer1_size * b + c]
1761 * cent[layer1_size * b + c];
1762 }
1763 closev = sqrt(closev);
1764 for (c = 0; c < layer1_size; c++)
1765 cent[layer1_size * b + c] /= closev;
1766 }
1767 for (c = 0; c < vocab_size; c++) {
1768 closev = -10;
1769 closeid = 0;
1770 for (d = 0; d < clcn; d++) {
1771 x = 0;
1772 for (b = 0; b < layer1_size; b++)
1773 x += cent[layer1_size * d + b]
1774 * syn0[c * layer1_size + b];
1775 if (x > closev) {
1776 closev = x;
1777 closeid = d;
1778 }
1779 }
1780 cl[c] = closeid;
1781 }
1782 }
1783 // Save the K-means classes
1784 for (a = 0; a < vocab_size; a++)
1785 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1786 free(centcn);
1787 free(cent);
1788 free(cl);
1789 }
1790 fclose(fo);
1791 if (save_net_file[0] != 0)
1792 SaveNet();
1793}
1794
1795int ArgPos(char *str, int argc, char **argv) {
1796 int a;
1797 for (a = 1; a < argc; a++)
1798 if (!strcmp(str, argv[a])) {
1799 if (a == argc - 1) {
1800 printf("Argument missing for %s\n", str);
1801 exit(1);
1802 }
1803 return a;
1804 }
1805 return -1;
1806}
1807
1808int main(int argc, char **argv) {
1809 int i;
1810 if (argc == 1) {
1811 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1812 printf("Options:\n");
1813 printf("Parameters for training:\n");
1814 printf("\t-train <file>\n");
1815 printf("\t\tUse text data from <file> to train the model\n");
1816 printf("\t-output <file>\n");
1817 printf(
1818 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1819 printf("\t-size <int>\n");
1820 printf("\t\tSet size of word vectors; default is 100\n");
1821 printf("\t-window <int>\n");
1822 printf("\t\tSet max skip length between words; default is 5\n");
1823 printf("\t-sample <float>\n");
1824 printf(
1825 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1826 printf(
1827 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1828 printf("\t-hs <int>\n");
1829 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1830 printf("\t-negative <int>\n");
1831 printf(
1832 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1833 printf("\t-negative-classes <file>\n");
1834 printf("\t\tNegative classes to sample from\n");
1835 printf("\t-nce <int>\n");
1836 printf(
1837 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1838 printf("\t-threads <int>\n");
1839 printf("\t\tUse <int> threads (default 12)\n");
1840 printf("\t-iter <int>\n");
1841 printf("\t\tRun more training iterations (default 5)\n");
1842 printf("\t-min-count <int>\n");
1843 printf(
1844 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1845 printf("\t-alpha <float>\n");
1846 printf(
1847 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1848 printf("\t-classes <int>\n");
1849 printf(
1850 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1851 printf("\t-debug <int>\n");
1852 printf(
1853 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1854 printf("\t-binary <int>\n");
1855 printf(
1856 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1857 printf("\t-save-vocab <file>\n");
1858 printf("\t\tThe vocabulary will be saved to <file>\n");
1859 printf("\t-read-vocab <file>\n");
1860 printf(
1861 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1862 printf("\t-read-net <file>\n");
1863 printf(
1864 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1865 printf("\t-save-net <file>\n");
1866 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001867 printf("\t-show-cc <int>\n");
1868 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001869 printf("\t-type <int>\n");
1870 printf(
1871 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1872 printf("\t-cap <int>\n");
1873 printf(
1874 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1875 printf("\nExamples:\n");
1876 printf(
1877 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1878 return 0;
1879 }
1880 output_file[0] = 0;
1881 save_vocab_file[0] = 0;
1882 read_vocab_file[0] = 0;
1883 save_net_file[0] = 0;
1884 read_net_file[0] = 0;
1885 negative_classes_file[0] = 0;
1886 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1887 layer1_size = atoi(argv[i + 1]);
1888 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1889 strcpy(train_file, argv[i + 1]);
1890 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1891 strcpy(save_vocab_file, argv[i + 1]);
1892 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1893 strcpy(read_vocab_file, argv[i + 1]);
1894 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1895 strcpy(save_net_file, argv[i + 1]);
1896 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1897 strcpy(read_net_file, argv[i + 1]);
1898 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1899 debug_mode = atoi(argv[i + 1]);
1900 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1901 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001902 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
1903 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001904 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1905 type = atoi(argv[i + 1]);
1906 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1907 strcpy(output_file, argv[i + 1]);
1908 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1909 window = atoi(argv[i + 1]);
1910 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1911 sample = atof(argv[i + 1]);
1912 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1913 hs = atoi(argv[i + 1]);
1914 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1915 negative = atoi(argv[i + 1]);
1916 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1917 strcpy(negative_classes_file, argv[i + 1]);
1918 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1919 nce = atoi(argv[i + 1]);
1920 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1921 num_threads = atoi(argv[i + 1]);
1922 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1923 iter = atoi(argv[i + 1]);
1924 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1925 min_count = atoi(argv[i + 1]);
1926 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1927 classes = atoi(argv[i + 1]);
1928 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1929 cap = atoi(argv[i + 1]);
1930 if (type == 0 || type == 2 || type == 4)
1931 alpha = 0.05;
1932 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1933 alpha = atof(argv[i + 1]);
1934 vocab = (struct vocab_word *) calloc(vocab_max_size,
1935 sizeof(struct vocab_word));
1936 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1937 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1938 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1939 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1940 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1941 }
1942 TrainModel();
1943 return 0;
1944}
1945