blob: 89bca7fdf46c05535866a92a5f9b11b1be81068c [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010025#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010026#define MAX_CODE_LENGTH 40
27
28const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
29
30typedef float real; // Precision of float numbers
31
32struct vocab_word {
33 long long cn;
34 int *point;
35 char *word, *code, codelen;
36};
37
38char train_file[MAX_STRING], output_file[MAX_STRING];
39char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
40char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
41struct vocab_word *vocab;
42int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietzc2731b22016-07-14 08:56:14 +020043 num_threads = 12, min_reduce = 1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010044int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020045long long *threadPos;
46int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010047long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
48long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
49 classes = 0;
50real alpha = 0.025, starting_alpha, sample = 1e-3;
51real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020052real avgWordLength=0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010053clock_t start;
54
55real *syn1_window, *syn1neg_window, *syn1nce_window;
56int w_offset, window_layer_size;
57
58int window_hidden_size = 500;
59real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
60 *syn_hidden_word_nce;
61
62int hs = 0, negative = 5;
63const int table_size = 1e8;
64int *table;
65
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010066long cc = 0;
67
Marc Kupietzd6f9c712016-03-16 11:50:56 +010068//constrastive negative sampling
69char negative_classes_file[MAX_STRING];
70int *word_to_group;
71int *group_to_table; //group_size*table_size
72int class_number;
73
74//nce
75real* noise_distribution;
76int nce = 0;
77
78//param caps
79real CAP_VALUE = 50;
80int cap = 0;
81
82void capParam(real* array, int index) {
83 if (array[index] > CAP_VALUE)
84 array[index] = CAP_VALUE;
85 else if (array[index] < -CAP_VALUE)
86 array[index] = -CAP_VALUE;
87}
88
89real hardTanh(real x) {
90 if (x >= 1) {
91 return 1;
92 } else if (x <= -1) {
93 return -1;
94 } else {
95 return x;
96 }
97}
98
99real dHardTanh(real x, real g) {
100 if (x > 1 && g > 0) {
101 return 0;
102 }
103 if (x < -1 && g < 0) {
104 return 0;
105 }
106 return 1;
107}
108
109void InitUnigramTable() {
110 int a, i;
111 long long train_words_pow = 0;
112 real d1, power = 0.75;
113 table = (int *) malloc(table_size * sizeof(int));
114 for (a = 0; a < vocab_size; a++)
115 train_words_pow += pow(vocab[a].cn, power);
116 i = 0;
117 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
118 for (a = 0; a < table_size; a++) {
119 table[a] = i;
120 if (a / (real) table_size > d1) {
121 i++;
122 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
123 }
124 if (i >= vocab_size)
125 i = vocab_size - 1;
126 }
127
128 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
129 for (a = 0; a < vocab_size; a++)
130 noise_distribution[a] = pow(vocab[a].cn, power)
131 / (real) train_words_pow;
132}
133
134// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
135void ReadWord(char *word, FILE *fin) {
136 int a = 0, ch;
137 while (!feof(fin)) {
138 ch = fgetc(fin);
139 if (ch == 13)
140 continue;
141 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
142 if (a > 0) {
143 if (ch == '\n')
144 ungetc(ch, fin);
145 break;
146 }
147 if (ch == '\n') {
148 strcpy(word, (char *) "</s>");
149 return;
150 } else
151 continue;
152 }
153 word[a] = ch;
154 a++;
155 if (a >= MAX_STRING - 1)
156 a--; // Truncate too long words
157 }
158 word[a] = 0;
159}
160
161// Returns hash value of a word
162int GetWordHash(char *word) {
163 unsigned long long a, hash = 0;
164 for (a = 0; a < strlen(word); a++)
165 hash = hash * 257 + word[a];
166 hash = hash % vocab_hash_size;
167 return hash;
168}
169
170// Returns position of a word in the vocabulary; if the word is not found, returns -1
171int SearchVocab(char *word) {
172 unsigned int hash = GetWordHash(word);
173 while (1) {
174 if (vocab_hash[hash] == -1)
175 return -1;
176 if (!strcmp(word, vocab[vocab_hash[hash]].word))
177 return vocab_hash[hash];
178 hash = (hash + 1) % vocab_hash_size;
179 }
180 return -1;
181}
182
183// Reads a word and returns its index in the vocabulary
184int ReadWordIndex(FILE *fin) {
185 char word[MAX_STRING];
186 ReadWord(word, fin);
187 if (feof(fin))
188 return -1;
189 return SearchVocab(word);
190}
191
192// Adds a word to the vocabulary
193int AddWordToVocab(char *word) {
194 unsigned int hash, length = strlen(word) + 1;
195 if (length > MAX_STRING)
196 length = MAX_STRING;
197 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
198 strcpy(vocab[vocab_size].word, word);
199 vocab[vocab_size].cn = 0;
200 vocab_size++;
201 // Reallocate memory if needed
202 if (vocab_size + 2 >= vocab_max_size) {
203 vocab_max_size += 1000;
204 vocab = (struct vocab_word *) realloc(vocab,
205 vocab_max_size * sizeof(struct vocab_word));
206 }
207 hash = GetWordHash(word);
208 while (vocab_hash[hash] != -1)
209 hash = (hash + 1) % vocab_hash_size;
210 vocab_hash[hash] = vocab_size - 1;
211 return vocab_size - 1;
212}
213
214// Used later for sorting by word counts
215int VocabCompare(const void *a, const void *b) {
216 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
217}
218
219// Sorts the vocabulary by frequency using word counts
220void SortVocab() {
221 int a, size;
222 unsigned int hash;
223 // Sort the vocabulary and keep </s> at the first position
224 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
225 for (a = 0; a < vocab_hash_size; a++)
226 vocab_hash[a] = -1;
227 size = vocab_size;
228 train_words = 0;
229 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200230 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100231 // Words occuring less than min_count times will be discarded from the vocab
232 if ((vocab[a].cn < min_count) && (a != 0)) {
233 vocab_size--;
234 free(vocab[a].word);
235 } else {
236 // Hash will be re-computed, as after the sorting it is not actual
237 hash = GetWordHash(vocab[a].word);
238 while (vocab_hash[hash] != -1)
239 hash = (hash + 1) % vocab_hash_size;
240 vocab_hash[hash] = a;
241 train_words += vocab[a].cn;
242 }
243 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200244 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100245 vocab = (struct vocab_word *) realloc(vocab,
246 (vocab_size + 1) * sizeof(struct vocab_word));
247 // Allocate memory for the binary tree construction
248 for (a = 0; a < vocab_size; a++) {
249 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
250 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
251 }
252}
253
254// Reduces the vocabulary by removing infrequent tokens
255void ReduceVocab() {
256 int a, b = 0;
257 unsigned int hash;
258 for (a = 0; a < vocab_size; a++)
259 if (vocab[a].cn > min_reduce) {
260 vocab[b].cn = vocab[a].cn;
261 vocab[b].word = vocab[a].word;
262 b++;
263 } else
264 free(vocab[a].word);
265 vocab_size = b;
266 for (a = 0; a < vocab_hash_size; a++)
267 vocab_hash[a] = -1;
268 for (a = 0; a < vocab_size; a++) {
269 // Hash will be re-computed, as it is not actual
270 hash = GetWordHash(vocab[a].word);
271 while (vocab_hash[hash] != -1)
272 hash = (hash + 1) % vocab_hash_size;
273 vocab_hash[hash] = a;
274 }
275 fflush(stdout);
276 min_reduce++;
277}
278
279// Create binary Huffman tree using the word counts
280// Frequent words will have short uniqe binary codes
281void CreateBinaryTree() {
282 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
283 char code[MAX_CODE_LENGTH];
284 long long *count = (long long *) calloc(vocab_size * 2 + 1,
285 sizeof(long long));
286 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
287 sizeof(long long));
288 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
289 sizeof(long long));
290 for (a = 0; a < vocab_size; a++)
291 count[a] = vocab[a].cn;
292 for (a = vocab_size; a < vocab_size * 2; a++)
293 count[a] = 1e15;
294 pos1 = vocab_size - 1;
295 pos2 = vocab_size;
296 // Following algorithm constructs the Huffman tree by adding one node at a time
297 for (a = 0; a < vocab_size - 1; a++) {
298 // First, find two smallest nodes 'min1, min2'
299 if (pos1 >= 0) {
300 if (count[pos1] < count[pos2]) {
301 min1i = pos1;
302 pos1--;
303 } else {
304 min1i = pos2;
305 pos2++;
306 }
307 } else {
308 min1i = pos2;
309 pos2++;
310 }
311 if (pos1 >= 0) {
312 if (count[pos1] < count[pos2]) {
313 min2i = pos1;
314 pos1--;
315 } else {
316 min2i = pos2;
317 pos2++;
318 }
319 } else {
320 min2i = pos2;
321 pos2++;
322 }
323 count[vocab_size + a] = count[min1i] + count[min2i];
324 parent_node[min1i] = vocab_size + a;
325 parent_node[min2i] = vocab_size + a;
326 binary[min2i] = 1;
327 }
328 // Now assign binary code to each vocabulary word
329 for (a = 0; a < vocab_size; a++) {
330 b = a;
331 i = 0;
332 while (1) {
333 code[i] = binary[b];
334 point[i] = b;
335 i++;
336 b = parent_node[b];
337 if (b == vocab_size * 2 - 2)
338 break;
339 }
340 vocab[a].codelen = i;
341 vocab[a].point[0] = vocab_size - 2;
342 for (b = 0; b < i; b++) {
343 vocab[a].code[i - b - 1] = code[b];
344 vocab[a].point[i - b] = point[b] - vocab_size;
345 }
346 }
347 free(count);
348 free(binary);
349 free(parent_node);
350}
351
352void LearnVocabFromTrainFile() {
353 char word[MAX_STRING];
354 FILE *fin;
355 long long a, i;
356 for (a = 0; a < vocab_hash_size; a++)
357 vocab_hash[a] = -1;
358 fin = fopen(train_file, "rb");
359 if (fin == NULL) {
360 printf("ERROR: training data file not found!\n");
361 exit(1);
362 }
363 vocab_size = 0;
364 AddWordToVocab((char *) "</s>");
365 while (1) {
366 ReadWord(word, fin);
367 if (feof(fin))
368 break;
369 train_words++;
370 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
371 printf("%lldK%c", train_words / 1000, 13);
372 fflush(stdout);
373 }
374 i = SearchVocab(word);
375 if (i == -1) {
376 a = AddWordToVocab(word);
377 vocab[a].cn = 1;
378 } else
379 vocab[i].cn++;
380 if (vocab_size > vocab_hash_size * 0.7)
381 ReduceVocab();
382 }
383 SortVocab();
384 if (debug_mode > 0) {
385 printf("Vocab size: %lld\n", vocab_size);
386 printf("Words in train file: %lld\n", train_words);
387 }
388 file_size = ftell(fin);
389 fclose(fin);
390}
391
392void SaveVocab() {
393 long long i;
394 FILE *fo = fopen(save_vocab_file, "wb");
395 for (i = 0; i < vocab_size; i++)
396 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
397 fclose(fo);
398}
399
400void ReadVocab() {
401 long long a, i = 0;
402 char c;
403 char word[MAX_STRING];
404 FILE *fin = fopen(read_vocab_file, "rb");
405 if (fin == NULL) {
406 printf("Vocabulary file not found\n");
407 exit(1);
408 }
409 for (a = 0; a < vocab_hash_size; a++)
410 vocab_hash[a] = -1;
411 vocab_size = 0;
412 while (1) {
413 ReadWord(word, fin);
414 if (feof(fin))
415 break;
416 a = AddWordToVocab(word);
417 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
418 i++;
419 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200420 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100421 fin = fopen(train_file, "rb");
422 if (fin == NULL) {
423 printf("ERROR: training data file not found!\n");
424 exit(1);
425 }
426 fseek(fin, 0, SEEK_END);
427 file_size = ftell(fin);
428 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200429 SortVocab();
430 if (debug_mode > 0) {
431 printf("Vocab size: %lld\n", vocab_size);
432 if(*read_vocab_file) {
433 printf("Words in vocab's train file: %lld\n", train_words);
434 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
435 } else {
436 printf("Words in train file: %lld\n", train_words);
437 }
438 }
439 if(*read_vocab_file) {
440 train_words = file_size / avgWordLength;
441 if(debug_mode > 0)
442 printf("Estimated words in train file: %lld\n", train_words);
443 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100444}
445
446void InitClassUnigramTable() {
447 long long a, c;
448 printf("loading class unigrams \n");
449 FILE *fin = fopen(negative_classes_file, "rb");
450 if (fin == NULL) {
451 printf("ERROR: class file not found!\n");
452 exit(1);
453 }
454 word_to_group = (int *) malloc(vocab_size * sizeof(int));
455 for (a = 0; a < vocab_size; a++)
456 word_to_group[a] = -1;
457 char class[MAX_STRING];
458 char prev_class[MAX_STRING];
459 prev_class[0] = 0;
460 char word[MAX_STRING];
461 class_number = -1;
462 while (1) {
463 if (feof(fin))
464 break;
465 ReadWord(class, fin);
466 ReadWord(word, fin);
467 int word_index = SearchVocab(word);
468 if (word_index != -1) {
469 if (strcmp(class, prev_class) != 0) {
470 class_number++;
471 strcpy(prev_class, class);
472 }
473 word_to_group[word_index] = class_number;
474 }
475 ReadWord(word, fin);
476 }
477 class_number++;
478 fclose(fin);
479
480 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
481 long long train_words_pow = 0;
482 real d1, power = 0.75;
483
484 for (c = 0; c < class_number; c++) {
485 long long offset = c * table_size;
486 train_words_pow = 0;
487 for (a = 0; a < vocab_size; a++)
488 if (word_to_group[a] == c)
489 train_words_pow += pow(vocab[a].cn, power);
490 int i = 0;
491 while (word_to_group[i] != c && i < vocab_size)
492 i++;
493 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
494 for (a = 0; a < table_size; a++) {
495 //printf("index %lld , word %d\n", a, i);
496 group_to_table[offset + a] = i;
497 if (a / (real) table_size > d1) {
498 i++;
499 while (word_to_group[i] != c && i < vocab_size)
500 i++;
501 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
502 }
503 if (i >= vocab_size)
504 while (word_to_group[i] != c && i >= 0)
505 i--;
506 }
507 }
508}
509
Marc Kupietz210b9d52016-04-02 21:48:13 +0200510void SaveArgs(int argc, char **argv) {
511 unsigned int i;
512 size_t len = 0;
513 char *_all_args, *all_args;
514 char *args_file = strdup(output_file);
515 strcat(args_file, ".args");
516 FILE *fargs = fopen(args_file, "w");
517 if (fargs == NULL) {
518 printf("Cannot save args to %s.\n", args_file);
519 return;
520 }
521
522 for(i=1; i<argc; i++) {
523 len += strlen(argv[i]);
524 }
525
526 _all_args = all_args = (char *)malloc(len+argc-1);
527
528 for(i=1; i<argc; i++) {
529 memcpy(_all_args, argv[i], strlen(argv[i]));
530 _all_args += strlen(argv[i])+1;
531 *(_all_args-1) = ' ';
532 }
533 *(_all_args-1) = 0;
534
535 fprintf(fargs, "%s\n", all_args);
536 fclose(fargs);
537
538 free(all_args);
539
540 return;
541}
542
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100543void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100544 if(type != 3 || negative <= 0) {
545 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
546 return;
547 }
548
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100549 FILE *fnet = fopen(save_net_file, "wb");
550 if (fnet == NULL) {
551 printf("Net parameter file not found\n");
552 exit(1);
553 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100554 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100555 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100556 fclose(fnet);
557}
558
559void InitNet() {
560 long long a, b;
561 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100562 long long read;
563
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100564 window_layer_size = layer1_size * window * 2;
565 a = posix_memalign((void **) &syn0, 128,
566 (long long) vocab_size * layer1_size * sizeof(real));
567 if (syn0 == NULL) {
568 printf("Memory allocation failed\n");
569 exit(1);
570 }
571
572 if (hs) {
573 a = posix_memalign((void **) &syn1, 128,
574 (long long) vocab_size * layer1_size * sizeof(real));
575 if (syn1 == NULL) {
576 printf("Memory allocation failed\n");
577 exit(1);
578 }
579 a = posix_memalign((void **) &syn1_window, 128,
580 (long long) vocab_size * window_layer_size * sizeof(real));
581 if (syn1_window == NULL) {
582 printf("Memory allocation failed\n");
583 exit(1);
584 }
585 a = posix_memalign((void **) &syn_hidden_word, 128,
586 (long long) vocab_size * window_hidden_size * sizeof(real));
587 if (syn_hidden_word == NULL) {
588 printf("Memory allocation failed\n");
589 exit(1);
590 }
591
592 for (a = 0; a < vocab_size; a++)
593 for (b = 0; b < layer1_size; b++)
594 syn1[a * layer1_size + b] = 0;
595 for (a = 0; a < vocab_size; a++)
596 for (b = 0; b < window_layer_size; b++)
597 syn1_window[a * window_layer_size + b] = 0;
598 for (a = 0; a < vocab_size; a++)
599 for (b = 0; b < window_hidden_size; b++)
600 syn_hidden_word[a * window_hidden_size + b] = 0;
601 }
602 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100603 if(type == 0) {
604 a = posix_memalign((void **) &syn1neg, 128,
605 (long long) vocab_size * layer1_size * sizeof(real));
606 if (syn1neg == NULL) {
607 printf("Memory allocation failed\n");
608 exit(1);
609 }
610 for (a = 0; a < vocab_size; a++)
611 for (b = 0; b < layer1_size; b++)
612 syn1neg[a * layer1_size + b] = 0;
613 } else if (type == 3) {
614 a = posix_memalign((void **) &syn1neg_window, 128,
615 (long long) vocab_size * window_layer_size * sizeof(real));
616 if (syn1neg_window == NULL) {
617 printf("Memory allocation failed\n");
618 exit(1);
619 }
620 for (a = 0; a < vocab_size; a++)
621 for (b = 0; b < window_layer_size; b++)
622 syn1neg_window[a * window_layer_size + b] = 0;
623 } else if (type == 4) {
624 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
625 (long long) vocab_size * window_hidden_size * sizeof(real));
626 if (syn_hidden_word_neg == NULL) {
627 printf("Memory allocation failed\n");
628 exit(1);
629 }
630 for (a = 0; a < vocab_size; a++)
631 for (b = 0; b < window_hidden_size; b++)
632 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100633 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100634 }
635 if (nce > 0) {
636 a = posix_memalign((void **) &syn1nce, 128,
637 (long long) vocab_size * layer1_size * sizeof(real));
638 if (syn1nce == NULL) {
639 printf("Memory allocation failed\n");
640 exit(1);
641 }
642 a = posix_memalign((void **) &syn1nce_window, 128,
643 (long long) vocab_size * window_layer_size * sizeof(real));
644 if (syn1nce_window == NULL) {
645 printf("Memory allocation failed\n");
646 exit(1);
647 }
648 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
649 (long long) vocab_size * window_hidden_size * sizeof(real));
650 if (syn_hidden_word_nce == NULL) {
651 printf("Memory allocation failed\n");
652 exit(1);
653 }
654
655 for (a = 0; a < vocab_size; a++)
656 for (b = 0; b < layer1_size; b++)
657 syn1nce[a * layer1_size + b] = 0;
658 for (a = 0; a < vocab_size; a++)
659 for (b = 0; b < window_layer_size; b++)
660 syn1nce_window[a * window_layer_size + b] = 0;
661 for (a = 0; a < vocab_size; a++)
662 for (b = 0; b < window_hidden_size; b++)
663 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
664 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100665
Marc Kupietz1006a272016-03-16 15:50:20 +0100666 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100667 a = posix_memalign((void **) &syn_window_hidden, 128,
668 window_hidden_size * window_layer_size * sizeof(real));
669 if (syn_window_hidden == NULL) {
670 printf("Memory allocation failed\n");
671 exit(1);
672 }
673 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
674 next_random = next_random * (unsigned long long) 25214903917 + 11;
675 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
676 - 0.5) / (window_hidden_size * window_layer_size);
677 }
678 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100679
680 if (read_net_file[0] == 0) {
681 for (a = 0; a < vocab_size; a++)
682 for (b = 0; b < layer1_size; b++) {
683 next_random = next_random * (unsigned long long) 25214903917
684 + 11;
685 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
686 / (real) 65536) - 0.5) / layer1_size;
687 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100688 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100689 FILE *fnet = fopen(read_net_file, "rb");
690 if (fnet == NULL) {
691 printf("Net parameter file not found\n");
692 exit(1);
693 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100694 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
695 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
696 if(read != vocab_size * layer1_size) {
697 fprintf(stderr, "read-net failed %lld\n", read);
698 exit(-1);
699 }
700 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
701 if(read != (long long) vocab_size * window_layer_size) {
702 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
703 (long long) sizeof(real) * vocab_size * window_layer_size);
704 exit(-1);
705 }
706 fgetc(fnet);
707 if(!feof(fnet)) {
708 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
709 exit(-1);
710 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100711 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100712 } else {
713 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
714 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100715 }
716
717 CreateBinaryTree();
718}
719
720void *TrainModelThread(void *id) {
721 long long a, b, d, cw, word, last_word, sentence_length = 0,
722 sentence_position = 0;
723 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
724 long long l1, l2, c, target, label, local_iter = iter;
725 unsigned long long next_random = (long long) id;
726 real f, g;
727 clock_t now;
728 int input_len_1 = layer1_size;
729 int window_offset = -1;
730 if (type == 2 || type == 4) {
731 input_len_1 = window_layer_size;
732 }
733 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
734 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
735
736 int input_len_2 = 0;
737 if (type == 4) {
738 input_len_2 = window_hidden_size;
739 }
740 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
741 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
742
743 FILE *fi = fopen(train_file, "rb");
744 fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
745 while (1) {
746 if (word_count - last_word_count > 10000) {
747 word_count_actual += word_count - last_word_count;
748 last_word_count = word_count;
749 if ((debug_mode > 1)) {
750 now = clock();
751 printf(
Marc Kupietz22f109f2016-07-12 16:02:28 +0200752 "%cCycles ahead: %lld, Alpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
753 13, local_iter, alpha,
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100754 word_count_actual / (real) (iter * train_words + 1)
755 * 100,
756 word_count_actual
757 / ((real) (now - start + 1)
758 / (real) CLOCKS_PER_SEC * 1000));
759 fflush(stdout);
760 }
761 alpha = starting_alpha
762 * (1 - word_count_actual / (real) (iter * train_words + 1));
763 if (alpha < starting_alpha * 0.0001)
764 alpha = starting_alpha * 0.0001;
765 }
766 if (sentence_length == 0) {
767 while (1) {
768 word = ReadWordIndex(fi);
769 if (feof(fi))
770 break;
771 if (word == -1)
772 continue;
773 word_count++;
774 if (word == 0)
775 break;
776 // The subsampling randomly discards frequent words while keeping the ranking same
777 if (sample > 0) {
778 real ran = (sqrt(vocab[word].cn / (sample * train_words))
779 + 1) * (sample * train_words) / vocab[word].cn;
780 next_random = next_random * (unsigned long long) 25214903917
781 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100782 if (ran < (next_random & 0xFFFF) / (real) 65536) {
783 if(type == 3) // in structured skipgrams
784 word = -2; // keep the window position correct
785 else
786 continue;
787 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100788 }
789 sen[sentence_length] = word;
790 sentence_length++;
791 if (sentence_length >= MAX_SENTENCE_LENGTH)
792 break;
793 }
794 sentence_position = 0;
795 }
796 if (feof(fi) || (word_count > train_words / num_threads)) {
797 word_count_actual += word_count - last_word_count;
798 local_iter--;
799 if (local_iter == 0)
800 break;
801 word_count = 0;
802 last_word_count = 0;
803 sentence_length = 0;
804 fseek(fi, file_size / (long long) num_threads * (long long) id,
805 SEEK_SET);
806 continue;
807 }
808 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200809 while (word == -2 && sentence_position<sentence_length)
810 word = sen[++sentence_position];
811 if (sentence_position>=sentence_length) {
812 sentence_length=0;
813 continue;
814 }
815 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100816 continue;
817 for (c = 0; c < input_len_1; c++)
818 neu1[c] = 0;
819 for (c = 0; c < input_len_1; c++)
820 neu1e[c] = 0;
821 for (c = 0; c < input_len_2; c++)
822 neu2[c] = 0;
823 for (c = 0; c < input_len_2; c++)
824 neu2e[c] = 0;
825 next_random = next_random * (unsigned long long) 25214903917 + 11;
826 b = next_random % window;
827 if (type == 0) { //train the cbow architecture
828 // in -> hidden
829 cw = 0;
830 for (a = b; a < window * 2 + 1 - b; a++)
831 if (a != window) {
832 c = sentence_position - window + a;
833 if (c < 0)
834 continue;
835 if (c >= sentence_length)
836 continue;
837 last_word = sen[c];
838 if (last_word == -1)
839 continue;
840 for (c = 0; c < layer1_size; c++)
841 neu1[c] += syn0[c + last_word * layer1_size];
842 cw++;
843 }
844 if (cw) {
845 for (c = 0; c < layer1_size; c++)
846 neu1[c] /= cw;
847 if (hs)
848 for (d = 0; d < vocab[word].codelen; d++) {
849 f = 0;
850 l2 = vocab[word].point[d] * layer1_size;
851 // Propagate hidden -> output
852 for (c = 0; c < layer1_size; c++)
853 f += neu1[c] * syn1[c + l2];
854 if (f <= -MAX_EXP)
855 continue;
856 else if (f >= MAX_EXP)
857 continue;
858 else
859 f = expTable[(int) ((f + MAX_EXP)
860 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
861 // 'g' is the gradient multiplied by the learning rate
862 g = (1 - vocab[word].code[d] - f) * alpha;
863 // Propagate errors output -> hidden
864 for (c = 0; c < layer1_size; c++)
865 neu1e[c] += g * syn1[c + l2];
866 // Learn weights hidden -> output
867 for (c = 0; c < layer1_size; c++)
868 syn1[c + l2] += g * neu1[c];
869 if (cap == 1)
870 for (c = 0; c < layer1_size; c++)
871 capParam(syn1, c + l2);
872 }
873 // NEGATIVE SAMPLING
874 if (negative > 0)
875 for (d = 0; d < negative + 1; d++) {
876 if (d == 0) {
877 target = word;
878 label = 1;
879 } else {
880 next_random = next_random
881 * (unsigned long long) 25214903917 + 11;
882 if (word_to_group != NULL
883 && word_to_group[word] != -1) {
884 target = word;
885 while (target == word) {
886 target = group_to_table[word_to_group[word]
887 * table_size
888 + (next_random >> 16) % table_size];
889 next_random = next_random
890 * (unsigned long long) 25214903917
891 + 11;
892 }
893 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
894 } else {
895 target =
896 table[(next_random >> 16) % table_size];
897 }
898 if (target == 0)
899 target = next_random % (vocab_size - 1) + 1;
900 if (target == word)
901 continue;
902 label = 0;
903 }
904 l2 = target * layer1_size;
905 f = 0;
906 for (c = 0; c < layer1_size; c++)
907 f += neu1[c] * syn1neg[c + l2];
908 if (f > MAX_EXP)
909 g = (label - 1) * alpha;
910 else if (f < -MAX_EXP)
911 g = (label - 0) * alpha;
912 else
913 g = (label
914 - expTable[(int) ((f + MAX_EXP)
915 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
916 * alpha;
917 for (c = 0; c < layer1_size; c++)
918 neu1e[c] += g * syn1neg[c + l2];
919 for (c = 0; c < layer1_size; c++)
920 syn1neg[c + l2] += g * neu1[c];
921 if (cap == 1)
922 for (c = 0; c < layer1_size; c++)
923 capParam(syn1neg, c + l2);
924 }
925 // Noise Contrastive Estimation
926 if (nce > 0)
927 for (d = 0; d < nce + 1; d++) {
928 if (d == 0) {
929 target = word;
930 label = 1;
931 } else {
932 next_random = next_random
933 * (unsigned long long) 25214903917 + 11;
934 if (word_to_group != NULL
935 && word_to_group[word] != -1) {
936 target = word;
937 while (target == word) {
938 target = group_to_table[word_to_group[word]
939 * table_size
940 + (next_random >> 16) % table_size];
941 next_random = next_random
942 * (unsigned long long) 25214903917
943 + 11;
944 }
945 } else {
946 target =
947 table[(next_random >> 16) % table_size];
948 }
949 if (target == 0)
950 target = next_random % (vocab_size - 1) + 1;
951 if (target == word)
952 continue;
953 label = 0;
954 }
955 l2 = target * layer1_size;
956 f = 0;
957
958 for (c = 0; c < layer1_size; c++)
959 f += neu1[c] * syn1nce[c + l2];
960 if (f > MAX_EXP)
961 g = (label - 1) * alpha;
962 else if (f < -MAX_EXP)
963 g = (label - 0) * alpha;
964 else {
965 f = exp(f);
966 g =
967 (label
968 - f
969 / (noise_distribution[target]
970 * nce + f)) * alpha;
971 }
972 for (c = 0; c < layer1_size; c++)
973 neu1e[c] += g * syn1nce[c + l2];
974 for (c = 0; c < layer1_size; c++)
975 syn1nce[c + l2] += g * neu1[c];
976 if (cap == 1)
977 for (c = 0; c < layer1_size; c++)
978 capParam(syn1nce, c + l2);
979 }
980 // hidden -> in
981 for (a = b; a < window * 2 + 1 - b; a++)
982 if (a != window) {
983 c = sentence_position - window + a;
984 if (c < 0)
985 continue;
986 if (c >= sentence_length)
987 continue;
988 last_word = sen[c];
989 if (last_word == -1)
990 continue;
991 for (c = 0; c < layer1_size; c++)
992 syn0[c + last_word * layer1_size] += neu1e[c];
993 }
994 }
995 } else if (type == 1) { //train skip-gram
996 for (a = b; a < window * 2 + 1 - b; a++)
997 if (a != window) {
998 c = sentence_position - window + a;
999 if (c < 0)
1000 continue;
1001 if (c >= sentence_length)
1002 continue;
1003 last_word = sen[c];
1004 if (last_word == -1)
1005 continue;
1006 l1 = last_word * layer1_size;
1007 for (c = 0; c < layer1_size; c++)
1008 neu1e[c] = 0;
1009 // HIERARCHICAL SOFTMAX
1010 if (hs)
1011 for (d = 0; d < vocab[word].codelen; d++) {
1012 f = 0;
1013 l2 = vocab[word].point[d] * layer1_size;
1014 // Propagate hidden -> output
1015 for (c = 0; c < layer1_size; c++)
1016 f += syn0[c + l1] * syn1[c + l2];
1017 if (f <= -MAX_EXP)
1018 continue;
1019 else if (f >= MAX_EXP)
1020 continue;
1021 else
1022 f = expTable[(int) ((f + MAX_EXP)
1023 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1024 // 'g' is the gradient multiplied by the learning rate
1025 g = (1 - vocab[word].code[d] - f) * alpha;
1026 // Propagate errors output -> hidden
1027 for (c = 0; c < layer1_size; c++)
1028 neu1e[c] += g * syn1[c + l2];
1029 // Learn weights hidden -> output
1030 for (c = 0; c < layer1_size; c++)
1031 syn1[c + l2] += g * syn0[c + l1];
1032 if (cap == 1)
1033 for (c = 0; c < layer1_size; c++)
1034 capParam(syn1, c + l2);
1035 }
1036 // NEGATIVE SAMPLING
1037 if (negative > 0)
1038 for (d = 0; d < negative + 1; d++) {
1039 if (d == 0) {
1040 target = word;
1041 label = 1;
1042 } else {
1043 next_random = next_random
1044 * (unsigned long long) 25214903917 + 11;
1045 if (word_to_group != NULL
1046 && word_to_group[word] != -1) {
1047 target = word;
1048 while (target == word) {
1049 target =
1050 group_to_table[word_to_group[word]
1051 * table_size
1052 + (next_random >> 16)
1053 % table_size];
1054 next_random =
1055 next_random
1056 * (unsigned long long) 25214903917
1057 + 11;
1058 }
1059 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1060 } else {
1061 target = table[(next_random >> 16)
1062 % table_size];
1063 }
1064 if (target == 0)
1065 target = next_random % (vocab_size - 1) + 1;
1066 if (target == word)
1067 continue;
1068 label = 0;
1069 }
1070 l2 = target * layer1_size;
1071 f = 0;
1072 for (c = 0; c < layer1_size; c++)
1073 f += syn0[c + l1] * syn1neg[c + l2];
1074 if (f > MAX_EXP)
1075 g = (label - 1) * alpha;
1076 else if (f < -MAX_EXP)
1077 g = (label - 0) * alpha;
1078 else
1079 g =
1080 (label
1081 - expTable[(int) ((f + MAX_EXP)
1082 * (EXP_TABLE_SIZE
1083 / MAX_EXP / 2))])
1084 * alpha;
1085 for (c = 0; c < layer1_size; c++)
1086 neu1e[c] += g * syn1neg[c + l2];
1087 for (c = 0; c < layer1_size; c++)
1088 syn1neg[c + l2] += g * syn0[c + l1];
1089 if (cap == 1)
1090 for (c = 0; c < layer1_size; c++)
1091 capParam(syn1neg, c + l2);
1092 }
1093 //Noise Contrastive Estimation
1094 if (nce > 0)
1095 for (d = 0; d < nce + 1; d++) {
1096 if (d == 0) {
1097 target = word;
1098 label = 1;
1099 } else {
1100 next_random = next_random
1101 * (unsigned long long) 25214903917 + 11;
1102 if (word_to_group != NULL
1103 && word_to_group[word] != -1) {
1104 target = word;
1105 while (target == word) {
1106 target =
1107 group_to_table[word_to_group[word]
1108 * table_size
1109 + (next_random >> 16)
1110 % table_size];
1111 next_random =
1112 next_random
1113 * (unsigned long long) 25214903917
1114 + 11;
1115 }
1116 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1117 } else {
1118 target = table[(next_random >> 16)
1119 % table_size];
1120 }
1121 if (target == 0)
1122 target = next_random % (vocab_size - 1) + 1;
1123 if (target == word)
1124 continue;
1125 label = 0;
1126 }
1127 l2 = target * layer1_size;
1128 f = 0;
1129 for (c = 0; c < layer1_size; c++)
1130 f += syn0[c + l1] * syn1nce[c + l2];
1131 if (f > MAX_EXP)
1132 g = (label - 1) * alpha;
1133 else if (f < -MAX_EXP)
1134 g = (label - 0) * alpha;
1135 else {
1136 f = exp(f);
1137 g = (label
1138 - f
1139 / (noise_distribution[target]
1140 * nce + f)) * alpha;
1141 }
1142 for (c = 0; c < layer1_size; c++)
1143 neu1e[c] += g * syn1nce[c + l2];
1144 for (c = 0; c < layer1_size; c++)
1145 syn1nce[c + l2] += g * syn0[c + l1];
1146 if (cap == 1)
1147 for (c = 0; c < layer1_size; c++)
1148 capParam(syn1nce, c + l2);
1149 }
1150 // Learn weights input -> hidden
1151 for (c = 0; c < layer1_size; c++)
1152 syn0[c + l1] += neu1e[c];
1153 }
1154 } else if (type == 2) { //train the cwindow architecture
1155 // in -> hidden
1156 cw = 0;
1157 for (a = 0; a < window * 2 + 1; a++)
1158 if (a != window) {
1159 c = sentence_position - window + a;
1160 if (c < 0)
1161 continue;
1162 if (c >= sentence_length)
1163 continue;
1164 last_word = sen[c];
1165 if (last_word == -1)
1166 continue;
1167 window_offset = a * layer1_size;
1168 if (a > window)
1169 window_offset -= layer1_size;
1170 for (c = 0; c < layer1_size; c++)
1171 neu1[c + window_offset] += syn0[c
1172 + last_word * layer1_size];
1173 cw++;
1174 }
1175 if (cw) {
1176 if (hs)
1177 for (d = 0; d < vocab[word].codelen; d++) {
1178 f = 0;
1179 l2 = vocab[word].point[d] * window_layer_size;
1180 // Propagate hidden -> output
1181 for (c = 0; c < window_layer_size; c++)
1182 f += neu1[c] * syn1_window[c + l2];
1183 if (f <= -MAX_EXP)
1184 continue;
1185 else if (f >= MAX_EXP)
1186 continue;
1187 else
1188 f = expTable[(int) ((f + MAX_EXP)
1189 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1190 // 'g' is the gradient multiplied by the learning rate
1191 g = (1 - vocab[word].code[d] - f) * alpha;
1192 // Propagate errors output -> hidden
1193 for (c = 0; c < window_layer_size; c++)
1194 neu1e[c] += g * syn1_window[c + l2];
1195 // Learn weights hidden -> output
1196 for (c = 0; c < window_layer_size; c++)
1197 syn1_window[c + l2] += g * neu1[c];
1198 if (cap == 1)
1199 for (c = 0; c < window_layer_size; c++)
1200 capParam(syn1_window, c + l2);
1201 }
1202 // NEGATIVE SAMPLING
1203 if (negative > 0)
1204 for (d = 0; d < negative + 1; d++) {
1205 if (d == 0) {
1206 target = word;
1207 label = 1;
1208 } else {
1209 next_random = next_random
1210 * (unsigned long long) 25214903917 + 11;
1211 if (word_to_group != NULL
1212 && word_to_group[word] != -1) {
1213 target = word;
1214 while (target == word) {
1215 target = group_to_table[word_to_group[word]
1216 * table_size
1217 + (next_random >> 16) % table_size];
1218 next_random = next_random
1219 * (unsigned long long) 25214903917
1220 + 11;
1221 }
1222 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1223 } else {
1224 target =
1225 table[(next_random >> 16) % table_size];
1226 }
1227 if (target == 0)
1228 target = next_random % (vocab_size - 1) + 1;
1229 if (target == word)
1230 continue;
1231 label = 0;
1232 }
1233 l2 = target * window_layer_size;
1234 f = 0;
1235 for (c = 0; c < window_layer_size; c++)
1236 f += neu1[c] * syn1neg_window[c + l2];
1237 if (f > MAX_EXP)
1238 g = (label - 1) * alpha;
1239 else if (f < -MAX_EXP)
1240 g = (label - 0) * alpha;
1241 else
1242 g = (label
1243 - expTable[(int) ((f + MAX_EXP)
1244 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1245 * alpha;
1246 for (c = 0; c < window_layer_size; c++)
1247 neu1e[c] += g * syn1neg_window[c + l2];
1248 for (c = 0; c < window_layer_size; c++)
1249 syn1neg_window[c + l2] += g * neu1[c];
1250 if (cap == 1)
1251 for (c = 0; c < window_layer_size; c++)
1252 capParam(syn1neg_window, c + l2);
1253 }
1254 // Noise Contrastive Estimation
1255 if (nce > 0)
1256 for (d = 0; d < nce + 1; d++) {
1257 if (d == 0) {
1258 target = word;
1259 label = 1;
1260 } else {
1261 next_random = next_random
1262 * (unsigned long long) 25214903917 + 11;
1263 if (word_to_group != NULL
1264 && word_to_group[word] != -1) {
1265 target = word;
1266 while (target == word) {
1267 target = group_to_table[word_to_group[word]
1268 * table_size
1269 + (next_random >> 16) % table_size];
1270 next_random = next_random
1271 * (unsigned long long) 25214903917
1272 + 11;
1273 }
1274 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1275 } else {
1276 target =
1277 table[(next_random >> 16) % table_size];
1278 }
1279 if (target == 0)
1280 target = next_random % (vocab_size - 1) + 1;
1281 if (target == word)
1282 continue;
1283 label = 0;
1284 }
1285 l2 = target * window_layer_size;
1286 f = 0;
1287 for (c = 0; c < window_layer_size; c++)
1288 f += neu1[c] * syn1nce_window[c + l2];
1289 if (f > MAX_EXP)
1290 g = (label - 1) * alpha;
1291 else if (f < -MAX_EXP)
1292 g = (label - 0) * alpha;
1293 else {
1294 f = exp(f);
1295 g =
1296 (label
1297 - f
1298 / (noise_distribution[target]
1299 * nce + f)) * alpha;
1300 }
1301 for (c = 0; c < window_layer_size; c++)
1302 neu1e[c] += g * syn1nce_window[c + l2];
1303 for (c = 0; c < window_layer_size; c++)
1304 syn1nce_window[c + l2] += g * neu1[c];
1305 if (cap == 1)
1306 for (c = 0; c < window_layer_size; c++)
1307 capParam(syn1nce_window, c + l2);
1308 }
1309 // hidden -> in
1310 for (a = 0; a < window * 2 + 1; a++)
1311 if (a != window) {
1312 c = sentence_position - window + a;
1313 if (c < 0)
1314 continue;
1315 if (c >= sentence_length)
1316 continue;
1317 last_word = sen[c];
1318 if (last_word == -1)
1319 continue;
1320 window_offset = a * layer1_size;
1321 if (a > window)
1322 window_offset -= layer1_size;
1323 for (c = 0; c < layer1_size; c++)
1324 syn0[c + last_word * layer1_size] += neu1e[c
1325 + window_offset];
1326 }
1327 }
1328 } else if (type == 3) { //train structured skip-gram
1329 for (a = 0; a < window * 2 + 1; a++)
1330 if (a != window) {
1331 c = sentence_position - window + a;
1332 if (c < 0)
1333 continue;
1334 if (c >= sentence_length)
1335 continue;
1336 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001337 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001338 continue;
1339 l1 = last_word * layer1_size;
1340 window_offset = a * layer1_size;
1341 if (a > window)
1342 window_offset -= layer1_size;
1343 for (c = 0; c < layer1_size; c++)
1344 neu1e[c] = 0;
1345 // HIERARCHICAL SOFTMAX
1346 if (hs)
1347 for (d = 0; d < vocab[word].codelen; d++) {
1348 f = 0;
1349 l2 = vocab[word].point[d] * window_layer_size;
1350 // Propagate hidden -> output
1351 for (c = 0; c < layer1_size; c++)
1352 f += syn0[c + l1]
1353 * syn1_window[c + l2 + window_offset];
1354 if (f <= -MAX_EXP)
1355 continue;
1356 else if (f >= MAX_EXP)
1357 continue;
1358 else
1359 f = expTable[(int) ((f + MAX_EXP)
1360 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1361 // 'g' is the gradient multiplied by the learning rate
1362 g = (1 - vocab[word].code[d] - f) * alpha;
1363 // Propagate errors output -> hidden
1364 for (c = 0; c < layer1_size; c++)
1365 neu1e[c] += g
1366 * syn1_window[c + l2 + window_offset];
1367 // Learn weights hidden -> output
1368 for (c = 0; c < layer1_size; c++)
1369 syn1[c + l2 + window_offset] += g
1370 * syn0[c + l1];
1371 if (cap == 1)
1372 for (c = 0; c < layer1_size; c++)
1373 capParam(syn1, c + l2 + window_offset);
1374 }
1375 // NEGATIVE SAMPLING
1376 if (negative > 0)
1377 for (d = 0; d < negative + 1; d++) {
1378 if (d == 0) {
1379 target = word;
1380 label = 1;
1381 } else {
1382 next_random = next_random
1383 * (unsigned long long) 25214903917 + 11;
1384 if (word_to_group != NULL
1385 && word_to_group[word] != -1) {
1386 target = word;
1387 while (target == word) {
1388 target =
1389 group_to_table[word_to_group[word]
1390 * table_size
1391 + (next_random >> 16)
1392 % table_size];
1393 next_random =
1394 next_random
1395 * (unsigned long long) 25214903917
1396 + 11;
1397 }
1398 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1399 } else {
1400 target = table[(next_random >> 16)
1401 % table_size];
1402 }
1403 if (target == 0)
1404 target = next_random % (vocab_size - 1) + 1;
1405 if (target == word)
1406 continue;
1407 label = 0;
1408 }
1409 l2 = target * window_layer_size;
1410 f = 0;
1411 for (c = 0; c < layer1_size; c++)
1412 f +=
1413 syn0[c + l1]
1414 * syn1neg_window[c + l2
1415 + window_offset];
1416 if (f > MAX_EXP)
1417 g = (label - 1) * alpha;
1418 else if (f < -MAX_EXP)
1419 g = (label - 0) * alpha;
1420 else
1421 g =
1422 (label
1423 - expTable[(int) ((f + MAX_EXP)
1424 * (EXP_TABLE_SIZE
1425 / MAX_EXP / 2))])
1426 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001427 if(debug_mode > 2 && ((long long) id) == 0) {
1428 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1429 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1430 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001431 for (c = 0; c < layer1_size; c++)
1432 neu1e[c] +=
1433 g
1434 * syn1neg_window[c + l2
1435 + window_offset];
1436 for (c = 0; c < layer1_size; c++)
1437 syn1neg_window[c + l2 + window_offset] += g
1438 * syn0[c + l1];
1439 if (cap == 1)
1440 for (c = 0; c < layer1_size; c++)
1441 capParam(syn1neg_window,
1442 c + l2 + window_offset);
1443 }
1444 // Noise Constrastive Estimation
1445 if (nce > 0)
1446 for (d = 0; d < nce + 1; d++) {
1447 if (d == 0) {
1448 target = word;
1449 label = 1;
1450 } else {
1451 next_random = next_random
1452 * (unsigned long long) 25214903917 + 11;
1453 if (word_to_group != NULL
1454 && word_to_group[word] != -1) {
1455 target = word;
1456 while (target == word) {
1457 target =
1458 group_to_table[word_to_group[word]
1459 * table_size
1460 + (next_random >> 16)
1461 % table_size];
1462 next_random =
1463 next_random
1464 * (unsigned long long) 25214903917
1465 + 11;
1466 }
1467 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1468 } else {
1469 target = table[(next_random >> 16)
1470 % table_size];
1471 }
1472 if (target == 0)
1473 target = next_random % (vocab_size - 1) + 1;
1474 if (target == word)
1475 continue;
1476 label = 0;
1477 }
1478 l2 = target * window_layer_size;
1479 f = 0;
1480 for (c = 0; c < layer1_size; c++)
1481 f +=
1482 syn0[c + l1]
1483 * syn1nce_window[c + l2
1484 + window_offset];
1485 if (f > MAX_EXP)
1486 g = (label - 1) * alpha;
1487 else if (f < -MAX_EXP)
1488 g = (label - 0) * alpha;
1489 else {
1490 f = exp(f);
1491 g = (label
1492 - f
1493 / (noise_distribution[target]
1494 * nce + f)) * alpha;
1495 }
1496 for (c = 0; c < layer1_size; c++)
1497 neu1e[c] +=
1498 g
1499 * syn1nce_window[c + l2
1500 + window_offset];
1501 for (c = 0; c < layer1_size; c++)
1502 syn1nce_window[c + l2 + window_offset] += g
1503 * syn0[c + l1];
1504 if (cap == 1)
1505 for (c = 0; c < layer1_size; c++)
1506 capParam(syn1nce_window,
1507 c + l2 + window_offset);
1508 }
1509 // Learn weights input -> hidden
1510 for (c = 0; c < layer1_size; c++) {
1511 syn0[c + l1] += neu1e[c];
1512 if (syn0[c + l1] > 50)
1513 syn0[c + l1] = 50;
1514 if (syn0[c + l1] < -50)
1515 syn0[c + l1] = -50;
1516 }
1517 }
1518 } else if (type == 4) { //training senna
1519 // in -> hidden
1520 cw = 0;
1521 for (a = 0; a < window * 2 + 1; a++)
1522 if (a != window) {
1523 c = sentence_position - window + a;
1524 if (c < 0)
1525 continue;
1526 if (c >= sentence_length)
1527 continue;
1528 last_word = sen[c];
1529 if (last_word == -1)
1530 continue;
1531 window_offset = a * layer1_size;
1532 if (a > window)
1533 window_offset -= layer1_size;
1534 for (c = 0; c < layer1_size; c++)
1535 neu1[c + window_offset] += syn0[c
1536 + last_word * layer1_size];
1537 cw++;
1538 }
1539 if (cw) {
1540 for (a = 0; a < window_hidden_size; a++) {
1541 c = a * window_layer_size;
1542 for (b = 0; b < window_layer_size; b++) {
1543 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1544 }
1545 }
1546 if (hs)
1547 for (d = 0; d < vocab[word].codelen; d++) {
1548 f = 0;
1549 l2 = vocab[word].point[d] * window_hidden_size;
1550 // Propagate hidden -> output
1551 for (c = 0; c < window_hidden_size; c++)
1552 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1553 if (f <= -MAX_EXP)
1554 continue;
1555 else if (f >= MAX_EXP)
1556 continue;
1557 else
1558 f = expTable[(int) ((f + MAX_EXP)
1559 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1560 // 'g' is the gradient multiplied by the learning rate
1561 g = (1 - vocab[word].code[d] - f) * alpha;
1562 // Propagate errors output -> hidden
1563 for (c = 0; c < window_hidden_size; c++)
1564 neu2e[c] += dHardTanh(neu2[c], g) * g
1565 * syn_hidden_word[c + l2];
1566 // Learn weights hidden -> output
1567 for (c = 0; c < window_hidden_size; c++)
1568 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1569 * neu2[c];
1570 }
1571 // NEGATIVE SAMPLING
1572 if (negative > 0)
1573 for (d = 0; d < negative + 1; d++) {
1574 if (d == 0) {
1575 target = word;
1576 label = 1;
1577 } else {
1578 next_random = next_random
1579 * (unsigned long long) 25214903917 + 11;
1580 if (word_to_group != NULL
1581 && word_to_group[word] != -1) {
1582 target = word;
1583 while (target == word) {
1584 target = group_to_table[word_to_group[word]
1585 * table_size
1586 + (next_random >> 16) % table_size];
1587 next_random = next_random
1588 * (unsigned long long) 25214903917
1589 + 11;
1590 }
1591 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1592 } else {
1593 target =
1594 table[(next_random >> 16) % table_size];
1595 }
1596 if (target == 0)
1597 target = next_random % (vocab_size - 1) + 1;
1598 if (target == word)
1599 continue;
1600 label = 0;
1601 }
1602 l2 = target * window_hidden_size;
1603 f = 0;
1604 for (c = 0; c < window_hidden_size; c++)
1605 f += hardTanh(neu2[c])
1606 * syn_hidden_word_neg[c + l2];
1607 if (f > MAX_EXP)
1608 g = (label - 1) * alpha / negative;
1609 else if (f < -MAX_EXP)
1610 g = (label - 0) * alpha / negative;
1611 else
1612 g = (label
1613 - expTable[(int) ((f + MAX_EXP)
1614 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1615 * alpha / negative;
1616 for (c = 0; c < window_hidden_size; c++)
1617 neu2e[c] += dHardTanh(neu2[c], g) * g
1618 * syn_hidden_word_neg[c + l2];
1619 for (c = 0; c < window_hidden_size; c++)
1620 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1621 * g * neu2[c];
1622 }
1623 for (a = 0; a < window_hidden_size; a++)
1624 for (b = 0; b < window_layer_size; b++)
1625 neu1e[b] += neu2e[a]
1626 * syn_window_hidden[a * window_layer_size + b];
1627 for (a = 0; a < window_hidden_size; a++)
1628 for (b = 0; b < window_layer_size; b++)
1629 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1630 * neu1[b];
1631 // hidden -> in
1632 for (a = 0; a < window * 2 + 1; a++)
1633 if (a != window) {
1634 c = sentence_position - window + a;
1635 if (c < 0)
1636 continue;
1637 if (c >= sentence_length)
1638 continue;
1639 last_word = sen[c];
1640 if (last_word == -1)
1641 continue;
1642 window_offset = a * layer1_size;
1643 if (a > window)
1644 window_offset -= layer1_size;
1645 for (c = 0; c < layer1_size; c++)
1646 syn0[c + last_word * layer1_size] += neu1e[c
1647 + window_offset];
1648 }
1649 }
1650 } else {
1651 printf("unknown type %i", type);
1652 exit(0);
1653 }
1654 sentence_position++;
1655 if (sentence_position >= sentence_length) {
1656 sentence_length = 0;
1657 continue;
1658 }
1659 }
1660 fclose(fi);
1661 free(neu1);
1662 free(neu1e);
1663 pthread_exit(NULL);
1664}
1665
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001666void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001667 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001668 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001669 real *target_sums, bestf[MAX_CC], worstbest;
1670 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001671 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001672 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1673
1674 for (d = cc; d < vocab_size; d++) {
1675 for (b = 0; b < vocab_size; b++)
1676 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001677 for (b = 0; b < N; b++)
1678 bestf[b]=-1;
1679 worstbest = -1;
1680
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001681 maxmax_f = -1;
1682 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001683 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001684 if (a != window) {
1685 max_f = -1;
1686 window_offset = a * layer1_size;
1687 if (a > window)
1688 window_offset -= layer1_size;
1689 for(target = 0; target < vocab_size; target ++) {
1690 if(target == d)
1691 continue;
1692 f = 0;
1693 for (c = 0; c < layer1_size; c++)
1694 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1695 if (f < -MAX_EXP)
1696 continue;
1697 else if (f > MAX_EXP)
1698 continue;
1699 else
1700 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1701 if(f > max_f) {
1702 max_f = f;
1703 max_target = target;
1704 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001705 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001706 if(f > worstbest) {
1707 for (b = 0; b < N; b++) {
1708 if (f > bestf[b]) {
1709 for (e = N - 1; e > b; e--) {
1710 bestf[e] = bestf[e - 1];
1711 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001712 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001713 }
1714 bestf[b] = f;
1715 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001716 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001717 break;
1718 }
1719 }
1720 worstbest = bestf[N-1];
1721 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001722 }
1723 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1724 if(max_f > maxmax_f) {
1725 maxmax_f = max_f;
1726 maxmax_target = max_target;
1727 }
1728 } else {
1729 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1730 }
1731 }
1732 max_f = -1;
1733 for (b = 0; b < vocab_size; b++) {
1734 if(target_sums[b] > max_f) {
1735 max_f = target_sums[b];
1736 max_target = b;
1737 }
1738 }
1739 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001740 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001741 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001742 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001743 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001744 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001745 }
1746}
1747
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001748void TrainModel() {
1749 long a, b, c, d;
1750 FILE *fo;
1751 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1752 printf("Starting training using file %s\n", train_file);
1753 starting_alpha = alpha;
1754 if (read_vocab_file[0] != 0)
1755 ReadVocab();
1756 else
1757 LearnVocabFromTrainFile();
1758 if (save_vocab_file[0] != 0)
1759 SaveVocab();
1760 if (output_file[0] == 0)
1761 return;
1762 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001763 if(cc > 0)
1764 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001765 if (negative > 0 || nce > 0)
1766 InitUnigramTable();
1767 if (negative_classes_file[0] != 0)
1768 InitClassUnigramTable();
1769 start = clock();
1770 for (a = 0; a < num_threads; a++)
1771 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1772 for (a = 0; a < num_threads; a++)
1773 pthread_join(pt[a], NULL);
1774 fo = fopen(output_file, "wb");
1775 if (classes == 0) {
1776 // Save the word vectors
1777 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1778 for (a = 0; a < vocab_size; a++) {
1779 fprintf(fo, "%s ", vocab[a].word);
1780 if (binary)
1781 for (b = 0; b < layer1_size; b++)
1782 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1783 else
1784 for (b = 0; b < layer1_size; b++)
1785 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1786 fprintf(fo, "\n");
1787 }
1788 } else {
1789 // Run K-means on the word vectors
1790 int clcn = classes, iter = 10, closeid;
1791 int *centcn = (int *) malloc(classes * sizeof(int));
1792 int *cl = (int *) calloc(vocab_size, sizeof(int));
1793 real closev, x;
1794 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1795 for (a = 0; a < vocab_size; a++)
1796 cl[a] = a % clcn;
1797 for (a = 0; a < iter; a++) {
1798 for (b = 0; b < clcn * layer1_size; b++)
1799 cent[b] = 0;
1800 for (b = 0; b < clcn; b++)
1801 centcn[b] = 1;
1802 for (c = 0; c < vocab_size; c++) {
1803 for (d = 0; d < layer1_size; d++)
1804 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1805 centcn[cl[c]]++;
1806 }
1807 for (b = 0; b < clcn; b++) {
1808 closev = 0;
1809 for (c = 0; c < layer1_size; c++) {
1810 cent[layer1_size * b + c] /= centcn[b];
1811 closev += cent[layer1_size * b + c]
1812 * cent[layer1_size * b + c];
1813 }
1814 closev = sqrt(closev);
1815 for (c = 0; c < layer1_size; c++)
1816 cent[layer1_size * b + c] /= closev;
1817 }
1818 for (c = 0; c < vocab_size; c++) {
1819 closev = -10;
1820 closeid = 0;
1821 for (d = 0; d < clcn; d++) {
1822 x = 0;
1823 for (b = 0; b < layer1_size; b++)
1824 x += cent[layer1_size * d + b]
1825 * syn0[c * layer1_size + b];
1826 if (x > closev) {
1827 closev = x;
1828 closeid = d;
1829 }
1830 }
1831 cl[c] = closeid;
1832 }
1833 }
1834 // Save the K-means classes
1835 for (a = 0; a < vocab_size; a++)
1836 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1837 free(centcn);
1838 free(cent);
1839 free(cl);
1840 }
1841 fclose(fo);
1842 if (save_net_file[0] != 0)
1843 SaveNet();
1844}
1845
1846int ArgPos(char *str, int argc, char **argv) {
1847 int a;
1848 for (a = 1; a < argc; a++)
1849 if (!strcmp(str, argv[a])) {
1850 if (a == argc - 1) {
1851 printf("Argument missing for %s\n", str);
1852 exit(1);
1853 }
1854 return a;
1855 }
1856 return -1;
1857}
1858
1859int main(int argc, char **argv) {
1860 int i;
1861 if (argc == 1) {
1862 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1863 printf("Options:\n");
1864 printf("Parameters for training:\n");
1865 printf("\t-train <file>\n");
1866 printf("\t\tUse text data from <file> to train the model\n");
1867 printf("\t-output <file>\n");
1868 printf(
1869 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1870 printf("\t-size <int>\n");
1871 printf("\t\tSet size of word vectors; default is 100\n");
1872 printf("\t-window <int>\n");
1873 printf("\t\tSet max skip length between words; default is 5\n");
1874 printf("\t-sample <float>\n");
1875 printf(
1876 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1877 printf(
1878 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1879 printf("\t-hs <int>\n");
1880 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1881 printf("\t-negative <int>\n");
1882 printf(
1883 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1884 printf("\t-negative-classes <file>\n");
1885 printf("\t\tNegative classes to sample from\n");
1886 printf("\t-nce <int>\n");
1887 printf(
1888 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1889 printf("\t-threads <int>\n");
1890 printf("\t\tUse <int> threads (default 12)\n");
1891 printf("\t-iter <int>\n");
1892 printf("\t\tRun more training iterations (default 5)\n");
1893 printf("\t-min-count <int>\n");
1894 printf(
1895 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1896 printf("\t-alpha <float>\n");
1897 printf(
1898 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1899 printf("\t-classes <int>\n");
1900 printf(
1901 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1902 printf("\t-debug <int>\n");
1903 printf(
1904 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1905 printf("\t-binary <int>\n");
1906 printf(
1907 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1908 printf("\t-save-vocab <file>\n");
1909 printf("\t\tThe vocabulary will be saved to <file>\n");
1910 printf("\t-read-vocab <file>\n");
1911 printf(
1912 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1913 printf("\t-read-net <file>\n");
1914 printf(
1915 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1916 printf("\t-save-net <file>\n");
1917 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001918 printf("\t-show-cc <int>\n");
1919 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001920 printf("\t-type <int>\n");
1921 printf(
1922 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1923 printf("\t-cap <int>\n");
1924 printf(
1925 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1926 printf("\nExamples:\n");
1927 printf(
1928 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1929 return 0;
1930 }
1931 output_file[0] = 0;
1932 save_vocab_file[0] = 0;
1933 read_vocab_file[0] = 0;
1934 save_net_file[0] = 0;
1935 read_net_file[0] = 0;
1936 negative_classes_file[0] = 0;
1937 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1938 layer1_size = atoi(argv[i + 1]);
1939 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1940 strcpy(train_file, argv[i + 1]);
1941 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1942 strcpy(save_vocab_file, argv[i + 1]);
1943 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1944 strcpy(read_vocab_file, argv[i + 1]);
1945 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1946 strcpy(save_net_file, argv[i + 1]);
1947 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
1948 strcpy(read_net_file, argv[i + 1]);
1949 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
1950 debug_mode = atoi(argv[i + 1]);
1951 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
1952 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001953 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
1954 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001955 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
1956 type = atoi(argv[i + 1]);
1957 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
1958 strcpy(output_file, argv[i + 1]);
1959 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
1960 window = atoi(argv[i + 1]);
1961 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
1962 sample = atof(argv[i + 1]);
1963 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
1964 hs = atoi(argv[i + 1]);
1965 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
1966 negative = atoi(argv[i + 1]);
1967 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
1968 strcpy(negative_classes_file, argv[i + 1]);
1969 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
1970 nce = atoi(argv[i + 1]);
1971 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
1972 num_threads = atoi(argv[i + 1]);
1973 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
1974 iter = atoi(argv[i + 1]);
1975 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
1976 min_count = atoi(argv[i + 1]);
1977 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
1978 classes = atoi(argv[i + 1]);
1979 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
1980 cap = atoi(argv[i + 1]);
1981 if (type == 0 || type == 2 || type == 4)
1982 alpha = 0.05;
1983 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
1984 alpha = atof(argv[i + 1]);
1985 vocab = (struct vocab_word *) calloc(vocab_max_size,
1986 sizeof(struct vocab_word));
1987 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
1988 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1989 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1990 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1991 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1992 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02001993 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001994 TrainModel();
1995 return 0;
1996}
1997