blob: 0fb49ff98ef57f7f71936c4d5be0c6de1f3fa2c7 [file] [log] [blame]
Peter Fankhauser542b6872020-04-19 15:21:44 +02001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <locale.h>
16#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
19#include <unistd.h>
20#include <math.h>
21#include <pthread.h>
22#include <collocatordb.h>
23
24#define MAX_STRING 100
25#define EXP_TABLE_SIZE 1000
26#define MAX_EXP 6
27#define MAX_SENTENCE_LENGTH 1000
28#define MAX_CC 100
29#define MAX_CODE_LENGTH 40
30
31const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
32
33typedef float real; // Precision of float numbers
34
35struct vocab_word {
36 long long cn;
37 int *point;
38 char *word, *code, codelen;
39};
40
41char train_file[MAX_STRING], output_file[MAX_STRING];
42char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
43char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
44char magic_stop_file[MAX_STRING];
45
46struct vocab_word *vocab;
47int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
48 num_threads = 12, min_reduce = 1;
49int *vocab_hash;
50long long *threadPos;
51int *threadIters;
52long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
53long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
54 classes = 0;
55real alpha = 0.025, starting_alpha, sample = 1e-3;
56real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
57real avgWordLength=0;
58clock_t start, start_clock;
59
60real *syn1_window, *syn1neg_window, *syn1nce_window;
61int w_offset, window_layer_size;
62
63int window_hidden_size = 500;
64real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
65 *syn_hidden_word_nce;
66
67int hs = 0, negative = 5;
68const int table_size = 1e8;
69int *table;
70
71long cc = 0;
72long tc = 1;
73
74//constrastive negative sampling
75char negative_classes_file[MAX_STRING];
76int *word_to_group;
77int *group_to_table; //group_size*table_size
78int class_number;
79
80//nce
81real* noise_distribution;
82int nce = 0;
83
84//param caps
85real CAP_VALUE = 50;
86int cap = 0;
87
88COLLOCATORDB *cdb = NULL;
89
90void capParam(real* array, int index) {
91 if (array[index] > CAP_VALUE)
92 array[index] = CAP_VALUE;
93 else if (array[index] < -CAP_VALUE)
94 array[index] = -CAP_VALUE;
95}
96
97real hardTanh(real x) {
98 if (x >= 1) {
99 return 1;
100 } else if (x <= -1) {
101 return -1;
102 } else {
103 return x;
104 }
105}
106
107real dHardTanh(real x, real g) {
108 if (x > 1 && g > 0) {
109 return 0;
110 }
111 if (x < -1 && g < 0) {
112 return 0;
113 }
114 return 1;
115}
116
117void InitUnigramTable() {
118 int a, i;
119 long long train_words_pow = 0;
120 real d1, power = 0.75;
121 table = (int *) malloc(table_size * sizeof(int));
122 for (a = 0; a < vocab_size; a++)
123 train_words_pow += pow(vocab[a].cn, power);
124 i = 0;
125 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
126 for (a = 0; a < table_size; a++) {
127 table[a] = i;
128 if (a / (real) table_size > d1) {
129 i++;
130 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
131 }
132 if (i >= vocab_size)
133 i = vocab_size - 1;
134 }
135
136 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
137 for (a = 0; a < vocab_size; a++)
138 noise_distribution[a] = pow(vocab[a].cn, power)
139 / (real) train_words_pow;
140}
141
142// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
143void ReadWord(char *word, FILE *fin) {
144 int a = 0, ch;
145 while (!feof(fin)) {
146 ch = fgetc(fin);
147 if (ch == 13)
148 continue;
149 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
150 if (a > 0) {
151 if (ch == '\n')
152 ungetc(ch, fin);
153 break;
154 }
155 if (ch == '\n') {
156 strcpy(word, (char *) "</s>");
157 return;
158 } else
159 continue;
160 }
161 word[a] = ch;
162 a++;
163 if (a >= MAX_STRING - 1)
164 a--; // Truncate too long words
165 }
166 word[a] = 0;
167}
168
169// Returns hash value of a word
170int GetWordHash(char *word) {
171 unsigned long long a, hash = 0;
172 for (a = 0; a < strlen(word); a++)
173 hash = hash * 257 + word[a];
174 hash = hash % vocab_hash_size;
175 return hash;
176}
177
178// Returns position of a word in the vocabulary; if the word is not found, returns -1
179int SearchVocab(char *word) {
180 unsigned int hash = GetWordHash(word);
181 while (1) {
182 if (vocab_hash[hash] == -1)
183 return -1;
184 if (!strcmp(word, vocab[vocab_hash[hash]].word))
185 return vocab_hash[hash];
186 hash = (hash + 1) % vocab_hash_size;
187 }
188 return -1;
189}
190
191// Reads a word and returns its index in the vocabulary
192int ReadWordIndex(FILE *fin) {
193 char word[MAX_STRING];
194 ReadWord(word, fin);
195 if (feof(fin))
196 return -1;
197 return SearchVocab(word);
198}
199
200// Adds a word to the vocabulary
201int AddWordToVocab(char *word) {
202 unsigned int hash, length = strlen(word) + 1;
203 if (length > MAX_STRING)
204 length = MAX_STRING;
205 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
206 strcpy(vocab[vocab_size].word, word);
207 vocab[vocab_size].cn = 0;
208 vocab_size++;
209 // Reallocate memory if needed
210 if (vocab_size + 2 >= vocab_max_size) {
211 vocab_max_size += 1000;
212 vocab = (struct vocab_word *) realloc(vocab,
213 vocab_max_size * sizeof(struct vocab_word));
214 }
215 hash = GetWordHash(word);
216 while (vocab_hash[hash] != -1)
217 hash = (hash + 1) % vocab_hash_size;
218 vocab_hash[hash] = vocab_size - 1;
219 return vocab_size - 1;
220}
221
222// Used later for sorting by word counts
223int VocabCompare(const void *a, const void *b) {
224 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
225}
226
227// Sorts the vocabulary by frequency using word counts
228void SortVocab() {
229 int a, size;
230 unsigned int hash;
231 // Sort the vocabulary and keep </s> at the first position
232 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
233 for (a = 0; a < vocab_hash_size; a++)
234 vocab_hash[a] = -1;
235 size = vocab_size;
236 train_words = 0;
237 for (a = 0; a < size; a++) {
238 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
239 // Words occuring less than min_count times will be discarded from the vocab
240 if ((vocab[a].cn < min_count) && (a != 0)) {
241 vocab_size--;
242 free(vocab[a].word);
243 } else {
244 // Hash will be re-computed, as after the sorting it is not actual
245 hash = GetWordHash(vocab[a].word);
246 while (vocab_hash[hash] != -1)
247 hash = (hash + 1) % vocab_hash_size;
248 vocab_hash[hash] = a;
249 train_words += vocab[a].cn;
250 }
251 }
252 avgWordLength /= train_words;
253 vocab = (struct vocab_word *) realloc(vocab,
254 (vocab_size + 1) * sizeof(struct vocab_word));
255 // Allocate memory for the binary tree construction
256 for (a = 0; a < vocab_size; a++) {
257 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
258 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
259 }
260}
261
262// Reduces the vocabulary by removing infrequent tokens
263void ReduceVocab() {
264 int a, b = 0;
265 unsigned int hash;
266 for (a = 0; a < vocab_size; a++)
267 if (vocab[a].cn > min_reduce) {
268 vocab[b].cn = vocab[a].cn;
269 vocab[b].word = vocab[a].word;
270 b++;
271 } else
272 free(vocab[a].word);
273 vocab_size = b;
274 for (a = 0; a < vocab_hash_size; a++)
275 vocab_hash[a] = -1;
276 for (a = 0; a < vocab_size; a++) {
277 // Hash will be re-computed, as it is not actual
278 hash = GetWordHash(vocab[a].word);
279 while (vocab_hash[hash] != -1)
280 hash = (hash + 1) % vocab_hash_size;
281 vocab_hash[hash] = a;
282 }
283 fflush(stdout);
284 min_reduce++;
285}
286
287// Create binary Huffman tree using the word counts
288// Frequent words will have short uniqe binary codes
289void CreateBinaryTree() {
290 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
291 char code[MAX_CODE_LENGTH];
292 long long *count = (long long *) calloc(vocab_size * 2 + 1,
293 sizeof(long long));
294 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
295 sizeof(long long));
296 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
297 sizeof(long long));
298 // todo: this needs to operate on a sorted copy of vocab[a].cn if we use local counts
299 for (a = 0; a < vocab_size; a++)
300 count[a] = vocab[a].cn;
301 for (a = vocab_size; a < vocab_size * 2; a++)
302 count[a] = 1e15;
303 pos1 = vocab_size - 1;
304 pos2 = vocab_size;
305 // Following algorithm constructs the Huffman tree by adding one node at a time
306 for (a = 0; a < vocab_size - 1; a++) {
307 // First, find two smallest nodes 'min1, min2'
308 if (pos1 >= 0) {
309 if (count[pos1] < count[pos2]) {
310 min1i = pos1;
311 pos1--;
312 } else {
313 min1i = pos2;
314 pos2++;
315 }
316 } else {
317 min1i = pos2;
318 pos2++;
319 }
320 if (pos1 >= 0) {
321 if (count[pos1] < count[pos2]) {
322 min2i = pos1;
323 pos1--;
324 } else {
325 min2i = pos2;
326 pos2++;
327 }
328 } else {
329 min2i = pos2;
330 pos2++;
331 }
332 count[vocab_size + a] = count[min1i] + count[min2i];
333 parent_node[min1i] = vocab_size + a;
334 parent_node[min2i] = vocab_size + a;
335 binary[min2i] = 1;
336 }
337 // Now assign binary code to each vocabulary word
338 for (a = 0; a < vocab_size; a++) {
339 b = a;
340 i = 0;
341 while (1) {
342 code[i] = binary[b];
343 point[i] = b;
344 i++;
345 b = parent_node[b];
346 if (b == vocab_size * 2 - 2)
347 break;
348 }
349 vocab[a].codelen = i;
350 vocab[a].point[0] = vocab_size - 2;
351 for (b = 0; b < i; b++) {
352 vocab[a].code[i - b - 1] = code[b];
353 vocab[a].point[i - b] = point[b] - vocab_size;
354 }
355 }
356 free(count);
357 free(binary);
358 free(parent_node);
359}
360
361void LearnVocabFromTrainFile() {
362 char word[MAX_STRING];
363 FILE *fin;
364 long long a, i;
365 for (a = 0; a < vocab_hash_size; a++)
366 vocab_hash[a] = -1;
367 fin = fopen(train_file, "rb");
368 if (fin == NULL) {
369 printf("ERROR: training data file not found!\n");
370 exit(1);
371 }
372 vocab_size = 0;
373 AddWordToVocab((char *) "</s>");
374 while (1) {
375 ReadWord(word, fin);
376 if (feof(fin))
377 break;
378 train_words++;
379 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
380 printf("%lldK%c", train_words / 1000, 13);
381 fflush(stdout);
382 }
383 i = SearchVocab(word);
384 if (i == -1) {
385 a = AddWordToVocab(word);
386 vocab[a].cn = 1;
387 } else
388 vocab[i].cn++;
389 if (vocab_size > vocab_hash_size * 0.7)
390 ReduceVocab();
391 }
392 SortVocab();
393 if (debug_mode > 0) {
394 printf("Vocab size: %lld\n", vocab_size);
395 printf("Words in train file: %lld\n", train_words);
396 }
397 file_size = ftell(fin);
398 fclose(fin);
399}
400
401void SaveVocab() {
402 long long i;
403 FILE *fo = fopen(save_vocab_file, "wb");
404 for (i = 0; i < vocab_size; i++)
405 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
406 fclose(fo);
407}
408
409void ReadVocab() {
410 long long a, i = 0;
411 char c;
412 char word[MAX_STRING];
413 FILE *fin = fopen(read_vocab_file, "rb");
414 if (fin == NULL) {
415 printf("Vocabulary file not found\n");
416 exit(1);
417 }
418 for (a = 0; a < vocab_hash_size; a++)
419 vocab_hash[a] = -1;
420 vocab_size = 0;
421 while (1) {
422 ReadWord(word, fin);
423 if (feof(fin))
424 break;
425 a = AddWordToVocab(word);
426 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
427 i++;
428 }
429 fclose(fin);
Peter Fankhauser117aab02020-04-24 13:20:06 +0200430 // this is just for determining train_words by avgWordLength
431 fin = fopen(train_file, "rb");
432 if (fin == NULL) {
433 printf("ERROR: training data file not found!\n");
434 exit(1);
435 }
436 fseek(fin, 0, SEEK_END);
437 file_size = ftell(fin);
438 fclose(fin);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200439 SortVocab();
Peter Fankhauser117aab02020-04-24 13:20:06 +0200440 train_words = file_size / avgWordLength;
441 if(debug_mode > 0)
442 printf("Estimated words in train file: %'lld\n", train_words);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200443 if (tc > 0) {
444 // recalculate counts for the current corpus
445 // adapted from LearnVocabFromTrainFile()
446 // note that we don't sort or rehash the vocabulary again, we only adapt vocab[.].cn.
447 fin = fopen(train_file, "rb");
448 if (fin == NULL) {
449 printf("ERROR: training data file not found!\n");
450 exit(1);
451 }
452 // reset vocabulary counts
453 for (a = 0; a < vocab_size; a++)
454 vocab[a].cn = 0;
Peter Fankhauser117aab02020-04-24 13:20:06 +0200455 long long train_words1 = 0;
Peter Fankhauser542b6872020-04-19 15:21:44 +0200456 while (1) {
457 ReadWord(word, fin);
458 if (feof(fin))
459 break;
Peter Fankhauser117aab02020-04-24 13:20:06 +0200460 if ((debug_mode > 1) && (train_words1 % 100000 == 0)) {
461 printf("%lldK%c", train_words1 / 1000, 13);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200462 fflush(stdout);
463 }
464 i = SearchVocab(word);
465 // the word must be in the vocabulary but we don't issue a warning,
466 // because it may have been cut off due to minfreq.
467 if (i >= 0) {
468 vocab[i].cn++;
Peter Fankhauser117aab02020-04-24 13:20:06 +0200469 train_words1++;
Peter Fankhauser542b6872020-04-19 15:21:44 +0200470 }
471 }
472 // we cannot have 0 counts.
473 for (a = 0; a < vocab_size; a++) {
474 if(vocab[a].cn == 0) {
475 vocab[a].cn = 1;
Peter Fankhauser117aab02020-04-24 13:20:06 +0200476 train_words1++;
Peter Fankhauser542b6872020-04-19 15:21:44 +0200477 }
478 }
479 if (debug_mode > 0) {
480 printf("Vocab size: %lld\n", vocab_size);
Peter Fankhauser117aab02020-04-24 13:20:06 +0200481 printf("Words in current train file: %'lld\n", train_words1);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200482 }
483 fseek(fin, 0, SEEK_END);
484 file_size = ftell(fin);
485 fclose(fin);
486 }
Peter Fankhauser542b6872020-04-19 15:21:44 +0200487}
488
489void InitClassUnigramTable() {
490 // TODO: this probably needs to be adapted for dealing with subcorpus adjusted vocabulary counts
491 long long a, c;
492 printf("loading class unigrams \n");
493 FILE *fin = fopen(negative_classes_file, "rb");
494 if (fin == NULL) {
495 printf("ERROR: class file not found!\n");
496 exit(1);
497 }
498 word_to_group = (int *) malloc(vocab_size * sizeof(int));
499 for (a = 0; a < vocab_size; a++)
500 word_to_group[a] = -1;
501 char class[MAX_STRING];
502 char prev_class[MAX_STRING];
503 prev_class[0] = 0;
504 char word[MAX_STRING];
505 class_number = -1;
506 while (1) {
507 if (feof(fin))
508 break;
509 ReadWord(class, fin);
510 ReadWord(word, fin);
511 int word_index = SearchVocab(word);
512 if (word_index != -1) {
513 if (strcmp(class, prev_class) != 0) {
514 class_number++;
515 strcpy(prev_class, class);
516 }
517 word_to_group[word_index] = class_number;
518 }
519 ReadWord(word, fin);
520 }
521 class_number++;
522 fclose(fin);
523
524 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
525 long long train_words_pow = 0;
526 real d1, power = 0.75;
527
528 for (c = 0; c < class_number; c++) {
529 long long offset = c * table_size;
530 train_words_pow = 0;
531 for (a = 0; a < vocab_size; a++)
532 if (word_to_group[a] == c)
533 train_words_pow += pow(vocab[a].cn, power);
534 int i = 0;
535 while (word_to_group[i] != c && i < vocab_size)
536 i++;
537 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
538 for (a = 0; a < table_size; a++) {
539 //printf("index %lld , word %d\n", a, i);
540 group_to_table[offset + a] = i;
541 if (a / (real) table_size > d1) {
542 i++;
543 while (word_to_group[i] != c && i < vocab_size)
544 i++;
545 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
546 }
547 if (i >= vocab_size)
548 while (word_to_group[i] != c && i >= 0)
549 i--;
550 }
551 }
552}
553
554void SaveArgs(int argc, char **argv) {
555 unsigned int i;
556 char args_file[MAX_STRING];
557 strcpy(args_file, output_file);
558 strcat(args_file, ".args");
559 FILE *fargs = fopen(args_file, "w");
560 if (fargs == NULL) {
561 printf("Cannot save args to %s.\n", args_file);
562 return;
563 }
564
565 for(i=1; i<argc; i++)
566 fprintf(fargs, "%s ", argv[i]);
567
568 fprintf(fargs, "\n");
569 fclose(fargs);
570
571 return;
572}
573
574void SaveNet() {
575 if (type == 4 || negative <= 0) {
576 fprintf(stderr,
577 "save-net only supported for type 0,1,2,3 with negative sampling\n");
578 return;
579 }
580
581 FILE *fnet = fopen(save_net_file, "wb");
582 if (fnet == NULL) {
583 printf("Net parameter file not found\n");
584 exit(1);
585 }
586 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
587 if (type == 0 || type == 1) {
588 fwrite(syn1neg, sizeof(real), vocab_size * layer1_size, fnet);
589 }
590 if (type == 2 || type == 3) {
591 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
592 }
593 fclose(fnet);
594}
595
596void InitNet() {
597 long long a, b;
598 unsigned long long next_random = 1;
599 long long read;
600
601 window_layer_size = layer1_size * window * 2;
602 a = posix_memalign((void **) &syn0, 128,
603 (long long) vocab_size * layer1_size * sizeof(real));
604 if (syn0 == NULL) {
605 printf("Memory allocation failed\n");
606 exit(1);
607 }
608
609 if (hs) {
610 a = posix_memalign((void **) &syn1, 128,
611 (long long) vocab_size * layer1_size * sizeof(real));
612 if (syn1 == NULL) {
613 printf("Memory allocation failed\n");
614 exit(1);
615 }
616 a = posix_memalign((void **) &syn1_window, 128,
617 (long long) vocab_size * window_layer_size * sizeof(real));
618 if (syn1_window == NULL) {
619 printf("Memory allocation failed\n");
620 exit(1);
621 }
622 a = posix_memalign((void **) &syn_hidden_word, 128,
623 (long long) vocab_size * window_hidden_size * sizeof(real));
624 if (syn_hidden_word == NULL) {
625 printf("Memory allocation failed\n");
626 exit(1);
627 }
628
629 for (a = 0; a < vocab_size; a++)
630 for (b = 0; b < layer1_size; b++)
631 syn1[a * layer1_size + b] = 0;
632 for (a = 0; a < vocab_size; a++)
633 for (b = 0; b < window_layer_size; b++)
634 syn1_window[a * window_layer_size + b] = 0;
635 for (a = 0; a < vocab_size; a++)
636 for (b = 0; b < window_hidden_size; b++)
637 syn_hidden_word[a * window_hidden_size + b] = 0;
638 }
639 if (negative > 0) {
640 if (type == 0 || type == 1) {
641 a = posix_memalign((void **) &syn1neg, 128,
642 (long long) vocab_size * layer1_size * sizeof(real));
643 if (syn1neg == NULL) {
644 printf("Memory allocation failed\n");
645 exit(1);
646 }
647 for (a = 0; a < vocab_size; a++)
648 for (b = 0; b < layer1_size; b++)
649 syn1neg[a * layer1_size + b] = 0;
650 } else if (type == 2 || type == 3) {
651 a = posix_memalign((void **) &syn1neg_window, 128,
652 (long long) vocab_size * window_layer_size * sizeof(real));
653 if (syn1neg_window == NULL) {
654 printf("Memory allocation failed\n");
655 exit(1);
656 }
657 for (a = 0; a < vocab_size; a++)
658 for (b = 0; b < window_layer_size; b++)
659 syn1neg_window[a * window_layer_size + b] = 0;
660 } else if (type == 4) {
661 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
662 (long long) vocab_size * window_hidden_size * sizeof(real));
663 if (syn_hidden_word_neg == NULL) {
664 printf("Memory allocation failed\n");
665 exit(1);
666 }
667 for (a = 0; a < vocab_size; a++)
668 for (b = 0; b < window_hidden_size; b++)
669 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
670 }
671 }
672 if (nce > 0) {
673 a = posix_memalign((void **) &syn1nce, 128,
674 (long long) vocab_size * layer1_size * sizeof(real));
675 if (syn1nce == NULL) {
676 printf("Memory allocation failed\n");
677 exit(1);
678 }
679 a = posix_memalign((void **) &syn1nce_window, 128,
680 (long long) vocab_size * window_layer_size * sizeof(real));
681 if (syn1nce_window == NULL) {
682 printf("Memory allocation failed\n");
683 exit(1);
684 }
685 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
686 (long long) vocab_size * window_hidden_size * sizeof(real));
687 if (syn_hidden_word_nce == NULL) {
688 printf("Memory allocation failed\n");
689 exit(1);
690 }
691
692 for (a = 0; a < vocab_size; a++)
693 for (b = 0; b < layer1_size; b++)
694 syn1nce[a * layer1_size + b] = 0;
695 for (a = 0; a < vocab_size; a++)
696 for (b = 0; b < window_layer_size; b++)
697 syn1nce_window[a * window_layer_size + b] = 0;
698 for (a = 0; a < vocab_size; a++)
699 for (b = 0; b < window_hidden_size; b++)
700 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
701 }
702
703 if (type == 4) {
704 a = posix_memalign((void **) &syn_window_hidden, 128,
705 window_hidden_size * window_layer_size * sizeof(real));
706 if (syn_window_hidden == NULL) {
707 printf("Memory allocation failed\n");
708 exit(1);
709 }
710 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
711 next_random = next_random * (unsigned long long) 25214903917 + 11;
712 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
713 - 0.5) / (window_hidden_size * window_layer_size);
714 }
715 }
716
717 if (read_net_file[0] == 0) {
718 for (a = 0; a < vocab_size; a++)
719 for (b = 0; b < layer1_size; b++) {
720 next_random = next_random * (unsigned long long) 25214903917
721 + 11;
722 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
723 / (real) 65536) - 0.5) / layer1_size;
724 }
725 } else if ((type == 0 || type == 1) && negative > 0) {
726 FILE *fnet = fopen(read_net_file, "rb");
727 if (fnet == NULL) {
728 printf("Net parameter file not found\n");
729 exit(1);
730 }
731 printf("vocab-size: %lld, layer1_size: %lld\n",
732 vocab_size, layer1_size);
733 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
734 if (read != vocab_size * layer1_size) {
735 fprintf(stderr, "read-net failed %lld\n", read);
736 exit(-1);
737 }
738 read = fread(syn1neg, sizeof(real),
739 vocab_size * layer1_size, fnet);
740 if (read != (long long) vocab_size * layer1_size) {
741 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n",
742 read,
743 (long long) sizeof(real) * vocab_size * layer1_size);
744 exit(-1);
745 }
746 fgetc(fnet);
747 if (!feof(fnet)) {
748 fprintf(stderr,
749 "Remaining bytes in net-file after read-net. File position: %ld\n",
750 ftell(fnet));
751 exit(-1);
752 }
753 fclose(fnet);
754 } else if ((type == 2 || type == 3) && negative > 0) {
755 FILE *fnet = fopen(read_net_file, "rb");
756 if (fnet == NULL) {
757 printf("Net parameter file not found\n");
758 exit(1);
759 }
760 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n",
761 vocab_size, layer1_size, window_layer_size);
762 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
763 if (read != vocab_size * layer1_size) {
764 fprintf(stderr, "read-net failed %lld\n", read);
765 exit(-1);
766 }
767 read = fread(syn1neg_window, sizeof(real),
768 vocab_size * window_layer_size, fnet);
769 if (read != (long long) vocab_size * window_layer_size) {
770 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n",
771 read,
772 (long long) sizeof(real) * vocab_size * window_layer_size);
773 exit(-1);
774 }
775 fgetc(fnet);
776 if (!feof(fnet)) {
777 fprintf(stderr,
778 "Remaining bytes in net-file after read-net. File position: %ld\n",
779 ftell(fnet));
780 exit(-1);
781 }
782 fclose(fnet);
783 } else {
784 fprintf(stderr,
785 "read-net only supported for type 3 with negative sampling\n");
786 exit(-1);
787 }
788
789 CreateBinaryTree();
790}
791
792char *currentDateTime(char *buf, real offset) {
793 time_t t;
794 time(&t);
795 t += (long) offset;
796 struct tm tstruct;
797 tstruct = *localtime(&t);
798 strftime(buf, 80, "%c", &tstruct);
799 return buf;
800}
801
802void *MonitorThread(void *id) {
803 char *timebuf = malloc(80);;
804 int i, n=num_threads;
805 long long sum;
806 sleep(1);
807 while(n > 0) {
808 sleep(1);
809 sum = n = 0;
810 for(i=0; i < num_threads; i++) {
811 if(threadPos[i] >= 0) {
812 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
813 n++;
814 } else {
815 sum += iter * file_size / num_threads;
816 }
817 }
818 if(n == 0)
819 break;
820 real finished_portion = (real) sum / (float) (file_size * iter);
821 long long now = time(NULL);
822 long long elapsed = (now - start);
823 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
824
825 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
826 alpha,
827 finished_portion * 100,
828 (float) sum / elapsed / 1000,
829 elapsed,
830 ttg,
831 currentDateTime(timebuf, ttg)
832 );
833 fflush(stdout);
834 }
835 pthread_exit(NULL);
836}
837
838void *TrainModelThread(void *id) {
839 long long a, b, d, cw, word, last_word, sentence_length = 0,
840 sentence_position = 0;
841 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
842 long long l1, l2, c, target, label, local_iter = iter;
843 unsigned long long next_random = (long long) id;
844 real f, g;
845 int input_len_1 = layer1_size;
846 int window_offset = -1;
847 if (type == 2 || type == 4) {
848 input_len_1 = window_layer_size;
849 }
850 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
851 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
852 threadIters[(long) id] = iter;
853
854 int input_len_2 = 0;
855 if (type == 4) {
856 input_len_2 = window_hidden_size;
857 }
858 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
859 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
860
861 FILE *fi = fopen(train_file, "rb");
862 long long start_pos = file_size / (long long) num_threads * (long long) id;
863 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
864 long long current_pos = start_pos;
865 long long last_pos = start_pos;;
866 fseek(fi, start_pos, SEEK_SET);
867 while (1) {
868 if (word_count - last_word_count > 10000) {
869 // if ((current_pos - last_pos > 100000)) {
870 // PF: changed back, because it seems that alpha is not correctly adjusted otherwise.
871 word_count_actual += word_count - last_word_count;
872 last_pos = current_pos;
873 last_word_count = word_count;
874 alpha = starting_alpha
875 * (1 - word_count_actual / (real) (iter * train_words + 1));
876 if (alpha < starting_alpha * 0.0001)
877 alpha = starting_alpha * 0.0001;
878 }
879 if (sentence_length == 0) {
880 while (1) {
881 word = ReadWordIndex(fi);
882 if (feof(fi))
883 break;
884 if (word == -1)
885 continue;
886 word_count++;
887 if (word == 0)
888 break;
889 // The subsampling randomly discards frequent words while keeping the ranking same
890 if (sample > 0) {
891 real ran = (sqrt(vocab[word].cn / (sample * train_words))
892 + 1) * (sample * train_words) / vocab[word].cn;
893 next_random = next_random * (unsigned long long) 25214903917
894 + 11;
895 if (ran < (next_random & 0xFFFF) / (real) 65536) {
896 if (type == 3) // in structured skipgrams
897 word = -2; // keep the window position correct
898 else
899 continue;
900 }
901 }
902 sen[sentence_length] = word;
903 sentence_length++;
904 if (sentence_length >= MAX_SENTENCE_LENGTH)
905 break;
906 }
907 sentence_position = 0;
908 }
909 current_pos = threadPos[(long) id] = ftell(fi);
910 if (feof(fi) || current_pos >= end_pos ) {
911 word_count_actual += word_count - last_word_count;
912 threadIters[(long) id]--;
913 local_iter--;
914 if (local_iter == 0)
915 break;
916 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
917 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
918 break;
919 }
920 word_count = 0;
921 last_word_count = 0;
922 sentence_length = 0;
923 fseek(fi, file_size / (long long) num_threads * (long long) id,
924 SEEK_SET);
925 continue;
926 }
927 word = sen[sentence_position];
928 while (word == -2 && sentence_position<sentence_length)
929 word = sen[++sentence_position];
930 if (sentence_position>=sentence_length) {
931 sentence_length=0;
932 continue;
933 }
934 if (word < 0)
935 continue;
936 for (c = 0; c < input_len_1; c++)
937 neu1[c] = 0;
938 for (c = 0; c < input_len_1; c++)
939 neu1e[c] = 0;
940 for (c = 0; c < input_len_2; c++)
941 neu2[c] = 0;
942 for (c = 0; c < input_len_2; c++)
943 neu2e[c] = 0;
944 next_random = next_random * (unsigned long long) 25214903917 + 11;
945 b = next_random % window;
946 if (type == 0) { //train the cbow architecture
947 // in -> hidden
948 cw = 0;
949 for (a = b; a < window * 2 + 1 - b; a++)
950 if (a != window) {
951 c = sentence_position - window + a;
952 if (c < 0)
953 continue;
954 if (c >= sentence_length)
955 continue;
956 last_word = sen[c];
957 if (last_word == -1)
958 continue;
959 for (c = 0; c < layer1_size; c++)
960 neu1[c] += syn0[c + last_word * layer1_size];
961 cw++;
962 }
963 if (cw) {
964 for (c = 0; c < layer1_size; c++)
965 neu1[c] /= cw;
966 if (hs)
967 for (d = 0; d < vocab[word].codelen; d++) {
968 f = 0;
969 l2 = vocab[word].point[d] * layer1_size;
970 // Propagate hidden -> output
971 for (c = 0; c < layer1_size; c++)
972 f += neu1[c] * syn1[c + l2];
973 if (f <= -MAX_EXP)
974 continue;
975 else if (f >= MAX_EXP)
976 continue;
977 else
978 f = expTable[(int) ((f + MAX_EXP)
979 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
980 // 'g' is the gradient multiplied by the learning rate
981 g = (1 - vocab[word].code[d] - f) * alpha;
982 // Propagate errors output -> hidden
983 for (c = 0; c < layer1_size; c++)
984 neu1e[c] += g * syn1[c + l2];
985 // Learn weights hidden -> output
986 for (c = 0; c < layer1_size; c++)
987 syn1[c + l2] += g * neu1[c];
988 if (cap == 1)
989 for (c = 0; c < layer1_size; c++)
990 capParam(syn1, c + l2);
991 }
992 // NEGATIVE SAMPLING
993 if (negative > 0)
994 for (d = 0; d < negative + 1; d++) {
995 if (d == 0) {
996 target = word;
997 label = 1;
998 } else {
999 next_random = next_random
1000 * (unsigned long long) 25214903917 + 11;
1001 if (word_to_group != NULL
1002 && word_to_group[word] != -1) {
1003 target = word;
1004 while (target == word) {
1005 target = group_to_table[word_to_group[word]
1006 * table_size
1007 + (next_random >> 16) % table_size];
1008 next_random = next_random
1009 * (unsigned long long) 25214903917
1010 + 11;
1011 }
1012 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1013 } else {
1014 target =
1015 table[(next_random >> 16) % table_size];
1016 }
1017 if (target == 0)
1018 target = next_random % (vocab_size - 1) + 1;
1019 if (target == word)
1020 continue;
1021 label = 0;
1022 }
1023 l2 = target * layer1_size;
1024 f = 0;
1025 for (c = 0; c < layer1_size; c++)
1026 f += neu1[c] * syn1neg[c + l2];
1027 if (f > MAX_EXP)
1028 g = (label - 1) * alpha;
1029 else if (f < -MAX_EXP)
1030 g = (label - 0) * alpha;
1031 else
1032 g = (label
1033 - expTable[(int) ((f + MAX_EXP)
1034 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1035 * alpha;
1036 for (c = 0; c < layer1_size; c++)
1037 neu1e[c] += g * syn1neg[c + l2];
1038 for (c = 0; c < layer1_size; c++)
1039 syn1neg[c + l2] += g * neu1[c];
1040 if (cap == 1)
1041 for (c = 0; c < layer1_size; c++)
1042 capParam(syn1neg, c + l2);
1043 }
1044 // Noise Contrastive Estimation
1045 if (nce > 0)
1046 for (d = 0; d < nce + 1; d++) {
1047 if (d == 0) {
1048 target = word;
1049 label = 1;
1050 } else {
1051 next_random = next_random
1052 * (unsigned long long) 25214903917 + 11;
1053 if (word_to_group != NULL
1054 && word_to_group[word] != -1) {
1055 target = word;
1056 while (target == word) {
1057 target = group_to_table[word_to_group[word]
1058 * table_size
1059 + (next_random >> 16) % table_size];
1060 next_random = next_random
1061 * (unsigned long long) 25214903917
1062 + 11;
1063 }
1064 } else {
1065 target =
1066 table[(next_random >> 16) % table_size];
1067 }
1068 if (target == 0)
1069 target = next_random % (vocab_size - 1) + 1;
1070 if (target == word)
1071 continue;
1072 label = 0;
1073 }
1074 l2 = target * layer1_size;
1075 f = 0;
1076
1077 for (c = 0; c < layer1_size; c++)
1078 f += neu1[c] * syn1nce[c + l2];
1079 if (f > MAX_EXP)
1080 g = (label - 1) * alpha;
1081 else if (f < -MAX_EXP)
1082 g = (label - 0) * alpha;
1083 else {
1084 f = exp(f);
1085 g =
1086 (label
1087 - f
1088 / (noise_distribution[target]
1089 * nce + f)) * alpha;
1090 }
1091 for (c = 0; c < layer1_size; c++)
1092 neu1e[c] += g * syn1nce[c + l2];
1093 for (c = 0; c < layer1_size; c++)
1094 syn1nce[c + l2] += g * neu1[c];
1095 if (cap == 1)
1096 for (c = 0; c < layer1_size; c++)
1097 capParam(syn1nce, c + l2);
1098 }
1099 // hidden -> in
1100 for (a = b; a < window * 2 + 1 - b; a++)
1101 if (a != window) {
1102 c = sentence_position - window + a;
1103 if (c < 0)
1104 continue;
1105 if (c >= sentence_length)
1106 continue;
1107 last_word = sen[c];
1108 if (last_word == -1)
1109 continue;
1110 for (c = 0; c < layer1_size; c++)
1111 syn0[c + last_word * layer1_size] += neu1e[c];
1112 }
1113 }
1114 } else if (type == 1) { //train skip-gram
1115 for (a = b; a < window * 2 + 1 - b; a++)
1116 if (a != window) {
1117 c = sentence_position - window + a;
1118 if (c < 0)
1119 continue;
1120 if (c >= sentence_length)
1121 continue;
1122 last_word = sen[c];
1123 if (last_word == -1)
1124 continue;
1125 l1 = last_word * layer1_size;
1126 for (c = 0; c < layer1_size; c++)
1127 neu1e[c] = 0;
1128 // HIERARCHICAL SOFTMAX
1129 if (hs)
1130 for (d = 0; d < vocab[word].codelen; d++) {
1131 f = 0;
1132 l2 = vocab[word].point[d] * layer1_size;
1133 // Propagate hidden -> output
1134 for (c = 0; c < layer1_size; c++)
1135 f += syn0[c + l1] * syn1[c + l2];
1136 if (f <= -MAX_EXP)
1137 continue;
1138 else if (f >= MAX_EXP)
1139 continue;
1140 else
1141 f = expTable[(int) ((f + MAX_EXP)
1142 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1143 // 'g' is the gradient multiplied by the learning rate
1144 g = (1 - vocab[word].code[d] - f) * alpha;
1145 // Propagate errors output -> hidden
1146 for (c = 0; c < layer1_size; c++)
1147 neu1e[c] += g * syn1[c + l2];
1148 // Learn weights hidden -> output
1149 for (c = 0; c < layer1_size; c++)
1150 syn1[c + l2] += g * syn0[c + l1];
1151 if (cap == 1)
1152 for (c = 0; c < layer1_size; c++)
1153 capParam(syn1, c + l2);
1154 }
1155 // NEGATIVE SAMPLING
1156 if (negative > 0)
1157 for (d = 0; d < negative + 1; d++) {
1158 if (d == 0) {
1159 target = word;
1160 label = 1;
1161 } else {
1162 next_random = next_random
1163 * (unsigned long long) 25214903917 + 11;
1164 if (word_to_group != NULL
1165 && word_to_group[word] != -1) {
1166 target = word;
1167 while (target == word) {
1168 target =
1169 group_to_table[word_to_group[word]
1170 * table_size
1171 + (next_random >> 16)
1172 % table_size];
1173 next_random =
1174 next_random
1175 * (unsigned long long) 25214903917
1176 + 11;
1177 }
1178 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1179 } else {
1180 target = table[(next_random >> 16)
1181 % table_size];
1182 }
1183 if (target == 0)
1184 target = next_random % (vocab_size - 1) + 1;
1185 if (target == word)
1186 continue;
1187 label = 0;
1188 }
1189 l2 = target * layer1_size;
1190 f = 0;
1191 for (c = 0; c < layer1_size; c++)
1192 f += syn0[c + l1] * syn1neg[c + l2];
1193 if (f > MAX_EXP)
1194 g = (label - 1) * alpha;
1195 else if (f < -MAX_EXP)
1196 g = (label - 0) * alpha;
1197 else
1198 g =
1199 (label
1200 - expTable[(int) ((f + MAX_EXP)
1201 * (EXP_TABLE_SIZE
1202 / MAX_EXP / 2))])
1203 * alpha;
1204 for (c = 0; c < layer1_size; c++)
1205 neu1e[c] += g * syn1neg[c + l2];
1206 for (c = 0; c < layer1_size; c++)
1207 syn1neg[c + l2] += g * syn0[c + l1];
1208 if (cap == 1)
1209 for (c = 0; c < layer1_size; c++)
1210 capParam(syn1neg, c + l2);
1211 }
1212 //Noise Contrastive Estimation
1213 if (nce > 0)
1214 for (d = 0; d < nce + 1; d++) {
1215 if (d == 0) {
1216 target = word;
1217 label = 1;
1218 } else {
1219 next_random = next_random
1220 * (unsigned long long) 25214903917 + 11;
1221 if (word_to_group != NULL
1222 && word_to_group[word] != -1) {
1223 target = word;
1224 while (target == word) {
1225 target =
1226 group_to_table[word_to_group[word]
1227 * table_size
1228 + (next_random >> 16)
1229 % table_size];
1230 next_random =
1231 next_random
1232 * (unsigned long long) 25214903917
1233 + 11;
1234 }
1235 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1236 } else {
1237 target = table[(next_random >> 16)
1238 % table_size];
1239 }
1240 if (target == 0)
1241 target = next_random % (vocab_size - 1) + 1;
1242 if (target == word)
1243 continue;
1244 label = 0;
1245 }
1246 l2 = target * layer1_size;
1247 f = 0;
1248 for (c = 0; c < layer1_size; c++)
1249 f += syn0[c + l1] * syn1nce[c + l2];
1250 if (f > MAX_EXP)
1251 g = (label - 1) * alpha;
1252 else if (f < -MAX_EXP)
1253 g = (label - 0) * alpha;
1254 else {
1255 f = exp(f);
1256 g = (label
1257 - f
1258 / (noise_distribution[target]
1259 * nce + f)) * alpha;
1260 }
1261 for (c = 0; c < layer1_size; c++)
1262 neu1e[c] += g * syn1nce[c + l2];
1263 for (c = 0; c < layer1_size; c++)
1264 syn1nce[c + l2] += g * syn0[c + l1];
1265 if (cap == 1)
1266 for (c = 0; c < layer1_size; c++)
1267 capParam(syn1nce, c + l2);
1268 }
1269 // Learn weights input -> hidden
1270 for (c = 0; c < layer1_size; c++)
1271 syn0[c + l1] += neu1e[c];
1272 }
1273 } else if (type == 2) { //train the cwindow architecture
1274 // in -> hidden
1275 cw = 0;
1276 for (a = 0; a < window * 2 + 1; a++)
1277 if (a != window) {
1278 c = sentence_position - window + a;
1279 if (c < 0)
1280 continue;
1281 if (c >= sentence_length)
1282 continue;
1283 last_word = sen[c];
1284 if (last_word == -1)
1285 continue;
1286 window_offset = a * layer1_size;
1287 if (a > window)
1288 window_offset -= layer1_size;
1289 for (c = 0; c < layer1_size; c++)
1290 neu1[c + window_offset] += syn0[c
1291 + last_word * layer1_size];
1292 cw++;
1293 }
1294 if (cw) {
1295 if (hs)
1296 for (d = 0; d < vocab[word].codelen; d++) {
1297 f = 0;
1298 l2 = vocab[word].point[d] * window_layer_size;
1299 // Propagate hidden -> output
1300 for (c = 0; c < window_layer_size; c++)
1301 f += neu1[c] * syn1_window[c + l2];
1302 if (f <= -MAX_EXP)
1303 continue;
1304 else if (f >= MAX_EXP)
1305 continue;
1306 else
1307 f = expTable[(int) ((f + MAX_EXP)
1308 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1309 // 'g' is the gradient multiplied by the learning rate
1310 g = (1 - vocab[word].code[d] - f) * alpha;
1311 // Propagate errors output -> hidden
1312 for (c = 0; c < window_layer_size; c++)
1313 neu1e[c] += g * syn1_window[c + l2];
1314 // Learn weights hidden -> output
1315 for (c = 0; c < window_layer_size; c++)
1316 syn1_window[c + l2] += g * neu1[c];
1317 if (cap == 1)
1318 for (c = 0; c < window_layer_size; c++)
1319 capParam(syn1_window, c + l2);
1320 }
1321 // NEGATIVE SAMPLING
1322 if (negative > 0)
1323 for (d = 0; d < negative + 1; d++) {
1324 if (d == 0) {
1325 target = word;
1326 label = 1;
1327 } else {
1328 next_random = next_random
1329 * (unsigned long long) 25214903917 + 11;
1330 if (word_to_group != NULL
1331 && word_to_group[word] != -1) {
1332 target = word;
1333 while (target == word) {
1334 target = group_to_table[word_to_group[word]
1335 * table_size
1336 + (next_random >> 16) % table_size];
1337 next_random = next_random
1338 * (unsigned long long) 25214903917
1339 + 11;
1340 }
1341 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1342 } else {
1343 target =
1344 table[(next_random >> 16) % table_size];
1345 }
1346 if (target == 0)
1347 target = next_random % (vocab_size - 1) + 1;
1348 if (target == word)
1349 continue;
1350 label = 0;
1351 }
1352 l2 = target * window_layer_size;
1353 f = 0;
1354 for (c = 0; c < window_layer_size; c++)
1355 f += neu1[c] * syn1neg_window[c + l2];
1356 if (f > MAX_EXP)
1357 g = (label - 1) * alpha;
1358 else if (f < -MAX_EXP)
1359 g = (label - 0) * alpha;
1360 else
1361 g = (label
1362 - expTable[(int) ((f + MAX_EXP)
1363 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1364 * alpha;
1365 for (c = 0; c < window_layer_size; c++)
1366 neu1e[c] += g * syn1neg_window[c + l2];
1367 for (c = 0; c < window_layer_size; c++)
1368 syn1neg_window[c + l2] += g * neu1[c];
1369 if (cap == 1)
1370 for (c = 0; c < window_layer_size; c++)
1371 capParam(syn1neg_window, c + l2);
1372 }
1373 // Noise Contrastive Estimation
1374 if (nce > 0)
1375 for (d = 0; d < nce + 1; d++) {
1376 if (d == 0) {
1377 target = word;
1378 label = 1;
1379 } else {
1380 next_random = next_random
1381 * (unsigned long long) 25214903917 + 11;
1382 if (word_to_group != NULL
1383 && word_to_group[word] != -1) {
1384 target = word;
1385 while (target == word) {
1386 target = group_to_table[word_to_group[word]
1387 * table_size
1388 + (next_random >> 16) % table_size];
1389 next_random = next_random
1390 * (unsigned long long) 25214903917
1391 + 11;
1392 }
1393 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1394 } else {
1395 target =
1396 table[(next_random >> 16) % table_size];
1397 }
1398 if (target == 0)
1399 target = next_random % (vocab_size - 1) + 1;
1400 if (target == word)
1401 continue;
1402 label = 0;
1403 }
1404 l2 = target * window_layer_size;
1405 f = 0;
1406 for (c = 0; c < window_layer_size; c++)
1407 f += neu1[c] * syn1nce_window[c + l2];
1408 if (f > MAX_EXP)
1409 g = (label - 1) * alpha;
1410 else if (f < -MAX_EXP)
1411 g = (label - 0) * alpha;
1412 else {
1413 f = exp(f);
1414 g =
1415 (label
1416 - f
1417 / (noise_distribution[target]
1418 * nce + f)) * alpha;
1419 }
1420 for (c = 0; c < window_layer_size; c++)
1421 neu1e[c] += g * syn1nce_window[c + l2];
1422 for (c = 0; c < window_layer_size; c++)
1423 syn1nce_window[c + l2] += g * neu1[c];
1424 if (cap == 1)
1425 for (c = 0; c < window_layer_size; c++)
1426 capParam(syn1nce_window, c + l2);
1427 }
1428 // hidden -> in
1429 for (a = 0; a < window * 2 + 1; a++)
1430 if (a != window) {
1431 c = sentence_position - window + a;
1432 if (c < 0)
1433 continue;
1434 if (c >= sentence_length)
1435 continue;
1436 last_word = sen[c];
1437 if (last_word == -1)
1438 continue;
1439 window_offset = a * layer1_size;
1440 if (a > window)
1441 window_offset -= layer1_size;
1442 for (c = 0; c < layer1_size; c++)
1443 syn0[c + last_word * layer1_size] += neu1e[c
1444 + window_offset];
1445 }
1446 }
1447 } else if (type == 3) { //train structured skip-gram
1448 for (a = 0; a < window * 2 + 1; a++)
1449 if (a != window) {
1450 c = sentence_position - window + a;
1451 if (c < 0)
1452 continue;
1453 if (c >= sentence_length)
1454 continue;
1455 last_word = sen[c];
1456 if (last_word < 0)
1457 continue;
1458 l1 = last_word * layer1_size;
1459 window_offset = a * layer1_size;
1460 if (a > window)
1461 window_offset -= layer1_size;
1462 for (c = 0; c < layer1_size; c++)
1463 neu1e[c] = 0;
1464 // HIERARCHICAL SOFTMAX
1465 if (hs)
1466 for (d = 0; d < vocab[word].codelen; d++) {
1467 f = 0;
1468 l2 = vocab[word].point[d] * window_layer_size;
1469 // Propagate hidden -> output
1470 for (c = 0; c < layer1_size; c++)
1471 f += syn0[c + l1]
1472 * syn1_window[c + l2 + window_offset];
1473 if (f <= -MAX_EXP)
1474 continue;
1475 else if (f >= MAX_EXP)
1476 continue;
1477 else
1478 f = expTable[(int) ((f + MAX_EXP)
1479 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1480 // 'g' is the gradient multiplied by the learning rate
1481 g = (1 - vocab[word].code[d] - f) * alpha;
1482 // Propagate errors output -> hidden
1483 for (c = 0; c < layer1_size; c++)
1484 neu1e[c] += g
1485 * syn1_window[c + l2 + window_offset];
1486 // Learn weights hidden -> output
1487 for (c = 0; c < layer1_size; c++)
1488 syn1[c + l2 + window_offset] += g
1489 * syn0[c + l1];
1490 if (cap == 1)
1491 for (c = 0; c < layer1_size; c++)
1492 capParam(syn1, c + l2 + window_offset);
1493 }
1494 // NEGATIVE SAMPLING
1495 if (negative > 0)
1496 for (d = 0; d < negative + 1; d++) {
1497 if (d == 0) {
1498 target = word;
1499 label = 1;
1500 } else {
1501 next_random = next_random
1502 * (unsigned long long) 25214903917 + 11;
1503 if (word_to_group != NULL
1504 && word_to_group[word] != -1) {
1505 target = word;
1506 while (target == word) {
1507 target =
1508 group_to_table[word_to_group[word]
1509 * table_size
1510 + (next_random >> 16)
1511 % table_size];
1512 next_random =
1513 next_random
1514 * (unsigned long long) 25214903917
1515 + 11;
1516 }
1517 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1518 } else {
1519 target = table[(next_random >> 16)
1520 % table_size];
1521 }
1522 if (target == 0)
1523 target = next_random % (vocab_size - 1) + 1;
1524 if (target == word)
1525 continue;
1526 label = 0;
1527 }
1528 l2 = target * window_layer_size;
1529 f = 0;
1530 for (c = 0; c < layer1_size; c++)
1531 f +=
1532 syn0[c + l1]
1533 * syn1neg_window[c + l2
1534 + window_offset];
1535 if (f > MAX_EXP)
1536 g = (label - 1) * alpha;
1537 else if (f < -MAX_EXP)
1538 g = (label - 0) * alpha;
1539 else
1540 g =
1541 (label
1542 - expTable[(int) ((f + MAX_EXP)
1543 * (EXP_TABLE_SIZE
1544 / MAX_EXP / 2))])
1545 * alpha;
1546 if(debug_mode > 2 && ((long long) id) == 0) {
1547 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1548 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1549 }
1550 for (c = 0; c < layer1_size; c++)
1551 neu1e[c] +=
1552 g
1553 * syn1neg_window[c + l2
1554 + window_offset];
1555 for (c = 0; c < layer1_size; c++)
1556 syn1neg_window[c + l2 + window_offset] += g
1557 * syn0[c + l1];
1558 if (cap == 1)
1559 for (c = 0; c < layer1_size; c++)
1560 capParam(syn1neg_window,
1561 c + l2 + window_offset);
1562 }
1563 // Noise Constrastive Estimation
1564 if (nce > 0)
1565 for (d = 0; d < nce + 1; d++) {
1566 if (d == 0) {
1567 target = word;
1568 label = 1;
1569 } else {
1570 next_random = next_random
1571 * (unsigned long long) 25214903917 + 11;
1572 if (word_to_group != NULL
1573 && word_to_group[word] != -1) {
1574 target = word;
1575 while (target == word) {
1576 target =
1577 group_to_table[word_to_group[word]
1578 * table_size
1579 + (next_random >> 16)
1580 % table_size];
1581 next_random =
1582 next_random
1583 * (unsigned long long) 25214903917
1584 + 11;
1585 }
1586 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1587 } else {
1588 target = table[(next_random >> 16)
1589 % table_size];
1590 }
1591 if (target == 0)
1592 target = next_random % (vocab_size - 1) + 1;
1593 if (target == word)
1594 continue;
1595 label = 0;
1596 }
1597 l2 = target * window_layer_size;
1598 f = 0;
1599 for (c = 0; c < layer1_size; c++)
1600 f +=
1601 syn0[c + l1]
1602 * syn1nce_window[c + l2
1603 + window_offset];
1604 if (f > MAX_EXP)
1605 g = (label - 1) * alpha;
1606 else if (f < -MAX_EXP)
1607 g = (label - 0) * alpha;
1608 else {
1609 f = exp(f);
1610 g = (label
1611 - f
1612 / (noise_distribution[target]
1613 * nce + f)) * alpha;
1614 }
1615 for (c = 0; c < layer1_size; c++)
1616 neu1e[c] +=
1617 g
1618 * syn1nce_window[c + l2
1619 + window_offset];
1620 for (c = 0; c < layer1_size; c++)
1621 syn1nce_window[c + l2 + window_offset] += g
1622 * syn0[c + l1];
1623 if (cap == 1)
1624 for (c = 0; c < layer1_size; c++)
1625 capParam(syn1nce_window,
1626 c + l2 + window_offset);
1627 }
1628 // Learn weights input -> hidden
1629 for (c = 0; c < layer1_size; c++) {
1630 syn0[c + l1] += neu1e[c];
1631 if (syn0[c + l1] > 50)
1632 syn0[c + l1] = 50;
1633 if (syn0[c + l1] < -50)
1634 syn0[c + l1] = -50;
1635 }
1636 }
1637 } else if (type == 4) { //training senna
1638 // in -> hidden
1639 cw = 0;
1640 for (a = 0; a < window * 2 + 1; a++)
1641 if (a != window) {
1642 c = sentence_position - window + a;
1643 if (c < 0)
1644 continue;
1645 if (c >= sentence_length)
1646 continue;
1647 last_word = sen[c];
1648 if (last_word == -1)
1649 continue;
1650 window_offset = a * layer1_size;
1651 if (a > window)
1652 window_offset -= layer1_size;
1653 for (c = 0; c < layer1_size; c++)
1654 neu1[c + window_offset] += syn0[c
1655 + last_word * layer1_size];
1656 cw++;
1657 }
1658 if (cw) {
1659 for (a = 0; a < window_hidden_size; a++) {
1660 c = a * window_layer_size;
1661 for (b = 0; b < window_layer_size; b++) {
1662 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1663 }
1664 }
1665 if (hs)
1666 for (d = 0; d < vocab[word].codelen; d++) {
1667 f = 0;
1668 l2 = vocab[word].point[d] * window_hidden_size;
1669 // Propagate hidden -> output
1670 for (c = 0; c < window_hidden_size; c++)
1671 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1672 if (f <= -MAX_EXP)
1673 continue;
1674 else if (f >= MAX_EXP)
1675 continue;
1676 else
1677 f = expTable[(int) ((f + MAX_EXP)
1678 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1679 // 'g' is the gradient multiplied by the learning rate
1680 g = (1 - vocab[word].code[d] - f) * alpha;
1681 // Propagate errors output -> hidden
1682 for (c = 0; c < window_hidden_size; c++)
1683 neu2e[c] += dHardTanh(neu2[c], g) * g
1684 * syn_hidden_word[c + l2];
1685 // Learn weights hidden -> output
1686 for (c = 0; c < window_hidden_size; c++)
1687 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1688 * neu2[c];
1689 }
1690 // NEGATIVE SAMPLING
1691 if (negative > 0)
1692 for (d = 0; d < negative + 1; d++) {
1693 if (d == 0) {
1694 target = word;
1695 label = 1;
1696 } else {
1697 next_random = next_random
1698 * (unsigned long long) 25214903917 + 11;
1699 if (word_to_group != NULL
1700 && word_to_group[word] != -1) {
1701 target = word;
1702 while (target == word) {
1703 target = group_to_table[word_to_group[word]
1704 * table_size
1705 + (next_random >> 16) % table_size];
1706 next_random = next_random
1707 * (unsigned long long) 25214903917
1708 + 11;
1709 }
1710 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1711 } else {
1712 target =
1713 table[(next_random >> 16) % table_size];
1714 }
1715 if (target == 0)
1716 target = next_random % (vocab_size - 1) + 1;
1717 if (target == word)
1718 continue;
1719 label = 0;
1720 }
1721 l2 = target * window_hidden_size;
1722 f = 0;
1723 for (c = 0; c < window_hidden_size; c++)
1724 f += hardTanh(neu2[c])
1725 * syn_hidden_word_neg[c + l2];
1726 if (f > MAX_EXP)
1727 g = (label - 1) * alpha / negative;
1728 else if (f < -MAX_EXP)
1729 g = (label - 0) * alpha / negative;
1730 else
1731 g = (label
1732 - expTable[(int) ((f + MAX_EXP)
1733 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1734 * alpha / negative;
1735 for (c = 0; c < window_hidden_size; c++)
1736 neu2e[c] += dHardTanh(neu2[c], g) * g
1737 * syn_hidden_word_neg[c + l2];
1738 for (c = 0; c < window_hidden_size; c++)
1739 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1740 * g * neu2[c];
1741 }
1742 for (a = 0; a < window_hidden_size; a++)
1743 for (b = 0; b < window_layer_size; b++)
1744 neu1e[b] += neu2e[a]
1745 * syn_window_hidden[a * window_layer_size + b];
1746 for (a = 0; a < window_hidden_size; a++)
1747 for (b = 0; b < window_layer_size; b++)
1748 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1749 * neu1[b];
1750 // hidden -> in
1751 for (a = 0; a < window * 2 + 1; a++)
1752 if (a != window) {
1753 c = sentence_position - window + a;
1754 if (c < 0)
1755 continue;
1756 if (c >= sentence_length)
1757 continue;
1758 last_word = sen[c];
1759 if (last_word == -1)
1760 continue;
1761 window_offset = a * layer1_size;
1762 if (a > window)
1763 window_offset -= layer1_size;
1764 for (c = 0; c < layer1_size; c++)
1765 syn0[c + last_word * layer1_size] += neu1e[c
1766 + window_offset];
1767 }
1768 }
1769 } else if(type == 5) {
1770 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1771 c = sentence_position - window + a;
1772 if (c < 0) continue;
1773 if (c >= sentence_length) continue;
1774 last_word = sen[c];
1775 if (last_word == -1) continue;
1776 inc_collocator(cdb, word, last_word, a - window);
1777 // printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
1778 // cw++;
1779 }
1780 } else {
1781 printf("unknown type %i", type);
1782 exit(0);
1783 }
1784 sentence_position++;
1785 if (sentence_position >= sentence_length) {
1786 sentence_length = 0;
1787 continue;
1788 }
1789 }
1790 fclose(fi);
1791 free(neu1);
1792 free(neu1e);
1793 threadPos[(long) id] = -1;
1794 pthread_exit(NULL);
1795}
1796
1797void ShowCollocations() {
1798 long a, b, c, d, e, window_offset, target, max_target = 0, maxmax_target;
1799 real f, max_f, maxmax_f;
1800 real *target_sums, bestf[MAX_CC], worstbest;
1801 long besti[MAX_CC];
1802 int N = 10, bestp[MAX_CC];
1803 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1804
1805 for (d = cc; d < vocab_size; d++) {
1806 for (b = 0; b < vocab_size; b++)
1807 target_sums[b] = 0;
1808 for (b = 0; b < N; b++)
1809 bestf[b] = -1;
1810 worstbest = -1;
1811
1812 maxmax_f = -1;
1813 maxmax_target = 0;
1814 for (a = window * 2 + 1; a >=0; a--) {
1815 if (a != window) {
1816 max_f = -1;
1817 window_offset = a * layer1_size;
1818 if (a > window)
1819 window_offset -= layer1_size;
1820 for(target = 0; target < vocab_size; target ++) {
1821 if(target == d)
1822 continue;
1823 f = 0;
1824 for (c = 0; c < layer1_size; c++)
1825 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1826 if (f < -MAX_EXP)
1827 continue;
1828 else if (f > MAX_EXP)
1829 continue;
1830 else
1831 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1832 if(f > max_f) {
1833 max_f = f;
1834 max_target = target;
1835 }
1836 target_sums[target] += (1-target_sums[target]) * f;
1837 if(f > worstbest) {
1838 for (b = 0; b < N; b++) {
1839 if (f > bestf[b]) {
1840 for (e = N - 1; e > b; e--) {
1841 bestf[e] = bestf[e - 1];
1842 besti[e] = besti[e - 1];
1843 bestp[e] = bestp[e - 1];
1844 }
1845 bestf[b] = f;
1846 besti[b] = target;
1847 bestp[b] = window-a;
1848 break;
1849 }
1850 }
1851 worstbest = bestf[N - 1];
1852 }
1853 }
1854 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1855 if (max_f > maxmax_f) {
1856 maxmax_f = max_f;
1857 maxmax_target = max_target;
1858 }
1859 } else {
1860 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1861 }
1862 }
1863 max_f = -1;
1864 for (b = 0; b < vocab_size; b++) {
1865 if (target_sums[b] > max_f) {
1866 max_f = target_sums[b];
1867 max_target = b;
1868 }
1869 }
1870 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
1871 vocab[max_target].word, max_f, vocab[maxmax_target].word,
1872 maxmax_f);
1873 for (b = 0; b < N && bestf[b] > -1; b++)
1874 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
1875 printf("\n");
1876 }
1877}
1878
1879void TrainModel() {
1880 long a, b, c, d;
1881 FILE *fo;
1882 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1883 threadPos = malloc(num_threads * sizeof(long long));
1884 threadIters = malloc(num_threads * sizeof(int));
1885 char *timebuf = malloc(80);
1886 printf("Starting training using file %s\n", train_file);
1887 starting_alpha = alpha;
1888 if (read_vocab_file[0] != 0)
1889 ReadVocab();
1890 else
1891 LearnVocabFromTrainFile();
1892 if (save_vocab_file[0] != 0)
1893 SaveVocab();
1894 if (output_file[0] == 0)
1895 return;
1896 InitNet();
1897 if (cc > 0)
1898 ShowCollocations();
1899 if (negative > 0 || nce > 0)
1900 InitUnigramTable();
1901 if (negative_classes_file[0] != 0)
1902 InitClassUnigramTable();
1903 start = time(NULL);
1904 start_clock = clock();
1905 for (a = 0; a < num_threads; a++)
1906 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1907 if(debug_mode > 1)
1908 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
1909 for (a = 0; a < num_threads; a++)
1910 pthread_join(pt[a], NULL);
1911 if(debug_mode > 1) {
1912 pthread_join(pt[num_threads], NULL);
1913 clock_t now = time(NULL);
1914 clock_t now_clock = clock();
1915 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
1916 if(type == 5) // don't save vectorsmfor classic collocators
1917 return;
1918 printf("Saving vectors to %s ...", output_file);
1919 fflush(stdout);
1920 }
1921 fo = fopen(output_file, "wb");
1922 if (classes == 0) {
1923 // Save the word vectors
1924 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1925 for (a = 0; a < vocab_size; a++) {
1926 fprintf(fo, "%s ", vocab[a].word);
1927 if (binary)
1928 for (b = 0; b < layer1_size; b++)
1929 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1930 else
1931 for (b = 0; b < layer1_size; b++)
1932 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1933 fprintf(fo, "\n");
1934 }
1935 if(debug_mode > 1)
1936 fprintf(stderr, "\n");
1937 } else {
1938 // Run K-means on the word vectors
1939 int clcn = classes, iter = 10, closeid;
1940 int *centcn = (int *) malloc(classes * sizeof(int));
1941 int *cl = (int *) calloc(vocab_size, sizeof(int));
1942 real closev, x;
1943 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1944 for (a = 0; a < vocab_size; a++)
1945 cl[a] = a % clcn;
1946 for (a = 0; a < iter; a++) {
1947 for (b = 0; b < clcn * layer1_size; b++)
1948 cent[b] = 0;
1949 for (b = 0; b < clcn; b++)
1950 centcn[b] = 1;
1951 for (c = 0; c < vocab_size; c++) {
1952 for (d = 0; d < layer1_size; d++)
1953 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1954 centcn[cl[c]]++;
1955 }
1956 for (b = 0; b < clcn; b++) {
1957 closev = 0;
1958 for (c = 0; c < layer1_size; c++) {
1959 cent[layer1_size * b + c] /= centcn[b];
1960 closev += cent[layer1_size * b + c]
1961 * cent[layer1_size * b + c];
1962 }
1963 closev = sqrt(closev);
1964 for (c = 0; c < layer1_size; c++)
1965 cent[layer1_size * b + c] /= closev;
1966 }
1967 for (c = 0; c < vocab_size; c++) {
1968 closev = -10;
1969 closeid = 0;
1970 for (d = 0; d < clcn; d++) {
1971 x = 0;
1972 for (b = 0; b < layer1_size; b++)
1973 x += cent[layer1_size * d + b]
1974 * syn0[c * layer1_size + b];
1975 if (x > closev) {
1976 closev = x;
1977 closeid = d;
1978 }
1979 }
1980 cl[c] = closeid;
1981 }
1982 }
1983 // Save the K-means classes
1984 for (a = 0; a < vocab_size; a++)
1985 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1986 free(centcn);
1987 free(cent);
1988 free(cl);
1989 }
1990 fclose(fo);
1991 if (save_net_file[0] != 0)
1992 SaveNet();
1993}
1994
1995int ArgPos(char *str, int argc, char **argv) {
1996 int a;
1997 for (a = 1; a < argc; a++)
1998 if (!strcmp(str, argv[a])) {
1999 if (a == argc - 1) {
2000 printf("Argument missing for %s\n", str);
2001 exit(1);
2002 }
2003 return a;
2004 }
2005 return -1;
2006}
2007
2008void print_help() {
2009 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
2010 printf("Options:\n");
2011 printf("Parameters for training:\n");
2012 printf("\t-train <file>\n");
2013 printf("\t\tUse text data from <file> to train the model\n");
2014 printf("\t-output <file>\n");
2015 printf(
2016 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
2017 printf("\t-size <int>\n");
2018 printf("\t\tSet size of word vectors; default is 100\n");
2019 printf("\t-window <int>\n");
2020 printf("\t\tSet max skip length between words; default is 5\n");
2021 printf("\t-sample <float>\n");
2022 printf(
2023 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
2024 printf(
2025 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
2026 printf("\t-hs <int>\n");
2027 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
2028 printf("\t-negative <int>\n");
2029 printf(
2030 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
2031 printf("\t-negative-classes <file>\n");
2032 printf("\t\tNegative classes to sample from\n");
2033 printf("\t-nce <int>\n");
2034 printf(
2035 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
2036 printf("\t-threads <int>\n");
2037 printf("\t\tUse <int> threads (default 12)\n");
2038 printf("\t-iter <int>\n");
2039 printf("\t\tRun more training iterations (default 5)\n");
2040 printf("\t-min-count <int>\n");
2041 printf(
2042 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
2043 printf("\t-alpha <float>\n");
2044 printf(
2045 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
2046 printf("\t-classes <int>\n");
2047 printf(
2048 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
2049 printf("\t-debug <int>\n");
2050 printf(
2051 "\t\tSet the debug mode (default = 2 = more info during training)\n");
2052 printf("\t-binary <int>\n");
2053 printf(
2054 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
2055 printf("\t-save-vocab <file>\n");
2056 printf("\t\tThe vocabulary will be saved to <file>\n");
2057 printf("\t-read-vocab <file>\n");
2058 printf(
2059 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
2060 printf("\t-train-counts <int>\n");
2061 printf(
2062 "\t\tUse word counts of actual corpus rather than vocabulary counts; default is 1 (on)\n");
2063 printf("\t-read-net <file>\n");
2064 printf(
2065 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
2066 printf("\t-save-net <file>\n");
2067 printf("\t\tThe net parameters will be saved to <file>\n");
2068 printf("\t-magic-stop-file <file>\n");
2069 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
2070 printf("\t-show-cc <int>\n");
2071 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
2072 printf("\t-type <int>\n");
2073 printf(
2074 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
2075 printf("\t-cap <int>\n");
2076 printf(
2077 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
2078 printf("\nExamples:\n");
2079 printf(
2080 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
2081}
2082
2083int main(int argc, char **argv) {
2084 int i;
2085 setlocale(LC_ALL, "");
2086 if (argc == 1) {
2087 print_help();
2088 return 0;
2089 }
2090 output_file[0] = 0;
2091 save_vocab_file[0] = 0;
2092 read_vocab_file[0] = 0;
2093 save_net_file[0] = 0;
2094 read_net_file[0] = 0;
2095 negative_classes_file[0] = 0;
2096 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
2097 print_help();
2098 return(0);
2099 }
2100 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
2101 print_help();
2102 return(0);
2103 }
2104 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2105 layer1_size = atoi(argv[i + 1]);
2106 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2107 strcpy(train_file, argv[i + 1]);
2108 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2109 strcpy(save_vocab_file, argv[i + 1]);
2110 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2111 strcpy(read_vocab_file, argv[i + 1]);
2112 if ((i = ArgPos((char *) "-train-counts", argc, argv)) > 0)
2113 tc = atoi(argv[i + 1]);
2114 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2115 strcpy(save_net_file, argv[i + 1]);
2116 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2117 strcpy(read_net_file, argv[i + 1]);
2118 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2119 strcpy(magic_stop_file, argv[i + 1]);
2120 if (access(magic_stop_file, F_OK ) != -1) {
2121 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2122 exit(1);
2123 }
2124 }
2125 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2126 debug_mode = atoi(argv[i + 1]);
2127 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2128 binary = atoi(argv[i + 1]);
2129 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2130 cc = atoi(argv[i + 1]);
2131 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2132 type = atoi(argv[i + 1]);
2133 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2134 strcpy(output_file, argv[i + 1]);
2135 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2136 window = atoi(argv[i + 1]);
2137 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2138 sample = atof(argv[i + 1]);
2139 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2140 hs = atoi(argv[i + 1]);
2141 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2142 negative = atoi(argv[i + 1]);
2143 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2144 strcpy(negative_classes_file, argv[i + 1]);
2145 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2146 nce = atoi(argv[i + 1]);
2147 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2148 num_threads = atoi(argv[i + 1]);
2149 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2150 iter = atoi(argv[i + 1]);
2151 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2152 min_count = atoi(argv[i + 1]);
2153 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2154 classes = atoi(argv[i + 1]);
2155 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2156 cap = atoi(argv[i + 1]);
2157 if (type == 0 || type == 2 || type == 4)
2158 alpha = 0.05;
2159 if (type==5) {
2160 sample = 0;
2161 cdb = open_collocatordb_for_write(output_file);
2162 }
2163 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2164 alpha = atof(argv[i + 1]);
2165 vocab = (struct vocab_word *) calloc(vocab_max_size,
2166 sizeof(struct vocab_word));
2167 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2168 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2169 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2170 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2171 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2172 }
2173 SaveArgs(argc, argv);
2174 TrainModel();
2175 return 0;
2176}
2177