blob: 7611517885b66dfb6b4453f464f30742b1ad34da [file] [log] [blame]
Peter Fankhauser542b6872020-04-19 15:21:44 +02001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <locale.h>
16#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
19#include <unistd.h>
20#include <math.h>
21#include <pthread.h>
22#include <collocatordb.h>
23
24#define MAX_STRING 100
25#define EXP_TABLE_SIZE 1000
26#define MAX_EXP 6
27#define MAX_SENTENCE_LENGTH 1000
28#define MAX_CC 100
29#define MAX_CODE_LENGTH 40
30
31const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
32
33typedef float real; // Precision of float numbers
34
35struct vocab_word {
36 long long cn;
37 int *point;
38 char *word, *code, codelen;
39};
40
41char train_file[MAX_STRING], output_file[MAX_STRING];
42char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
43char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
44char magic_stop_file[MAX_STRING];
45
46struct vocab_word *vocab;
47int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
48 num_threads = 12, min_reduce = 1;
49int *vocab_hash;
50long long *threadPos;
51int *threadIters;
52long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
53long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
54 classes = 0;
55real alpha = 0.025, starting_alpha, sample = 1e-3;
56real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
57real avgWordLength=0;
58clock_t start, start_clock;
59
60real *syn1_window, *syn1neg_window, *syn1nce_window;
61int w_offset, window_layer_size;
62
63int window_hidden_size = 500;
64real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
65 *syn_hidden_word_nce;
66
67int hs = 0, negative = 5;
68const int table_size = 1e8;
69int *table;
70
71long cc = 0;
72long tc = 1;
73
74//constrastive negative sampling
75char negative_classes_file[MAX_STRING];
76int *word_to_group;
77int *group_to_table; //group_size*table_size
78int class_number;
79
80//nce
81real* noise_distribution;
82int nce = 0;
83
84//param caps
85real CAP_VALUE = 50;
86int cap = 0;
87
88COLLOCATORDB *cdb = NULL;
89
90void capParam(real* array, int index) {
91 if (array[index] > CAP_VALUE)
92 array[index] = CAP_VALUE;
93 else if (array[index] < -CAP_VALUE)
94 array[index] = -CAP_VALUE;
95}
96
97real hardTanh(real x) {
98 if (x >= 1) {
99 return 1;
100 } else if (x <= -1) {
101 return -1;
102 } else {
103 return x;
104 }
105}
106
107real dHardTanh(real x, real g) {
108 if (x > 1 && g > 0) {
109 return 0;
110 }
111 if (x < -1 && g < 0) {
112 return 0;
113 }
114 return 1;
115}
116
117void InitUnigramTable() {
118 int a, i;
119 long long train_words_pow = 0;
120 real d1, power = 0.75;
121 table = (int *) malloc(table_size * sizeof(int));
122 for (a = 0; a < vocab_size; a++)
123 train_words_pow += pow(vocab[a].cn, power);
124 i = 0;
125 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
126 for (a = 0; a < table_size; a++) {
127 table[a] = i;
128 if (a / (real) table_size > d1) {
129 i++;
130 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
131 }
132 if (i >= vocab_size)
133 i = vocab_size - 1;
134 }
135
136 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
137 for (a = 0; a < vocab_size; a++)
138 noise_distribution[a] = pow(vocab[a].cn, power)
139 / (real) train_words_pow;
140}
141
142// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
143void ReadWord(char *word, FILE *fin) {
144 int a = 0, ch;
145 while (!feof(fin)) {
146 ch = fgetc(fin);
147 if (ch == 13)
148 continue;
149 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
150 if (a > 0) {
151 if (ch == '\n')
152 ungetc(ch, fin);
153 break;
154 }
155 if (ch == '\n') {
156 strcpy(word, (char *) "</s>");
157 return;
158 } else
159 continue;
160 }
161 word[a] = ch;
162 a++;
163 if (a >= MAX_STRING - 1)
164 a--; // Truncate too long words
165 }
166 word[a] = 0;
167}
168
169// Returns hash value of a word
170int GetWordHash(char *word) {
171 unsigned long long a, hash = 0;
172 for (a = 0; a < strlen(word); a++)
173 hash = hash * 257 + word[a];
174 hash = hash % vocab_hash_size;
175 return hash;
176}
177
178// Returns position of a word in the vocabulary; if the word is not found, returns -1
179int SearchVocab(char *word) {
180 unsigned int hash = GetWordHash(word);
181 while (1) {
182 if (vocab_hash[hash] == -1)
183 return -1;
184 if (!strcmp(word, vocab[vocab_hash[hash]].word))
185 return vocab_hash[hash];
186 hash = (hash + 1) % vocab_hash_size;
187 }
188 return -1;
189}
190
191// Reads a word and returns its index in the vocabulary
192int ReadWordIndex(FILE *fin) {
193 char word[MAX_STRING];
194 ReadWord(word, fin);
195 if (feof(fin))
196 return -1;
197 return SearchVocab(word);
198}
199
200// Adds a word to the vocabulary
201int AddWordToVocab(char *word) {
202 unsigned int hash, length = strlen(word) + 1;
203 if (length > MAX_STRING)
204 length = MAX_STRING;
205 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
206 strcpy(vocab[vocab_size].word, word);
207 vocab[vocab_size].cn = 0;
208 vocab_size++;
209 // Reallocate memory if needed
210 if (vocab_size + 2 >= vocab_max_size) {
211 vocab_max_size += 1000;
212 vocab = (struct vocab_word *) realloc(vocab,
213 vocab_max_size * sizeof(struct vocab_word));
214 }
215 hash = GetWordHash(word);
216 while (vocab_hash[hash] != -1)
217 hash = (hash + 1) % vocab_hash_size;
218 vocab_hash[hash] = vocab_size - 1;
219 return vocab_size - 1;
220}
221
222// Used later for sorting by word counts
223int VocabCompare(const void *a, const void *b) {
224 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
225}
226
227// Sorts the vocabulary by frequency using word counts
228void SortVocab() {
229 int a, size;
230 unsigned int hash;
231 // Sort the vocabulary and keep </s> at the first position
232 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
233 for (a = 0; a < vocab_hash_size; a++)
234 vocab_hash[a] = -1;
235 size = vocab_size;
236 train_words = 0;
237 for (a = 0; a < size; a++) {
238 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
239 // Words occuring less than min_count times will be discarded from the vocab
240 if ((vocab[a].cn < min_count) && (a != 0)) {
241 vocab_size--;
242 free(vocab[a].word);
243 } else {
244 // Hash will be re-computed, as after the sorting it is not actual
245 hash = GetWordHash(vocab[a].word);
246 while (vocab_hash[hash] != -1)
247 hash = (hash + 1) % vocab_hash_size;
248 vocab_hash[hash] = a;
249 train_words += vocab[a].cn;
250 }
251 }
252 avgWordLength /= train_words;
253 vocab = (struct vocab_word *) realloc(vocab,
254 (vocab_size + 1) * sizeof(struct vocab_word));
255 // Allocate memory for the binary tree construction
256 for (a = 0; a < vocab_size; a++) {
257 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
258 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
259 }
260}
261
262// Reduces the vocabulary by removing infrequent tokens
263void ReduceVocab() {
264 int a, b = 0;
265 unsigned int hash;
266 for (a = 0; a < vocab_size; a++)
267 if (vocab[a].cn > min_reduce) {
268 vocab[b].cn = vocab[a].cn;
269 vocab[b].word = vocab[a].word;
270 b++;
271 } else
272 free(vocab[a].word);
273 vocab_size = b;
274 for (a = 0; a < vocab_hash_size; a++)
275 vocab_hash[a] = -1;
276 for (a = 0; a < vocab_size; a++) {
277 // Hash will be re-computed, as it is not actual
278 hash = GetWordHash(vocab[a].word);
279 while (vocab_hash[hash] != -1)
280 hash = (hash + 1) % vocab_hash_size;
281 vocab_hash[hash] = a;
282 }
283 fflush(stdout);
284 min_reduce++;
285}
286
287// Create binary Huffman tree using the word counts
288// Frequent words will have short uniqe binary codes
289void CreateBinaryTree() {
290 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
291 char code[MAX_CODE_LENGTH];
292 long long *count = (long long *) calloc(vocab_size * 2 + 1,
293 sizeof(long long));
294 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
295 sizeof(long long));
296 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
297 sizeof(long long));
298 // todo: this needs to operate on a sorted copy of vocab[a].cn if we use local counts
299 for (a = 0; a < vocab_size; a++)
300 count[a] = vocab[a].cn;
301 for (a = vocab_size; a < vocab_size * 2; a++)
302 count[a] = 1e15;
303 pos1 = vocab_size - 1;
304 pos2 = vocab_size;
305 // Following algorithm constructs the Huffman tree by adding one node at a time
306 for (a = 0; a < vocab_size - 1; a++) {
307 // First, find two smallest nodes 'min1, min2'
308 if (pos1 >= 0) {
309 if (count[pos1] < count[pos2]) {
310 min1i = pos1;
311 pos1--;
312 } else {
313 min1i = pos2;
314 pos2++;
315 }
316 } else {
317 min1i = pos2;
318 pos2++;
319 }
320 if (pos1 >= 0) {
321 if (count[pos1] < count[pos2]) {
322 min2i = pos1;
323 pos1--;
324 } else {
325 min2i = pos2;
326 pos2++;
327 }
328 } else {
329 min2i = pos2;
330 pos2++;
331 }
332 count[vocab_size + a] = count[min1i] + count[min2i];
333 parent_node[min1i] = vocab_size + a;
334 parent_node[min2i] = vocab_size + a;
335 binary[min2i] = 1;
336 }
337 // Now assign binary code to each vocabulary word
338 for (a = 0; a < vocab_size; a++) {
339 b = a;
340 i = 0;
341 while (1) {
342 code[i] = binary[b];
343 point[i] = b;
344 i++;
345 b = parent_node[b];
346 if (b == vocab_size * 2 - 2)
347 break;
348 }
349 vocab[a].codelen = i;
350 vocab[a].point[0] = vocab_size - 2;
351 for (b = 0; b < i; b++) {
352 vocab[a].code[i - b - 1] = code[b];
353 vocab[a].point[i - b] = point[b] - vocab_size;
354 }
355 }
356 free(count);
357 free(binary);
358 free(parent_node);
359}
360
361void LearnVocabFromTrainFile() {
362 char word[MAX_STRING];
363 FILE *fin;
364 long long a, i;
365 for (a = 0; a < vocab_hash_size; a++)
366 vocab_hash[a] = -1;
367 fin = fopen(train_file, "rb");
368 if (fin == NULL) {
369 printf("ERROR: training data file not found!\n");
370 exit(1);
371 }
372 vocab_size = 0;
373 AddWordToVocab((char *) "</s>");
374 while (1) {
375 ReadWord(word, fin);
376 if (feof(fin))
377 break;
378 train_words++;
379 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
380 printf("%lldK%c", train_words / 1000, 13);
381 fflush(stdout);
382 }
383 i = SearchVocab(word);
384 if (i == -1) {
385 a = AddWordToVocab(word);
386 vocab[a].cn = 1;
387 } else
388 vocab[i].cn++;
389 if (vocab_size > vocab_hash_size * 0.7)
390 ReduceVocab();
391 }
392 SortVocab();
393 if (debug_mode > 0) {
394 printf("Vocab size: %lld\n", vocab_size);
395 printf("Words in train file: %lld\n", train_words);
396 }
397 file_size = ftell(fin);
398 fclose(fin);
399}
400
401void SaveVocab() {
402 long long i;
403 FILE *fo = fopen(save_vocab_file, "wb");
404 for (i = 0; i < vocab_size; i++)
405 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
406 fclose(fo);
407}
408
409void ReadVocab() {
410 long long a, i = 0;
411 char c;
412 char word[MAX_STRING];
413 FILE *fin = fopen(read_vocab_file, "rb");
414 if (fin == NULL) {
415 printf("Vocabulary file not found\n");
416 exit(1);
417 }
418 for (a = 0; a < vocab_hash_size; a++)
419 vocab_hash[a] = -1;
420 vocab_size = 0;
421 while (1) {
422 ReadWord(word, fin);
423 if (feof(fin))
424 break;
425 a = AddWordToVocab(word);
426 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
427 i++;
428 }
429 fclose(fin);
430 SortVocab();
431
432 if (tc > 0) {
433 // recalculate counts for the current corpus
434 // adapted from LearnVocabFromTrainFile()
435 // note that we don't sort or rehash the vocabulary again, we only adapt vocab[.].cn.
436 fin = fopen(train_file, "rb");
437 if (fin == NULL) {
438 printf("ERROR: training data file not found!\n");
439 exit(1);
440 }
441 // reset vocabulary counts
442 for (a = 0; a < vocab_size; a++)
443 vocab[a].cn = 0;
444 train_words = 0;
445 while (1) {
446 ReadWord(word, fin);
447 if (feof(fin))
448 break;
449 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
450 printf("%lldK%c", train_words / 1000, 13);
451 fflush(stdout);
452 }
453 i = SearchVocab(word);
454 // the word must be in the vocabulary but we don't issue a warning,
455 // because it may have been cut off due to minfreq.
456 if (i >= 0) {
457 vocab[i].cn++;
458 train_words++;
459 }
460 }
461 // we cannot have 0 counts.
462 for (a = 0; a < vocab_size; a++) {
463 if(vocab[a].cn == 0) {
464 vocab[a].cn = 1;
465 train_words++;
466 }
467 }
468 if (debug_mode > 0) {
469 printf("Vocab size: %lld\n", vocab_size);
470 printf("Words in current train file: %'lld\n", train_words);
471 }
472 fseek(fin, 0, SEEK_END);
473 file_size = ftell(fin);
474 fclose(fin);
475 }
476 train_words = file_size / avgWordLength;
477 if(debug_mode > 0)
478 printf("Estimated words in train file: %'lld\n", train_words);
479}
480
481void InitClassUnigramTable() {
482 // TODO: this probably needs to be adapted for dealing with subcorpus adjusted vocabulary counts
483 long long a, c;
484 printf("loading class unigrams \n");
485 FILE *fin = fopen(negative_classes_file, "rb");
486 if (fin == NULL) {
487 printf("ERROR: class file not found!\n");
488 exit(1);
489 }
490 word_to_group = (int *) malloc(vocab_size * sizeof(int));
491 for (a = 0; a < vocab_size; a++)
492 word_to_group[a] = -1;
493 char class[MAX_STRING];
494 char prev_class[MAX_STRING];
495 prev_class[0] = 0;
496 char word[MAX_STRING];
497 class_number = -1;
498 while (1) {
499 if (feof(fin))
500 break;
501 ReadWord(class, fin);
502 ReadWord(word, fin);
503 int word_index = SearchVocab(word);
504 if (word_index != -1) {
505 if (strcmp(class, prev_class) != 0) {
506 class_number++;
507 strcpy(prev_class, class);
508 }
509 word_to_group[word_index] = class_number;
510 }
511 ReadWord(word, fin);
512 }
513 class_number++;
514 fclose(fin);
515
516 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
517 long long train_words_pow = 0;
518 real d1, power = 0.75;
519
520 for (c = 0; c < class_number; c++) {
521 long long offset = c * table_size;
522 train_words_pow = 0;
523 for (a = 0; a < vocab_size; a++)
524 if (word_to_group[a] == c)
525 train_words_pow += pow(vocab[a].cn, power);
526 int i = 0;
527 while (word_to_group[i] != c && i < vocab_size)
528 i++;
529 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
530 for (a = 0; a < table_size; a++) {
531 //printf("index %lld , word %d\n", a, i);
532 group_to_table[offset + a] = i;
533 if (a / (real) table_size > d1) {
534 i++;
535 while (word_to_group[i] != c && i < vocab_size)
536 i++;
537 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
538 }
539 if (i >= vocab_size)
540 while (word_to_group[i] != c && i >= 0)
541 i--;
542 }
543 }
544}
545
546void SaveArgs(int argc, char **argv) {
547 unsigned int i;
548 char args_file[MAX_STRING];
549 strcpy(args_file, output_file);
550 strcat(args_file, ".args");
551 FILE *fargs = fopen(args_file, "w");
552 if (fargs == NULL) {
553 printf("Cannot save args to %s.\n", args_file);
554 return;
555 }
556
557 for(i=1; i<argc; i++)
558 fprintf(fargs, "%s ", argv[i]);
559
560 fprintf(fargs, "\n");
561 fclose(fargs);
562
563 return;
564}
565
566void SaveNet() {
567 if (type == 4 || negative <= 0) {
568 fprintf(stderr,
569 "save-net only supported for type 0,1,2,3 with negative sampling\n");
570 return;
571 }
572
573 FILE *fnet = fopen(save_net_file, "wb");
574 if (fnet == NULL) {
575 printf("Net parameter file not found\n");
576 exit(1);
577 }
578 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
579 if (type == 0 || type == 1) {
580 fwrite(syn1neg, sizeof(real), vocab_size * layer1_size, fnet);
581 }
582 if (type == 2 || type == 3) {
583 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
584 }
585 fclose(fnet);
586}
587
588void InitNet() {
589 long long a, b;
590 unsigned long long next_random = 1;
591 long long read;
592
593 window_layer_size = layer1_size * window * 2;
594 a = posix_memalign((void **) &syn0, 128,
595 (long long) vocab_size * layer1_size * sizeof(real));
596 if (syn0 == NULL) {
597 printf("Memory allocation failed\n");
598 exit(1);
599 }
600
601 if (hs) {
602 a = posix_memalign((void **) &syn1, 128,
603 (long long) vocab_size * layer1_size * sizeof(real));
604 if (syn1 == NULL) {
605 printf("Memory allocation failed\n");
606 exit(1);
607 }
608 a = posix_memalign((void **) &syn1_window, 128,
609 (long long) vocab_size * window_layer_size * sizeof(real));
610 if (syn1_window == NULL) {
611 printf("Memory allocation failed\n");
612 exit(1);
613 }
614 a = posix_memalign((void **) &syn_hidden_word, 128,
615 (long long) vocab_size * window_hidden_size * sizeof(real));
616 if (syn_hidden_word == NULL) {
617 printf("Memory allocation failed\n");
618 exit(1);
619 }
620
621 for (a = 0; a < vocab_size; a++)
622 for (b = 0; b < layer1_size; b++)
623 syn1[a * layer1_size + b] = 0;
624 for (a = 0; a < vocab_size; a++)
625 for (b = 0; b < window_layer_size; b++)
626 syn1_window[a * window_layer_size + b] = 0;
627 for (a = 0; a < vocab_size; a++)
628 for (b = 0; b < window_hidden_size; b++)
629 syn_hidden_word[a * window_hidden_size + b] = 0;
630 }
631 if (negative > 0) {
632 if (type == 0 || type == 1) {
633 a = posix_memalign((void **) &syn1neg, 128,
634 (long long) vocab_size * layer1_size * sizeof(real));
635 if (syn1neg == NULL) {
636 printf("Memory allocation failed\n");
637 exit(1);
638 }
639 for (a = 0; a < vocab_size; a++)
640 for (b = 0; b < layer1_size; b++)
641 syn1neg[a * layer1_size + b] = 0;
642 } else if (type == 2 || type == 3) {
643 a = posix_memalign((void **) &syn1neg_window, 128,
644 (long long) vocab_size * window_layer_size * sizeof(real));
645 if (syn1neg_window == NULL) {
646 printf("Memory allocation failed\n");
647 exit(1);
648 }
649 for (a = 0; a < vocab_size; a++)
650 for (b = 0; b < window_layer_size; b++)
651 syn1neg_window[a * window_layer_size + b] = 0;
652 } else if (type == 4) {
653 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
654 (long long) vocab_size * window_hidden_size * sizeof(real));
655 if (syn_hidden_word_neg == NULL) {
656 printf("Memory allocation failed\n");
657 exit(1);
658 }
659 for (a = 0; a < vocab_size; a++)
660 for (b = 0; b < window_hidden_size; b++)
661 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
662 }
663 }
664 if (nce > 0) {
665 a = posix_memalign((void **) &syn1nce, 128,
666 (long long) vocab_size * layer1_size * sizeof(real));
667 if (syn1nce == NULL) {
668 printf("Memory allocation failed\n");
669 exit(1);
670 }
671 a = posix_memalign((void **) &syn1nce_window, 128,
672 (long long) vocab_size * window_layer_size * sizeof(real));
673 if (syn1nce_window == NULL) {
674 printf("Memory allocation failed\n");
675 exit(1);
676 }
677 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
678 (long long) vocab_size * window_hidden_size * sizeof(real));
679 if (syn_hidden_word_nce == NULL) {
680 printf("Memory allocation failed\n");
681 exit(1);
682 }
683
684 for (a = 0; a < vocab_size; a++)
685 for (b = 0; b < layer1_size; b++)
686 syn1nce[a * layer1_size + b] = 0;
687 for (a = 0; a < vocab_size; a++)
688 for (b = 0; b < window_layer_size; b++)
689 syn1nce_window[a * window_layer_size + b] = 0;
690 for (a = 0; a < vocab_size; a++)
691 for (b = 0; b < window_hidden_size; b++)
692 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
693 }
694
695 if (type == 4) {
696 a = posix_memalign((void **) &syn_window_hidden, 128,
697 window_hidden_size * window_layer_size * sizeof(real));
698 if (syn_window_hidden == NULL) {
699 printf("Memory allocation failed\n");
700 exit(1);
701 }
702 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
703 next_random = next_random * (unsigned long long) 25214903917 + 11;
704 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
705 - 0.5) / (window_hidden_size * window_layer_size);
706 }
707 }
708
709 if (read_net_file[0] == 0) {
710 for (a = 0; a < vocab_size; a++)
711 for (b = 0; b < layer1_size; b++) {
712 next_random = next_random * (unsigned long long) 25214903917
713 + 11;
714 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
715 / (real) 65536) - 0.5) / layer1_size;
716 }
717 } else if ((type == 0 || type == 1) && negative > 0) {
718 FILE *fnet = fopen(read_net_file, "rb");
719 if (fnet == NULL) {
720 printf("Net parameter file not found\n");
721 exit(1);
722 }
723 printf("vocab-size: %lld, layer1_size: %lld\n",
724 vocab_size, layer1_size);
725 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
726 if (read != vocab_size * layer1_size) {
727 fprintf(stderr, "read-net failed %lld\n", read);
728 exit(-1);
729 }
730 read = fread(syn1neg, sizeof(real),
731 vocab_size * layer1_size, fnet);
732 if (read != (long long) vocab_size * layer1_size) {
733 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n",
734 read,
735 (long long) sizeof(real) * vocab_size * layer1_size);
736 exit(-1);
737 }
738 fgetc(fnet);
739 if (!feof(fnet)) {
740 fprintf(stderr,
741 "Remaining bytes in net-file after read-net. File position: %ld\n",
742 ftell(fnet));
743 exit(-1);
744 }
745 fclose(fnet);
746 } else if ((type == 2 || type == 3) && negative > 0) {
747 FILE *fnet = fopen(read_net_file, "rb");
748 if (fnet == NULL) {
749 printf("Net parameter file not found\n");
750 exit(1);
751 }
752 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n",
753 vocab_size, layer1_size, window_layer_size);
754 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
755 if (read != vocab_size * layer1_size) {
756 fprintf(stderr, "read-net failed %lld\n", read);
757 exit(-1);
758 }
759 read = fread(syn1neg_window, sizeof(real),
760 vocab_size * window_layer_size, fnet);
761 if (read != (long long) vocab_size * window_layer_size) {
762 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n",
763 read,
764 (long long) sizeof(real) * vocab_size * window_layer_size);
765 exit(-1);
766 }
767 fgetc(fnet);
768 if (!feof(fnet)) {
769 fprintf(stderr,
770 "Remaining bytes in net-file after read-net. File position: %ld\n",
771 ftell(fnet));
772 exit(-1);
773 }
774 fclose(fnet);
775 } else {
776 fprintf(stderr,
777 "read-net only supported for type 3 with negative sampling\n");
778 exit(-1);
779 }
780
781 CreateBinaryTree();
782}
783
784char *currentDateTime(char *buf, real offset) {
785 time_t t;
786 time(&t);
787 t += (long) offset;
788 struct tm tstruct;
789 tstruct = *localtime(&t);
790 strftime(buf, 80, "%c", &tstruct);
791 return buf;
792}
793
794void *MonitorThread(void *id) {
795 char *timebuf = malloc(80);;
796 int i, n=num_threads;
797 long long sum;
798 sleep(1);
799 while(n > 0) {
800 sleep(1);
801 sum = n = 0;
802 for(i=0; i < num_threads; i++) {
803 if(threadPos[i] >= 0) {
804 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
805 n++;
806 } else {
807 sum += iter * file_size / num_threads;
808 }
809 }
810 if(n == 0)
811 break;
812 real finished_portion = (real) sum / (float) (file_size * iter);
813 long long now = time(NULL);
814 long long elapsed = (now - start);
815 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
816
817 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
818 alpha,
819 finished_portion * 100,
820 (float) sum / elapsed / 1000,
821 elapsed,
822 ttg,
823 currentDateTime(timebuf, ttg)
824 );
825 fflush(stdout);
826 }
827 pthread_exit(NULL);
828}
829
830void *TrainModelThread(void *id) {
831 long long a, b, d, cw, word, last_word, sentence_length = 0,
832 sentence_position = 0;
833 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
834 long long l1, l2, c, target, label, local_iter = iter;
835 unsigned long long next_random = (long long) id;
836 real f, g;
837 int input_len_1 = layer1_size;
838 int window_offset = -1;
839 if (type == 2 || type == 4) {
840 input_len_1 = window_layer_size;
841 }
842 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
843 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
844 threadIters[(long) id] = iter;
845
846 int input_len_2 = 0;
847 if (type == 4) {
848 input_len_2 = window_hidden_size;
849 }
850 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
851 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
852
853 FILE *fi = fopen(train_file, "rb");
854 long long start_pos = file_size / (long long) num_threads * (long long) id;
855 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
856 long long current_pos = start_pos;
857 long long last_pos = start_pos;;
858 fseek(fi, start_pos, SEEK_SET);
859 while (1) {
860 if (word_count - last_word_count > 10000) {
861 // if ((current_pos - last_pos > 100000)) {
862 // PF: changed back, because it seems that alpha is not correctly adjusted otherwise.
863 word_count_actual += word_count - last_word_count;
864 last_pos = current_pos;
865 last_word_count = word_count;
866 alpha = starting_alpha
867 * (1 - word_count_actual / (real) (iter * train_words + 1));
868 if (alpha < starting_alpha * 0.0001)
869 alpha = starting_alpha * 0.0001;
870 }
871 if (sentence_length == 0) {
872 while (1) {
873 word = ReadWordIndex(fi);
874 if (feof(fi))
875 break;
876 if (word == -1)
877 continue;
878 word_count++;
879 if (word == 0)
880 break;
881 // The subsampling randomly discards frequent words while keeping the ranking same
882 if (sample > 0) {
883 real ran = (sqrt(vocab[word].cn / (sample * train_words))
884 + 1) * (sample * train_words) / vocab[word].cn;
885 next_random = next_random * (unsigned long long) 25214903917
886 + 11;
887 if (ran < (next_random & 0xFFFF) / (real) 65536) {
888 if (type == 3) // in structured skipgrams
889 word = -2; // keep the window position correct
890 else
891 continue;
892 }
893 }
894 sen[sentence_length] = word;
895 sentence_length++;
896 if (sentence_length >= MAX_SENTENCE_LENGTH)
897 break;
898 }
899 sentence_position = 0;
900 }
901 current_pos = threadPos[(long) id] = ftell(fi);
902 if (feof(fi) || current_pos >= end_pos ) {
903 word_count_actual += word_count - last_word_count;
904 threadIters[(long) id]--;
905 local_iter--;
906 if (local_iter == 0)
907 break;
908 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
909 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
910 break;
911 }
912 word_count = 0;
913 last_word_count = 0;
914 sentence_length = 0;
915 fseek(fi, file_size / (long long) num_threads * (long long) id,
916 SEEK_SET);
917 continue;
918 }
919 word = sen[sentence_position];
920 while (word == -2 && sentence_position<sentence_length)
921 word = sen[++sentence_position];
922 if (sentence_position>=sentence_length) {
923 sentence_length=0;
924 continue;
925 }
926 if (word < 0)
927 continue;
928 for (c = 0; c < input_len_1; c++)
929 neu1[c] = 0;
930 for (c = 0; c < input_len_1; c++)
931 neu1e[c] = 0;
932 for (c = 0; c < input_len_2; c++)
933 neu2[c] = 0;
934 for (c = 0; c < input_len_2; c++)
935 neu2e[c] = 0;
936 next_random = next_random * (unsigned long long) 25214903917 + 11;
937 b = next_random % window;
938 if (type == 0) { //train the cbow architecture
939 // in -> hidden
940 cw = 0;
941 for (a = b; a < window * 2 + 1 - b; a++)
942 if (a != window) {
943 c = sentence_position - window + a;
944 if (c < 0)
945 continue;
946 if (c >= sentence_length)
947 continue;
948 last_word = sen[c];
949 if (last_word == -1)
950 continue;
951 for (c = 0; c < layer1_size; c++)
952 neu1[c] += syn0[c + last_word * layer1_size];
953 cw++;
954 }
955 if (cw) {
956 for (c = 0; c < layer1_size; c++)
957 neu1[c] /= cw;
958 if (hs)
959 for (d = 0; d < vocab[word].codelen; d++) {
960 f = 0;
961 l2 = vocab[word].point[d] * layer1_size;
962 // Propagate hidden -> output
963 for (c = 0; c < layer1_size; c++)
964 f += neu1[c] * syn1[c + l2];
965 if (f <= -MAX_EXP)
966 continue;
967 else if (f >= MAX_EXP)
968 continue;
969 else
970 f = expTable[(int) ((f + MAX_EXP)
971 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
972 // 'g' is the gradient multiplied by the learning rate
973 g = (1 - vocab[word].code[d] - f) * alpha;
974 // Propagate errors output -> hidden
975 for (c = 0; c < layer1_size; c++)
976 neu1e[c] += g * syn1[c + l2];
977 // Learn weights hidden -> output
978 for (c = 0; c < layer1_size; c++)
979 syn1[c + l2] += g * neu1[c];
980 if (cap == 1)
981 for (c = 0; c < layer1_size; c++)
982 capParam(syn1, c + l2);
983 }
984 // NEGATIVE SAMPLING
985 if (negative > 0)
986 for (d = 0; d < negative + 1; d++) {
987 if (d == 0) {
988 target = word;
989 label = 1;
990 } else {
991 next_random = next_random
992 * (unsigned long long) 25214903917 + 11;
993 if (word_to_group != NULL
994 && word_to_group[word] != -1) {
995 target = word;
996 while (target == word) {
997 target = group_to_table[word_to_group[word]
998 * table_size
999 + (next_random >> 16) % table_size];
1000 next_random = next_random
1001 * (unsigned long long) 25214903917
1002 + 11;
1003 }
1004 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1005 } else {
1006 target =
1007 table[(next_random >> 16) % table_size];
1008 }
1009 if (target == 0)
1010 target = next_random % (vocab_size - 1) + 1;
1011 if (target == word)
1012 continue;
1013 label = 0;
1014 }
1015 l2 = target * layer1_size;
1016 f = 0;
1017 for (c = 0; c < layer1_size; c++)
1018 f += neu1[c] * syn1neg[c + l2];
1019 if (f > MAX_EXP)
1020 g = (label - 1) * alpha;
1021 else if (f < -MAX_EXP)
1022 g = (label - 0) * alpha;
1023 else
1024 g = (label
1025 - expTable[(int) ((f + MAX_EXP)
1026 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1027 * alpha;
1028 for (c = 0; c < layer1_size; c++)
1029 neu1e[c] += g * syn1neg[c + l2];
1030 for (c = 0; c < layer1_size; c++)
1031 syn1neg[c + l2] += g * neu1[c];
1032 if (cap == 1)
1033 for (c = 0; c < layer1_size; c++)
1034 capParam(syn1neg, c + l2);
1035 }
1036 // Noise Contrastive Estimation
1037 if (nce > 0)
1038 for (d = 0; d < nce + 1; d++) {
1039 if (d == 0) {
1040 target = word;
1041 label = 1;
1042 } else {
1043 next_random = next_random
1044 * (unsigned long long) 25214903917 + 11;
1045 if (word_to_group != NULL
1046 && word_to_group[word] != -1) {
1047 target = word;
1048 while (target == word) {
1049 target = group_to_table[word_to_group[word]
1050 * table_size
1051 + (next_random >> 16) % table_size];
1052 next_random = next_random
1053 * (unsigned long long) 25214903917
1054 + 11;
1055 }
1056 } else {
1057 target =
1058 table[(next_random >> 16) % table_size];
1059 }
1060 if (target == 0)
1061 target = next_random % (vocab_size - 1) + 1;
1062 if (target == word)
1063 continue;
1064 label = 0;
1065 }
1066 l2 = target * layer1_size;
1067 f = 0;
1068
1069 for (c = 0; c < layer1_size; c++)
1070 f += neu1[c] * syn1nce[c + l2];
1071 if (f > MAX_EXP)
1072 g = (label - 1) * alpha;
1073 else if (f < -MAX_EXP)
1074 g = (label - 0) * alpha;
1075 else {
1076 f = exp(f);
1077 g =
1078 (label
1079 - f
1080 / (noise_distribution[target]
1081 * nce + f)) * alpha;
1082 }
1083 for (c = 0; c < layer1_size; c++)
1084 neu1e[c] += g * syn1nce[c + l2];
1085 for (c = 0; c < layer1_size; c++)
1086 syn1nce[c + l2] += g * neu1[c];
1087 if (cap == 1)
1088 for (c = 0; c < layer1_size; c++)
1089 capParam(syn1nce, c + l2);
1090 }
1091 // hidden -> in
1092 for (a = b; a < window * 2 + 1 - b; a++)
1093 if (a != window) {
1094 c = sentence_position - window + a;
1095 if (c < 0)
1096 continue;
1097 if (c >= sentence_length)
1098 continue;
1099 last_word = sen[c];
1100 if (last_word == -1)
1101 continue;
1102 for (c = 0; c < layer1_size; c++)
1103 syn0[c + last_word * layer1_size] += neu1e[c];
1104 }
1105 }
1106 } else if (type == 1) { //train skip-gram
1107 for (a = b; a < window * 2 + 1 - b; a++)
1108 if (a != window) {
1109 c = sentence_position - window + a;
1110 if (c < 0)
1111 continue;
1112 if (c >= sentence_length)
1113 continue;
1114 last_word = sen[c];
1115 if (last_word == -1)
1116 continue;
1117 l1 = last_word * layer1_size;
1118 for (c = 0; c < layer1_size; c++)
1119 neu1e[c] = 0;
1120 // HIERARCHICAL SOFTMAX
1121 if (hs)
1122 for (d = 0; d < vocab[word].codelen; d++) {
1123 f = 0;
1124 l2 = vocab[word].point[d] * layer1_size;
1125 // Propagate hidden -> output
1126 for (c = 0; c < layer1_size; c++)
1127 f += syn0[c + l1] * syn1[c + l2];
1128 if (f <= -MAX_EXP)
1129 continue;
1130 else if (f >= MAX_EXP)
1131 continue;
1132 else
1133 f = expTable[(int) ((f + MAX_EXP)
1134 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1135 // 'g' is the gradient multiplied by the learning rate
1136 g = (1 - vocab[word].code[d] - f) * alpha;
1137 // Propagate errors output -> hidden
1138 for (c = 0; c < layer1_size; c++)
1139 neu1e[c] += g * syn1[c + l2];
1140 // Learn weights hidden -> output
1141 for (c = 0; c < layer1_size; c++)
1142 syn1[c + l2] += g * syn0[c + l1];
1143 if (cap == 1)
1144 for (c = 0; c < layer1_size; c++)
1145 capParam(syn1, c + l2);
1146 }
1147 // NEGATIVE SAMPLING
1148 if (negative > 0)
1149 for (d = 0; d < negative + 1; d++) {
1150 if (d == 0) {
1151 target = word;
1152 label = 1;
1153 } else {
1154 next_random = next_random
1155 * (unsigned long long) 25214903917 + 11;
1156 if (word_to_group != NULL
1157 && word_to_group[word] != -1) {
1158 target = word;
1159 while (target == word) {
1160 target =
1161 group_to_table[word_to_group[word]
1162 * table_size
1163 + (next_random >> 16)
1164 % table_size];
1165 next_random =
1166 next_random
1167 * (unsigned long long) 25214903917
1168 + 11;
1169 }
1170 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1171 } else {
1172 target = table[(next_random >> 16)
1173 % table_size];
1174 }
1175 if (target == 0)
1176 target = next_random % (vocab_size - 1) + 1;
1177 if (target == word)
1178 continue;
1179 label = 0;
1180 }
1181 l2 = target * layer1_size;
1182 f = 0;
1183 for (c = 0; c < layer1_size; c++)
1184 f += syn0[c + l1] * syn1neg[c + l2];
1185 if (f > MAX_EXP)
1186 g = (label - 1) * alpha;
1187 else if (f < -MAX_EXP)
1188 g = (label - 0) * alpha;
1189 else
1190 g =
1191 (label
1192 - expTable[(int) ((f + MAX_EXP)
1193 * (EXP_TABLE_SIZE
1194 / MAX_EXP / 2))])
1195 * alpha;
1196 for (c = 0; c < layer1_size; c++)
1197 neu1e[c] += g * syn1neg[c + l2];
1198 for (c = 0; c < layer1_size; c++)
1199 syn1neg[c + l2] += g * syn0[c + l1];
1200 if (cap == 1)
1201 for (c = 0; c < layer1_size; c++)
1202 capParam(syn1neg, c + l2);
1203 }
1204 //Noise Contrastive Estimation
1205 if (nce > 0)
1206 for (d = 0; d < nce + 1; d++) {
1207 if (d == 0) {
1208 target = word;
1209 label = 1;
1210 } else {
1211 next_random = next_random
1212 * (unsigned long long) 25214903917 + 11;
1213 if (word_to_group != NULL
1214 && word_to_group[word] != -1) {
1215 target = word;
1216 while (target == word) {
1217 target =
1218 group_to_table[word_to_group[word]
1219 * table_size
1220 + (next_random >> 16)
1221 % table_size];
1222 next_random =
1223 next_random
1224 * (unsigned long long) 25214903917
1225 + 11;
1226 }
1227 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1228 } else {
1229 target = table[(next_random >> 16)
1230 % table_size];
1231 }
1232 if (target == 0)
1233 target = next_random % (vocab_size - 1) + 1;
1234 if (target == word)
1235 continue;
1236 label = 0;
1237 }
1238 l2 = target * layer1_size;
1239 f = 0;
1240 for (c = 0; c < layer1_size; c++)
1241 f += syn0[c + l1] * syn1nce[c + l2];
1242 if (f > MAX_EXP)
1243 g = (label - 1) * alpha;
1244 else if (f < -MAX_EXP)
1245 g = (label - 0) * alpha;
1246 else {
1247 f = exp(f);
1248 g = (label
1249 - f
1250 / (noise_distribution[target]
1251 * nce + f)) * alpha;
1252 }
1253 for (c = 0; c < layer1_size; c++)
1254 neu1e[c] += g * syn1nce[c + l2];
1255 for (c = 0; c < layer1_size; c++)
1256 syn1nce[c + l2] += g * syn0[c + l1];
1257 if (cap == 1)
1258 for (c = 0; c < layer1_size; c++)
1259 capParam(syn1nce, c + l2);
1260 }
1261 // Learn weights input -> hidden
1262 for (c = 0; c < layer1_size; c++)
1263 syn0[c + l1] += neu1e[c];
1264 }
1265 } else if (type == 2) { //train the cwindow architecture
1266 // in -> hidden
1267 cw = 0;
1268 for (a = 0; a < window * 2 + 1; a++)
1269 if (a != window) {
1270 c = sentence_position - window + a;
1271 if (c < 0)
1272 continue;
1273 if (c >= sentence_length)
1274 continue;
1275 last_word = sen[c];
1276 if (last_word == -1)
1277 continue;
1278 window_offset = a * layer1_size;
1279 if (a > window)
1280 window_offset -= layer1_size;
1281 for (c = 0; c < layer1_size; c++)
1282 neu1[c + window_offset] += syn0[c
1283 + last_word * layer1_size];
1284 cw++;
1285 }
1286 if (cw) {
1287 if (hs)
1288 for (d = 0; d < vocab[word].codelen; d++) {
1289 f = 0;
1290 l2 = vocab[word].point[d] * window_layer_size;
1291 // Propagate hidden -> output
1292 for (c = 0; c < window_layer_size; c++)
1293 f += neu1[c] * syn1_window[c + l2];
1294 if (f <= -MAX_EXP)
1295 continue;
1296 else if (f >= MAX_EXP)
1297 continue;
1298 else
1299 f = expTable[(int) ((f + MAX_EXP)
1300 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1301 // 'g' is the gradient multiplied by the learning rate
1302 g = (1 - vocab[word].code[d] - f) * alpha;
1303 // Propagate errors output -> hidden
1304 for (c = 0; c < window_layer_size; c++)
1305 neu1e[c] += g * syn1_window[c + l2];
1306 // Learn weights hidden -> output
1307 for (c = 0; c < window_layer_size; c++)
1308 syn1_window[c + l2] += g * neu1[c];
1309 if (cap == 1)
1310 for (c = 0; c < window_layer_size; c++)
1311 capParam(syn1_window, c + l2);
1312 }
1313 // NEGATIVE SAMPLING
1314 if (negative > 0)
1315 for (d = 0; d < negative + 1; d++) {
1316 if (d == 0) {
1317 target = word;
1318 label = 1;
1319 } else {
1320 next_random = next_random
1321 * (unsigned long long) 25214903917 + 11;
1322 if (word_to_group != NULL
1323 && word_to_group[word] != -1) {
1324 target = word;
1325 while (target == word) {
1326 target = group_to_table[word_to_group[word]
1327 * table_size
1328 + (next_random >> 16) % table_size];
1329 next_random = next_random
1330 * (unsigned long long) 25214903917
1331 + 11;
1332 }
1333 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1334 } else {
1335 target =
1336 table[(next_random >> 16) % table_size];
1337 }
1338 if (target == 0)
1339 target = next_random % (vocab_size - 1) + 1;
1340 if (target == word)
1341 continue;
1342 label = 0;
1343 }
1344 l2 = target * window_layer_size;
1345 f = 0;
1346 for (c = 0; c < window_layer_size; c++)
1347 f += neu1[c] * syn1neg_window[c + l2];
1348 if (f > MAX_EXP)
1349 g = (label - 1) * alpha;
1350 else if (f < -MAX_EXP)
1351 g = (label - 0) * alpha;
1352 else
1353 g = (label
1354 - expTable[(int) ((f + MAX_EXP)
1355 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1356 * alpha;
1357 for (c = 0; c < window_layer_size; c++)
1358 neu1e[c] += g * syn1neg_window[c + l2];
1359 for (c = 0; c < window_layer_size; c++)
1360 syn1neg_window[c + l2] += g * neu1[c];
1361 if (cap == 1)
1362 for (c = 0; c < window_layer_size; c++)
1363 capParam(syn1neg_window, c + l2);
1364 }
1365 // Noise Contrastive Estimation
1366 if (nce > 0)
1367 for (d = 0; d < nce + 1; d++) {
1368 if (d == 0) {
1369 target = word;
1370 label = 1;
1371 } else {
1372 next_random = next_random
1373 * (unsigned long long) 25214903917 + 11;
1374 if (word_to_group != NULL
1375 && word_to_group[word] != -1) {
1376 target = word;
1377 while (target == word) {
1378 target = group_to_table[word_to_group[word]
1379 * table_size
1380 + (next_random >> 16) % table_size];
1381 next_random = next_random
1382 * (unsigned long long) 25214903917
1383 + 11;
1384 }
1385 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1386 } else {
1387 target =
1388 table[(next_random >> 16) % table_size];
1389 }
1390 if (target == 0)
1391 target = next_random % (vocab_size - 1) + 1;
1392 if (target == word)
1393 continue;
1394 label = 0;
1395 }
1396 l2 = target * window_layer_size;
1397 f = 0;
1398 for (c = 0; c < window_layer_size; c++)
1399 f += neu1[c] * syn1nce_window[c + l2];
1400 if (f > MAX_EXP)
1401 g = (label - 1) * alpha;
1402 else if (f < -MAX_EXP)
1403 g = (label - 0) * alpha;
1404 else {
1405 f = exp(f);
1406 g =
1407 (label
1408 - f
1409 / (noise_distribution[target]
1410 * nce + f)) * alpha;
1411 }
1412 for (c = 0; c < window_layer_size; c++)
1413 neu1e[c] += g * syn1nce_window[c + l2];
1414 for (c = 0; c < window_layer_size; c++)
1415 syn1nce_window[c + l2] += g * neu1[c];
1416 if (cap == 1)
1417 for (c = 0; c < window_layer_size; c++)
1418 capParam(syn1nce_window, c + l2);
1419 }
1420 // hidden -> in
1421 for (a = 0; a < window * 2 + 1; a++)
1422 if (a != window) {
1423 c = sentence_position - window + a;
1424 if (c < 0)
1425 continue;
1426 if (c >= sentence_length)
1427 continue;
1428 last_word = sen[c];
1429 if (last_word == -1)
1430 continue;
1431 window_offset = a * layer1_size;
1432 if (a > window)
1433 window_offset -= layer1_size;
1434 for (c = 0; c < layer1_size; c++)
1435 syn0[c + last_word * layer1_size] += neu1e[c
1436 + window_offset];
1437 }
1438 }
1439 } else if (type == 3) { //train structured skip-gram
1440 for (a = 0; a < window * 2 + 1; a++)
1441 if (a != window) {
1442 c = sentence_position - window + a;
1443 if (c < 0)
1444 continue;
1445 if (c >= sentence_length)
1446 continue;
1447 last_word = sen[c];
1448 if (last_word < 0)
1449 continue;
1450 l1 = last_word * layer1_size;
1451 window_offset = a * layer1_size;
1452 if (a > window)
1453 window_offset -= layer1_size;
1454 for (c = 0; c < layer1_size; c++)
1455 neu1e[c] = 0;
1456 // HIERARCHICAL SOFTMAX
1457 if (hs)
1458 for (d = 0; d < vocab[word].codelen; d++) {
1459 f = 0;
1460 l2 = vocab[word].point[d] * window_layer_size;
1461 // Propagate hidden -> output
1462 for (c = 0; c < layer1_size; c++)
1463 f += syn0[c + l1]
1464 * syn1_window[c + l2 + window_offset];
1465 if (f <= -MAX_EXP)
1466 continue;
1467 else if (f >= MAX_EXP)
1468 continue;
1469 else
1470 f = expTable[(int) ((f + MAX_EXP)
1471 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1472 // 'g' is the gradient multiplied by the learning rate
1473 g = (1 - vocab[word].code[d] - f) * alpha;
1474 // Propagate errors output -> hidden
1475 for (c = 0; c < layer1_size; c++)
1476 neu1e[c] += g
1477 * syn1_window[c + l2 + window_offset];
1478 // Learn weights hidden -> output
1479 for (c = 0; c < layer1_size; c++)
1480 syn1[c + l2 + window_offset] += g
1481 * syn0[c + l1];
1482 if (cap == 1)
1483 for (c = 0; c < layer1_size; c++)
1484 capParam(syn1, c + l2 + window_offset);
1485 }
1486 // NEGATIVE SAMPLING
1487 if (negative > 0)
1488 for (d = 0; d < negative + 1; d++) {
1489 if (d == 0) {
1490 target = word;
1491 label = 1;
1492 } else {
1493 next_random = next_random
1494 * (unsigned long long) 25214903917 + 11;
1495 if (word_to_group != NULL
1496 && word_to_group[word] != -1) {
1497 target = word;
1498 while (target == word) {
1499 target =
1500 group_to_table[word_to_group[word]
1501 * table_size
1502 + (next_random >> 16)
1503 % table_size];
1504 next_random =
1505 next_random
1506 * (unsigned long long) 25214903917
1507 + 11;
1508 }
1509 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1510 } else {
1511 target = table[(next_random >> 16)
1512 % table_size];
1513 }
1514 if (target == 0)
1515 target = next_random % (vocab_size - 1) + 1;
1516 if (target == word)
1517 continue;
1518 label = 0;
1519 }
1520 l2 = target * window_layer_size;
1521 f = 0;
1522 for (c = 0; c < layer1_size; c++)
1523 f +=
1524 syn0[c + l1]
1525 * syn1neg_window[c + l2
1526 + window_offset];
1527 if (f > MAX_EXP)
1528 g = (label - 1) * alpha;
1529 else if (f < -MAX_EXP)
1530 g = (label - 0) * alpha;
1531 else
1532 g =
1533 (label
1534 - expTable[(int) ((f + MAX_EXP)
1535 * (EXP_TABLE_SIZE
1536 / MAX_EXP / 2))])
1537 * alpha;
1538 if(debug_mode > 2 && ((long long) id) == 0) {
1539 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1540 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1541 }
1542 for (c = 0; c < layer1_size; c++)
1543 neu1e[c] +=
1544 g
1545 * syn1neg_window[c + l2
1546 + window_offset];
1547 for (c = 0; c < layer1_size; c++)
1548 syn1neg_window[c + l2 + window_offset] += g
1549 * syn0[c + l1];
1550 if (cap == 1)
1551 for (c = 0; c < layer1_size; c++)
1552 capParam(syn1neg_window,
1553 c + l2 + window_offset);
1554 }
1555 // Noise Constrastive Estimation
1556 if (nce > 0)
1557 for (d = 0; d < nce + 1; d++) {
1558 if (d == 0) {
1559 target = word;
1560 label = 1;
1561 } else {
1562 next_random = next_random
1563 * (unsigned long long) 25214903917 + 11;
1564 if (word_to_group != NULL
1565 && word_to_group[word] != -1) {
1566 target = word;
1567 while (target == word) {
1568 target =
1569 group_to_table[word_to_group[word]
1570 * table_size
1571 + (next_random >> 16)
1572 % table_size];
1573 next_random =
1574 next_random
1575 * (unsigned long long) 25214903917
1576 + 11;
1577 }
1578 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1579 } else {
1580 target = table[(next_random >> 16)
1581 % table_size];
1582 }
1583 if (target == 0)
1584 target = next_random % (vocab_size - 1) + 1;
1585 if (target == word)
1586 continue;
1587 label = 0;
1588 }
1589 l2 = target * window_layer_size;
1590 f = 0;
1591 for (c = 0; c < layer1_size; c++)
1592 f +=
1593 syn0[c + l1]
1594 * syn1nce_window[c + l2
1595 + window_offset];
1596 if (f > MAX_EXP)
1597 g = (label - 1) * alpha;
1598 else if (f < -MAX_EXP)
1599 g = (label - 0) * alpha;
1600 else {
1601 f = exp(f);
1602 g = (label
1603 - f
1604 / (noise_distribution[target]
1605 * nce + f)) * alpha;
1606 }
1607 for (c = 0; c < layer1_size; c++)
1608 neu1e[c] +=
1609 g
1610 * syn1nce_window[c + l2
1611 + window_offset];
1612 for (c = 0; c < layer1_size; c++)
1613 syn1nce_window[c + l2 + window_offset] += g
1614 * syn0[c + l1];
1615 if (cap == 1)
1616 for (c = 0; c < layer1_size; c++)
1617 capParam(syn1nce_window,
1618 c + l2 + window_offset);
1619 }
1620 // Learn weights input -> hidden
1621 for (c = 0; c < layer1_size; c++) {
1622 syn0[c + l1] += neu1e[c];
1623 if (syn0[c + l1] > 50)
1624 syn0[c + l1] = 50;
1625 if (syn0[c + l1] < -50)
1626 syn0[c + l1] = -50;
1627 }
1628 }
1629 } else if (type == 4) { //training senna
1630 // in -> hidden
1631 cw = 0;
1632 for (a = 0; a < window * 2 + 1; a++)
1633 if (a != window) {
1634 c = sentence_position - window + a;
1635 if (c < 0)
1636 continue;
1637 if (c >= sentence_length)
1638 continue;
1639 last_word = sen[c];
1640 if (last_word == -1)
1641 continue;
1642 window_offset = a * layer1_size;
1643 if (a > window)
1644 window_offset -= layer1_size;
1645 for (c = 0; c < layer1_size; c++)
1646 neu1[c + window_offset] += syn0[c
1647 + last_word * layer1_size];
1648 cw++;
1649 }
1650 if (cw) {
1651 for (a = 0; a < window_hidden_size; a++) {
1652 c = a * window_layer_size;
1653 for (b = 0; b < window_layer_size; b++) {
1654 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1655 }
1656 }
1657 if (hs)
1658 for (d = 0; d < vocab[word].codelen; d++) {
1659 f = 0;
1660 l2 = vocab[word].point[d] * window_hidden_size;
1661 // Propagate hidden -> output
1662 for (c = 0; c < window_hidden_size; c++)
1663 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1664 if (f <= -MAX_EXP)
1665 continue;
1666 else if (f >= MAX_EXP)
1667 continue;
1668 else
1669 f = expTable[(int) ((f + MAX_EXP)
1670 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1671 // 'g' is the gradient multiplied by the learning rate
1672 g = (1 - vocab[word].code[d] - f) * alpha;
1673 // Propagate errors output -> hidden
1674 for (c = 0; c < window_hidden_size; c++)
1675 neu2e[c] += dHardTanh(neu2[c], g) * g
1676 * syn_hidden_word[c + l2];
1677 // Learn weights hidden -> output
1678 for (c = 0; c < window_hidden_size; c++)
1679 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1680 * neu2[c];
1681 }
1682 // NEGATIVE SAMPLING
1683 if (negative > 0)
1684 for (d = 0; d < negative + 1; d++) {
1685 if (d == 0) {
1686 target = word;
1687 label = 1;
1688 } else {
1689 next_random = next_random
1690 * (unsigned long long) 25214903917 + 11;
1691 if (word_to_group != NULL
1692 && word_to_group[word] != -1) {
1693 target = word;
1694 while (target == word) {
1695 target = group_to_table[word_to_group[word]
1696 * table_size
1697 + (next_random >> 16) % table_size];
1698 next_random = next_random
1699 * (unsigned long long) 25214903917
1700 + 11;
1701 }
1702 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1703 } else {
1704 target =
1705 table[(next_random >> 16) % table_size];
1706 }
1707 if (target == 0)
1708 target = next_random % (vocab_size - 1) + 1;
1709 if (target == word)
1710 continue;
1711 label = 0;
1712 }
1713 l2 = target * window_hidden_size;
1714 f = 0;
1715 for (c = 0; c < window_hidden_size; c++)
1716 f += hardTanh(neu2[c])
1717 * syn_hidden_word_neg[c + l2];
1718 if (f > MAX_EXP)
1719 g = (label - 1) * alpha / negative;
1720 else if (f < -MAX_EXP)
1721 g = (label - 0) * alpha / negative;
1722 else
1723 g = (label
1724 - expTable[(int) ((f + MAX_EXP)
1725 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1726 * alpha / negative;
1727 for (c = 0; c < window_hidden_size; c++)
1728 neu2e[c] += dHardTanh(neu2[c], g) * g
1729 * syn_hidden_word_neg[c + l2];
1730 for (c = 0; c < window_hidden_size; c++)
1731 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1732 * g * neu2[c];
1733 }
1734 for (a = 0; a < window_hidden_size; a++)
1735 for (b = 0; b < window_layer_size; b++)
1736 neu1e[b] += neu2e[a]
1737 * syn_window_hidden[a * window_layer_size + b];
1738 for (a = 0; a < window_hidden_size; a++)
1739 for (b = 0; b < window_layer_size; b++)
1740 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1741 * neu1[b];
1742 // hidden -> in
1743 for (a = 0; a < window * 2 + 1; a++)
1744 if (a != window) {
1745 c = sentence_position - window + a;
1746 if (c < 0)
1747 continue;
1748 if (c >= sentence_length)
1749 continue;
1750 last_word = sen[c];
1751 if (last_word == -1)
1752 continue;
1753 window_offset = a * layer1_size;
1754 if (a > window)
1755 window_offset -= layer1_size;
1756 for (c = 0; c < layer1_size; c++)
1757 syn0[c + last_word * layer1_size] += neu1e[c
1758 + window_offset];
1759 }
1760 }
1761 } else if(type == 5) {
1762 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1763 c = sentence_position - window + a;
1764 if (c < 0) continue;
1765 if (c >= sentence_length) continue;
1766 last_word = sen[c];
1767 if (last_word == -1) continue;
1768 inc_collocator(cdb, word, last_word, a - window);
1769 // printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
1770 // cw++;
1771 }
1772 } else {
1773 printf("unknown type %i", type);
1774 exit(0);
1775 }
1776 sentence_position++;
1777 if (sentence_position >= sentence_length) {
1778 sentence_length = 0;
1779 continue;
1780 }
1781 }
1782 fclose(fi);
1783 free(neu1);
1784 free(neu1e);
1785 threadPos[(long) id] = -1;
1786 pthread_exit(NULL);
1787}
1788
1789void ShowCollocations() {
1790 long a, b, c, d, e, window_offset, target, max_target = 0, maxmax_target;
1791 real f, max_f, maxmax_f;
1792 real *target_sums, bestf[MAX_CC], worstbest;
1793 long besti[MAX_CC];
1794 int N = 10, bestp[MAX_CC];
1795 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1796
1797 for (d = cc; d < vocab_size; d++) {
1798 for (b = 0; b < vocab_size; b++)
1799 target_sums[b] = 0;
1800 for (b = 0; b < N; b++)
1801 bestf[b] = -1;
1802 worstbest = -1;
1803
1804 maxmax_f = -1;
1805 maxmax_target = 0;
1806 for (a = window * 2 + 1; a >=0; a--) {
1807 if (a != window) {
1808 max_f = -1;
1809 window_offset = a * layer1_size;
1810 if (a > window)
1811 window_offset -= layer1_size;
1812 for(target = 0; target < vocab_size; target ++) {
1813 if(target == d)
1814 continue;
1815 f = 0;
1816 for (c = 0; c < layer1_size; c++)
1817 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1818 if (f < -MAX_EXP)
1819 continue;
1820 else if (f > MAX_EXP)
1821 continue;
1822 else
1823 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1824 if(f > max_f) {
1825 max_f = f;
1826 max_target = target;
1827 }
1828 target_sums[target] += (1-target_sums[target]) * f;
1829 if(f > worstbest) {
1830 for (b = 0; b < N; b++) {
1831 if (f > bestf[b]) {
1832 for (e = N - 1; e > b; e--) {
1833 bestf[e] = bestf[e - 1];
1834 besti[e] = besti[e - 1];
1835 bestp[e] = bestp[e - 1];
1836 }
1837 bestf[b] = f;
1838 besti[b] = target;
1839 bestp[b] = window-a;
1840 break;
1841 }
1842 }
1843 worstbest = bestf[N - 1];
1844 }
1845 }
1846 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1847 if (max_f > maxmax_f) {
1848 maxmax_f = max_f;
1849 maxmax_target = max_target;
1850 }
1851 } else {
1852 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1853 }
1854 }
1855 max_f = -1;
1856 for (b = 0; b < vocab_size; b++) {
1857 if (target_sums[b] > max_f) {
1858 max_f = target_sums[b];
1859 max_target = b;
1860 }
1861 }
1862 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
1863 vocab[max_target].word, max_f, vocab[maxmax_target].word,
1864 maxmax_f);
1865 for (b = 0; b < N && bestf[b] > -1; b++)
1866 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
1867 printf("\n");
1868 }
1869}
1870
1871void TrainModel() {
1872 long a, b, c, d;
1873 FILE *fo;
1874 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1875 threadPos = malloc(num_threads * sizeof(long long));
1876 threadIters = malloc(num_threads * sizeof(int));
1877 char *timebuf = malloc(80);
1878 printf("Starting training using file %s\n", train_file);
1879 starting_alpha = alpha;
1880 if (read_vocab_file[0] != 0)
1881 ReadVocab();
1882 else
1883 LearnVocabFromTrainFile();
1884 if (save_vocab_file[0] != 0)
1885 SaveVocab();
1886 if (output_file[0] == 0)
1887 return;
1888 InitNet();
1889 if (cc > 0)
1890 ShowCollocations();
1891 if (negative > 0 || nce > 0)
1892 InitUnigramTable();
1893 if (negative_classes_file[0] != 0)
1894 InitClassUnigramTable();
1895 start = time(NULL);
1896 start_clock = clock();
1897 for (a = 0; a < num_threads; a++)
1898 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1899 if(debug_mode > 1)
1900 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
1901 for (a = 0; a < num_threads; a++)
1902 pthread_join(pt[a], NULL);
1903 if(debug_mode > 1) {
1904 pthread_join(pt[num_threads], NULL);
1905 clock_t now = time(NULL);
1906 clock_t now_clock = clock();
1907 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
1908 if(type == 5) // don't save vectorsmfor classic collocators
1909 return;
1910 printf("Saving vectors to %s ...", output_file);
1911 fflush(stdout);
1912 }
1913 fo = fopen(output_file, "wb");
1914 if (classes == 0) {
1915 // Save the word vectors
1916 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1917 for (a = 0; a < vocab_size; a++) {
1918 fprintf(fo, "%s ", vocab[a].word);
1919 if (binary)
1920 for (b = 0; b < layer1_size; b++)
1921 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1922 else
1923 for (b = 0; b < layer1_size; b++)
1924 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1925 fprintf(fo, "\n");
1926 }
1927 if(debug_mode > 1)
1928 fprintf(stderr, "\n");
1929 } else {
1930 // Run K-means on the word vectors
1931 int clcn = classes, iter = 10, closeid;
1932 int *centcn = (int *) malloc(classes * sizeof(int));
1933 int *cl = (int *) calloc(vocab_size, sizeof(int));
1934 real closev, x;
1935 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1936 for (a = 0; a < vocab_size; a++)
1937 cl[a] = a % clcn;
1938 for (a = 0; a < iter; a++) {
1939 for (b = 0; b < clcn * layer1_size; b++)
1940 cent[b] = 0;
1941 for (b = 0; b < clcn; b++)
1942 centcn[b] = 1;
1943 for (c = 0; c < vocab_size; c++) {
1944 for (d = 0; d < layer1_size; d++)
1945 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1946 centcn[cl[c]]++;
1947 }
1948 for (b = 0; b < clcn; b++) {
1949 closev = 0;
1950 for (c = 0; c < layer1_size; c++) {
1951 cent[layer1_size * b + c] /= centcn[b];
1952 closev += cent[layer1_size * b + c]
1953 * cent[layer1_size * b + c];
1954 }
1955 closev = sqrt(closev);
1956 for (c = 0; c < layer1_size; c++)
1957 cent[layer1_size * b + c] /= closev;
1958 }
1959 for (c = 0; c < vocab_size; c++) {
1960 closev = -10;
1961 closeid = 0;
1962 for (d = 0; d < clcn; d++) {
1963 x = 0;
1964 for (b = 0; b < layer1_size; b++)
1965 x += cent[layer1_size * d + b]
1966 * syn0[c * layer1_size + b];
1967 if (x > closev) {
1968 closev = x;
1969 closeid = d;
1970 }
1971 }
1972 cl[c] = closeid;
1973 }
1974 }
1975 // Save the K-means classes
1976 for (a = 0; a < vocab_size; a++)
1977 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1978 free(centcn);
1979 free(cent);
1980 free(cl);
1981 }
1982 fclose(fo);
1983 if (save_net_file[0] != 0)
1984 SaveNet();
1985}
1986
1987int ArgPos(char *str, int argc, char **argv) {
1988 int a;
1989 for (a = 1; a < argc; a++)
1990 if (!strcmp(str, argv[a])) {
1991 if (a == argc - 1) {
1992 printf("Argument missing for %s\n", str);
1993 exit(1);
1994 }
1995 return a;
1996 }
1997 return -1;
1998}
1999
2000void print_help() {
2001 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
2002 printf("Options:\n");
2003 printf("Parameters for training:\n");
2004 printf("\t-train <file>\n");
2005 printf("\t\tUse text data from <file> to train the model\n");
2006 printf("\t-output <file>\n");
2007 printf(
2008 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
2009 printf("\t-size <int>\n");
2010 printf("\t\tSet size of word vectors; default is 100\n");
2011 printf("\t-window <int>\n");
2012 printf("\t\tSet max skip length between words; default is 5\n");
2013 printf("\t-sample <float>\n");
2014 printf(
2015 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
2016 printf(
2017 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
2018 printf("\t-hs <int>\n");
2019 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
2020 printf("\t-negative <int>\n");
2021 printf(
2022 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
2023 printf("\t-negative-classes <file>\n");
2024 printf("\t\tNegative classes to sample from\n");
2025 printf("\t-nce <int>\n");
2026 printf(
2027 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
2028 printf("\t-threads <int>\n");
2029 printf("\t\tUse <int> threads (default 12)\n");
2030 printf("\t-iter <int>\n");
2031 printf("\t\tRun more training iterations (default 5)\n");
2032 printf("\t-min-count <int>\n");
2033 printf(
2034 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
2035 printf("\t-alpha <float>\n");
2036 printf(
2037 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
2038 printf("\t-classes <int>\n");
2039 printf(
2040 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
2041 printf("\t-debug <int>\n");
2042 printf(
2043 "\t\tSet the debug mode (default = 2 = more info during training)\n");
2044 printf("\t-binary <int>\n");
2045 printf(
2046 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
2047 printf("\t-save-vocab <file>\n");
2048 printf("\t\tThe vocabulary will be saved to <file>\n");
2049 printf("\t-read-vocab <file>\n");
2050 printf(
2051 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
2052 printf("\t-train-counts <int>\n");
2053 printf(
2054 "\t\tUse word counts of actual corpus rather than vocabulary counts; default is 1 (on)\n");
2055 printf("\t-read-net <file>\n");
2056 printf(
2057 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
2058 printf("\t-save-net <file>\n");
2059 printf("\t\tThe net parameters will be saved to <file>\n");
2060 printf("\t-magic-stop-file <file>\n");
2061 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
2062 printf("\t-show-cc <int>\n");
2063 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
2064 printf("\t-type <int>\n");
2065 printf(
2066 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
2067 printf("\t-cap <int>\n");
2068 printf(
2069 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
2070 printf("\nExamples:\n");
2071 printf(
2072 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
2073}
2074
2075int main(int argc, char **argv) {
2076 int i;
2077 setlocale(LC_ALL, "");
2078 if (argc == 1) {
2079 print_help();
2080 return 0;
2081 }
2082 output_file[0] = 0;
2083 save_vocab_file[0] = 0;
2084 read_vocab_file[0] = 0;
2085 save_net_file[0] = 0;
2086 read_net_file[0] = 0;
2087 negative_classes_file[0] = 0;
2088 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
2089 print_help();
2090 return(0);
2091 }
2092 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
2093 print_help();
2094 return(0);
2095 }
2096 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2097 layer1_size = atoi(argv[i + 1]);
2098 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2099 strcpy(train_file, argv[i + 1]);
2100 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2101 strcpy(save_vocab_file, argv[i + 1]);
2102 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2103 strcpy(read_vocab_file, argv[i + 1]);
2104 if ((i = ArgPos((char *) "-train-counts", argc, argv)) > 0)
2105 tc = atoi(argv[i + 1]);
2106 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2107 strcpy(save_net_file, argv[i + 1]);
2108 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2109 strcpy(read_net_file, argv[i + 1]);
2110 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2111 strcpy(magic_stop_file, argv[i + 1]);
2112 if (access(magic_stop_file, F_OK ) != -1) {
2113 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2114 exit(1);
2115 }
2116 }
2117 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2118 debug_mode = atoi(argv[i + 1]);
2119 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2120 binary = atoi(argv[i + 1]);
2121 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2122 cc = atoi(argv[i + 1]);
2123 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2124 type = atoi(argv[i + 1]);
2125 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2126 strcpy(output_file, argv[i + 1]);
2127 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2128 window = atoi(argv[i + 1]);
2129 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2130 sample = atof(argv[i + 1]);
2131 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2132 hs = atoi(argv[i + 1]);
2133 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2134 negative = atoi(argv[i + 1]);
2135 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2136 strcpy(negative_classes_file, argv[i + 1]);
2137 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2138 nce = atoi(argv[i + 1]);
2139 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2140 num_threads = atoi(argv[i + 1]);
2141 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2142 iter = atoi(argv[i + 1]);
2143 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2144 min_count = atoi(argv[i + 1]);
2145 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2146 classes = atoi(argv[i + 1]);
2147 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2148 cap = atoi(argv[i + 1]);
2149 if (type == 0 || type == 2 || type == 4)
2150 alpha = 0.05;
2151 if (type==5) {
2152 sample = 0;
2153 cdb = open_collocatordb_for_write(output_file);
2154 }
2155 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2156 alpha = atof(argv[i + 1]);
2157 vocab = (struct vocab_word *) calloc(vocab_max_size,
2158 sizeof(struct vocab_word));
2159 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2160 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2161 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2162 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2163 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2164 }
2165 SaveArgs(argc, argv);
2166 TrainModel();
2167 return 0;
2168}
2169