blob: a2d5cdc4f663b074e6276dd85bf43c83a8b15303 [file] [log] [blame]
Peter Fankhauser542b6872020-04-19 15:21:44 +02001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <locale.h>
16#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
19#include <unistd.h>
20#include <math.h>
21#include <pthread.h>
22#include <collocatordb.h>
23
24#define MAX_STRING 100
25#define EXP_TABLE_SIZE 1000
26#define MAX_EXP 6
27#define MAX_SENTENCE_LENGTH 1000
28#define MAX_CC 100
29#define MAX_CODE_LENGTH 40
30
31const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
32
33typedef float real; // Precision of float numbers
34
35struct vocab_word {
36 long long cn;
37 int *point;
38 char *word, *code, codelen;
39};
40
41char train_file[MAX_STRING], output_file[MAX_STRING];
42char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
43char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
44char magic_stop_file[MAX_STRING];
45
46struct vocab_word *vocab;
47int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
48 num_threads = 12, min_reduce = 1;
49int *vocab_hash;
50long long *threadPos;
51int *threadIters;
52long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
53long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
54 classes = 0;
55real alpha = 0.025, starting_alpha, sample = 1e-3;
56real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
57real avgWordLength=0;
58clock_t start, start_clock;
59
60real *syn1_window, *syn1neg_window, *syn1nce_window;
61int w_offset, window_layer_size;
62
63int window_hidden_size = 500;
64real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
65 *syn_hidden_word_nce;
66
67int hs = 0, negative = 5;
68const int table_size = 1e8;
69int *table;
70
71long cc = 0;
72long tc = 1;
73
74//constrastive negative sampling
75char negative_classes_file[MAX_STRING];
76int *word_to_group;
77int *group_to_table; //group_size*table_size
78int class_number;
79
80//nce
81real* noise_distribution;
82int nce = 0;
83
84//param caps
85real CAP_VALUE = 50;
86int cap = 0;
87
88COLLOCATORDB *cdb = NULL;
89
90void capParam(real* array, int index) {
91 if (array[index] > CAP_VALUE)
92 array[index] = CAP_VALUE;
93 else if (array[index] < -CAP_VALUE)
94 array[index] = -CAP_VALUE;
95}
96
97real hardTanh(real x) {
98 if (x >= 1) {
99 return 1;
100 } else if (x <= -1) {
101 return -1;
102 } else {
103 return x;
104 }
105}
106
107real dHardTanh(real x, real g) {
108 if (x > 1 && g > 0) {
109 return 0;
110 }
111 if (x < -1 && g < 0) {
112 return 0;
113 }
114 return 1;
115}
116
117void InitUnigramTable() {
118 int a, i;
119 long long train_words_pow = 0;
120 real d1, power = 0.75;
121 table = (int *) malloc(table_size * sizeof(int));
122 for (a = 0; a < vocab_size; a++)
123 train_words_pow += pow(vocab[a].cn, power);
124 i = 0;
125 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
126 for (a = 0; a < table_size; a++) {
127 table[a] = i;
128 if (a / (real) table_size > d1) {
129 i++;
130 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
131 }
132 if (i >= vocab_size)
133 i = vocab_size - 1;
134 }
135
136 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
137 for (a = 0; a < vocab_size; a++)
138 noise_distribution[a] = pow(vocab[a].cn, power)
139 / (real) train_words_pow;
140}
141
142// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
143void ReadWord(char *word, FILE *fin) {
144 int a = 0, ch;
145 while (!feof(fin)) {
146 ch = fgetc(fin);
147 if (ch == 13)
148 continue;
149 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
150 if (a > 0) {
151 if (ch == '\n')
152 ungetc(ch, fin);
153 break;
154 }
155 if (ch == '\n') {
156 strcpy(word, (char *) "</s>");
157 return;
158 } else
159 continue;
160 }
161 word[a] = ch;
162 a++;
163 if (a >= MAX_STRING - 1)
164 a--; // Truncate too long words
165 }
166 word[a] = 0;
167}
168
169// Returns hash value of a word
170int GetWordHash(char *word) {
171 unsigned long long a, hash = 0;
172 for (a = 0; a < strlen(word); a++)
173 hash = hash * 257 + word[a];
174 hash = hash % vocab_hash_size;
175 return hash;
176}
177
178// Returns position of a word in the vocabulary; if the word is not found, returns -1
179int SearchVocab(char *word) {
180 unsigned int hash = GetWordHash(word);
181 while (1) {
182 if (vocab_hash[hash] == -1)
183 return -1;
184 if (!strcmp(word, vocab[vocab_hash[hash]].word))
185 return vocab_hash[hash];
186 hash = (hash + 1) % vocab_hash_size;
187 }
188 return -1;
189}
190
191// Reads a word and returns its index in the vocabulary
192int ReadWordIndex(FILE *fin) {
193 char word[MAX_STRING];
194 ReadWord(word, fin);
195 if (feof(fin))
196 return -1;
197 return SearchVocab(word);
198}
199
200// Adds a word to the vocabulary
201int AddWordToVocab(char *word) {
202 unsigned int hash, length = strlen(word) + 1;
203 if (length > MAX_STRING)
204 length = MAX_STRING;
205 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
206 strcpy(vocab[vocab_size].word, word);
207 vocab[vocab_size].cn = 0;
208 vocab_size++;
209 // Reallocate memory if needed
210 if (vocab_size + 2 >= vocab_max_size) {
211 vocab_max_size += 1000;
212 vocab = (struct vocab_word *) realloc(vocab,
213 vocab_max_size * sizeof(struct vocab_word));
214 }
215 hash = GetWordHash(word);
216 while (vocab_hash[hash] != -1)
217 hash = (hash + 1) % vocab_hash_size;
218 vocab_hash[hash] = vocab_size - 1;
219 return vocab_size - 1;
220}
221
222// Used later for sorting by word counts
223int VocabCompare(const void *a, const void *b) {
224 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
225}
226
227// Sorts the vocabulary by frequency using word counts
228void SortVocab() {
229 int a, size;
230 unsigned int hash;
231 // Sort the vocabulary and keep </s> at the first position
232 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
233 for (a = 0; a < vocab_hash_size; a++)
234 vocab_hash[a] = -1;
235 size = vocab_size;
236 train_words = 0;
237 for (a = 0; a < size; a++) {
238 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
239 // Words occuring less than min_count times will be discarded from the vocab
240 if ((vocab[a].cn < min_count) && (a != 0)) {
241 vocab_size--;
242 free(vocab[a].word);
243 } else {
244 // Hash will be re-computed, as after the sorting it is not actual
245 hash = GetWordHash(vocab[a].word);
246 while (vocab_hash[hash] != -1)
247 hash = (hash + 1) % vocab_hash_size;
248 vocab_hash[hash] = a;
249 train_words += vocab[a].cn;
250 }
251 }
252 avgWordLength /= train_words;
253 vocab = (struct vocab_word *) realloc(vocab,
254 (vocab_size + 1) * sizeof(struct vocab_word));
255 // Allocate memory for the binary tree construction
256 for (a = 0; a < vocab_size; a++) {
257 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
258 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
259 }
260}
261
262// Reduces the vocabulary by removing infrequent tokens
263void ReduceVocab() {
264 int a, b = 0;
265 unsigned int hash;
266 for (a = 0; a < vocab_size; a++)
267 if (vocab[a].cn > min_reduce) {
268 vocab[b].cn = vocab[a].cn;
269 vocab[b].word = vocab[a].word;
270 b++;
271 } else
272 free(vocab[a].word);
273 vocab_size = b;
274 for (a = 0; a < vocab_hash_size; a++)
275 vocab_hash[a] = -1;
276 for (a = 0; a < vocab_size; a++) {
277 // Hash will be re-computed, as it is not actual
278 hash = GetWordHash(vocab[a].word);
279 while (vocab_hash[hash] != -1)
280 hash = (hash + 1) % vocab_hash_size;
281 vocab_hash[hash] = a;
282 }
283 fflush(stdout);
284 min_reduce++;
285}
286
287// Create binary Huffman tree using the word counts
288// Frequent words will have short uniqe binary codes
289void CreateBinaryTree() {
290 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
291 char code[MAX_CODE_LENGTH];
292 long long *count = (long long *) calloc(vocab_size * 2 + 1,
293 sizeof(long long));
294 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
295 sizeof(long long));
296 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
297 sizeof(long long));
298 // todo: this needs to operate on a sorted copy of vocab[a].cn if we use local counts
299 for (a = 0; a < vocab_size; a++)
300 count[a] = vocab[a].cn;
301 for (a = vocab_size; a < vocab_size * 2; a++)
302 count[a] = 1e15;
303 pos1 = vocab_size - 1;
304 pos2 = vocab_size;
305 // Following algorithm constructs the Huffman tree by adding one node at a time
306 for (a = 0; a < vocab_size - 1; a++) {
307 // First, find two smallest nodes 'min1, min2'
308 if (pos1 >= 0) {
309 if (count[pos1] < count[pos2]) {
310 min1i = pos1;
311 pos1--;
312 } else {
313 min1i = pos2;
314 pos2++;
315 }
316 } else {
317 min1i = pos2;
318 pos2++;
319 }
320 if (pos1 >= 0) {
321 if (count[pos1] < count[pos2]) {
322 min2i = pos1;
323 pos1--;
324 } else {
325 min2i = pos2;
326 pos2++;
327 }
328 } else {
329 min2i = pos2;
330 pos2++;
331 }
332 count[vocab_size + a] = count[min1i] + count[min2i];
333 parent_node[min1i] = vocab_size + a;
334 parent_node[min2i] = vocab_size + a;
335 binary[min2i] = 1;
336 }
337 // Now assign binary code to each vocabulary word
338 for (a = 0; a < vocab_size; a++) {
339 b = a;
340 i = 0;
341 while (1) {
342 code[i] = binary[b];
343 point[i] = b;
344 i++;
345 b = parent_node[b];
346 if (b == vocab_size * 2 - 2)
347 break;
348 }
349 vocab[a].codelen = i;
350 vocab[a].point[0] = vocab_size - 2;
351 for (b = 0; b < i; b++) {
352 vocab[a].code[i - b - 1] = code[b];
353 vocab[a].point[i - b] = point[b] - vocab_size;
354 }
355 }
356 free(count);
357 free(binary);
358 free(parent_node);
359}
360
361void LearnVocabFromTrainFile() {
362 char word[MAX_STRING];
363 FILE *fin;
364 long long a, i;
365 for (a = 0; a < vocab_hash_size; a++)
366 vocab_hash[a] = -1;
367 fin = fopen(train_file, "rb");
368 if (fin == NULL) {
369 printf("ERROR: training data file not found!\n");
370 exit(1);
371 }
372 vocab_size = 0;
373 AddWordToVocab((char *) "</s>");
374 while (1) {
375 ReadWord(word, fin);
376 if (feof(fin))
377 break;
378 train_words++;
379 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
380 printf("%lldK%c", train_words / 1000, 13);
381 fflush(stdout);
382 }
383 i = SearchVocab(word);
384 if (i == -1) {
385 a = AddWordToVocab(word);
386 vocab[a].cn = 1;
387 } else
388 vocab[i].cn++;
389 if (vocab_size > vocab_hash_size * 0.7)
390 ReduceVocab();
391 }
392 SortVocab();
393 if (debug_mode > 0) {
Peter Fankhauserdd8b8542020-07-01 09:29:42 +0200394 printf("Vocab size: %'lld\n", vocab_size);
395 printf("Words in train file: %'lld\n", train_words);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200396 }
397 file_size = ftell(fin);
398 fclose(fin);
399}
400
401void SaveVocab() {
402 long long i;
403 FILE *fo = fopen(save_vocab_file, "wb");
404 for (i = 0; i < vocab_size; i++)
405 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
406 fclose(fo);
407}
408
409void ReadVocab() {
410 long long a, i = 0;
411 char c;
412 char word[MAX_STRING];
413 FILE *fin = fopen(read_vocab_file, "rb");
414 if (fin == NULL) {
415 printf("Vocabulary file not found\n");
416 exit(1);
417 }
418 for (a = 0; a < vocab_hash_size; a++)
419 vocab_hash[a] = -1;
420 vocab_size = 0;
421 while (1) {
422 ReadWord(word, fin);
423 if (feof(fin))
424 break;
425 a = AddWordToVocab(word);
426 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
427 i++;
428 }
429 fclose(fin);
Peter Fankhauser117aab02020-04-24 13:20:06 +0200430 fin = fopen(train_file, "rb");
431 if (fin == NULL) {
432 printf("ERROR: training data file not found!\n");
433 exit(1);
434 }
435 fseek(fin, 0, SEEK_END);
436 file_size = ftell(fin);
437 fclose(fin);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200438 SortVocab();
Peter Fankhauserdd8b8542020-07-01 09:29:42 +0200439 if (debug_mode > 0) {
440 printf("Vocab size: %'lld\n", vocab_size);
441 printf("Words in vocab's train file: %'lld\n", train_words);
442 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
443 }
Peter Fankhauser117aab02020-04-24 13:20:06 +0200444 train_words = file_size / avgWordLength;
Peter Fankhauserdd8b8542020-07-01 09:29:42 +0200445 // PF: so even with tc=0, alpha will be appropriately adapted?
Peter Fankhauser117aab02020-04-24 13:20:06 +0200446 if(debug_mode > 0)
447 printf("Estimated words in train file: %'lld\n", train_words);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200448 if (tc > 0) {
449 // recalculate counts for the current corpus
450 // adapted from LearnVocabFromTrainFile()
451 // note that we don't sort or rehash the vocabulary again, we only adapt vocab[.].cn.
452 fin = fopen(train_file, "rb");
453 if (fin == NULL) {
454 printf("ERROR: training data file not found!\n");
455 exit(1);
456 }
457 // reset vocabulary counts
458 for (a = 0; a < vocab_size; a++)
459 vocab[a].cn = 0;
Peter Fankhauserdd8b8542020-07-01 09:29:42 +0200460 train_words = 0;
Peter Fankhauser542b6872020-04-19 15:21:44 +0200461 while (1) {
462 ReadWord(word, fin);
463 if (feof(fin))
464 break;
Peter Fankhauserdd8b8542020-07-01 09:29:42 +0200465 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
466 printf("%lldK%c", train_words / 1000, 13);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200467 fflush(stdout);
468 }
469 i = SearchVocab(word);
470 // the word must be in the vocabulary but we don't issue a warning,
471 // because it may have been cut off due to minfreq.
472 if (i >= 0) {
473 vocab[i].cn++;
Peter Fankhauserdd8b8542020-07-01 09:29:42 +0200474 train_words++;
Peter Fankhauser542b6872020-04-19 15:21:44 +0200475 }
476 }
477 // we cannot have 0 counts.
478 for (a = 0; a < vocab_size; a++) {
479 if(vocab[a].cn == 0) {
480 vocab[a].cn = 1;
Peter Fankhauserdd8b8542020-07-01 09:29:42 +0200481 train_words++;
Peter Fankhauser542b6872020-04-19 15:21:44 +0200482 }
483 }
484 if (debug_mode > 0) {
485 printf("Vocab size: %lld\n", vocab_size);
Peter Fankhauserdd8b8542020-07-01 09:29:42 +0200486 printf("Words in current train file: %'lld\n", train_words);
Peter Fankhauser542b6872020-04-19 15:21:44 +0200487 }
488 fseek(fin, 0, SEEK_END);
489 file_size = ftell(fin);
490 fclose(fin);
491 }
Peter Fankhauser542b6872020-04-19 15:21:44 +0200492}
493
494void InitClassUnigramTable() {
495 // TODO: this probably needs to be adapted for dealing with subcorpus adjusted vocabulary counts
496 long long a, c;
497 printf("loading class unigrams \n");
498 FILE *fin = fopen(negative_classes_file, "rb");
499 if (fin == NULL) {
500 printf("ERROR: class file not found!\n");
501 exit(1);
502 }
503 word_to_group = (int *) malloc(vocab_size * sizeof(int));
504 for (a = 0; a < vocab_size; a++)
505 word_to_group[a] = -1;
506 char class[MAX_STRING];
507 char prev_class[MAX_STRING];
508 prev_class[0] = 0;
509 char word[MAX_STRING];
510 class_number = -1;
511 while (1) {
512 if (feof(fin))
513 break;
514 ReadWord(class, fin);
515 ReadWord(word, fin);
516 int word_index = SearchVocab(word);
517 if (word_index != -1) {
518 if (strcmp(class, prev_class) != 0) {
519 class_number++;
520 strcpy(prev_class, class);
521 }
522 word_to_group[word_index] = class_number;
523 }
524 ReadWord(word, fin);
525 }
526 class_number++;
527 fclose(fin);
528
529 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
530 long long train_words_pow = 0;
531 real d1, power = 0.75;
532
533 for (c = 0; c < class_number; c++) {
534 long long offset = c * table_size;
535 train_words_pow = 0;
536 for (a = 0; a < vocab_size; a++)
537 if (word_to_group[a] == c)
538 train_words_pow += pow(vocab[a].cn, power);
539 int i = 0;
540 while (word_to_group[i] != c && i < vocab_size)
541 i++;
542 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
543 for (a = 0; a < table_size; a++) {
544 //printf("index %lld , word %d\n", a, i);
545 group_to_table[offset + a] = i;
546 if (a / (real) table_size > d1) {
547 i++;
548 while (word_to_group[i] != c && i < vocab_size)
549 i++;
550 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
551 }
552 if (i >= vocab_size)
553 while (word_to_group[i] != c && i >= 0)
554 i--;
555 }
556 }
557}
558
559void SaveArgs(int argc, char **argv) {
560 unsigned int i;
561 char args_file[MAX_STRING];
562 strcpy(args_file, output_file);
563 strcat(args_file, ".args");
564 FILE *fargs = fopen(args_file, "w");
565 if (fargs == NULL) {
566 printf("Cannot save args to %s.\n", args_file);
567 return;
568 }
569
570 for(i=1; i<argc; i++)
571 fprintf(fargs, "%s ", argv[i]);
572
573 fprintf(fargs, "\n");
574 fclose(fargs);
575
576 return;
577}
578
579void SaveNet() {
580 if (type == 4 || negative <= 0) {
581 fprintf(stderr,
582 "save-net only supported for type 0,1,2,3 with negative sampling\n");
583 return;
584 }
585
586 FILE *fnet = fopen(save_net_file, "wb");
587 if (fnet == NULL) {
588 printf("Net parameter file not found\n");
589 exit(1);
590 }
591 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
592 if (type == 0 || type == 1) {
593 fwrite(syn1neg, sizeof(real), vocab_size * layer1_size, fnet);
594 }
595 if (type == 2 || type == 3) {
596 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
597 }
598 fclose(fnet);
599}
600
601void InitNet() {
602 long long a, b;
603 unsigned long long next_random = 1;
604 long long read;
605
606 window_layer_size = layer1_size * window * 2;
607 a = posix_memalign((void **) &syn0, 128,
608 (long long) vocab_size * layer1_size * sizeof(real));
609 if (syn0 == NULL) {
610 printf("Memory allocation failed\n");
611 exit(1);
612 }
613
614 if (hs) {
615 a = posix_memalign((void **) &syn1, 128,
616 (long long) vocab_size * layer1_size * sizeof(real));
617 if (syn1 == NULL) {
618 printf("Memory allocation failed\n");
619 exit(1);
620 }
621 a = posix_memalign((void **) &syn1_window, 128,
622 (long long) vocab_size * window_layer_size * sizeof(real));
623 if (syn1_window == NULL) {
624 printf("Memory allocation failed\n");
625 exit(1);
626 }
627 a = posix_memalign((void **) &syn_hidden_word, 128,
628 (long long) vocab_size * window_hidden_size * sizeof(real));
629 if (syn_hidden_word == NULL) {
630 printf("Memory allocation failed\n");
631 exit(1);
632 }
633
634 for (a = 0; a < vocab_size; a++)
635 for (b = 0; b < layer1_size; b++)
636 syn1[a * layer1_size + b] = 0;
637 for (a = 0; a < vocab_size; a++)
638 for (b = 0; b < window_layer_size; b++)
639 syn1_window[a * window_layer_size + b] = 0;
640 for (a = 0; a < vocab_size; a++)
641 for (b = 0; b < window_hidden_size; b++)
642 syn_hidden_word[a * window_hidden_size + b] = 0;
643 }
644 if (negative > 0) {
645 if (type == 0 || type == 1) {
646 a = posix_memalign((void **) &syn1neg, 128,
647 (long long) vocab_size * layer1_size * sizeof(real));
648 if (syn1neg == NULL) {
649 printf("Memory allocation failed\n");
650 exit(1);
651 }
652 for (a = 0; a < vocab_size; a++)
653 for (b = 0; b < layer1_size; b++)
654 syn1neg[a * layer1_size + b] = 0;
655 } else if (type == 2 || type == 3) {
656 a = posix_memalign((void **) &syn1neg_window, 128,
657 (long long) vocab_size * window_layer_size * sizeof(real));
658 if (syn1neg_window == NULL) {
659 printf("Memory allocation failed\n");
660 exit(1);
661 }
662 for (a = 0; a < vocab_size; a++)
663 for (b = 0; b < window_layer_size; b++)
664 syn1neg_window[a * window_layer_size + b] = 0;
665 } else if (type == 4) {
666 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
667 (long long) vocab_size * window_hidden_size * sizeof(real));
668 if (syn_hidden_word_neg == NULL) {
669 printf("Memory allocation failed\n");
670 exit(1);
671 }
672 for (a = 0; a < vocab_size; a++)
673 for (b = 0; b < window_hidden_size; b++)
674 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
675 }
676 }
677 if (nce > 0) {
678 a = posix_memalign((void **) &syn1nce, 128,
679 (long long) vocab_size * layer1_size * sizeof(real));
680 if (syn1nce == NULL) {
681 printf("Memory allocation failed\n");
682 exit(1);
683 }
684 a = posix_memalign((void **) &syn1nce_window, 128,
685 (long long) vocab_size * window_layer_size * sizeof(real));
686 if (syn1nce_window == NULL) {
687 printf("Memory allocation failed\n");
688 exit(1);
689 }
690 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
691 (long long) vocab_size * window_hidden_size * sizeof(real));
692 if (syn_hidden_word_nce == NULL) {
693 printf("Memory allocation failed\n");
694 exit(1);
695 }
696
697 for (a = 0; a < vocab_size; a++)
698 for (b = 0; b < layer1_size; b++)
699 syn1nce[a * layer1_size + b] = 0;
700 for (a = 0; a < vocab_size; a++)
701 for (b = 0; b < window_layer_size; b++)
702 syn1nce_window[a * window_layer_size + b] = 0;
703 for (a = 0; a < vocab_size; a++)
704 for (b = 0; b < window_hidden_size; b++)
705 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
706 }
707
708 if (type == 4) {
709 a = posix_memalign((void **) &syn_window_hidden, 128,
710 window_hidden_size * window_layer_size * sizeof(real));
711 if (syn_window_hidden == NULL) {
712 printf("Memory allocation failed\n");
713 exit(1);
714 }
715 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
716 next_random = next_random * (unsigned long long) 25214903917 + 11;
717 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
718 - 0.5) / (window_hidden_size * window_layer_size);
719 }
720 }
721
722 if (read_net_file[0] == 0) {
723 for (a = 0; a < vocab_size; a++)
724 for (b = 0; b < layer1_size; b++) {
725 next_random = next_random * (unsigned long long) 25214903917
726 + 11;
727 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
728 / (real) 65536) - 0.5) / layer1_size;
729 }
730 } else if ((type == 0 || type == 1) && negative > 0) {
731 FILE *fnet = fopen(read_net_file, "rb");
732 if (fnet == NULL) {
733 printf("Net parameter file not found\n");
734 exit(1);
735 }
736 printf("vocab-size: %lld, layer1_size: %lld\n",
737 vocab_size, layer1_size);
738 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
739 if (read != vocab_size * layer1_size) {
740 fprintf(stderr, "read-net failed %lld\n", read);
741 exit(-1);
742 }
743 read = fread(syn1neg, sizeof(real),
744 vocab_size * layer1_size, fnet);
745 if (read != (long long) vocab_size * layer1_size) {
746 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n",
747 read,
748 (long long) sizeof(real) * vocab_size * layer1_size);
749 exit(-1);
750 }
751 fgetc(fnet);
752 if (!feof(fnet)) {
753 fprintf(stderr,
754 "Remaining bytes in net-file after read-net. File position: %ld\n",
755 ftell(fnet));
756 exit(-1);
757 }
758 fclose(fnet);
759 } else if ((type == 2 || type == 3) && negative > 0) {
760 FILE *fnet = fopen(read_net_file, "rb");
761 if (fnet == NULL) {
762 printf("Net parameter file not found\n");
763 exit(1);
764 }
765 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n",
766 vocab_size, layer1_size, window_layer_size);
767 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
768 if (read != vocab_size * layer1_size) {
769 fprintf(stderr, "read-net failed %lld\n", read);
770 exit(-1);
771 }
772 read = fread(syn1neg_window, sizeof(real),
773 vocab_size * window_layer_size, fnet);
774 if (read != (long long) vocab_size * window_layer_size) {
775 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n",
776 read,
777 (long long) sizeof(real) * vocab_size * window_layer_size);
778 exit(-1);
779 }
780 fgetc(fnet);
781 if (!feof(fnet)) {
782 fprintf(stderr,
783 "Remaining bytes in net-file after read-net. File position: %ld\n",
784 ftell(fnet));
785 exit(-1);
786 }
787 fclose(fnet);
788 } else {
789 fprintf(stderr,
790 "read-net only supported for type 3 with negative sampling\n");
791 exit(-1);
792 }
793
794 CreateBinaryTree();
795}
796
797char *currentDateTime(char *buf, real offset) {
798 time_t t;
799 time(&t);
800 t += (long) offset;
801 struct tm tstruct;
802 tstruct = *localtime(&t);
803 strftime(buf, 80, "%c", &tstruct);
804 return buf;
805}
806
807void *MonitorThread(void *id) {
808 char *timebuf = malloc(80);;
809 int i, n=num_threads;
810 long long sum;
811 sleep(1);
812 while(n > 0) {
813 sleep(1);
814 sum = n = 0;
815 for(i=0; i < num_threads; i++) {
816 if(threadPos[i] >= 0) {
817 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
818 n++;
819 } else {
820 sum += iter * file_size / num_threads;
821 }
822 }
823 if(n == 0)
824 break;
825 real finished_portion = (real) sum / (float) (file_size * iter);
826 long long now = time(NULL);
827 long long elapsed = (now - start);
828 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
829
830 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
831 alpha,
832 finished_portion * 100,
833 (float) sum / elapsed / 1000,
834 elapsed,
835 ttg,
836 currentDateTime(timebuf, ttg)
837 );
838 fflush(stdout);
839 }
840 pthread_exit(NULL);
841}
842
843void *TrainModelThread(void *id) {
844 long long a, b, d, cw, word, last_word, sentence_length = 0,
845 sentence_position = 0;
846 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
847 long long l1, l2, c, target, label, local_iter = iter;
848 unsigned long long next_random = (long long) id;
849 real f, g;
850 int input_len_1 = layer1_size;
851 int window_offset = -1;
852 if (type == 2 || type == 4) {
853 input_len_1 = window_layer_size;
854 }
855 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
856 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
857 threadIters[(long) id] = iter;
858
859 int input_len_2 = 0;
860 if (type == 4) {
861 input_len_2 = window_hidden_size;
862 }
863 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
864 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
865
866 FILE *fi = fopen(train_file, "rb");
867 long long start_pos = file_size / (long long) num_threads * (long long) id;
868 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
869 long long current_pos = start_pos;
870 long long last_pos = start_pos;;
871 fseek(fi, start_pos, SEEK_SET);
872 while (1) {
873 if (word_count - last_word_count > 10000) {
874 // if ((current_pos - last_pos > 100000)) {
875 // PF: changed back, because it seems that alpha is not correctly adjusted otherwise.
876 word_count_actual += word_count - last_word_count;
877 last_pos = current_pos;
878 last_word_count = word_count;
879 alpha = starting_alpha
880 * (1 - word_count_actual / (real) (iter * train_words + 1));
881 if (alpha < starting_alpha * 0.0001)
882 alpha = starting_alpha * 0.0001;
883 }
884 if (sentence_length == 0) {
885 while (1) {
886 word = ReadWordIndex(fi);
887 if (feof(fi))
888 break;
889 if (word == -1)
890 continue;
891 word_count++;
892 if (word == 0)
893 break;
894 // The subsampling randomly discards frequent words while keeping the ranking same
895 if (sample > 0) {
896 real ran = (sqrt(vocab[word].cn / (sample * train_words))
897 + 1) * (sample * train_words) / vocab[word].cn;
898 next_random = next_random * (unsigned long long) 25214903917
899 + 11;
900 if (ran < (next_random & 0xFFFF) / (real) 65536) {
901 if (type == 3) // in structured skipgrams
902 word = -2; // keep the window position correct
903 else
904 continue;
905 }
906 }
907 sen[sentence_length] = word;
908 sentence_length++;
909 if (sentence_length >= MAX_SENTENCE_LENGTH)
910 break;
911 }
912 sentence_position = 0;
913 }
914 current_pos = threadPos[(long) id] = ftell(fi);
915 if (feof(fi) || current_pos >= end_pos ) {
916 word_count_actual += word_count - last_word_count;
917 threadIters[(long) id]--;
918 local_iter--;
919 if (local_iter == 0)
920 break;
921 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
922 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
923 break;
924 }
925 word_count = 0;
926 last_word_count = 0;
927 sentence_length = 0;
928 fseek(fi, file_size / (long long) num_threads * (long long) id,
929 SEEK_SET);
930 continue;
931 }
932 word = sen[sentence_position];
933 while (word == -2 && sentence_position<sentence_length)
934 word = sen[++sentence_position];
935 if (sentence_position>=sentence_length) {
936 sentence_length=0;
937 continue;
938 }
939 if (word < 0)
940 continue;
941 for (c = 0; c < input_len_1; c++)
942 neu1[c] = 0;
943 for (c = 0; c < input_len_1; c++)
944 neu1e[c] = 0;
945 for (c = 0; c < input_len_2; c++)
946 neu2[c] = 0;
947 for (c = 0; c < input_len_2; c++)
948 neu2e[c] = 0;
949 next_random = next_random * (unsigned long long) 25214903917 + 11;
950 b = next_random % window;
951 if (type == 0) { //train the cbow architecture
952 // in -> hidden
953 cw = 0;
954 for (a = b; a < window * 2 + 1 - b; a++)
955 if (a != window) {
956 c = sentence_position - window + a;
957 if (c < 0)
958 continue;
959 if (c >= sentence_length)
960 continue;
961 last_word = sen[c];
962 if (last_word == -1)
963 continue;
964 for (c = 0; c < layer1_size; c++)
965 neu1[c] += syn0[c + last_word * layer1_size];
966 cw++;
967 }
968 if (cw) {
969 for (c = 0; c < layer1_size; c++)
970 neu1[c] /= cw;
971 if (hs)
972 for (d = 0; d < vocab[word].codelen; d++) {
973 f = 0;
974 l2 = vocab[word].point[d] * layer1_size;
975 // Propagate hidden -> output
976 for (c = 0; c < layer1_size; c++)
977 f += neu1[c] * syn1[c + l2];
978 if (f <= -MAX_EXP)
979 continue;
980 else if (f >= MAX_EXP)
981 continue;
982 else
983 f = expTable[(int) ((f + MAX_EXP)
984 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
985 // 'g' is the gradient multiplied by the learning rate
986 g = (1 - vocab[word].code[d] - f) * alpha;
987 // Propagate errors output -> hidden
988 for (c = 0; c < layer1_size; c++)
989 neu1e[c] += g * syn1[c + l2];
990 // Learn weights hidden -> output
991 for (c = 0; c < layer1_size; c++)
992 syn1[c + l2] += g * neu1[c];
993 if (cap == 1)
994 for (c = 0; c < layer1_size; c++)
995 capParam(syn1, c + l2);
996 }
997 // NEGATIVE SAMPLING
998 if (negative > 0)
999 for (d = 0; d < negative + 1; d++) {
1000 if (d == 0) {
1001 target = word;
1002 label = 1;
1003 } else {
1004 next_random = next_random
1005 * (unsigned long long) 25214903917 + 11;
1006 if (word_to_group != NULL
1007 && word_to_group[word] != -1) {
1008 target = word;
1009 while (target == word) {
1010 target = group_to_table[word_to_group[word]
1011 * table_size
1012 + (next_random >> 16) % table_size];
1013 next_random = next_random
1014 * (unsigned long long) 25214903917
1015 + 11;
1016 }
1017 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1018 } else {
1019 target =
1020 table[(next_random >> 16) % table_size];
1021 }
1022 if (target == 0)
1023 target = next_random % (vocab_size - 1) + 1;
1024 if (target == word)
1025 continue;
1026 label = 0;
1027 }
1028 l2 = target * layer1_size;
1029 f = 0;
1030 for (c = 0; c < layer1_size; c++)
1031 f += neu1[c] * syn1neg[c + l2];
1032 if (f > MAX_EXP)
1033 g = (label - 1) * alpha;
1034 else if (f < -MAX_EXP)
1035 g = (label - 0) * alpha;
1036 else
1037 g = (label
1038 - expTable[(int) ((f + MAX_EXP)
1039 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1040 * alpha;
1041 for (c = 0; c < layer1_size; c++)
1042 neu1e[c] += g * syn1neg[c + l2];
1043 for (c = 0; c < layer1_size; c++)
1044 syn1neg[c + l2] += g * neu1[c];
1045 if (cap == 1)
1046 for (c = 0; c < layer1_size; c++)
1047 capParam(syn1neg, c + l2);
1048 }
1049 // Noise Contrastive Estimation
1050 if (nce > 0)
1051 for (d = 0; d < nce + 1; d++) {
1052 if (d == 0) {
1053 target = word;
1054 label = 1;
1055 } else {
1056 next_random = next_random
1057 * (unsigned long long) 25214903917 + 11;
1058 if (word_to_group != NULL
1059 && word_to_group[word] != -1) {
1060 target = word;
1061 while (target == word) {
1062 target = group_to_table[word_to_group[word]
1063 * table_size
1064 + (next_random >> 16) % table_size];
1065 next_random = next_random
1066 * (unsigned long long) 25214903917
1067 + 11;
1068 }
1069 } else {
1070 target =
1071 table[(next_random >> 16) % table_size];
1072 }
1073 if (target == 0)
1074 target = next_random % (vocab_size - 1) + 1;
1075 if (target == word)
1076 continue;
1077 label = 0;
1078 }
1079 l2 = target * layer1_size;
1080 f = 0;
1081
1082 for (c = 0; c < layer1_size; c++)
1083 f += neu1[c] * syn1nce[c + l2];
1084 if (f > MAX_EXP)
1085 g = (label - 1) * alpha;
1086 else if (f < -MAX_EXP)
1087 g = (label - 0) * alpha;
1088 else {
1089 f = exp(f);
1090 g =
1091 (label
1092 - f
1093 / (noise_distribution[target]
1094 * nce + f)) * alpha;
1095 }
1096 for (c = 0; c < layer1_size; c++)
1097 neu1e[c] += g * syn1nce[c + l2];
1098 for (c = 0; c < layer1_size; c++)
1099 syn1nce[c + l2] += g * neu1[c];
1100 if (cap == 1)
1101 for (c = 0; c < layer1_size; c++)
1102 capParam(syn1nce, c + l2);
1103 }
1104 // hidden -> in
1105 for (a = b; a < window * 2 + 1 - b; a++)
1106 if (a != window) {
1107 c = sentence_position - window + a;
1108 if (c < 0)
1109 continue;
1110 if (c >= sentence_length)
1111 continue;
1112 last_word = sen[c];
1113 if (last_word == -1)
1114 continue;
1115 for (c = 0; c < layer1_size; c++)
1116 syn0[c + last_word * layer1_size] += neu1e[c];
1117 }
1118 }
1119 } else if (type == 1) { //train skip-gram
1120 for (a = b; a < window * 2 + 1 - b; a++)
1121 if (a != window) {
1122 c = sentence_position - window + a;
1123 if (c < 0)
1124 continue;
1125 if (c >= sentence_length)
1126 continue;
1127 last_word = sen[c];
1128 if (last_word == -1)
1129 continue;
1130 l1 = last_word * layer1_size;
1131 for (c = 0; c < layer1_size; c++)
1132 neu1e[c] = 0;
1133 // HIERARCHICAL SOFTMAX
1134 if (hs)
1135 for (d = 0; d < vocab[word].codelen; d++) {
1136 f = 0;
1137 l2 = vocab[word].point[d] * layer1_size;
1138 // Propagate hidden -> output
1139 for (c = 0; c < layer1_size; c++)
1140 f += syn0[c + l1] * syn1[c + l2];
1141 if (f <= -MAX_EXP)
1142 continue;
1143 else if (f >= MAX_EXP)
1144 continue;
1145 else
1146 f = expTable[(int) ((f + MAX_EXP)
1147 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1148 // 'g' is the gradient multiplied by the learning rate
1149 g = (1 - vocab[word].code[d] - f) * alpha;
1150 // Propagate errors output -> hidden
1151 for (c = 0; c < layer1_size; c++)
1152 neu1e[c] += g * syn1[c + l2];
1153 // Learn weights hidden -> output
1154 for (c = 0; c < layer1_size; c++)
1155 syn1[c + l2] += g * syn0[c + l1];
1156 if (cap == 1)
1157 for (c = 0; c < layer1_size; c++)
1158 capParam(syn1, c + l2);
1159 }
1160 // NEGATIVE SAMPLING
1161 if (negative > 0)
1162 for (d = 0; d < negative + 1; d++) {
1163 if (d == 0) {
1164 target = word;
1165 label = 1;
1166 } else {
1167 next_random = next_random
1168 * (unsigned long long) 25214903917 + 11;
1169 if (word_to_group != NULL
1170 && word_to_group[word] != -1) {
1171 target = word;
1172 while (target == word) {
1173 target =
1174 group_to_table[word_to_group[word]
1175 * table_size
1176 + (next_random >> 16)
1177 % table_size];
1178 next_random =
1179 next_random
1180 * (unsigned long long) 25214903917
1181 + 11;
1182 }
1183 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1184 } else {
1185 target = table[(next_random >> 16)
1186 % table_size];
1187 }
1188 if (target == 0)
1189 target = next_random % (vocab_size - 1) + 1;
1190 if (target == word)
1191 continue;
1192 label = 0;
1193 }
1194 l2 = target * layer1_size;
1195 f = 0;
1196 for (c = 0; c < layer1_size; c++)
1197 f += syn0[c + l1] * syn1neg[c + l2];
1198 if (f > MAX_EXP)
1199 g = (label - 1) * alpha;
1200 else if (f < -MAX_EXP)
1201 g = (label - 0) * alpha;
1202 else
1203 g =
1204 (label
1205 - expTable[(int) ((f + MAX_EXP)
1206 * (EXP_TABLE_SIZE
1207 / MAX_EXP / 2))])
1208 * alpha;
1209 for (c = 0; c < layer1_size; c++)
1210 neu1e[c] += g * syn1neg[c + l2];
1211 for (c = 0; c < layer1_size; c++)
1212 syn1neg[c + l2] += g * syn0[c + l1];
1213 if (cap == 1)
1214 for (c = 0; c < layer1_size; c++)
1215 capParam(syn1neg, c + l2);
1216 }
1217 //Noise Contrastive Estimation
1218 if (nce > 0)
1219 for (d = 0; d < nce + 1; d++) {
1220 if (d == 0) {
1221 target = word;
1222 label = 1;
1223 } else {
1224 next_random = next_random
1225 * (unsigned long long) 25214903917 + 11;
1226 if (word_to_group != NULL
1227 && word_to_group[word] != -1) {
1228 target = word;
1229 while (target == word) {
1230 target =
1231 group_to_table[word_to_group[word]
1232 * table_size
1233 + (next_random >> 16)
1234 % table_size];
1235 next_random =
1236 next_random
1237 * (unsigned long long) 25214903917
1238 + 11;
1239 }
1240 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1241 } else {
1242 target = table[(next_random >> 16)
1243 % table_size];
1244 }
1245 if (target == 0)
1246 target = next_random % (vocab_size - 1) + 1;
1247 if (target == word)
1248 continue;
1249 label = 0;
1250 }
1251 l2 = target * layer1_size;
1252 f = 0;
1253 for (c = 0; c < layer1_size; c++)
1254 f += syn0[c + l1] * syn1nce[c + l2];
1255 if (f > MAX_EXP)
1256 g = (label - 1) * alpha;
1257 else if (f < -MAX_EXP)
1258 g = (label - 0) * alpha;
1259 else {
1260 f = exp(f);
1261 g = (label
1262 - f
1263 / (noise_distribution[target]
1264 * nce + f)) * alpha;
1265 }
1266 for (c = 0; c < layer1_size; c++)
1267 neu1e[c] += g * syn1nce[c + l2];
1268 for (c = 0; c < layer1_size; c++)
1269 syn1nce[c + l2] += g * syn0[c + l1];
1270 if (cap == 1)
1271 for (c = 0; c < layer1_size; c++)
1272 capParam(syn1nce, c + l2);
1273 }
1274 // Learn weights input -> hidden
1275 for (c = 0; c < layer1_size; c++)
1276 syn0[c + l1] += neu1e[c];
1277 }
1278 } else if (type == 2) { //train the cwindow architecture
1279 // in -> hidden
1280 cw = 0;
1281 for (a = 0; a < window * 2 + 1; a++)
1282 if (a != window) {
1283 c = sentence_position - window + a;
1284 if (c < 0)
1285 continue;
1286 if (c >= sentence_length)
1287 continue;
1288 last_word = sen[c];
1289 if (last_word == -1)
1290 continue;
1291 window_offset = a * layer1_size;
1292 if (a > window)
1293 window_offset -= layer1_size;
1294 for (c = 0; c < layer1_size; c++)
1295 neu1[c + window_offset] += syn0[c
1296 + last_word * layer1_size];
1297 cw++;
1298 }
1299 if (cw) {
1300 if (hs)
1301 for (d = 0; d < vocab[word].codelen; d++) {
1302 f = 0;
1303 l2 = vocab[word].point[d] * window_layer_size;
1304 // Propagate hidden -> output
1305 for (c = 0; c < window_layer_size; c++)
1306 f += neu1[c] * syn1_window[c + l2];
1307 if (f <= -MAX_EXP)
1308 continue;
1309 else if (f >= MAX_EXP)
1310 continue;
1311 else
1312 f = expTable[(int) ((f + MAX_EXP)
1313 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1314 // 'g' is the gradient multiplied by the learning rate
1315 g = (1 - vocab[word].code[d] - f) * alpha;
1316 // Propagate errors output -> hidden
1317 for (c = 0; c < window_layer_size; c++)
1318 neu1e[c] += g * syn1_window[c + l2];
1319 // Learn weights hidden -> output
1320 for (c = 0; c < window_layer_size; c++)
1321 syn1_window[c + l2] += g * neu1[c];
1322 if (cap == 1)
1323 for (c = 0; c < window_layer_size; c++)
1324 capParam(syn1_window, c + l2);
1325 }
1326 // NEGATIVE SAMPLING
1327 if (negative > 0)
1328 for (d = 0; d < negative + 1; d++) {
1329 if (d == 0) {
1330 target = word;
1331 label = 1;
1332 } else {
1333 next_random = next_random
1334 * (unsigned long long) 25214903917 + 11;
1335 if (word_to_group != NULL
1336 && word_to_group[word] != -1) {
1337 target = word;
1338 while (target == word) {
1339 target = group_to_table[word_to_group[word]
1340 * table_size
1341 + (next_random >> 16) % table_size];
1342 next_random = next_random
1343 * (unsigned long long) 25214903917
1344 + 11;
1345 }
1346 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1347 } else {
1348 target =
1349 table[(next_random >> 16) % table_size];
1350 }
1351 if (target == 0)
1352 target = next_random % (vocab_size - 1) + 1;
1353 if (target == word)
1354 continue;
1355 label = 0;
1356 }
1357 l2 = target * window_layer_size;
1358 f = 0;
1359 for (c = 0; c < window_layer_size; c++)
1360 f += neu1[c] * syn1neg_window[c + l2];
1361 if (f > MAX_EXP)
1362 g = (label - 1) * alpha;
1363 else if (f < -MAX_EXP)
1364 g = (label - 0) * alpha;
1365 else
1366 g = (label
1367 - expTable[(int) ((f + MAX_EXP)
1368 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1369 * alpha;
1370 for (c = 0; c < window_layer_size; c++)
1371 neu1e[c] += g * syn1neg_window[c + l2];
1372 for (c = 0; c < window_layer_size; c++)
1373 syn1neg_window[c + l2] += g * neu1[c];
1374 if (cap == 1)
1375 for (c = 0; c < window_layer_size; c++)
1376 capParam(syn1neg_window, c + l2);
1377 }
1378 // Noise Contrastive Estimation
1379 if (nce > 0)
1380 for (d = 0; d < nce + 1; d++) {
1381 if (d == 0) {
1382 target = word;
1383 label = 1;
1384 } else {
1385 next_random = next_random
1386 * (unsigned long long) 25214903917 + 11;
1387 if (word_to_group != NULL
1388 && word_to_group[word] != -1) {
1389 target = word;
1390 while (target == word) {
1391 target = group_to_table[word_to_group[word]
1392 * table_size
1393 + (next_random >> 16) % table_size];
1394 next_random = next_random
1395 * (unsigned long long) 25214903917
1396 + 11;
1397 }
1398 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1399 } else {
1400 target =
1401 table[(next_random >> 16) % table_size];
1402 }
1403 if (target == 0)
1404 target = next_random % (vocab_size - 1) + 1;
1405 if (target == word)
1406 continue;
1407 label = 0;
1408 }
1409 l2 = target * window_layer_size;
1410 f = 0;
1411 for (c = 0; c < window_layer_size; c++)
1412 f += neu1[c] * syn1nce_window[c + l2];
1413 if (f > MAX_EXP)
1414 g = (label - 1) * alpha;
1415 else if (f < -MAX_EXP)
1416 g = (label - 0) * alpha;
1417 else {
1418 f = exp(f);
1419 g =
1420 (label
1421 - f
1422 / (noise_distribution[target]
1423 * nce + f)) * alpha;
1424 }
1425 for (c = 0; c < window_layer_size; c++)
1426 neu1e[c] += g * syn1nce_window[c + l2];
1427 for (c = 0; c < window_layer_size; c++)
1428 syn1nce_window[c + l2] += g * neu1[c];
1429 if (cap == 1)
1430 for (c = 0; c < window_layer_size; c++)
1431 capParam(syn1nce_window, c + l2);
1432 }
1433 // hidden -> in
1434 for (a = 0; a < window * 2 + 1; a++)
1435 if (a != window) {
1436 c = sentence_position - window + a;
1437 if (c < 0)
1438 continue;
1439 if (c >= sentence_length)
1440 continue;
1441 last_word = sen[c];
1442 if (last_word == -1)
1443 continue;
1444 window_offset = a * layer1_size;
1445 if (a > window)
1446 window_offset -= layer1_size;
1447 for (c = 0; c < layer1_size; c++)
1448 syn0[c + last_word * layer1_size] += neu1e[c
1449 + window_offset];
1450 }
1451 }
1452 } else if (type == 3) { //train structured skip-gram
1453 for (a = 0; a < window * 2 + 1; a++)
1454 if (a != window) {
1455 c = sentence_position - window + a;
1456 if (c < 0)
1457 continue;
1458 if (c >= sentence_length)
1459 continue;
1460 last_word = sen[c];
1461 if (last_word < 0)
1462 continue;
1463 l1 = last_word * layer1_size;
1464 window_offset = a * layer1_size;
1465 if (a > window)
1466 window_offset -= layer1_size;
1467 for (c = 0; c < layer1_size; c++)
1468 neu1e[c] = 0;
1469 // HIERARCHICAL SOFTMAX
1470 if (hs)
1471 for (d = 0; d < vocab[word].codelen; d++) {
1472 f = 0;
1473 l2 = vocab[word].point[d] * window_layer_size;
1474 // Propagate hidden -> output
1475 for (c = 0; c < layer1_size; c++)
1476 f += syn0[c + l1]
1477 * syn1_window[c + l2 + window_offset];
1478 if (f <= -MAX_EXP)
1479 continue;
1480 else if (f >= MAX_EXP)
1481 continue;
1482 else
1483 f = expTable[(int) ((f + MAX_EXP)
1484 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1485 // 'g' is the gradient multiplied by the learning rate
1486 g = (1 - vocab[word].code[d] - f) * alpha;
1487 // Propagate errors output -> hidden
1488 for (c = 0; c < layer1_size; c++)
1489 neu1e[c] += g
1490 * syn1_window[c + l2 + window_offset];
1491 // Learn weights hidden -> output
1492 for (c = 0; c < layer1_size; c++)
1493 syn1[c + l2 + window_offset] += g
1494 * syn0[c + l1];
1495 if (cap == 1)
1496 for (c = 0; c < layer1_size; c++)
1497 capParam(syn1, c + l2 + window_offset);
1498 }
1499 // NEGATIVE SAMPLING
1500 if (negative > 0)
1501 for (d = 0; d < negative + 1; d++) {
1502 if (d == 0) {
1503 target = word;
1504 label = 1;
1505 } else {
1506 next_random = next_random
1507 * (unsigned long long) 25214903917 + 11;
1508 if (word_to_group != NULL
1509 && word_to_group[word] != -1) {
1510 target = word;
1511 while (target == word) {
1512 target =
1513 group_to_table[word_to_group[word]
1514 * table_size
1515 + (next_random >> 16)
1516 % table_size];
1517 next_random =
1518 next_random
1519 * (unsigned long long) 25214903917
1520 + 11;
1521 }
1522 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1523 } else {
1524 target = table[(next_random >> 16)
1525 % table_size];
1526 }
1527 if (target == 0)
1528 target = next_random % (vocab_size - 1) + 1;
1529 if (target == word)
1530 continue;
1531 label = 0;
1532 }
1533 l2 = target * window_layer_size;
1534 f = 0;
1535 for (c = 0; c < layer1_size; c++)
1536 f +=
1537 syn0[c + l1]
1538 * syn1neg_window[c + l2
1539 + window_offset];
1540 if (f > MAX_EXP)
1541 g = (label - 1) * alpha;
1542 else if (f < -MAX_EXP)
1543 g = (label - 0) * alpha;
1544 else
1545 g =
1546 (label
1547 - expTable[(int) ((f + MAX_EXP)
1548 * (EXP_TABLE_SIZE
1549 / MAX_EXP / 2))])
1550 * alpha;
1551 if(debug_mode > 2 && ((long long) id) == 0) {
1552 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1553 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1554 }
1555 for (c = 0; c < layer1_size; c++)
1556 neu1e[c] +=
1557 g
1558 * syn1neg_window[c + l2
1559 + window_offset];
1560 for (c = 0; c < layer1_size; c++)
1561 syn1neg_window[c + l2 + window_offset] += g
1562 * syn0[c + l1];
1563 if (cap == 1)
1564 for (c = 0; c < layer1_size; c++)
1565 capParam(syn1neg_window,
1566 c + l2 + window_offset);
1567 }
1568 // Noise Constrastive Estimation
1569 if (nce > 0)
1570 for (d = 0; d < nce + 1; d++) {
1571 if (d == 0) {
1572 target = word;
1573 label = 1;
1574 } else {
1575 next_random = next_random
1576 * (unsigned long long) 25214903917 + 11;
1577 if (word_to_group != NULL
1578 && word_to_group[word] != -1) {
1579 target = word;
1580 while (target == word) {
1581 target =
1582 group_to_table[word_to_group[word]
1583 * table_size
1584 + (next_random >> 16)
1585 % table_size];
1586 next_random =
1587 next_random
1588 * (unsigned long long) 25214903917
1589 + 11;
1590 }
1591 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1592 } else {
1593 target = table[(next_random >> 16)
1594 % table_size];
1595 }
1596 if (target == 0)
1597 target = next_random % (vocab_size - 1) + 1;
1598 if (target == word)
1599 continue;
1600 label = 0;
1601 }
1602 l2 = target * window_layer_size;
1603 f = 0;
1604 for (c = 0; c < layer1_size; c++)
1605 f +=
1606 syn0[c + l1]
1607 * syn1nce_window[c + l2
1608 + window_offset];
1609 if (f > MAX_EXP)
1610 g = (label - 1) * alpha;
1611 else if (f < -MAX_EXP)
1612 g = (label - 0) * alpha;
1613 else {
1614 f = exp(f);
1615 g = (label
1616 - f
1617 / (noise_distribution[target]
1618 * nce + f)) * alpha;
1619 }
1620 for (c = 0; c < layer1_size; c++)
1621 neu1e[c] +=
1622 g
1623 * syn1nce_window[c + l2
1624 + window_offset];
1625 for (c = 0; c < layer1_size; c++)
1626 syn1nce_window[c + l2 + window_offset] += g
1627 * syn0[c + l1];
1628 if (cap == 1)
1629 for (c = 0; c < layer1_size; c++)
1630 capParam(syn1nce_window,
1631 c + l2 + window_offset);
1632 }
1633 // Learn weights input -> hidden
1634 for (c = 0; c < layer1_size; c++) {
1635 syn0[c + l1] += neu1e[c];
1636 if (syn0[c + l1] > 50)
1637 syn0[c + l1] = 50;
1638 if (syn0[c + l1] < -50)
1639 syn0[c + l1] = -50;
1640 }
1641 }
1642 } else if (type == 4) { //training senna
1643 // in -> hidden
1644 cw = 0;
1645 for (a = 0; a < window * 2 + 1; a++)
1646 if (a != window) {
1647 c = sentence_position - window + a;
1648 if (c < 0)
1649 continue;
1650 if (c >= sentence_length)
1651 continue;
1652 last_word = sen[c];
1653 if (last_word == -1)
1654 continue;
1655 window_offset = a * layer1_size;
1656 if (a > window)
1657 window_offset -= layer1_size;
1658 for (c = 0; c < layer1_size; c++)
1659 neu1[c + window_offset] += syn0[c
1660 + last_word * layer1_size];
1661 cw++;
1662 }
1663 if (cw) {
1664 for (a = 0; a < window_hidden_size; a++) {
1665 c = a * window_layer_size;
1666 for (b = 0; b < window_layer_size; b++) {
1667 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1668 }
1669 }
1670 if (hs)
1671 for (d = 0; d < vocab[word].codelen; d++) {
1672 f = 0;
1673 l2 = vocab[word].point[d] * window_hidden_size;
1674 // Propagate hidden -> output
1675 for (c = 0; c < window_hidden_size; c++)
1676 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1677 if (f <= -MAX_EXP)
1678 continue;
1679 else if (f >= MAX_EXP)
1680 continue;
1681 else
1682 f = expTable[(int) ((f + MAX_EXP)
1683 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1684 // 'g' is the gradient multiplied by the learning rate
1685 g = (1 - vocab[word].code[d] - f) * alpha;
1686 // Propagate errors output -> hidden
1687 for (c = 0; c < window_hidden_size; c++)
1688 neu2e[c] += dHardTanh(neu2[c], g) * g
1689 * syn_hidden_word[c + l2];
1690 // Learn weights hidden -> output
1691 for (c = 0; c < window_hidden_size; c++)
1692 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1693 * neu2[c];
1694 }
1695 // NEGATIVE SAMPLING
1696 if (negative > 0)
1697 for (d = 0; d < negative + 1; d++) {
1698 if (d == 0) {
1699 target = word;
1700 label = 1;
1701 } else {
1702 next_random = next_random
1703 * (unsigned long long) 25214903917 + 11;
1704 if (word_to_group != NULL
1705 && word_to_group[word] != -1) {
1706 target = word;
1707 while (target == word) {
1708 target = group_to_table[word_to_group[word]
1709 * table_size
1710 + (next_random >> 16) % table_size];
1711 next_random = next_random
1712 * (unsigned long long) 25214903917
1713 + 11;
1714 }
1715 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1716 } else {
1717 target =
1718 table[(next_random >> 16) % table_size];
1719 }
1720 if (target == 0)
1721 target = next_random % (vocab_size - 1) + 1;
1722 if (target == word)
1723 continue;
1724 label = 0;
1725 }
1726 l2 = target * window_hidden_size;
1727 f = 0;
1728 for (c = 0; c < window_hidden_size; c++)
1729 f += hardTanh(neu2[c])
1730 * syn_hidden_word_neg[c + l2];
1731 if (f > MAX_EXP)
1732 g = (label - 1) * alpha / negative;
1733 else if (f < -MAX_EXP)
1734 g = (label - 0) * alpha / negative;
1735 else
1736 g = (label
1737 - expTable[(int) ((f + MAX_EXP)
1738 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1739 * alpha / negative;
1740 for (c = 0; c < window_hidden_size; c++)
1741 neu2e[c] += dHardTanh(neu2[c], g) * g
1742 * syn_hidden_word_neg[c + l2];
1743 for (c = 0; c < window_hidden_size; c++)
1744 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1745 * g * neu2[c];
1746 }
1747 for (a = 0; a < window_hidden_size; a++)
1748 for (b = 0; b < window_layer_size; b++)
1749 neu1e[b] += neu2e[a]
1750 * syn_window_hidden[a * window_layer_size + b];
1751 for (a = 0; a < window_hidden_size; a++)
1752 for (b = 0; b < window_layer_size; b++)
1753 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1754 * neu1[b];
1755 // hidden -> in
1756 for (a = 0; a < window * 2 + 1; a++)
1757 if (a != window) {
1758 c = sentence_position - window + a;
1759 if (c < 0)
1760 continue;
1761 if (c >= sentence_length)
1762 continue;
1763 last_word = sen[c];
1764 if (last_word == -1)
1765 continue;
1766 window_offset = a * layer1_size;
1767 if (a > window)
1768 window_offset -= layer1_size;
1769 for (c = 0; c < layer1_size; c++)
1770 syn0[c + last_word * layer1_size] += neu1e[c
1771 + window_offset];
1772 }
1773 }
1774 } else if(type == 5) {
1775 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1776 c = sentence_position - window + a;
1777 if (c < 0) continue;
1778 if (c >= sentence_length) continue;
1779 last_word = sen[c];
1780 if (last_word == -1) continue;
1781 inc_collocator(cdb, word, last_word, a - window);
1782 // printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
1783 // cw++;
1784 }
1785 } else {
1786 printf("unknown type %i", type);
1787 exit(0);
1788 }
1789 sentence_position++;
1790 if (sentence_position >= sentence_length) {
1791 sentence_length = 0;
1792 continue;
1793 }
1794 }
1795 fclose(fi);
1796 free(neu1);
1797 free(neu1e);
1798 threadPos[(long) id] = -1;
1799 pthread_exit(NULL);
1800}
1801
1802void ShowCollocations() {
1803 long a, b, c, d, e, window_offset, target, max_target = 0, maxmax_target;
1804 real f, max_f, maxmax_f;
1805 real *target_sums, bestf[MAX_CC], worstbest;
1806 long besti[MAX_CC];
1807 int N = 10, bestp[MAX_CC];
1808 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1809
1810 for (d = cc; d < vocab_size; d++) {
1811 for (b = 0; b < vocab_size; b++)
1812 target_sums[b] = 0;
1813 for (b = 0; b < N; b++)
1814 bestf[b] = -1;
1815 worstbest = -1;
1816
1817 maxmax_f = -1;
1818 maxmax_target = 0;
1819 for (a = window * 2 + 1; a >=0; a--) {
1820 if (a != window) {
1821 max_f = -1;
1822 window_offset = a * layer1_size;
1823 if (a > window)
1824 window_offset -= layer1_size;
1825 for(target = 0; target < vocab_size; target ++) {
1826 if(target == d)
1827 continue;
1828 f = 0;
1829 for (c = 0; c < layer1_size; c++)
1830 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1831 if (f < -MAX_EXP)
1832 continue;
1833 else if (f > MAX_EXP)
1834 continue;
1835 else
1836 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1837 if(f > max_f) {
1838 max_f = f;
1839 max_target = target;
1840 }
1841 target_sums[target] += (1-target_sums[target]) * f;
1842 if(f > worstbest) {
1843 for (b = 0; b < N; b++) {
1844 if (f > bestf[b]) {
1845 for (e = N - 1; e > b; e--) {
1846 bestf[e] = bestf[e - 1];
1847 besti[e] = besti[e - 1];
1848 bestp[e] = bestp[e - 1];
1849 }
1850 bestf[b] = f;
1851 besti[b] = target;
1852 bestp[b] = window-a;
1853 break;
1854 }
1855 }
1856 worstbest = bestf[N - 1];
1857 }
1858 }
1859 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1860 if (max_f > maxmax_f) {
1861 maxmax_f = max_f;
1862 maxmax_target = max_target;
1863 }
1864 } else {
1865 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1866 }
1867 }
1868 max_f = -1;
1869 for (b = 0; b < vocab_size; b++) {
1870 if (target_sums[b] > max_f) {
1871 max_f = target_sums[b];
1872 max_target = b;
1873 }
1874 }
1875 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
1876 vocab[max_target].word, max_f, vocab[maxmax_target].word,
1877 maxmax_f);
1878 for (b = 0; b < N && bestf[b] > -1; b++)
1879 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
1880 printf("\n");
1881 }
1882}
1883
1884void TrainModel() {
1885 long a, b, c, d;
1886 FILE *fo;
1887 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
1888 threadPos = malloc(num_threads * sizeof(long long));
1889 threadIters = malloc(num_threads * sizeof(int));
1890 char *timebuf = malloc(80);
1891 printf("Starting training using file %s\n", train_file);
1892 starting_alpha = alpha;
1893 if (read_vocab_file[0] != 0)
1894 ReadVocab();
1895 else
1896 LearnVocabFromTrainFile();
1897 if (save_vocab_file[0] != 0)
1898 SaveVocab();
1899 if (output_file[0] == 0)
1900 return;
1901 InitNet();
1902 if (cc > 0)
1903 ShowCollocations();
1904 if (negative > 0 || nce > 0)
1905 InitUnigramTable();
1906 if (negative_classes_file[0] != 0)
1907 InitClassUnigramTable();
1908 start = time(NULL);
1909 start_clock = clock();
1910 for (a = 0; a < num_threads; a++)
1911 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
1912 if(debug_mode > 1)
1913 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
1914 for (a = 0; a < num_threads; a++)
1915 pthread_join(pt[a], NULL);
1916 if(debug_mode > 1) {
1917 pthread_join(pt[num_threads], NULL);
1918 clock_t now = time(NULL);
1919 clock_t now_clock = clock();
1920 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
1921 if(type == 5) // don't save vectorsmfor classic collocators
1922 return;
1923 printf("Saving vectors to %s ...", output_file);
1924 fflush(stdout);
1925 }
1926 fo = fopen(output_file, "wb");
1927 if (classes == 0) {
1928 // Save the word vectors
1929 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1930 for (a = 0; a < vocab_size; a++) {
1931 fprintf(fo, "%s ", vocab[a].word);
1932 if (binary)
1933 for (b = 0; b < layer1_size; b++)
1934 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1935 else
1936 for (b = 0; b < layer1_size; b++)
1937 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1938 fprintf(fo, "\n");
1939 }
1940 if(debug_mode > 1)
1941 fprintf(stderr, "\n");
1942 } else {
1943 // Run K-means on the word vectors
1944 int clcn = classes, iter = 10, closeid;
1945 int *centcn = (int *) malloc(classes * sizeof(int));
1946 int *cl = (int *) calloc(vocab_size, sizeof(int));
1947 real closev, x;
1948 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1949 for (a = 0; a < vocab_size; a++)
1950 cl[a] = a % clcn;
1951 for (a = 0; a < iter; a++) {
1952 for (b = 0; b < clcn * layer1_size; b++)
1953 cent[b] = 0;
1954 for (b = 0; b < clcn; b++)
1955 centcn[b] = 1;
1956 for (c = 0; c < vocab_size; c++) {
1957 for (d = 0; d < layer1_size; d++)
1958 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1959 centcn[cl[c]]++;
1960 }
1961 for (b = 0; b < clcn; b++) {
1962 closev = 0;
1963 for (c = 0; c < layer1_size; c++) {
1964 cent[layer1_size * b + c] /= centcn[b];
1965 closev += cent[layer1_size * b + c]
1966 * cent[layer1_size * b + c];
1967 }
1968 closev = sqrt(closev);
1969 for (c = 0; c < layer1_size; c++)
1970 cent[layer1_size * b + c] /= closev;
1971 }
1972 for (c = 0; c < vocab_size; c++) {
1973 closev = -10;
1974 closeid = 0;
1975 for (d = 0; d < clcn; d++) {
1976 x = 0;
1977 for (b = 0; b < layer1_size; b++)
1978 x += cent[layer1_size * d + b]
1979 * syn0[c * layer1_size + b];
1980 if (x > closev) {
1981 closev = x;
1982 closeid = d;
1983 }
1984 }
1985 cl[c] = closeid;
1986 }
1987 }
1988 // Save the K-means classes
1989 for (a = 0; a < vocab_size; a++)
1990 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1991 free(centcn);
1992 free(cent);
1993 free(cl);
1994 }
1995 fclose(fo);
1996 if (save_net_file[0] != 0)
1997 SaveNet();
1998}
1999
2000int ArgPos(char *str, int argc, char **argv) {
2001 int a;
2002 for (a = 1; a < argc; a++)
2003 if (!strcmp(str, argv[a])) {
2004 if (a == argc - 1) {
2005 printf("Argument missing for %s\n", str);
2006 exit(1);
2007 }
2008 return a;
2009 }
2010 return -1;
2011}
2012
2013void print_help() {
2014 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
2015 printf("Options:\n");
2016 printf("Parameters for training:\n");
2017 printf("\t-train <file>\n");
2018 printf("\t\tUse text data from <file> to train the model\n");
2019 printf("\t-output <file>\n");
2020 printf(
2021 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
2022 printf("\t-size <int>\n");
2023 printf("\t\tSet size of word vectors; default is 100\n");
2024 printf("\t-window <int>\n");
2025 printf("\t\tSet max skip length between words; default is 5\n");
2026 printf("\t-sample <float>\n");
2027 printf(
2028 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
2029 printf(
2030 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
2031 printf("\t-hs <int>\n");
2032 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
2033 printf("\t-negative <int>\n");
2034 printf(
2035 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
2036 printf("\t-negative-classes <file>\n");
2037 printf("\t\tNegative classes to sample from\n");
2038 printf("\t-nce <int>\n");
2039 printf(
2040 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
2041 printf("\t-threads <int>\n");
2042 printf("\t\tUse <int> threads (default 12)\n");
2043 printf("\t-iter <int>\n");
2044 printf("\t\tRun more training iterations (default 5)\n");
2045 printf("\t-min-count <int>\n");
2046 printf(
2047 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
2048 printf("\t-alpha <float>\n");
2049 printf(
2050 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
2051 printf("\t-classes <int>\n");
2052 printf(
2053 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
2054 printf("\t-debug <int>\n");
2055 printf(
2056 "\t\tSet the debug mode (default = 2 = more info during training)\n");
2057 printf("\t-binary <int>\n");
2058 printf(
2059 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
2060 printf("\t-save-vocab <file>\n");
2061 printf("\t\tThe vocabulary will be saved to <file>\n");
2062 printf("\t-read-vocab <file>\n");
2063 printf(
2064 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
2065 printf("\t-train-counts <int>\n");
2066 printf(
2067 "\t\tUse word counts of actual corpus rather than vocabulary counts; default is 1 (on)\n");
2068 printf("\t-read-net <file>\n");
2069 printf(
2070 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
2071 printf("\t-save-net <file>\n");
2072 printf("\t\tThe net parameters will be saved to <file>\n");
2073 printf("\t-magic-stop-file <file>\n");
2074 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
2075 printf("\t-show-cc <int>\n");
2076 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
2077 printf("\t-type <int>\n");
2078 printf(
2079 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
2080 printf("\t-cap <int>\n");
2081 printf(
2082 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
2083 printf("\nExamples:\n");
2084 printf(
2085 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
2086}
2087
2088int main(int argc, char **argv) {
2089 int i;
2090 setlocale(LC_ALL, "");
2091 if (argc == 1) {
2092 print_help();
2093 return 0;
2094 }
2095 output_file[0] = 0;
2096 save_vocab_file[0] = 0;
2097 read_vocab_file[0] = 0;
2098 save_net_file[0] = 0;
2099 read_net_file[0] = 0;
2100 negative_classes_file[0] = 0;
2101 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
2102 print_help();
2103 return(0);
2104 }
2105 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
2106 print_help();
2107 return(0);
2108 }
2109 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2110 layer1_size = atoi(argv[i + 1]);
2111 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2112 strcpy(train_file, argv[i + 1]);
2113 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2114 strcpy(save_vocab_file, argv[i + 1]);
2115 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2116 strcpy(read_vocab_file, argv[i + 1]);
2117 if ((i = ArgPos((char *) "-train-counts", argc, argv)) > 0)
2118 tc = atoi(argv[i + 1]);
2119 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2120 strcpy(save_net_file, argv[i + 1]);
2121 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2122 strcpy(read_net_file, argv[i + 1]);
2123 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2124 strcpy(magic_stop_file, argv[i + 1]);
2125 if (access(magic_stop_file, F_OK ) != -1) {
2126 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2127 exit(1);
2128 }
2129 }
2130 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2131 debug_mode = atoi(argv[i + 1]);
2132 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2133 binary = atoi(argv[i + 1]);
2134 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2135 cc = atoi(argv[i + 1]);
2136 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2137 type = atoi(argv[i + 1]);
2138 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2139 strcpy(output_file, argv[i + 1]);
2140 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2141 window = atoi(argv[i + 1]);
2142 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2143 sample = atof(argv[i + 1]);
2144 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2145 hs = atoi(argv[i + 1]);
2146 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2147 negative = atoi(argv[i + 1]);
2148 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2149 strcpy(negative_classes_file, argv[i + 1]);
2150 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2151 nce = atoi(argv[i + 1]);
2152 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2153 num_threads = atoi(argv[i + 1]);
2154 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2155 iter = atoi(argv[i + 1]);
2156 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2157 min_count = atoi(argv[i + 1]);
2158 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2159 classes = atoi(argv[i + 1]);
2160 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2161 cap = atoi(argv[i + 1]);
2162 if (type == 0 || type == 2 || type == 4)
2163 alpha = 0.05;
2164 if (type==5) {
2165 sample = 0;
2166 cdb = open_collocatordb_for_write(output_file);
2167 }
2168 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2169 alpha = atof(argv[i + 1]);
2170 vocab = (struct vocab_word *) calloc(vocab_max_size,
2171 sizeof(struct vocab_word));
2172 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2173 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2174 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2175 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2176 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2177 }
2178 SaveArgs(argc, argv);
2179 TrainModel();
2180 return 0;
2181}
2182