blob: 83b8e3c8c5f0d119fc755eaa956e862248d4da10 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020018#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010019#include <math.h>
20#include <pthread.h>
21
22#define MAX_STRING 100
23#define EXP_TABLE_SIZE 1000
24#define MAX_EXP 6
25#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010026#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010027#define MAX_CODE_LENGTH 40
28
29const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
30
31typedef float real; // Precision of float numbers
32
33struct vocab_word {
34 long long cn;
35 int *point;
36 char *word, *code, codelen;
37};
38
39char train_file[MAX_STRING], output_file[MAX_STRING];
40char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
41char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
42struct vocab_word *vocab;
43int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietzc2731b22016-07-14 08:56:14 +020044 num_threads = 12, min_reduce = 1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010045int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020046long long *threadPos;
47int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010048long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
49long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
50 classes = 0;
51real alpha = 0.025, starting_alpha, sample = 1e-3;
52real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020053real avgWordLength=0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010054clock_t start;
55
56real *syn1_window, *syn1neg_window, *syn1nce_window;
57int w_offset, window_layer_size;
58
59int window_hidden_size = 500;
60real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
61 *syn_hidden_word_nce;
62
63int hs = 0, negative = 5;
64const int table_size = 1e8;
65int *table;
66
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010067long cc = 0;
68
Marc Kupietzd6f9c712016-03-16 11:50:56 +010069//constrastive negative sampling
70char negative_classes_file[MAX_STRING];
71int *word_to_group;
72int *group_to_table; //group_size*table_size
73int class_number;
74
75//nce
76real* noise_distribution;
77int nce = 0;
78
79//param caps
80real CAP_VALUE = 50;
81int cap = 0;
82
83void capParam(real* array, int index) {
84 if (array[index] > CAP_VALUE)
85 array[index] = CAP_VALUE;
86 else if (array[index] < -CAP_VALUE)
87 array[index] = -CAP_VALUE;
88}
89
90real hardTanh(real x) {
91 if (x >= 1) {
92 return 1;
93 } else if (x <= -1) {
94 return -1;
95 } else {
96 return x;
97 }
98}
99
100real dHardTanh(real x, real g) {
101 if (x > 1 && g > 0) {
102 return 0;
103 }
104 if (x < -1 && g < 0) {
105 return 0;
106 }
107 return 1;
108}
109
110void InitUnigramTable() {
111 int a, i;
112 long long train_words_pow = 0;
113 real d1, power = 0.75;
114 table = (int *) malloc(table_size * sizeof(int));
115 for (a = 0; a < vocab_size; a++)
116 train_words_pow += pow(vocab[a].cn, power);
117 i = 0;
118 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
119 for (a = 0; a < table_size; a++) {
120 table[a] = i;
121 if (a / (real) table_size > d1) {
122 i++;
123 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
124 }
125 if (i >= vocab_size)
126 i = vocab_size - 1;
127 }
128
129 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
130 for (a = 0; a < vocab_size; a++)
131 noise_distribution[a] = pow(vocab[a].cn, power)
132 / (real) train_words_pow;
133}
134
135// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
136void ReadWord(char *word, FILE *fin) {
137 int a = 0, ch;
138 while (!feof(fin)) {
139 ch = fgetc(fin);
140 if (ch == 13)
141 continue;
142 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
143 if (a > 0) {
144 if (ch == '\n')
145 ungetc(ch, fin);
146 break;
147 }
148 if (ch == '\n') {
149 strcpy(word, (char *) "</s>");
150 return;
151 } else
152 continue;
153 }
154 word[a] = ch;
155 a++;
156 if (a >= MAX_STRING - 1)
157 a--; // Truncate too long words
158 }
159 word[a] = 0;
160}
161
162// Returns hash value of a word
163int GetWordHash(char *word) {
164 unsigned long long a, hash = 0;
165 for (a = 0; a < strlen(word); a++)
166 hash = hash * 257 + word[a];
167 hash = hash % vocab_hash_size;
168 return hash;
169}
170
171// Returns position of a word in the vocabulary; if the word is not found, returns -1
172int SearchVocab(char *word) {
173 unsigned int hash = GetWordHash(word);
174 while (1) {
175 if (vocab_hash[hash] == -1)
176 return -1;
177 if (!strcmp(word, vocab[vocab_hash[hash]].word))
178 return vocab_hash[hash];
179 hash = (hash + 1) % vocab_hash_size;
180 }
181 return -1;
182}
183
184// Reads a word and returns its index in the vocabulary
185int ReadWordIndex(FILE *fin) {
186 char word[MAX_STRING];
187 ReadWord(word, fin);
188 if (feof(fin))
189 return -1;
190 return SearchVocab(word);
191}
192
193// Adds a word to the vocabulary
194int AddWordToVocab(char *word) {
195 unsigned int hash, length = strlen(word) + 1;
196 if (length > MAX_STRING)
197 length = MAX_STRING;
198 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
199 strcpy(vocab[vocab_size].word, word);
200 vocab[vocab_size].cn = 0;
201 vocab_size++;
202 // Reallocate memory if needed
203 if (vocab_size + 2 >= vocab_max_size) {
204 vocab_max_size += 1000;
205 vocab = (struct vocab_word *) realloc(vocab,
206 vocab_max_size * sizeof(struct vocab_word));
207 }
208 hash = GetWordHash(word);
209 while (vocab_hash[hash] != -1)
210 hash = (hash + 1) % vocab_hash_size;
211 vocab_hash[hash] = vocab_size - 1;
212 return vocab_size - 1;
213}
214
215// Used later for sorting by word counts
216int VocabCompare(const void *a, const void *b) {
217 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
218}
219
220// Sorts the vocabulary by frequency using word counts
221void SortVocab() {
222 int a, size;
223 unsigned int hash;
224 // Sort the vocabulary and keep </s> at the first position
225 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
226 for (a = 0; a < vocab_hash_size; a++)
227 vocab_hash[a] = -1;
228 size = vocab_size;
229 train_words = 0;
230 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200231 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100232 // Words occuring less than min_count times will be discarded from the vocab
233 if ((vocab[a].cn < min_count) && (a != 0)) {
234 vocab_size--;
235 free(vocab[a].word);
236 } else {
237 // Hash will be re-computed, as after the sorting it is not actual
238 hash = GetWordHash(vocab[a].word);
239 while (vocab_hash[hash] != -1)
240 hash = (hash + 1) % vocab_hash_size;
241 vocab_hash[hash] = a;
242 train_words += vocab[a].cn;
243 }
244 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200245 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100246 vocab = (struct vocab_word *) realloc(vocab,
247 (vocab_size + 1) * sizeof(struct vocab_word));
248 // Allocate memory for the binary tree construction
249 for (a = 0; a < vocab_size; a++) {
250 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
251 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
252 }
253}
254
255// Reduces the vocabulary by removing infrequent tokens
256void ReduceVocab() {
257 int a, b = 0;
258 unsigned int hash;
259 for (a = 0; a < vocab_size; a++)
260 if (vocab[a].cn > min_reduce) {
261 vocab[b].cn = vocab[a].cn;
262 vocab[b].word = vocab[a].word;
263 b++;
264 } else
265 free(vocab[a].word);
266 vocab_size = b;
267 for (a = 0; a < vocab_hash_size; a++)
268 vocab_hash[a] = -1;
269 for (a = 0; a < vocab_size; a++) {
270 // Hash will be re-computed, as it is not actual
271 hash = GetWordHash(vocab[a].word);
272 while (vocab_hash[hash] != -1)
273 hash = (hash + 1) % vocab_hash_size;
274 vocab_hash[hash] = a;
275 }
276 fflush(stdout);
277 min_reduce++;
278}
279
280// Create binary Huffman tree using the word counts
281// Frequent words will have short uniqe binary codes
282void CreateBinaryTree() {
283 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
284 char code[MAX_CODE_LENGTH];
285 long long *count = (long long *) calloc(vocab_size * 2 + 1,
286 sizeof(long long));
287 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
288 sizeof(long long));
289 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
290 sizeof(long long));
291 for (a = 0; a < vocab_size; a++)
292 count[a] = vocab[a].cn;
293 for (a = vocab_size; a < vocab_size * 2; a++)
294 count[a] = 1e15;
295 pos1 = vocab_size - 1;
296 pos2 = vocab_size;
297 // Following algorithm constructs the Huffman tree by adding one node at a time
298 for (a = 0; a < vocab_size - 1; a++) {
299 // First, find two smallest nodes 'min1, min2'
300 if (pos1 >= 0) {
301 if (count[pos1] < count[pos2]) {
302 min1i = pos1;
303 pos1--;
304 } else {
305 min1i = pos2;
306 pos2++;
307 }
308 } else {
309 min1i = pos2;
310 pos2++;
311 }
312 if (pos1 >= 0) {
313 if (count[pos1] < count[pos2]) {
314 min2i = pos1;
315 pos1--;
316 } else {
317 min2i = pos2;
318 pos2++;
319 }
320 } else {
321 min2i = pos2;
322 pos2++;
323 }
324 count[vocab_size + a] = count[min1i] + count[min2i];
325 parent_node[min1i] = vocab_size + a;
326 parent_node[min2i] = vocab_size + a;
327 binary[min2i] = 1;
328 }
329 // Now assign binary code to each vocabulary word
330 for (a = 0; a < vocab_size; a++) {
331 b = a;
332 i = 0;
333 while (1) {
334 code[i] = binary[b];
335 point[i] = b;
336 i++;
337 b = parent_node[b];
338 if (b == vocab_size * 2 - 2)
339 break;
340 }
341 vocab[a].codelen = i;
342 vocab[a].point[0] = vocab_size - 2;
343 for (b = 0; b < i; b++) {
344 vocab[a].code[i - b - 1] = code[b];
345 vocab[a].point[i - b] = point[b] - vocab_size;
346 }
347 }
348 free(count);
349 free(binary);
350 free(parent_node);
351}
352
353void LearnVocabFromTrainFile() {
354 char word[MAX_STRING];
355 FILE *fin;
356 long long a, i;
357 for (a = 0; a < vocab_hash_size; a++)
358 vocab_hash[a] = -1;
359 fin = fopen(train_file, "rb");
360 if (fin == NULL) {
361 printf("ERROR: training data file not found!\n");
362 exit(1);
363 }
364 vocab_size = 0;
365 AddWordToVocab((char *) "</s>");
366 while (1) {
367 ReadWord(word, fin);
368 if (feof(fin))
369 break;
370 train_words++;
371 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
372 printf("%lldK%c", train_words / 1000, 13);
373 fflush(stdout);
374 }
375 i = SearchVocab(word);
376 if (i == -1) {
377 a = AddWordToVocab(word);
378 vocab[a].cn = 1;
379 } else
380 vocab[i].cn++;
381 if (vocab_size > vocab_hash_size * 0.7)
382 ReduceVocab();
383 }
384 SortVocab();
385 if (debug_mode > 0) {
386 printf("Vocab size: %lld\n", vocab_size);
387 printf("Words in train file: %lld\n", train_words);
388 }
389 file_size = ftell(fin);
390 fclose(fin);
391}
392
393void SaveVocab() {
394 long long i;
395 FILE *fo = fopen(save_vocab_file, "wb");
396 for (i = 0; i < vocab_size; i++)
397 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
398 fclose(fo);
399}
400
401void ReadVocab() {
402 long long a, i = 0;
403 char c;
404 char word[MAX_STRING];
405 FILE *fin = fopen(read_vocab_file, "rb");
406 if (fin == NULL) {
407 printf("Vocabulary file not found\n");
408 exit(1);
409 }
410 for (a = 0; a < vocab_hash_size; a++)
411 vocab_hash[a] = -1;
412 vocab_size = 0;
413 while (1) {
414 ReadWord(word, fin);
415 if (feof(fin))
416 break;
417 a = AddWordToVocab(word);
418 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
419 i++;
420 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200421 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100422 fin = fopen(train_file, "rb");
423 if (fin == NULL) {
424 printf("ERROR: training data file not found!\n");
425 exit(1);
426 }
427 fseek(fin, 0, SEEK_END);
428 file_size = ftell(fin);
429 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200430 SortVocab();
431 if (debug_mode > 0) {
432 printf("Vocab size: %lld\n", vocab_size);
433 if(*read_vocab_file) {
434 printf("Words in vocab's train file: %lld\n", train_words);
435 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
436 } else {
437 printf("Words in train file: %lld\n", train_words);
438 }
439 }
440 if(*read_vocab_file) {
441 train_words = file_size / avgWordLength;
442 if(debug_mode > 0)
443 printf("Estimated words in train file: %lld\n", train_words);
444 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100445}
446
447void InitClassUnigramTable() {
448 long long a, c;
449 printf("loading class unigrams \n");
450 FILE *fin = fopen(negative_classes_file, "rb");
451 if (fin == NULL) {
452 printf("ERROR: class file not found!\n");
453 exit(1);
454 }
455 word_to_group = (int *) malloc(vocab_size * sizeof(int));
456 for (a = 0; a < vocab_size; a++)
457 word_to_group[a] = -1;
458 char class[MAX_STRING];
459 char prev_class[MAX_STRING];
460 prev_class[0] = 0;
461 char word[MAX_STRING];
462 class_number = -1;
463 while (1) {
464 if (feof(fin))
465 break;
466 ReadWord(class, fin);
467 ReadWord(word, fin);
468 int word_index = SearchVocab(word);
469 if (word_index != -1) {
470 if (strcmp(class, prev_class) != 0) {
471 class_number++;
472 strcpy(prev_class, class);
473 }
474 word_to_group[word_index] = class_number;
475 }
476 ReadWord(word, fin);
477 }
478 class_number++;
479 fclose(fin);
480
481 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
482 long long train_words_pow = 0;
483 real d1, power = 0.75;
484
485 for (c = 0; c < class_number; c++) {
486 long long offset = c * table_size;
487 train_words_pow = 0;
488 for (a = 0; a < vocab_size; a++)
489 if (word_to_group[a] == c)
490 train_words_pow += pow(vocab[a].cn, power);
491 int i = 0;
492 while (word_to_group[i] != c && i < vocab_size)
493 i++;
494 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
495 for (a = 0; a < table_size; a++) {
496 //printf("index %lld , word %d\n", a, i);
497 group_to_table[offset + a] = i;
498 if (a / (real) table_size > d1) {
499 i++;
500 while (word_to_group[i] != c && i < vocab_size)
501 i++;
502 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
503 }
504 if (i >= vocab_size)
505 while (word_to_group[i] != c && i >= 0)
506 i--;
507 }
508 }
509}
510
Marc Kupietz210b9d52016-04-02 21:48:13 +0200511void SaveArgs(int argc, char **argv) {
512 unsigned int i;
513 size_t len = 0;
514 char *_all_args, *all_args;
515 char *args_file = strdup(output_file);
516 strcat(args_file, ".args");
517 FILE *fargs = fopen(args_file, "w");
518 if (fargs == NULL) {
519 printf("Cannot save args to %s.\n", args_file);
520 return;
521 }
522
523 for(i=1; i<argc; i++) {
524 len += strlen(argv[i]);
525 }
526
527 _all_args = all_args = (char *)malloc(len+argc-1);
528
529 for(i=1; i<argc; i++) {
530 memcpy(_all_args, argv[i], strlen(argv[i]));
531 _all_args += strlen(argv[i])+1;
532 *(_all_args-1) = ' ';
533 }
534 *(_all_args-1) = 0;
535
536 fprintf(fargs, "%s\n", all_args);
537 fclose(fargs);
538
539 free(all_args);
540
541 return;
542}
543
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100544void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100545 if(type != 3 || negative <= 0) {
546 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
547 return;
548 }
549
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100550 FILE *fnet = fopen(save_net_file, "wb");
551 if (fnet == NULL) {
552 printf("Net parameter file not found\n");
553 exit(1);
554 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100555 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100556 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100557 fclose(fnet);
558}
559
560void InitNet() {
561 long long a, b;
562 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100563 long long read;
564
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100565 window_layer_size = layer1_size * window * 2;
566 a = posix_memalign((void **) &syn0, 128,
567 (long long) vocab_size * layer1_size * sizeof(real));
568 if (syn0 == NULL) {
569 printf("Memory allocation failed\n");
570 exit(1);
571 }
572
573 if (hs) {
574 a = posix_memalign((void **) &syn1, 128,
575 (long long) vocab_size * layer1_size * sizeof(real));
576 if (syn1 == NULL) {
577 printf("Memory allocation failed\n");
578 exit(1);
579 }
580 a = posix_memalign((void **) &syn1_window, 128,
581 (long long) vocab_size * window_layer_size * sizeof(real));
582 if (syn1_window == NULL) {
583 printf("Memory allocation failed\n");
584 exit(1);
585 }
586 a = posix_memalign((void **) &syn_hidden_word, 128,
587 (long long) vocab_size * window_hidden_size * sizeof(real));
588 if (syn_hidden_word == NULL) {
589 printf("Memory allocation failed\n");
590 exit(1);
591 }
592
593 for (a = 0; a < vocab_size; a++)
594 for (b = 0; b < layer1_size; b++)
595 syn1[a * layer1_size + b] = 0;
596 for (a = 0; a < vocab_size; a++)
597 for (b = 0; b < window_layer_size; b++)
598 syn1_window[a * window_layer_size + b] = 0;
599 for (a = 0; a < vocab_size; a++)
600 for (b = 0; b < window_hidden_size; b++)
601 syn_hidden_word[a * window_hidden_size + b] = 0;
602 }
603 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100604 if(type == 0) {
605 a = posix_memalign((void **) &syn1neg, 128,
606 (long long) vocab_size * layer1_size * sizeof(real));
607 if (syn1neg == NULL) {
608 printf("Memory allocation failed\n");
609 exit(1);
610 }
611 for (a = 0; a < vocab_size; a++)
612 for (b = 0; b < layer1_size; b++)
613 syn1neg[a * layer1_size + b] = 0;
614 } else if (type == 3) {
615 a = posix_memalign((void **) &syn1neg_window, 128,
616 (long long) vocab_size * window_layer_size * sizeof(real));
617 if (syn1neg_window == NULL) {
618 printf("Memory allocation failed\n");
619 exit(1);
620 }
621 for (a = 0; a < vocab_size; a++)
622 for (b = 0; b < window_layer_size; b++)
623 syn1neg_window[a * window_layer_size + b] = 0;
624 } else if (type == 4) {
625 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
626 (long long) vocab_size * window_hidden_size * sizeof(real));
627 if (syn_hidden_word_neg == NULL) {
628 printf("Memory allocation failed\n");
629 exit(1);
630 }
631 for (a = 0; a < vocab_size; a++)
632 for (b = 0; b < window_hidden_size; b++)
633 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100634 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100635 }
636 if (nce > 0) {
637 a = posix_memalign((void **) &syn1nce, 128,
638 (long long) vocab_size * layer1_size * sizeof(real));
639 if (syn1nce == NULL) {
640 printf("Memory allocation failed\n");
641 exit(1);
642 }
643 a = posix_memalign((void **) &syn1nce_window, 128,
644 (long long) vocab_size * window_layer_size * sizeof(real));
645 if (syn1nce_window == NULL) {
646 printf("Memory allocation failed\n");
647 exit(1);
648 }
649 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
650 (long long) vocab_size * window_hidden_size * sizeof(real));
651 if (syn_hidden_word_nce == NULL) {
652 printf("Memory allocation failed\n");
653 exit(1);
654 }
655
656 for (a = 0; a < vocab_size; a++)
657 for (b = 0; b < layer1_size; b++)
658 syn1nce[a * layer1_size + b] = 0;
659 for (a = 0; a < vocab_size; a++)
660 for (b = 0; b < window_layer_size; b++)
661 syn1nce_window[a * window_layer_size + b] = 0;
662 for (a = 0; a < vocab_size; a++)
663 for (b = 0; b < window_hidden_size; b++)
664 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
665 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100666
Marc Kupietz1006a272016-03-16 15:50:20 +0100667 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100668 a = posix_memalign((void **) &syn_window_hidden, 128,
669 window_hidden_size * window_layer_size * sizeof(real));
670 if (syn_window_hidden == NULL) {
671 printf("Memory allocation failed\n");
672 exit(1);
673 }
674 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
675 next_random = next_random * (unsigned long long) 25214903917 + 11;
676 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
677 - 0.5) / (window_hidden_size * window_layer_size);
678 }
679 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100680
681 if (read_net_file[0] == 0) {
682 for (a = 0; a < vocab_size; a++)
683 for (b = 0; b < layer1_size; b++) {
684 next_random = next_random * (unsigned long long) 25214903917
685 + 11;
686 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
687 / (real) 65536) - 0.5) / layer1_size;
688 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100689 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100690 FILE *fnet = fopen(read_net_file, "rb");
691 if (fnet == NULL) {
692 printf("Net parameter file not found\n");
693 exit(1);
694 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100695 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
696 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
697 if(read != vocab_size * layer1_size) {
698 fprintf(stderr, "read-net failed %lld\n", read);
699 exit(-1);
700 }
701 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
702 if(read != (long long) vocab_size * window_layer_size) {
703 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
704 (long long) sizeof(real) * vocab_size * window_layer_size);
705 exit(-1);
706 }
707 fgetc(fnet);
708 if(!feof(fnet)) {
709 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
710 exit(-1);
711 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100712 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100713 } else {
714 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
715 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100716 }
717
718 CreateBinaryTree();
719}
720
Marc Kupietz202723e2016-07-14 09:12:00 +0200721char *currentDateTime(char *buf, real offset) {
722 time_t t;
723 time(&t);
724 t += (long) offset;
725 struct tm tstruct;
726 tstruct = *localtime(&t);
727 strftime(buf, 80, "%c", &tstruct);
728 return buf;
729}
730
731void *MonitorThread(void *id) {
732 char *timebuf = malloc(80);;
733 int i, n=num_threads;
734 long long sum;
735 sleep(1);
736 while(n > 0) {
737 sleep(1);
738 sum = n = 0;
739 for(i=0; i < num_threads; i++) {
740 if(threadPos[i] >= 0) {
741 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
742 n++;
743 } else {
744 sum += iter * file_size / num_threads;
745 }
746 }
747 if(n == 0)
748 break;
749 real finished_portion = (real) sum / (float) (file_size * iter);
750 long long now = clock();
751 long long elapsed = (now - start) / CLOCKS_PER_SEC / num_threads;
752 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed) * ((real) num_threads / n) ;
753
754 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/t/s TE: %llds TTG: %llds ETA: %s\033[K",
755 alpha,
756 finished_portion * 100,
757 (float) sum / elapsed / num_threads / 1000,
758 elapsed,
759 ttg,
760 currentDateTime(timebuf, ttg)
761 );
762 fflush(stdout);
763 }
764 pthread_exit(NULL);
765}
766
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100767void *TrainModelThread(void *id) {
768 long long a, b, d, cw, word, last_word, sentence_length = 0,
769 sentence_position = 0;
770 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
771 long long l1, l2, c, target, label, local_iter = iter;
772 unsigned long long next_random = (long long) id;
773 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100774 int input_len_1 = layer1_size;
775 int window_offset = -1;
776 if (type == 2 || type == 4) {
777 input_len_1 = window_layer_size;
778 }
779 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
780 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200781 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100782
783 int input_len_2 = 0;
784 if (type == 4) {
785 input_len_2 = window_hidden_size;
786 }
787 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
788 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
789
790 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200791 long long start_pos = file_size / (long long) num_threads * (long long) id;
792 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
793 long long current_pos = start_pos;
794 long long last_pos = start_pos;;
795 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100796 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200797 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100798 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200799 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100800 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100801 alpha = starting_alpha
802 * (1 - word_count_actual / (real) (iter * train_words + 1));
803 if (alpha < starting_alpha * 0.0001)
804 alpha = starting_alpha * 0.0001;
805 }
806 if (sentence_length == 0) {
807 while (1) {
808 word = ReadWordIndex(fi);
809 if (feof(fi))
810 break;
811 if (word == -1)
812 continue;
813 word_count++;
814 if (word == 0)
815 break;
816 // The subsampling randomly discards frequent words while keeping the ranking same
817 if (sample > 0) {
818 real ran = (sqrt(vocab[word].cn / (sample * train_words))
819 + 1) * (sample * train_words) / vocab[word].cn;
820 next_random = next_random * (unsigned long long) 25214903917
821 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100822 if (ran < (next_random & 0xFFFF) / (real) 65536) {
823 if(type == 3) // in structured skipgrams
824 word = -2; // keep the window position correct
825 else
826 continue;
827 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100828 }
829 sen[sentence_length] = word;
830 sentence_length++;
831 if (sentence_length >= MAX_SENTENCE_LENGTH)
832 break;
833 }
834 sentence_position = 0;
835 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200836 current_pos = threadPos[(long) id] = ftell(fi);
837 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100838 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200839 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100840 local_iter--;
841 if (local_iter == 0)
842 break;
843 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200844 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100845 last_word_count = 0;
846 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200847 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100848 continue;
849 }
850 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200851 while (word == -2 && sentence_position<sentence_length)
852 word = sen[++sentence_position];
853 if (sentence_position>=sentence_length) {
854 sentence_length=0;
855 continue;
856 }
857 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100858 continue;
859 for (c = 0; c < input_len_1; c++)
860 neu1[c] = 0;
861 for (c = 0; c < input_len_1; c++)
862 neu1e[c] = 0;
863 for (c = 0; c < input_len_2; c++)
864 neu2[c] = 0;
865 for (c = 0; c < input_len_2; c++)
866 neu2e[c] = 0;
867 next_random = next_random * (unsigned long long) 25214903917 + 11;
868 b = next_random % window;
869 if (type == 0) { //train the cbow architecture
870 // in -> hidden
871 cw = 0;
872 for (a = b; a < window * 2 + 1 - b; a++)
873 if (a != window) {
874 c = sentence_position - window + a;
875 if (c < 0)
876 continue;
877 if (c >= sentence_length)
878 continue;
879 last_word = sen[c];
880 if (last_word == -1)
881 continue;
882 for (c = 0; c < layer1_size; c++)
883 neu1[c] += syn0[c + last_word * layer1_size];
884 cw++;
885 }
886 if (cw) {
887 for (c = 0; c < layer1_size; c++)
888 neu1[c] /= cw;
889 if (hs)
890 for (d = 0; d < vocab[word].codelen; d++) {
891 f = 0;
892 l2 = vocab[word].point[d] * layer1_size;
893 // Propagate hidden -> output
894 for (c = 0; c < layer1_size; c++)
895 f += neu1[c] * syn1[c + l2];
896 if (f <= -MAX_EXP)
897 continue;
898 else if (f >= MAX_EXP)
899 continue;
900 else
901 f = expTable[(int) ((f + MAX_EXP)
902 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
903 // 'g' is the gradient multiplied by the learning rate
904 g = (1 - vocab[word].code[d] - f) * alpha;
905 // Propagate errors output -> hidden
906 for (c = 0; c < layer1_size; c++)
907 neu1e[c] += g * syn1[c + l2];
908 // Learn weights hidden -> output
909 for (c = 0; c < layer1_size; c++)
910 syn1[c + l2] += g * neu1[c];
911 if (cap == 1)
912 for (c = 0; c < layer1_size; c++)
913 capParam(syn1, c + l2);
914 }
915 // NEGATIVE SAMPLING
916 if (negative > 0)
917 for (d = 0; d < negative + 1; d++) {
918 if (d == 0) {
919 target = word;
920 label = 1;
921 } else {
922 next_random = next_random
923 * (unsigned long long) 25214903917 + 11;
924 if (word_to_group != NULL
925 && word_to_group[word] != -1) {
926 target = word;
927 while (target == word) {
928 target = group_to_table[word_to_group[word]
929 * table_size
930 + (next_random >> 16) % table_size];
931 next_random = next_random
932 * (unsigned long long) 25214903917
933 + 11;
934 }
935 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
936 } else {
937 target =
938 table[(next_random >> 16) % table_size];
939 }
940 if (target == 0)
941 target = next_random % (vocab_size - 1) + 1;
942 if (target == word)
943 continue;
944 label = 0;
945 }
946 l2 = target * layer1_size;
947 f = 0;
948 for (c = 0; c < layer1_size; c++)
949 f += neu1[c] * syn1neg[c + l2];
950 if (f > MAX_EXP)
951 g = (label - 1) * alpha;
952 else if (f < -MAX_EXP)
953 g = (label - 0) * alpha;
954 else
955 g = (label
956 - expTable[(int) ((f + MAX_EXP)
957 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
958 * alpha;
959 for (c = 0; c < layer1_size; c++)
960 neu1e[c] += g * syn1neg[c + l2];
961 for (c = 0; c < layer1_size; c++)
962 syn1neg[c + l2] += g * neu1[c];
963 if (cap == 1)
964 for (c = 0; c < layer1_size; c++)
965 capParam(syn1neg, c + l2);
966 }
967 // Noise Contrastive Estimation
968 if (nce > 0)
969 for (d = 0; d < nce + 1; d++) {
970 if (d == 0) {
971 target = word;
972 label = 1;
973 } else {
974 next_random = next_random
975 * (unsigned long long) 25214903917 + 11;
976 if (word_to_group != NULL
977 && word_to_group[word] != -1) {
978 target = word;
979 while (target == word) {
980 target = group_to_table[word_to_group[word]
981 * table_size
982 + (next_random >> 16) % table_size];
983 next_random = next_random
984 * (unsigned long long) 25214903917
985 + 11;
986 }
987 } else {
988 target =
989 table[(next_random >> 16) % table_size];
990 }
991 if (target == 0)
992 target = next_random % (vocab_size - 1) + 1;
993 if (target == word)
994 continue;
995 label = 0;
996 }
997 l2 = target * layer1_size;
998 f = 0;
999
1000 for (c = 0; c < layer1_size; c++)
1001 f += neu1[c] * syn1nce[c + l2];
1002 if (f > MAX_EXP)
1003 g = (label - 1) * alpha;
1004 else if (f < -MAX_EXP)
1005 g = (label - 0) * alpha;
1006 else {
1007 f = exp(f);
1008 g =
1009 (label
1010 - f
1011 / (noise_distribution[target]
1012 * nce + f)) * alpha;
1013 }
1014 for (c = 0; c < layer1_size; c++)
1015 neu1e[c] += g * syn1nce[c + l2];
1016 for (c = 0; c < layer1_size; c++)
1017 syn1nce[c + l2] += g * neu1[c];
1018 if (cap == 1)
1019 for (c = 0; c < layer1_size; c++)
1020 capParam(syn1nce, c + l2);
1021 }
1022 // hidden -> in
1023 for (a = b; a < window * 2 + 1 - b; a++)
1024 if (a != window) {
1025 c = sentence_position - window + a;
1026 if (c < 0)
1027 continue;
1028 if (c >= sentence_length)
1029 continue;
1030 last_word = sen[c];
1031 if (last_word == -1)
1032 continue;
1033 for (c = 0; c < layer1_size; c++)
1034 syn0[c + last_word * layer1_size] += neu1e[c];
1035 }
1036 }
1037 } else if (type == 1) { //train skip-gram
1038 for (a = b; a < window * 2 + 1 - b; a++)
1039 if (a != window) {
1040 c = sentence_position - window + a;
1041 if (c < 0)
1042 continue;
1043 if (c >= sentence_length)
1044 continue;
1045 last_word = sen[c];
1046 if (last_word == -1)
1047 continue;
1048 l1 = last_word * layer1_size;
1049 for (c = 0; c < layer1_size; c++)
1050 neu1e[c] = 0;
1051 // HIERARCHICAL SOFTMAX
1052 if (hs)
1053 for (d = 0; d < vocab[word].codelen; d++) {
1054 f = 0;
1055 l2 = vocab[word].point[d] * layer1_size;
1056 // Propagate hidden -> output
1057 for (c = 0; c < layer1_size; c++)
1058 f += syn0[c + l1] * syn1[c + l2];
1059 if (f <= -MAX_EXP)
1060 continue;
1061 else if (f >= MAX_EXP)
1062 continue;
1063 else
1064 f = expTable[(int) ((f + MAX_EXP)
1065 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1066 // 'g' is the gradient multiplied by the learning rate
1067 g = (1 - vocab[word].code[d] - f) * alpha;
1068 // Propagate errors output -> hidden
1069 for (c = 0; c < layer1_size; c++)
1070 neu1e[c] += g * syn1[c + l2];
1071 // Learn weights hidden -> output
1072 for (c = 0; c < layer1_size; c++)
1073 syn1[c + l2] += g * syn0[c + l1];
1074 if (cap == 1)
1075 for (c = 0; c < layer1_size; c++)
1076 capParam(syn1, c + l2);
1077 }
1078 // NEGATIVE SAMPLING
1079 if (negative > 0)
1080 for (d = 0; d < negative + 1; d++) {
1081 if (d == 0) {
1082 target = word;
1083 label = 1;
1084 } else {
1085 next_random = next_random
1086 * (unsigned long long) 25214903917 + 11;
1087 if (word_to_group != NULL
1088 && word_to_group[word] != -1) {
1089 target = word;
1090 while (target == word) {
1091 target =
1092 group_to_table[word_to_group[word]
1093 * table_size
1094 + (next_random >> 16)
1095 % table_size];
1096 next_random =
1097 next_random
1098 * (unsigned long long) 25214903917
1099 + 11;
1100 }
1101 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1102 } else {
1103 target = table[(next_random >> 16)
1104 % table_size];
1105 }
1106 if (target == 0)
1107 target = next_random % (vocab_size - 1) + 1;
1108 if (target == word)
1109 continue;
1110 label = 0;
1111 }
1112 l2 = target * layer1_size;
1113 f = 0;
1114 for (c = 0; c < layer1_size; c++)
1115 f += syn0[c + l1] * syn1neg[c + l2];
1116 if (f > MAX_EXP)
1117 g = (label - 1) * alpha;
1118 else if (f < -MAX_EXP)
1119 g = (label - 0) * alpha;
1120 else
1121 g =
1122 (label
1123 - expTable[(int) ((f + MAX_EXP)
1124 * (EXP_TABLE_SIZE
1125 / MAX_EXP / 2))])
1126 * alpha;
1127 for (c = 0; c < layer1_size; c++)
1128 neu1e[c] += g * syn1neg[c + l2];
1129 for (c = 0; c < layer1_size; c++)
1130 syn1neg[c + l2] += g * syn0[c + l1];
1131 if (cap == 1)
1132 for (c = 0; c < layer1_size; c++)
1133 capParam(syn1neg, c + l2);
1134 }
1135 //Noise Contrastive Estimation
1136 if (nce > 0)
1137 for (d = 0; d < nce + 1; d++) {
1138 if (d == 0) {
1139 target = word;
1140 label = 1;
1141 } else {
1142 next_random = next_random
1143 * (unsigned long long) 25214903917 + 11;
1144 if (word_to_group != NULL
1145 && word_to_group[word] != -1) {
1146 target = word;
1147 while (target == word) {
1148 target =
1149 group_to_table[word_to_group[word]
1150 * table_size
1151 + (next_random >> 16)
1152 % table_size];
1153 next_random =
1154 next_random
1155 * (unsigned long long) 25214903917
1156 + 11;
1157 }
1158 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1159 } else {
1160 target = table[(next_random >> 16)
1161 % table_size];
1162 }
1163 if (target == 0)
1164 target = next_random % (vocab_size - 1) + 1;
1165 if (target == word)
1166 continue;
1167 label = 0;
1168 }
1169 l2 = target * layer1_size;
1170 f = 0;
1171 for (c = 0; c < layer1_size; c++)
1172 f += syn0[c + l1] * syn1nce[c + l2];
1173 if (f > MAX_EXP)
1174 g = (label - 1) * alpha;
1175 else if (f < -MAX_EXP)
1176 g = (label - 0) * alpha;
1177 else {
1178 f = exp(f);
1179 g = (label
1180 - f
1181 / (noise_distribution[target]
1182 * nce + f)) * alpha;
1183 }
1184 for (c = 0; c < layer1_size; c++)
1185 neu1e[c] += g * syn1nce[c + l2];
1186 for (c = 0; c < layer1_size; c++)
1187 syn1nce[c + l2] += g * syn0[c + l1];
1188 if (cap == 1)
1189 for (c = 0; c < layer1_size; c++)
1190 capParam(syn1nce, c + l2);
1191 }
1192 // Learn weights input -> hidden
1193 for (c = 0; c < layer1_size; c++)
1194 syn0[c + l1] += neu1e[c];
1195 }
1196 } else if (type == 2) { //train the cwindow architecture
1197 // in -> hidden
1198 cw = 0;
1199 for (a = 0; a < window * 2 + 1; a++)
1200 if (a != window) {
1201 c = sentence_position - window + a;
1202 if (c < 0)
1203 continue;
1204 if (c >= sentence_length)
1205 continue;
1206 last_word = sen[c];
1207 if (last_word == -1)
1208 continue;
1209 window_offset = a * layer1_size;
1210 if (a > window)
1211 window_offset -= layer1_size;
1212 for (c = 0; c < layer1_size; c++)
1213 neu1[c + window_offset] += syn0[c
1214 + last_word * layer1_size];
1215 cw++;
1216 }
1217 if (cw) {
1218 if (hs)
1219 for (d = 0; d < vocab[word].codelen; d++) {
1220 f = 0;
1221 l2 = vocab[word].point[d] * window_layer_size;
1222 // Propagate hidden -> output
1223 for (c = 0; c < window_layer_size; c++)
1224 f += neu1[c] * syn1_window[c + l2];
1225 if (f <= -MAX_EXP)
1226 continue;
1227 else if (f >= MAX_EXP)
1228 continue;
1229 else
1230 f = expTable[(int) ((f + MAX_EXP)
1231 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1232 // 'g' is the gradient multiplied by the learning rate
1233 g = (1 - vocab[word].code[d] - f) * alpha;
1234 // Propagate errors output -> hidden
1235 for (c = 0; c < window_layer_size; c++)
1236 neu1e[c] += g * syn1_window[c + l2];
1237 // Learn weights hidden -> output
1238 for (c = 0; c < window_layer_size; c++)
1239 syn1_window[c + l2] += g * neu1[c];
1240 if (cap == 1)
1241 for (c = 0; c < window_layer_size; c++)
1242 capParam(syn1_window, c + l2);
1243 }
1244 // NEGATIVE SAMPLING
1245 if (negative > 0)
1246 for (d = 0; d < negative + 1; d++) {
1247 if (d == 0) {
1248 target = word;
1249 label = 1;
1250 } else {
1251 next_random = next_random
1252 * (unsigned long long) 25214903917 + 11;
1253 if (word_to_group != NULL
1254 && word_to_group[word] != -1) {
1255 target = word;
1256 while (target == word) {
1257 target = group_to_table[word_to_group[word]
1258 * table_size
1259 + (next_random >> 16) % table_size];
1260 next_random = next_random
1261 * (unsigned long long) 25214903917
1262 + 11;
1263 }
1264 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1265 } else {
1266 target =
1267 table[(next_random >> 16) % table_size];
1268 }
1269 if (target == 0)
1270 target = next_random % (vocab_size - 1) + 1;
1271 if (target == word)
1272 continue;
1273 label = 0;
1274 }
1275 l2 = target * window_layer_size;
1276 f = 0;
1277 for (c = 0; c < window_layer_size; c++)
1278 f += neu1[c] * syn1neg_window[c + l2];
1279 if (f > MAX_EXP)
1280 g = (label - 1) * alpha;
1281 else if (f < -MAX_EXP)
1282 g = (label - 0) * alpha;
1283 else
1284 g = (label
1285 - expTable[(int) ((f + MAX_EXP)
1286 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1287 * alpha;
1288 for (c = 0; c < window_layer_size; c++)
1289 neu1e[c] += g * syn1neg_window[c + l2];
1290 for (c = 0; c < window_layer_size; c++)
1291 syn1neg_window[c + l2] += g * neu1[c];
1292 if (cap == 1)
1293 for (c = 0; c < window_layer_size; c++)
1294 capParam(syn1neg_window, c + l2);
1295 }
1296 // Noise Contrastive Estimation
1297 if (nce > 0)
1298 for (d = 0; d < nce + 1; d++) {
1299 if (d == 0) {
1300 target = word;
1301 label = 1;
1302 } else {
1303 next_random = next_random
1304 * (unsigned long long) 25214903917 + 11;
1305 if (word_to_group != NULL
1306 && word_to_group[word] != -1) {
1307 target = word;
1308 while (target == word) {
1309 target = group_to_table[word_to_group[word]
1310 * table_size
1311 + (next_random >> 16) % table_size];
1312 next_random = next_random
1313 * (unsigned long long) 25214903917
1314 + 11;
1315 }
1316 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1317 } else {
1318 target =
1319 table[(next_random >> 16) % table_size];
1320 }
1321 if (target == 0)
1322 target = next_random % (vocab_size - 1) + 1;
1323 if (target == word)
1324 continue;
1325 label = 0;
1326 }
1327 l2 = target * window_layer_size;
1328 f = 0;
1329 for (c = 0; c < window_layer_size; c++)
1330 f += neu1[c] * syn1nce_window[c + l2];
1331 if (f > MAX_EXP)
1332 g = (label - 1) * alpha;
1333 else if (f < -MAX_EXP)
1334 g = (label - 0) * alpha;
1335 else {
1336 f = exp(f);
1337 g =
1338 (label
1339 - f
1340 / (noise_distribution[target]
1341 * nce + f)) * alpha;
1342 }
1343 for (c = 0; c < window_layer_size; c++)
1344 neu1e[c] += g * syn1nce_window[c + l2];
1345 for (c = 0; c < window_layer_size; c++)
1346 syn1nce_window[c + l2] += g * neu1[c];
1347 if (cap == 1)
1348 for (c = 0; c < window_layer_size; c++)
1349 capParam(syn1nce_window, c + l2);
1350 }
1351 // hidden -> in
1352 for (a = 0; a < window * 2 + 1; a++)
1353 if (a != window) {
1354 c = sentence_position - window + a;
1355 if (c < 0)
1356 continue;
1357 if (c >= sentence_length)
1358 continue;
1359 last_word = sen[c];
1360 if (last_word == -1)
1361 continue;
1362 window_offset = a * layer1_size;
1363 if (a > window)
1364 window_offset -= layer1_size;
1365 for (c = 0; c < layer1_size; c++)
1366 syn0[c + last_word * layer1_size] += neu1e[c
1367 + window_offset];
1368 }
1369 }
1370 } else if (type == 3) { //train structured skip-gram
1371 for (a = 0; a < window * 2 + 1; a++)
1372 if (a != window) {
1373 c = sentence_position - window + a;
1374 if (c < 0)
1375 continue;
1376 if (c >= sentence_length)
1377 continue;
1378 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001379 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001380 continue;
1381 l1 = last_word * layer1_size;
1382 window_offset = a * layer1_size;
1383 if (a > window)
1384 window_offset -= layer1_size;
1385 for (c = 0; c < layer1_size; c++)
1386 neu1e[c] = 0;
1387 // HIERARCHICAL SOFTMAX
1388 if (hs)
1389 for (d = 0; d < vocab[word].codelen; d++) {
1390 f = 0;
1391 l2 = vocab[word].point[d] * window_layer_size;
1392 // Propagate hidden -> output
1393 for (c = 0; c < layer1_size; c++)
1394 f += syn0[c + l1]
1395 * syn1_window[c + l2 + window_offset];
1396 if (f <= -MAX_EXP)
1397 continue;
1398 else if (f >= MAX_EXP)
1399 continue;
1400 else
1401 f = expTable[(int) ((f + MAX_EXP)
1402 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1403 // 'g' is the gradient multiplied by the learning rate
1404 g = (1 - vocab[word].code[d] - f) * alpha;
1405 // Propagate errors output -> hidden
1406 for (c = 0; c < layer1_size; c++)
1407 neu1e[c] += g
1408 * syn1_window[c + l2 + window_offset];
1409 // Learn weights hidden -> output
1410 for (c = 0; c < layer1_size; c++)
1411 syn1[c + l2 + window_offset] += g
1412 * syn0[c + l1];
1413 if (cap == 1)
1414 for (c = 0; c < layer1_size; c++)
1415 capParam(syn1, c + l2 + window_offset);
1416 }
1417 // NEGATIVE SAMPLING
1418 if (negative > 0)
1419 for (d = 0; d < negative + 1; d++) {
1420 if (d == 0) {
1421 target = word;
1422 label = 1;
1423 } else {
1424 next_random = next_random
1425 * (unsigned long long) 25214903917 + 11;
1426 if (word_to_group != NULL
1427 && word_to_group[word] != -1) {
1428 target = word;
1429 while (target == word) {
1430 target =
1431 group_to_table[word_to_group[word]
1432 * table_size
1433 + (next_random >> 16)
1434 % table_size];
1435 next_random =
1436 next_random
1437 * (unsigned long long) 25214903917
1438 + 11;
1439 }
1440 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1441 } else {
1442 target = table[(next_random >> 16)
1443 % table_size];
1444 }
1445 if (target == 0)
1446 target = next_random % (vocab_size - 1) + 1;
1447 if (target == word)
1448 continue;
1449 label = 0;
1450 }
1451 l2 = target * window_layer_size;
1452 f = 0;
1453 for (c = 0; c < layer1_size; c++)
1454 f +=
1455 syn0[c + l1]
1456 * syn1neg_window[c + l2
1457 + window_offset];
1458 if (f > MAX_EXP)
1459 g = (label - 1) * alpha;
1460 else if (f < -MAX_EXP)
1461 g = (label - 0) * alpha;
1462 else
1463 g =
1464 (label
1465 - expTable[(int) ((f + MAX_EXP)
1466 * (EXP_TABLE_SIZE
1467 / MAX_EXP / 2))])
1468 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001469 if(debug_mode > 2 && ((long long) id) == 0) {
1470 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1471 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1472 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001473 for (c = 0; c < layer1_size; c++)
1474 neu1e[c] +=
1475 g
1476 * syn1neg_window[c + l2
1477 + window_offset];
1478 for (c = 0; c < layer1_size; c++)
1479 syn1neg_window[c + l2 + window_offset] += g
1480 * syn0[c + l1];
1481 if (cap == 1)
1482 for (c = 0; c < layer1_size; c++)
1483 capParam(syn1neg_window,
1484 c + l2 + window_offset);
1485 }
1486 // Noise Constrastive Estimation
1487 if (nce > 0)
1488 for (d = 0; d < nce + 1; d++) {
1489 if (d == 0) {
1490 target = word;
1491 label = 1;
1492 } else {
1493 next_random = next_random
1494 * (unsigned long long) 25214903917 + 11;
1495 if (word_to_group != NULL
1496 && word_to_group[word] != -1) {
1497 target = word;
1498 while (target == word) {
1499 target =
1500 group_to_table[word_to_group[word]
1501 * table_size
1502 + (next_random >> 16)
1503 % table_size];
1504 next_random =
1505 next_random
1506 * (unsigned long long) 25214903917
1507 + 11;
1508 }
1509 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1510 } else {
1511 target = table[(next_random >> 16)
1512 % table_size];
1513 }
1514 if (target == 0)
1515 target = next_random % (vocab_size - 1) + 1;
1516 if (target == word)
1517 continue;
1518 label = 0;
1519 }
1520 l2 = target * window_layer_size;
1521 f = 0;
1522 for (c = 0; c < layer1_size; c++)
1523 f +=
1524 syn0[c + l1]
1525 * syn1nce_window[c + l2
1526 + window_offset];
1527 if (f > MAX_EXP)
1528 g = (label - 1) * alpha;
1529 else if (f < -MAX_EXP)
1530 g = (label - 0) * alpha;
1531 else {
1532 f = exp(f);
1533 g = (label
1534 - f
1535 / (noise_distribution[target]
1536 * nce + f)) * alpha;
1537 }
1538 for (c = 0; c < layer1_size; c++)
1539 neu1e[c] +=
1540 g
1541 * syn1nce_window[c + l2
1542 + window_offset];
1543 for (c = 0; c < layer1_size; c++)
1544 syn1nce_window[c + l2 + window_offset] += g
1545 * syn0[c + l1];
1546 if (cap == 1)
1547 for (c = 0; c < layer1_size; c++)
1548 capParam(syn1nce_window,
1549 c + l2 + window_offset);
1550 }
1551 // Learn weights input -> hidden
1552 for (c = 0; c < layer1_size; c++) {
1553 syn0[c + l1] += neu1e[c];
1554 if (syn0[c + l1] > 50)
1555 syn0[c + l1] = 50;
1556 if (syn0[c + l1] < -50)
1557 syn0[c + l1] = -50;
1558 }
1559 }
1560 } else if (type == 4) { //training senna
1561 // in -> hidden
1562 cw = 0;
1563 for (a = 0; a < window * 2 + 1; a++)
1564 if (a != window) {
1565 c = sentence_position - window + a;
1566 if (c < 0)
1567 continue;
1568 if (c >= sentence_length)
1569 continue;
1570 last_word = sen[c];
1571 if (last_word == -1)
1572 continue;
1573 window_offset = a * layer1_size;
1574 if (a > window)
1575 window_offset -= layer1_size;
1576 for (c = 0; c < layer1_size; c++)
1577 neu1[c + window_offset] += syn0[c
1578 + last_word * layer1_size];
1579 cw++;
1580 }
1581 if (cw) {
1582 for (a = 0; a < window_hidden_size; a++) {
1583 c = a * window_layer_size;
1584 for (b = 0; b < window_layer_size; b++) {
1585 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1586 }
1587 }
1588 if (hs)
1589 for (d = 0; d < vocab[word].codelen; d++) {
1590 f = 0;
1591 l2 = vocab[word].point[d] * window_hidden_size;
1592 // Propagate hidden -> output
1593 for (c = 0; c < window_hidden_size; c++)
1594 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1595 if (f <= -MAX_EXP)
1596 continue;
1597 else if (f >= MAX_EXP)
1598 continue;
1599 else
1600 f = expTable[(int) ((f + MAX_EXP)
1601 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1602 // 'g' is the gradient multiplied by the learning rate
1603 g = (1 - vocab[word].code[d] - f) * alpha;
1604 // Propagate errors output -> hidden
1605 for (c = 0; c < window_hidden_size; c++)
1606 neu2e[c] += dHardTanh(neu2[c], g) * g
1607 * syn_hidden_word[c + l2];
1608 // Learn weights hidden -> output
1609 for (c = 0; c < window_hidden_size; c++)
1610 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1611 * neu2[c];
1612 }
1613 // NEGATIVE SAMPLING
1614 if (negative > 0)
1615 for (d = 0; d < negative + 1; d++) {
1616 if (d == 0) {
1617 target = word;
1618 label = 1;
1619 } else {
1620 next_random = next_random
1621 * (unsigned long long) 25214903917 + 11;
1622 if (word_to_group != NULL
1623 && word_to_group[word] != -1) {
1624 target = word;
1625 while (target == word) {
1626 target = group_to_table[word_to_group[word]
1627 * table_size
1628 + (next_random >> 16) % table_size];
1629 next_random = next_random
1630 * (unsigned long long) 25214903917
1631 + 11;
1632 }
1633 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1634 } else {
1635 target =
1636 table[(next_random >> 16) % table_size];
1637 }
1638 if (target == 0)
1639 target = next_random % (vocab_size - 1) + 1;
1640 if (target == word)
1641 continue;
1642 label = 0;
1643 }
1644 l2 = target * window_hidden_size;
1645 f = 0;
1646 for (c = 0; c < window_hidden_size; c++)
1647 f += hardTanh(neu2[c])
1648 * syn_hidden_word_neg[c + l2];
1649 if (f > MAX_EXP)
1650 g = (label - 1) * alpha / negative;
1651 else if (f < -MAX_EXP)
1652 g = (label - 0) * alpha / negative;
1653 else
1654 g = (label
1655 - expTable[(int) ((f + MAX_EXP)
1656 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1657 * alpha / negative;
1658 for (c = 0; c < window_hidden_size; c++)
1659 neu2e[c] += dHardTanh(neu2[c], g) * g
1660 * syn_hidden_word_neg[c + l2];
1661 for (c = 0; c < window_hidden_size; c++)
1662 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1663 * g * neu2[c];
1664 }
1665 for (a = 0; a < window_hidden_size; a++)
1666 for (b = 0; b < window_layer_size; b++)
1667 neu1e[b] += neu2e[a]
1668 * syn_window_hidden[a * window_layer_size + b];
1669 for (a = 0; a < window_hidden_size; a++)
1670 for (b = 0; b < window_layer_size; b++)
1671 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1672 * neu1[b];
1673 // hidden -> in
1674 for (a = 0; a < window * 2 + 1; a++)
1675 if (a != window) {
1676 c = sentence_position - window + a;
1677 if (c < 0)
1678 continue;
1679 if (c >= sentence_length)
1680 continue;
1681 last_word = sen[c];
1682 if (last_word == -1)
1683 continue;
1684 window_offset = a * layer1_size;
1685 if (a > window)
1686 window_offset -= layer1_size;
1687 for (c = 0; c < layer1_size; c++)
1688 syn0[c + last_word * layer1_size] += neu1e[c
1689 + window_offset];
1690 }
1691 }
1692 } else {
1693 printf("unknown type %i", type);
1694 exit(0);
1695 }
1696 sentence_position++;
1697 if (sentence_position >= sentence_length) {
1698 sentence_length = 0;
1699 continue;
1700 }
1701 }
1702 fclose(fi);
1703 free(neu1);
1704 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001705 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001706 pthread_exit(NULL);
1707}
1708
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001709void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001710 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001711 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001712 real *target_sums, bestf[MAX_CC], worstbest;
1713 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001714 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001715 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1716
1717 for (d = cc; d < vocab_size; d++) {
1718 for (b = 0; b < vocab_size; b++)
1719 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001720 for (b = 0; b < N; b++)
1721 bestf[b]=-1;
1722 worstbest = -1;
1723
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001724 maxmax_f = -1;
1725 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001726 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001727 if (a != window) {
1728 max_f = -1;
1729 window_offset = a * layer1_size;
1730 if (a > window)
1731 window_offset -= layer1_size;
1732 for(target = 0; target < vocab_size; target ++) {
1733 if(target == d)
1734 continue;
1735 f = 0;
1736 for (c = 0; c < layer1_size; c++)
1737 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1738 if (f < -MAX_EXP)
1739 continue;
1740 else if (f > MAX_EXP)
1741 continue;
1742 else
1743 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1744 if(f > max_f) {
1745 max_f = f;
1746 max_target = target;
1747 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001748 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001749 if(f > worstbest) {
1750 for (b = 0; b < N; b++) {
1751 if (f > bestf[b]) {
1752 for (e = N - 1; e > b; e--) {
1753 bestf[e] = bestf[e - 1];
1754 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001755 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001756 }
1757 bestf[b] = f;
1758 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001759 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001760 break;
1761 }
1762 }
1763 worstbest = bestf[N-1];
1764 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001765 }
1766 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1767 if(max_f > maxmax_f) {
1768 maxmax_f = max_f;
1769 maxmax_target = max_target;
1770 }
1771 } else {
1772 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1773 }
1774 }
1775 max_f = -1;
1776 for (b = 0; b < vocab_size; b++) {
1777 if(target_sums[b] > max_f) {
1778 max_f = target_sums[b];
1779 max_target = b;
1780 }
1781 }
1782 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001783 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001784 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001785 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001786 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001787 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001788 }
1789}
1790
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001791void TrainModel() {
1792 long a, b, c, d;
1793 FILE *fo;
1794 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001795 threadPos = malloc(num_threads * sizeof(long long));
1796 threadIters = malloc(num_threads * sizeof(int));
1797 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001798 printf("Starting training using file %s\n", train_file);
1799 starting_alpha = alpha;
1800 if (read_vocab_file[0] != 0)
1801 ReadVocab();
1802 else
1803 LearnVocabFromTrainFile();
1804 if (save_vocab_file[0] != 0)
1805 SaveVocab();
1806 if (output_file[0] == 0)
1807 return;
1808 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001809 if(cc > 0)
1810 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001811 if (negative > 0 || nce > 0)
1812 InitUnigramTable();
1813 if (negative_classes_file[0] != 0)
1814 InitClassUnigramTable();
1815 start = clock();
1816 for (a = 0; a < num_threads; a++)
1817 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001818 if(debug_mode > 1)
1819 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001820 for (a = 0; a < num_threads; a++)
1821 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001822 if(debug_mode > 1) {
1823 pthread_join(pt[num_threads], NULL);
1824 clock_t now = clock();
1825 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now-start) / CLOCKS_PER_SEC / num_threads, (now-start) / CLOCKS_PER_SEC);
1826 printf("Saving vectors to %s ...", output_file);
1827 fflush(stdout);
1828 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001829 fo = fopen(output_file, "wb");
1830 if (classes == 0) {
1831 // Save the word vectors
1832 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1833 for (a = 0; a < vocab_size; a++) {
1834 fprintf(fo, "%s ", vocab[a].word);
1835 if (binary)
1836 for (b = 0; b < layer1_size; b++)
1837 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1838 else
1839 for (b = 0; b < layer1_size; b++)
1840 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1841 fprintf(fo, "\n");
1842 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001843 if(debug_mode > 1)
1844 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001845 } else {
1846 // Run K-means on the word vectors
1847 int clcn = classes, iter = 10, closeid;
1848 int *centcn = (int *) malloc(classes * sizeof(int));
1849 int *cl = (int *) calloc(vocab_size, sizeof(int));
1850 real closev, x;
1851 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1852 for (a = 0; a < vocab_size; a++)
1853 cl[a] = a % clcn;
1854 for (a = 0; a < iter; a++) {
1855 for (b = 0; b < clcn * layer1_size; b++)
1856 cent[b] = 0;
1857 for (b = 0; b < clcn; b++)
1858 centcn[b] = 1;
1859 for (c = 0; c < vocab_size; c++) {
1860 for (d = 0; d < layer1_size; d++)
1861 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1862 centcn[cl[c]]++;
1863 }
1864 for (b = 0; b < clcn; b++) {
1865 closev = 0;
1866 for (c = 0; c < layer1_size; c++) {
1867 cent[layer1_size * b + c] /= centcn[b];
1868 closev += cent[layer1_size * b + c]
1869 * cent[layer1_size * b + c];
1870 }
1871 closev = sqrt(closev);
1872 for (c = 0; c < layer1_size; c++)
1873 cent[layer1_size * b + c] /= closev;
1874 }
1875 for (c = 0; c < vocab_size; c++) {
1876 closev = -10;
1877 closeid = 0;
1878 for (d = 0; d < clcn; d++) {
1879 x = 0;
1880 for (b = 0; b < layer1_size; b++)
1881 x += cent[layer1_size * d + b]
1882 * syn0[c * layer1_size + b];
1883 if (x > closev) {
1884 closev = x;
1885 closeid = d;
1886 }
1887 }
1888 cl[c] = closeid;
1889 }
1890 }
1891 // Save the K-means classes
1892 for (a = 0; a < vocab_size; a++)
1893 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1894 free(centcn);
1895 free(cent);
1896 free(cl);
1897 }
1898 fclose(fo);
1899 if (save_net_file[0] != 0)
1900 SaveNet();
1901}
1902
1903int ArgPos(char *str, int argc, char **argv) {
1904 int a;
1905 for (a = 1; a < argc; a++)
1906 if (!strcmp(str, argv[a])) {
1907 if (a == argc - 1) {
1908 printf("Argument missing for %s\n", str);
1909 exit(1);
1910 }
1911 return a;
1912 }
1913 return -1;
1914}
1915
1916int main(int argc, char **argv) {
1917 int i;
1918 if (argc == 1) {
1919 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1920 printf("Options:\n");
1921 printf("Parameters for training:\n");
1922 printf("\t-train <file>\n");
1923 printf("\t\tUse text data from <file> to train the model\n");
1924 printf("\t-output <file>\n");
1925 printf(
1926 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1927 printf("\t-size <int>\n");
1928 printf("\t\tSet size of word vectors; default is 100\n");
1929 printf("\t-window <int>\n");
1930 printf("\t\tSet max skip length between words; default is 5\n");
1931 printf("\t-sample <float>\n");
1932 printf(
1933 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1934 printf(
1935 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1936 printf("\t-hs <int>\n");
1937 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1938 printf("\t-negative <int>\n");
1939 printf(
1940 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1941 printf("\t-negative-classes <file>\n");
1942 printf("\t\tNegative classes to sample from\n");
1943 printf("\t-nce <int>\n");
1944 printf(
1945 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1946 printf("\t-threads <int>\n");
1947 printf("\t\tUse <int> threads (default 12)\n");
1948 printf("\t-iter <int>\n");
1949 printf("\t\tRun more training iterations (default 5)\n");
1950 printf("\t-min-count <int>\n");
1951 printf(
1952 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1953 printf("\t-alpha <float>\n");
1954 printf(
1955 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1956 printf("\t-classes <int>\n");
1957 printf(
1958 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1959 printf("\t-debug <int>\n");
1960 printf(
1961 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1962 printf("\t-binary <int>\n");
1963 printf(
1964 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1965 printf("\t-save-vocab <file>\n");
1966 printf("\t\tThe vocabulary will be saved to <file>\n");
1967 printf("\t-read-vocab <file>\n");
1968 printf(
1969 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1970 printf("\t-read-net <file>\n");
1971 printf(
1972 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1973 printf("\t-save-net <file>\n");
1974 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001975 printf("\t-show-cc <int>\n");
1976 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001977 printf("\t-type <int>\n");
1978 printf(
1979 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1980 printf("\t-cap <int>\n");
1981 printf(
1982 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1983 printf("\nExamples:\n");
1984 printf(
1985 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1986 return 0;
1987 }
1988 output_file[0] = 0;
1989 save_vocab_file[0] = 0;
1990 read_vocab_file[0] = 0;
1991 save_net_file[0] = 0;
1992 read_net_file[0] = 0;
1993 negative_classes_file[0] = 0;
1994 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1995 layer1_size = atoi(argv[i + 1]);
1996 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1997 strcpy(train_file, argv[i + 1]);
1998 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1999 strcpy(save_vocab_file, argv[i + 1]);
2000 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2001 strcpy(read_vocab_file, argv[i + 1]);
2002 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2003 strcpy(save_net_file, argv[i + 1]);
2004 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2005 strcpy(read_net_file, argv[i + 1]);
2006 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2007 debug_mode = atoi(argv[i + 1]);
2008 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2009 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002010 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2011 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002012 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2013 type = atoi(argv[i + 1]);
2014 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2015 strcpy(output_file, argv[i + 1]);
2016 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2017 window = atoi(argv[i + 1]);
2018 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2019 sample = atof(argv[i + 1]);
2020 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2021 hs = atoi(argv[i + 1]);
2022 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2023 negative = atoi(argv[i + 1]);
2024 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2025 strcpy(negative_classes_file, argv[i + 1]);
2026 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2027 nce = atoi(argv[i + 1]);
2028 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2029 num_threads = atoi(argv[i + 1]);
2030 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2031 iter = atoi(argv[i + 1]);
2032 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2033 min_count = atoi(argv[i + 1]);
2034 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2035 classes = atoi(argv[i + 1]);
2036 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2037 cap = atoi(argv[i + 1]);
2038 if (type == 0 || type == 2 || type == 4)
2039 alpha = 0.05;
2040 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2041 alpha = atof(argv[i + 1]);
2042 vocab = (struct vocab_word *) calloc(vocab_max_size,
2043 sizeof(struct vocab_word));
2044 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2045 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2046 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2047 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2048 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2049 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002050 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002051 TrainModel();
2052 return 0;
2053}
2054