blob: 2eca7e2c8a66edade0f16319ce1bc6fe8c96b760 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Marc Kupietze23c5402016-07-14 11:10:09 +020015#include <locale.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010016#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020019#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010020#include <math.h>
21#include <pthread.h>
22
23#define MAX_STRING 100
24#define EXP_TABLE_SIZE 1000
25#define MAX_EXP 6
26#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010027#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010028#define MAX_CODE_LENGTH 40
29
30const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
31
32typedef float real; // Precision of float numbers
33
34struct vocab_word {
35 long long cn;
36 int *point;
37 char *word, *code, codelen;
38};
39
40char train_file[MAX_STRING], output_file[MAX_STRING];
41char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
42char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
43struct vocab_word *vocab;
44int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietzc2731b22016-07-14 08:56:14 +020045 num_threads = 12, min_reduce = 1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010046int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020047long long *threadPos;
48int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010049long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
50long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
51 classes = 0;
52real alpha = 0.025, starting_alpha, sample = 1e-3;
53real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020054real avgWordLength=0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010055clock_t start;
56
57real *syn1_window, *syn1neg_window, *syn1nce_window;
58int w_offset, window_layer_size;
59
60int window_hidden_size = 500;
61real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
62 *syn_hidden_word_nce;
63
64int hs = 0, negative = 5;
65const int table_size = 1e8;
66int *table;
67
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010068long cc = 0;
69
Marc Kupietzd6f9c712016-03-16 11:50:56 +010070//constrastive negative sampling
71char negative_classes_file[MAX_STRING];
72int *word_to_group;
73int *group_to_table; //group_size*table_size
74int class_number;
75
76//nce
77real* noise_distribution;
78int nce = 0;
79
80//param caps
81real CAP_VALUE = 50;
82int cap = 0;
83
84void capParam(real* array, int index) {
85 if (array[index] > CAP_VALUE)
86 array[index] = CAP_VALUE;
87 else if (array[index] < -CAP_VALUE)
88 array[index] = -CAP_VALUE;
89}
90
91real hardTanh(real x) {
92 if (x >= 1) {
93 return 1;
94 } else if (x <= -1) {
95 return -1;
96 } else {
97 return x;
98 }
99}
100
101real dHardTanh(real x, real g) {
102 if (x > 1 && g > 0) {
103 return 0;
104 }
105 if (x < -1 && g < 0) {
106 return 0;
107 }
108 return 1;
109}
110
111void InitUnigramTable() {
112 int a, i;
113 long long train_words_pow = 0;
114 real d1, power = 0.75;
115 table = (int *) malloc(table_size * sizeof(int));
116 for (a = 0; a < vocab_size; a++)
117 train_words_pow += pow(vocab[a].cn, power);
118 i = 0;
119 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
120 for (a = 0; a < table_size; a++) {
121 table[a] = i;
122 if (a / (real) table_size > d1) {
123 i++;
124 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
125 }
126 if (i >= vocab_size)
127 i = vocab_size - 1;
128 }
129
130 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
131 for (a = 0; a < vocab_size; a++)
132 noise_distribution[a] = pow(vocab[a].cn, power)
133 / (real) train_words_pow;
134}
135
136// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
137void ReadWord(char *word, FILE *fin) {
138 int a = 0, ch;
139 while (!feof(fin)) {
140 ch = fgetc(fin);
141 if (ch == 13)
142 continue;
143 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
144 if (a > 0) {
145 if (ch == '\n')
146 ungetc(ch, fin);
147 break;
148 }
149 if (ch == '\n') {
150 strcpy(word, (char *) "</s>");
151 return;
152 } else
153 continue;
154 }
155 word[a] = ch;
156 a++;
157 if (a >= MAX_STRING - 1)
158 a--; // Truncate too long words
159 }
160 word[a] = 0;
161}
162
163// Returns hash value of a word
164int GetWordHash(char *word) {
165 unsigned long long a, hash = 0;
166 for (a = 0; a < strlen(word); a++)
167 hash = hash * 257 + word[a];
168 hash = hash % vocab_hash_size;
169 return hash;
170}
171
172// Returns position of a word in the vocabulary; if the word is not found, returns -1
173int SearchVocab(char *word) {
174 unsigned int hash = GetWordHash(word);
175 while (1) {
176 if (vocab_hash[hash] == -1)
177 return -1;
178 if (!strcmp(word, vocab[vocab_hash[hash]].word))
179 return vocab_hash[hash];
180 hash = (hash + 1) % vocab_hash_size;
181 }
182 return -1;
183}
184
185// Reads a word and returns its index in the vocabulary
186int ReadWordIndex(FILE *fin) {
187 char word[MAX_STRING];
188 ReadWord(word, fin);
189 if (feof(fin))
190 return -1;
191 return SearchVocab(word);
192}
193
194// Adds a word to the vocabulary
195int AddWordToVocab(char *word) {
196 unsigned int hash, length = strlen(word) + 1;
197 if (length > MAX_STRING)
198 length = MAX_STRING;
199 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
200 strcpy(vocab[vocab_size].word, word);
201 vocab[vocab_size].cn = 0;
202 vocab_size++;
203 // Reallocate memory if needed
204 if (vocab_size + 2 >= vocab_max_size) {
205 vocab_max_size += 1000;
206 vocab = (struct vocab_word *) realloc(vocab,
207 vocab_max_size * sizeof(struct vocab_word));
208 }
209 hash = GetWordHash(word);
210 while (vocab_hash[hash] != -1)
211 hash = (hash + 1) % vocab_hash_size;
212 vocab_hash[hash] = vocab_size - 1;
213 return vocab_size - 1;
214}
215
216// Used later for sorting by word counts
217int VocabCompare(const void *a, const void *b) {
218 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
219}
220
221// Sorts the vocabulary by frequency using word counts
222void SortVocab() {
223 int a, size;
224 unsigned int hash;
225 // Sort the vocabulary and keep </s> at the first position
226 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
227 for (a = 0; a < vocab_hash_size; a++)
228 vocab_hash[a] = -1;
229 size = vocab_size;
230 train_words = 0;
231 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200232 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100233 // Words occuring less than min_count times will be discarded from the vocab
234 if ((vocab[a].cn < min_count) && (a != 0)) {
235 vocab_size--;
236 free(vocab[a].word);
237 } else {
238 // Hash will be re-computed, as after the sorting it is not actual
239 hash = GetWordHash(vocab[a].word);
240 while (vocab_hash[hash] != -1)
241 hash = (hash + 1) % vocab_hash_size;
242 vocab_hash[hash] = a;
243 train_words += vocab[a].cn;
244 }
245 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200246 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100247 vocab = (struct vocab_word *) realloc(vocab,
248 (vocab_size + 1) * sizeof(struct vocab_word));
249 // Allocate memory for the binary tree construction
250 for (a = 0; a < vocab_size; a++) {
251 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
252 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
253 }
254}
255
256// Reduces the vocabulary by removing infrequent tokens
257void ReduceVocab() {
258 int a, b = 0;
259 unsigned int hash;
260 for (a = 0; a < vocab_size; a++)
261 if (vocab[a].cn > min_reduce) {
262 vocab[b].cn = vocab[a].cn;
263 vocab[b].word = vocab[a].word;
264 b++;
265 } else
266 free(vocab[a].word);
267 vocab_size = b;
268 for (a = 0; a < vocab_hash_size; a++)
269 vocab_hash[a] = -1;
270 for (a = 0; a < vocab_size; a++) {
271 // Hash will be re-computed, as it is not actual
272 hash = GetWordHash(vocab[a].word);
273 while (vocab_hash[hash] != -1)
274 hash = (hash + 1) % vocab_hash_size;
275 vocab_hash[hash] = a;
276 }
277 fflush(stdout);
278 min_reduce++;
279}
280
281// Create binary Huffman tree using the word counts
282// Frequent words will have short uniqe binary codes
283void CreateBinaryTree() {
284 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
285 char code[MAX_CODE_LENGTH];
286 long long *count = (long long *) calloc(vocab_size * 2 + 1,
287 sizeof(long long));
288 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
289 sizeof(long long));
290 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
291 sizeof(long long));
292 for (a = 0; a < vocab_size; a++)
293 count[a] = vocab[a].cn;
294 for (a = vocab_size; a < vocab_size * 2; a++)
295 count[a] = 1e15;
296 pos1 = vocab_size - 1;
297 pos2 = vocab_size;
298 // Following algorithm constructs the Huffman tree by adding one node at a time
299 for (a = 0; a < vocab_size - 1; a++) {
300 // First, find two smallest nodes 'min1, min2'
301 if (pos1 >= 0) {
302 if (count[pos1] < count[pos2]) {
303 min1i = pos1;
304 pos1--;
305 } else {
306 min1i = pos2;
307 pos2++;
308 }
309 } else {
310 min1i = pos2;
311 pos2++;
312 }
313 if (pos1 >= 0) {
314 if (count[pos1] < count[pos2]) {
315 min2i = pos1;
316 pos1--;
317 } else {
318 min2i = pos2;
319 pos2++;
320 }
321 } else {
322 min2i = pos2;
323 pos2++;
324 }
325 count[vocab_size + a] = count[min1i] + count[min2i];
326 parent_node[min1i] = vocab_size + a;
327 parent_node[min2i] = vocab_size + a;
328 binary[min2i] = 1;
329 }
330 // Now assign binary code to each vocabulary word
331 for (a = 0; a < vocab_size; a++) {
332 b = a;
333 i = 0;
334 while (1) {
335 code[i] = binary[b];
336 point[i] = b;
337 i++;
338 b = parent_node[b];
339 if (b == vocab_size * 2 - 2)
340 break;
341 }
342 vocab[a].codelen = i;
343 vocab[a].point[0] = vocab_size - 2;
344 for (b = 0; b < i; b++) {
345 vocab[a].code[i - b - 1] = code[b];
346 vocab[a].point[i - b] = point[b] - vocab_size;
347 }
348 }
349 free(count);
350 free(binary);
351 free(parent_node);
352}
353
354void LearnVocabFromTrainFile() {
355 char word[MAX_STRING];
356 FILE *fin;
357 long long a, i;
358 for (a = 0; a < vocab_hash_size; a++)
359 vocab_hash[a] = -1;
360 fin = fopen(train_file, "rb");
361 if (fin == NULL) {
362 printf("ERROR: training data file not found!\n");
363 exit(1);
364 }
365 vocab_size = 0;
366 AddWordToVocab((char *) "</s>");
367 while (1) {
368 ReadWord(word, fin);
369 if (feof(fin))
370 break;
371 train_words++;
372 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
373 printf("%lldK%c", train_words / 1000, 13);
374 fflush(stdout);
375 }
376 i = SearchVocab(word);
377 if (i == -1) {
378 a = AddWordToVocab(word);
379 vocab[a].cn = 1;
380 } else
381 vocab[i].cn++;
382 if (vocab_size > vocab_hash_size * 0.7)
383 ReduceVocab();
384 }
385 SortVocab();
386 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200387 printf("Vocab size: %'lld\n", vocab_size);
388 printf("Words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100389 }
390 file_size = ftell(fin);
391 fclose(fin);
392}
393
394void SaveVocab() {
395 long long i;
396 FILE *fo = fopen(save_vocab_file, "wb");
397 for (i = 0; i < vocab_size; i++)
398 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
399 fclose(fo);
400}
401
402void ReadVocab() {
403 long long a, i = 0;
404 char c;
405 char word[MAX_STRING];
406 FILE *fin = fopen(read_vocab_file, "rb");
407 if (fin == NULL) {
408 printf("Vocabulary file not found\n");
409 exit(1);
410 }
411 for (a = 0; a < vocab_hash_size; a++)
412 vocab_hash[a] = -1;
413 vocab_size = 0;
414 while (1) {
415 ReadWord(word, fin);
416 if (feof(fin))
417 break;
418 a = AddWordToVocab(word);
419 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
420 i++;
421 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200422 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100423 fin = fopen(train_file, "rb");
424 if (fin == NULL) {
425 printf("ERROR: training data file not found!\n");
426 exit(1);
427 }
428 fseek(fin, 0, SEEK_END);
429 file_size = ftell(fin);
430 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200431 SortVocab();
432 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200433 printf("Vocab size: %'lld\n", vocab_size);
434 printf("Words in vocab's train file: %'lld\n", train_words);
435 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200436 }
Marc Kupietze23c5402016-07-14 11:10:09 +0200437 train_words = file_size / avgWordLength;
438 if(debug_mode > 0)
439 printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100440}
441
442void InitClassUnigramTable() {
443 long long a, c;
444 printf("loading class unigrams \n");
445 FILE *fin = fopen(negative_classes_file, "rb");
446 if (fin == NULL) {
447 printf("ERROR: class file not found!\n");
448 exit(1);
449 }
450 word_to_group = (int *) malloc(vocab_size * sizeof(int));
451 for (a = 0; a < vocab_size; a++)
452 word_to_group[a] = -1;
453 char class[MAX_STRING];
454 char prev_class[MAX_STRING];
455 prev_class[0] = 0;
456 char word[MAX_STRING];
457 class_number = -1;
458 while (1) {
459 if (feof(fin))
460 break;
461 ReadWord(class, fin);
462 ReadWord(word, fin);
463 int word_index = SearchVocab(word);
464 if (word_index != -1) {
465 if (strcmp(class, prev_class) != 0) {
466 class_number++;
467 strcpy(prev_class, class);
468 }
469 word_to_group[word_index] = class_number;
470 }
471 ReadWord(word, fin);
472 }
473 class_number++;
474 fclose(fin);
475
476 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
477 long long train_words_pow = 0;
478 real d1, power = 0.75;
479
480 for (c = 0; c < class_number; c++) {
481 long long offset = c * table_size;
482 train_words_pow = 0;
483 for (a = 0; a < vocab_size; a++)
484 if (word_to_group[a] == c)
485 train_words_pow += pow(vocab[a].cn, power);
486 int i = 0;
487 while (word_to_group[i] != c && i < vocab_size)
488 i++;
489 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
490 for (a = 0; a < table_size; a++) {
491 //printf("index %lld , word %d\n", a, i);
492 group_to_table[offset + a] = i;
493 if (a / (real) table_size > d1) {
494 i++;
495 while (word_to_group[i] != c && i < vocab_size)
496 i++;
497 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
498 }
499 if (i >= vocab_size)
500 while (word_to_group[i] != c && i >= 0)
501 i--;
502 }
503 }
504}
505
Marc Kupietz210b9d52016-04-02 21:48:13 +0200506void SaveArgs(int argc, char **argv) {
507 unsigned int i;
508 size_t len = 0;
509 char *_all_args, *all_args;
510 char *args_file = strdup(output_file);
511 strcat(args_file, ".args");
512 FILE *fargs = fopen(args_file, "w");
513 if (fargs == NULL) {
514 printf("Cannot save args to %s.\n", args_file);
515 return;
516 }
517
518 for(i=1; i<argc; i++) {
519 len += strlen(argv[i]);
520 }
521
522 _all_args = all_args = (char *)malloc(len+argc-1);
523
524 for(i=1; i<argc; i++) {
525 memcpy(_all_args, argv[i], strlen(argv[i]));
526 _all_args += strlen(argv[i])+1;
527 *(_all_args-1) = ' ';
528 }
529 *(_all_args-1) = 0;
530
531 fprintf(fargs, "%s\n", all_args);
532 fclose(fargs);
533
534 free(all_args);
535
536 return;
537}
538
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100539void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100540 if(type != 3 || negative <= 0) {
541 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
542 return;
543 }
544
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100545 FILE *fnet = fopen(save_net_file, "wb");
546 if (fnet == NULL) {
547 printf("Net parameter file not found\n");
548 exit(1);
549 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100550 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100551 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100552 fclose(fnet);
553}
554
555void InitNet() {
556 long long a, b;
557 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100558 long long read;
559
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100560 window_layer_size = layer1_size * window * 2;
561 a = posix_memalign((void **) &syn0, 128,
562 (long long) vocab_size * layer1_size * sizeof(real));
563 if (syn0 == NULL) {
564 printf("Memory allocation failed\n");
565 exit(1);
566 }
567
568 if (hs) {
569 a = posix_memalign((void **) &syn1, 128,
570 (long long) vocab_size * layer1_size * sizeof(real));
571 if (syn1 == NULL) {
572 printf("Memory allocation failed\n");
573 exit(1);
574 }
575 a = posix_memalign((void **) &syn1_window, 128,
576 (long long) vocab_size * window_layer_size * sizeof(real));
577 if (syn1_window == NULL) {
578 printf("Memory allocation failed\n");
579 exit(1);
580 }
581 a = posix_memalign((void **) &syn_hidden_word, 128,
582 (long long) vocab_size * window_hidden_size * sizeof(real));
583 if (syn_hidden_word == NULL) {
584 printf("Memory allocation failed\n");
585 exit(1);
586 }
587
588 for (a = 0; a < vocab_size; a++)
589 for (b = 0; b < layer1_size; b++)
590 syn1[a * layer1_size + b] = 0;
591 for (a = 0; a < vocab_size; a++)
592 for (b = 0; b < window_layer_size; b++)
593 syn1_window[a * window_layer_size + b] = 0;
594 for (a = 0; a < vocab_size; a++)
595 for (b = 0; b < window_hidden_size; b++)
596 syn_hidden_word[a * window_hidden_size + b] = 0;
597 }
598 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100599 if(type == 0) {
600 a = posix_memalign((void **) &syn1neg, 128,
601 (long long) vocab_size * layer1_size * sizeof(real));
602 if (syn1neg == NULL) {
603 printf("Memory allocation failed\n");
604 exit(1);
605 }
606 for (a = 0; a < vocab_size; a++)
607 for (b = 0; b < layer1_size; b++)
608 syn1neg[a * layer1_size + b] = 0;
609 } else if (type == 3) {
610 a = posix_memalign((void **) &syn1neg_window, 128,
611 (long long) vocab_size * window_layer_size * sizeof(real));
612 if (syn1neg_window == NULL) {
613 printf("Memory allocation failed\n");
614 exit(1);
615 }
616 for (a = 0; a < vocab_size; a++)
617 for (b = 0; b < window_layer_size; b++)
618 syn1neg_window[a * window_layer_size + b] = 0;
619 } else if (type == 4) {
620 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
621 (long long) vocab_size * window_hidden_size * sizeof(real));
622 if (syn_hidden_word_neg == NULL) {
623 printf("Memory allocation failed\n");
624 exit(1);
625 }
626 for (a = 0; a < vocab_size; a++)
627 for (b = 0; b < window_hidden_size; b++)
628 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100629 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100630 }
631 if (nce > 0) {
632 a = posix_memalign((void **) &syn1nce, 128,
633 (long long) vocab_size * layer1_size * sizeof(real));
634 if (syn1nce == NULL) {
635 printf("Memory allocation failed\n");
636 exit(1);
637 }
638 a = posix_memalign((void **) &syn1nce_window, 128,
639 (long long) vocab_size * window_layer_size * sizeof(real));
640 if (syn1nce_window == NULL) {
641 printf("Memory allocation failed\n");
642 exit(1);
643 }
644 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
645 (long long) vocab_size * window_hidden_size * sizeof(real));
646 if (syn_hidden_word_nce == NULL) {
647 printf("Memory allocation failed\n");
648 exit(1);
649 }
650
651 for (a = 0; a < vocab_size; a++)
652 for (b = 0; b < layer1_size; b++)
653 syn1nce[a * layer1_size + b] = 0;
654 for (a = 0; a < vocab_size; a++)
655 for (b = 0; b < window_layer_size; b++)
656 syn1nce_window[a * window_layer_size + b] = 0;
657 for (a = 0; a < vocab_size; a++)
658 for (b = 0; b < window_hidden_size; b++)
659 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
660 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100661
Marc Kupietz1006a272016-03-16 15:50:20 +0100662 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100663 a = posix_memalign((void **) &syn_window_hidden, 128,
664 window_hidden_size * window_layer_size * sizeof(real));
665 if (syn_window_hidden == NULL) {
666 printf("Memory allocation failed\n");
667 exit(1);
668 }
669 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
670 next_random = next_random * (unsigned long long) 25214903917 + 11;
671 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
672 - 0.5) / (window_hidden_size * window_layer_size);
673 }
674 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100675
676 if (read_net_file[0] == 0) {
677 for (a = 0; a < vocab_size; a++)
678 for (b = 0; b < layer1_size; b++) {
679 next_random = next_random * (unsigned long long) 25214903917
680 + 11;
681 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
682 / (real) 65536) - 0.5) / layer1_size;
683 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100684 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100685 FILE *fnet = fopen(read_net_file, "rb");
686 if (fnet == NULL) {
687 printf("Net parameter file not found\n");
688 exit(1);
689 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100690 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
691 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
692 if(read != vocab_size * layer1_size) {
693 fprintf(stderr, "read-net failed %lld\n", read);
694 exit(-1);
695 }
696 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
697 if(read != (long long) vocab_size * window_layer_size) {
698 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
699 (long long) sizeof(real) * vocab_size * window_layer_size);
700 exit(-1);
701 }
702 fgetc(fnet);
703 if(!feof(fnet)) {
704 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
705 exit(-1);
706 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100707 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100708 } else {
709 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
710 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100711 }
712
713 CreateBinaryTree();
714}
715
Marc Kupietz202723e2016-07-14 09:12:00 +0200716char *currentDateTime(char *buf, real offset) {
717 time_t t;
718 time(&t);
719 t += (long) offset;
720 struct tm tstruct;
721 tstruct = *localtime(&t);
722 strftime(buf, 80, "%c", &tstruct);
723 return buf;
724}
725
726void *MonitorThread(void *id) {
727 char *timebuf = malloc(80);;
728 int i, n=num_threads;
729 long long sum;
730 sleep(1);
731 while(n > 0) {
732 sleep(1);
733 sum = n = 0;
734 for(i=0; i < num_threads; i++) {
735 if(threadPos[i] >= 0) {
736 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
737 n++;
738 } else {
739 sum += iter * file_size / num_threads;
740 }
741 }
742 if(n == 0)
743 break;
744 real finished_portion = (real) sum / (float) (file_size * iter);
745 long long now = clock();
746 long long elapsed = (now - start) / CLOCKS_PER_SEC / num_threads;
747 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed) * ((real) num_threads / n) ;
748
749 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/t/s TE: %llds TTG: %llds ETA: %s\033[K",
750 alpha,
751 finished_portion * 100,
752 (float) sum / elapsed / num_threads / 1000,
753 elapsed,
754 ttg,
755 currentDateTime(timebuf, ttg)
756 );
757 fflush(stdout);
758 }
759 pthread_exit(NULL);
760}
761
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100762void *TrainModelThread(void *id) {
763 long long a, b, d, cw, word, last_word, sentence_length = 0,
764 sentence_position = 0;
765 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
766 long long l1, l2, c, target, label, local_iter = iter;
767 unsigned long long next_random = (long long) id;
768 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100769 int input_len_1 = layer1_size;
770 int window_offset = -1;
771 if (type == 2 || type == 4) {
772 input_len_1 = window_layer_size;
773 }
774 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
775 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200776 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100777
778 int input_len_2 = 0;
779 if (type == 4) {
780 input_len_2 = window_hidden_size;
781 }
782 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
783 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
784
785 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200786 long long start_pos = file_size / (long long) num_threads * (long long) id;
787 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
788 long long current_pos = start_pos;
789 long long last_pos = start_pos;;
790 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100791 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200792 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100793 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200794 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100795 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100796 alpha = starting_alpha
797 * (1 - word_count_actual / (real) (iter * train_words + 1));
798 if (alpha < starting_alpha * 0.0001)
799 alpha = starting_alpha * 0.0001;
800 }
801 if (sentence_length == 0) {
802 while (1) {
803 word = ReadWordIndex(fi);
804 if (feof(fi))
805 break;
806 if (word == -1)
807 continue;
808 word_count++;
809 if (word == 0)
810 break;
811 // The subsampling randomly discards frequent words while keeping the ranking same
812 if (sample > 0) {
813 real ran = (sqrt(vocab[word].cn / (sample * train_words))
814 + 1) * (sample * train_words) / vocab[word].cn;
815 next_random = next_random * (unsigned long long) 25214903917
816 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100817 if (ran < (next_random & 0xFFFF) / (real) 65536) {
818 if(type == 3) // in structured skipgrams
819 word = -2; // keep the window position correct
820 else
821 continue;
822 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100823 }
824 sen[sentence_length] = word;
825 sentence_length++;
826 if (sentence_length >= MAX_SENTENCE_LENGTH)
827 break;
828 }
829 sentence_position = 0;
830 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200831 current_pos = threadPos[(long) id] = ftell(fi);
832 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100833 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200834 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100835 local_iter--;
836 if (local_iter == 0)
837 break;
838 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200839 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100840 last_word_count = 0;
841 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200842 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100843 continue;
844 }
845 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200846 while (word == -2 && sentence_position<sentence_length)
847 word = sen[++sentence_position];
848 if (sentence_position>=sentence_length) {
849 sentence_length=0;
850 continue;
851 }
852 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100853 continue;
854 for (c = 0; c < input_len_1; c++)
855 neu1[c] = 0;
856 for (c = 0; c < input_len_1; c++)
857 neu1e[c] = 0;
858 for (c = 0; c < input_len_2; c++)
859 neu2[c] = 0;
860 for (c = 0; c < input_len_2; c++)
861 neu2e[c] = 0;
862 next_random = next_random * (unsigned long long) 25214903917 + 11;
863 b = next_random % window;
864 if (type == 0) { //train the cbow architecture
865 // in -> hidden
866 cw = 0;
867 for (a = b; a < window * 2 + 1 - b; a++)
868 if (a != window) {
869 c = sentence_position - window + a;
870 if (c < 0)
871 continue;
872 if (c >= sentence_length)
873 continue;
874 last_word = sen[c];
875 if (last_word == -1)
876 continue;
877 for (c = 0; c < layer1_size; c++)
878 neu1[c] += syn0[c + last_word * layer1_size];
879 cw++;
880 }
881 if (cw) {
882 for (c = 0; c < layer1_size; c++)
883 neu1[c] /= cw;
884 if (hs)
885 for (d = 0; d < vocab[word].codelen; d++) {
886 f = 0;
887 l2 = vocab[word].point[d] * layer1_size;
888 // Propagate hidden -> output
889 for (c = 0; c < layer1_size; c++)
890 f += neu1[c] * syn1[c + l2];
891 if (f <= -MAX_EXP)
892 continue;
893 else if (f >= MAX_EXP)
894 continue;
895 else
896 f = expTable[(int) ((f + MAX_EXP)
897 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
898 // 'g' is the gradient multiplied by the learning rate
899 g = (1 - vocab[word].code[d] - f) * alpha;
900 // Propagate errors output -> hidden
901 for (c = 0; c < layer1_size; c++)
902 neu1e[c] += g * syn1[c + l2];
903 // Learn weights hidden -> output
904 for (c = 0; c < layer1_size; c++)
905 syn1[c + l2] += g * neu1[c];
906 if (cap == 1)
907 for (c = 0; c < layer1_size; c++)
908 capParam(syn1, c + l2);
909 }
910 // NEGATIVE SAMPLING
911 if (negative > 0)
912 for (d = 0; d < negative + 1; d++) {
913 if (d == 0) {
914 target = word;
915 label = 1;
916 } else {
917 next_random = next_random
918 * (unsigned long long) 25214903917 + 11;
919 if (word_to_group != NULL
920 && word_to_group[word] != -1) {
921 target = word;
922 while (target == word) {
923 target = group_to_table[word_to_group[word]
924 * table_size
925 + (next_random >> 16) % table_size];
926 next_random = next_random
927 * (unsigned long long) 25214903917
928 + 11;
929 }
930 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
931 } else {
932 target =
933 table[(next_random >> 16) % table_size];
934 }
935 if (target == 0)
936 target = next_random % (vocab_size - 1) + 1;
937 if (target == word)
938 continue;
939 label = 0;
940 }
941 l2 = target * layer1_size;
942 f = 0;
943 for (c = 0; c < layer1_size; c++)
944 f += neu1[c] * syn1neg[c + l2];
945 if (f > MAX_EXP)
946 g = (label - 1) * alpha;
947 else if (f < -MAX_EXP)
948 g = (label - 0) * alpha;
949 else
950 g = (label
951 - expTable[(int) ((f + MAX_EXP)
952 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
953 * alpha;
954 for (c = 0; c < layer1_size; c++)
955 neu1e[c] += g * syn1neg[c + l2];
956 for (c = 0; c < layer1_size; c++)
957 syn1neg[c + l2] += g * neu1[c];
958 if (cap == 1)
959 for (c = 0; c < layer1_size; c++)
960 capParam(syn1neg, c + l2);
961 }
962 // Noise Contrastive Estimation
963 if (nce > 0)
964 for (d = 0; d < nce + 1; d++) {
965 if (d == 0) {
966 target = word;
967 label = 1;
968 } else {
969 next_random = next_random
970 * (unsigned long long) 25214903917 + 11;
971 if (word_to_group != NULL
972 && word_to_group[word] != -1) {
973 target = word;
974 while (target == word) {
975 target = group_to_table[word_to_group[word]
976 * table_size
977 + (next_random >> 16) % table_size];
978 next_random = next_random
979 * (unsigned long long) 25214903917
980 + 11;
981 }
982 } else {
983 target =
984 table[(next_random >> 16) % table_size];
985 }
986 if (target == 0)
987 target = next_random % (vocab_size - 1) + 1;
988 if (target == word)
989 continue;
990 label = 0;
991 }
992 l2 = target * layer1_size;
993 f = 0;
994
995 for (c = 0; c < layer1_size; c++)
996 f += neu1[c] * syn1nce[c + l2];
997 if (f > MAX_EXP)
998 g = (label - 1) * alpha;
999 else if (f < -MAX_EXP)
1000 g = (label - 0) * alpha;
1001 else {
1002 f = exp(f);
1003 g =
1004 (label
1005 - f
1006 / (noise_distribution[target]
1007 * nce + f)) * alpha;
1008 }
1009 for (c = 0; c < layer1_size; c++)
1010 neu1e[c] += g * syn1nce[c + l2];
1011 for (c = 0; c < layer1_size; c++)
1012 syn1nce[c + l2] += g * neu1[c];
1013 if (cap == 1)
1014 for (c = 0; c < layer1_size; c++)
1015 capParam(syn1nce, c + l2);
1016 }
1017 // hidden -> in
1018 for (a = b; a < window * 2 + 1 - b; a++)
1019 if (a != window) {
1020 c = sentence_position - window + a;
1021 if (c < 0)
1022 continue;
1023 if (c >= sentence_length)
1024 continue;
1025 last_word = sen[c];
1026 if (last_word == -1)
1027 continue;
1028 for (c = 0; c < layer1_size; c++)
1029 syn0[c + last_word * layer1_size] += neu1e[c];
1030 }
1031 }
1032 } else if (type == 1) { //train skip-gram
1033 for (a = b; a < window * 2 + 1 - b; a++)
1034 if (a != window) {
1035 c = sentence_position - window + a;
1036 if (c < 0)
1037 continue;
1038 if (c >= sentence_length)
1039 continue;
1040 last_word = sen[c];
1041 if (last_word == -1)
1042 continue;
1043 l1 = last_word * layer1_size;
1044 for (c = 0; c < layer1_size; c++)
1045 neu1e[c] = 0;
1046 // HIERARCHICAL SOFTMAX
1047 if (hs)
1048 for (d = 0; d < vocab[word].codelen; d++) {
1049 f = 0;
1050 l2 = vocab[word].point[d] * layer1_size;
1051 // Propagate hidden -> output
1052 for (c = 0; c < layer1_size; c++)
1053 f += syn0[c + l1] * syn1[c + l2];
1054 if (f <= -MAX_EXP)
1055 continue;
1056 else if (f >= MAX_EXP)
1057 continue;
1058 else
1059 f = expTable[(int) ((f + MAX_EXP)
1060 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1061 // 'g' is the gradient multiplied by the learning rate
1062 g = (1 - vocab[word].code[d] - f) * alpha;
1063 // Propagate errors output -> hidden
1064 for (c = 0; c < layer1_size; c++)
1065 neu1e[c] += g * syn1[c + l2];
1066 // Learn weights hidden -> output
1067 for (c = 0; c < layer1_size; c++)
1068 syn1[c + l2] += g * syn0[c + l1];
1069 if (cap == 1)
1070 for (c = 0; c < layer1_size; c++)
1071 capParam(syn1, c + l2);
1072 }
1073 // NEGATIVE SAMPLING
1074 if (negative > 0)
1075 for (d = 0; d < negative + 1; d++) {
1076 if (d == 0) {
1077 target = word;
1078 label = 1;
1079 } else {
1080 next_random = next_random
1081 * (unsigned long long) 25214903917 + 11;
1082 if (word_to_group != NULL
1083 && word_to_group[word] != -1) {
1084 target = word;
1085 while (target == word) {
1086 target =
1087 group_to_table[word_to_group[word]
1088 * table_size
1089 + (next_random >> 16)
1090 % table_size];
1091 next_random =
1092 next_random
1093 * (unsigned long long) 25214903917
1094 + 11;
1095 }
1096 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1097 } else {
1098 target = table[(next_random >> 16)
1099 % table_size];
1100 }
1101 if (target == 0)
1102 target = next_random % (vocab_size - 1) + 1;
1103 if (target == word)
1104 continue;
1105 label = 0;
1106 }
1107 l2 = target * layer1_size;
1108 f = 0;
1109 for (c = 0; c < layer1_size; c++)
1110 f += syn0[c + l1] * syn1neg[c + l2];
1111 if (f > MAX_EXP)
1112 g = (label - 1) * alpha;
1113 else if (f < -MAX_EXP)
1114 g = (label - 0) * alpha;
1115 else
1116 g =
1117 (label
1118 - expTable[(int) ((f + MAX_EXP)
1119 * (EXP_TABLE_SIZE
1120 / MAX_EXP / 2))])
1121 * alpha;
1122 for (c = 0; c < layer1_size; c++)
1123 neu1e[c] += g * syn1neg[c + l2];
1124 for (c = 0; c < layer1_size; c++)
1125 syn1neg[c + l2] += g * syn0[c + l1];
1126 if (cap == 1)
1127 for (c = 0; c < layer1_size; c++)
1128 capParam(syn1neg, c + l2);
1129 }
1130 //Noise Contrastive Estimation
1131 if (nce > 0)
1132 for (d = 0; d < nce + 1; d++) {
1133 if (d == 0) {
1134 target = word;
1135 label = 1;
1136 } else {
1137 next_random = next_random
1138 * (unsigned long long) 25214903917 + 11;
1139 if (word_to_group != NULL
1140 && word_to_group[word] != -1) {
1141 target = word;
1142 while (target == word) {
1143 target =
1144 group_to_table[word_to_group[word]
1145 * table_size
1146 + (next_random >> 16)
1147 % table_size];
1148 next_random =
1149 next_random
1150 * (unsigned long long) 25214903917
1151 + 11;
1152 }
1153 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1154 } else {
1155 target = table[(next_random >> 16)
1156 % table_size];
1157 }
1158 if (target == 0)
1159 target = next_random % (vocab_size - 1) + 1;
1160 if (target == word)
1161 continue;
1162 label = 0;
1163 }
1164 l2 = target * layer1_size;
1165 f = 0;
1166 for (c = 0; c < layer1_size; c++)
1167 f += syn0[c + l1] * syn1nce[c + l2];
1168 if (f > MAX_EXP)
1169 g = (label - 1) * alpha;
1170 else if (f < -MAX_EXP)
1171 g = (label - 0) * alpha;
1172 else {
1173 f = exp(f);
1174 g = (label
1175 - f
1176 / (noise_distribution[target]
1177 * nce + f)) * alpha;
1178 }
1179 for (c = 0; c < layer1_size; c++)
1180 neu1e[c] += g * syn1nce[c + l2];
1181 for (c = 0; c < layer1_size; c++)
1182 syn1nce[c + l2] += g * syn0[c + l1];
1183 if (cap == 1)
1184 for (c = 0; c < layer1_size; c++)
1185 capParam(syn1nce, c + l2);
1186 }
1187 // Learn weights input -> hidden
1188 for (c = 0; c < layer1_size; c++)
1189 syn0[c + l1] += neu1e[c];
1190 }
1191 } else if (type == 2) { //train the cwindow architecture
1192 // in -> hidden
1193 cw = 0;
1194 for (a = 0; a < window * 2 + 1; a++)
1195 if (a != window) {
1196 c = sentence_position - window + a;
1197 if (c < 0)
1198 continue;
1199 if (c >= sentence_length)
1200 continue;
1201 last_word = sen[c];
1202 if (last_word == -1)
1203 continue;
1204 window_offset = a * layer1_size;
1205 if (a > window)
1206 window_offset -= layer1_size;
1207 for (c = 0; c < layer1_size; c++)
1208 neu1[c + window_offset] += syn0[c
1209 + last_word * layer1_size];
1210 cw++;
1211 }
1212 if (cw) {
1213 if (hs)
1214 for (d = 0; d < vocab[word].codelen; d++) {
1215 f = 0;
1216 l2 = vocab[word].point[d] * window_layer_size;
1217 // Propagate hidden -> output
1218 for (c = 0; c < window_layer_size; c++)
1219 f += neu1[c] * syn1_window[c + l2];
1220 if (f <= -MAX_EXP)
1221 continue;
1222 else if (f >= MAX_EXP)
1223 continue;
1224 else
1225 f = expTable[(int) ((f + MAX_EXP)
1226 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1227 // 'g' is the gradient multiplied by the learning rate
1228 g = (1 - vocab[word].code[d] - f) * alpha;
1229 // Propagate errors output -> hidden
1230 for (c = 0; c < window_layer_size; c++)
1231 neu1e[c] += g * syn1_window[c + l2];
1232 // Learn weights hidden -> output
1233 for (c = 0; c < window_layer_size; c++)
1234 syn1_window[c + l2] += g * neu1[c];
1235 if (cap == 1)
1236 for (c = 0; c < window_layer_size; c++)
1237 capParam(syn1_window, c + l2);
1238 }
1239 // NEGATIVE SAMPLING
1240 if (negative > 0)
1241 for (d = 0; d < negative + 1; d++) {
1242 if (d == 0) {
1243 target = word;
1244 label = 1;
1245 } else {
1246 next_random = next_random
1247 * (unsigned long long) 25214903917 + 11;
1248 if (word_to_group != NULL
1249 && word_to_group[word] != -1) {
1250 target = word;
1251 while (target == word) {
1252 target = group_to_table[word_to_group[word]
1253 * table_size
1254 + (next_random >> 16) % table_size];
1255 next_random = next_random
1256 * (unsigned long long) 25214903917
1257 + 11;
1258 }
1259 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1260 } else {
1261 target =
1262 table[(next_random >> 16) % table_size];
1263 }
1264 if (target == 0)
1265 target = next_random % (vocab_size - 1) + 1;
1266 if (target == word)
1267 continue;
1268 label = 0;
1269 }
1270 l2 = target * window_layer_size;
1271 f = 0;
1272 for (c = 0; c < window_layer_size; c++)
1273 f += neu1[c] * syn1neg_window[c + l2];
1274 if (f > MAX_EXP)
1275 g = (label - 1) * alpha;
1276 else if (f < -MAX_EXP)
1277 g = (label - 0) * alpha;
1278 else
1279 g = (label
1280 - expTable[(int) ((f + MAX_EXP)
1281 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1282 * alpha;
1283 for (c = 0; c < window_layer_size; c++)
1284 neu1e[c] += g * syn1neg_window[c + l2];
1285 for (c = 0; c < window_layer_size; c++)
1286 syn1neg_window[c + l2] += g * neu1[c];
1287 if (cap == 1)
1288 for (c = 0; c < window_layer_size; c++)
1289 capParam(syn1neg_window, c + l2);
1290 }
1291 // Noise Contrastive Estimation
1292 if (nce > 0)
1293 for (d = 0; d < nce + 1; d++) {
1294 if (d == 0) {
1295 target = word;
1296 label = 1;
1297 } else {
1298 next_random = next_random
1299 * (unsigned long long) 25214903917 + 11;
1300 if (word_to_group != NULL
1301 && word_to_group[word] != -1) {
1302 target = word;
1303 while (target == word) {
1304 target = group_to_table[word_to_group[word]
1305 * table_size
1306 + (next_random >> 16) % table_size];
1307 next_random = next_random
1308 * (unsigned long long) 25214903917
1309 + 11;
1310 }
1311 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1312 } else {
1313 target =
1314 table[(next_random >> 16) % table_size];
1315 }
1316 if (target == 0)
1317 target = next_random % (vocab_size - 1) + 1;
1318 if (target == word)
1319 continue;
1320 label = 0;
1321 }
1322 l2 = target * window_layer_size;
1323 f = 0;
1324 for (c = 0; c < window_layer_size; c++)
1325 f += neu1[c] * syn1nce_window[c + l2];
1326 if (f > MAX_EXP)
1327 g = (label - 1) * alpha;
1328 else if (f < -MAX_EXP)
1329 g = (label - 0) * alpha;
1330 else {
1331 f = exp(f);
1332 g =
1333 (label
1334 - f
1335 / (noise_distribution[target]
1336 * nce + f)) * alpha;
1337 }
1338 for (c = 0; c < window_layer_size; c++)
1339 neu1e[c] += g * syn1nce_window[c + l2];
1340 for (c = 0; c < window_layer_size; c++)
1341 syn1nce_window[c + l2] += g * neu1[c];
1342 if (cap == 1)
1343 for (c = 0; c < window_layer_size; c++)
1344 capParam(syn1nce_window, c + l2);
1345 }
1346 // hidden -> in
1347 for (a = 0; a < window * 2 + 1; a++)
1348 if (a != window) {
1349 c = sentence_position - window + a;
1350 if (c < 0)
1351 continue;
1352 if (c >= sentence_length)
1353 continue;
1354 last_word = sen[c];
1355 if (last_word == -1)
1356 continue;
1357 window_offset = a * layer1_size;
1358 if (a > window)
1359 window_offset -= layer1_size;
1360 for (c = 0; c < layer1_size; c++)
1361 syn0[c + last_word * layer1_size] += neu1e[c
1362 + window_offset];
1363 }
1364 }
1365 } else if (type == 3) { //train structured skip-gram
1366 for (a = 0; a < window * 2 + 1; a++)
1367 if (a != window) {
1368 c = sentence_position - window + a;
1369 if (c < 0)
1370 continue;
1371 if (c >= sentence_length)
1372 continue;
1373 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001374 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001375 continue;
1376 l1 = last_word * layer1_size;
1377 window_offset = a * layer1_size;
1378 if (a > window)
1379 window_offset -= layer1_size;
1380 for (c = 0; c < layer1_size; c++)
1381 neu1e[c] = 0;
1382 // HIERARCHICAL SOFTMAX
1383 if (hs)
1384 for (d = 0; d < vocab[word].codelen; d++) {
1385 f = 0;
1386 l2 = vocab[word].point[d] * window_layer_size;
1387 // Propagate hidden -> output
1388 for (c = 0; c < layer1_size; c++)
1389 f += syn0[c + l1]
1390 * syn1_window[c + l2 + window_offset];
1391 if (f <= -MAX_EXP)
1392 continue;
1393 else if (f >= MAX_EXP)
1394 continue;
1395 else
1396 f = expTable[(int) ((f + MAX_EXP)
1397 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1398 // 'g' is the gradient multiplied by the learning rate
1399 g = (1 - vocab[word].code[d] - f) * alpha;
1400 // Propagate errors output -> hidden
1401 for (c = 0; c < layer1_size; c++)
1402 neu1e[c] += g
1403 * syn1_window[c + l2 + window_offset];
1404 // Learn weights hidden -> output
1405 for (c = 0; c < layer1_size; c++)
1406 syn1[c + l2 + window_offset] += g
1407 * syn0[c + l1];
1408 if (cap == 1)
1409 for (c = 0; c < layer1_size; c++)
1410 capParam(syn1, c + l2 + window_offset);
1411 }
1412 // NEGATIVE SAMPLING
1413 if (negative > 0)
1414 for (d = 0; d < negative + 1; d++) {
1415 if (d == 0) {
1416 target = word;
1417 label = 1;
1418 } else {
1419 next_random = next_random
1420 * (unsigned long long) 25214903917 + 11;
1421 if (word_to_group != NULL
1422 && word_to_group[word] != -1) {
1423 target = word;
1424 while (target == word) {
1425 target =
1426 group_to_table[word_to_group[word]
1427 * table_size
1428 + (next_random >> 16)
1429 % table_size];
1430 next_random =
1431 next_random
1432 * (unsigned long long) 25214903917
1433 + 11;
1434 }
1435 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1436 } else {
1437 target = table[(next_random >> 16)
1438 % table_size];
1439 }
1440 if (target == 0)
1441 target = next_random % (vocab_size - 1) + 1;
1442 if (target == word)
1443 continue;
1444 label = 0;
1445 }
1446 l2 = target * window_layer_size;
1447 f = 0;
1448 for (c = 0; c < layer1_size; c++)
1449 f +=
1450 syn0[c + l1]
1451 * syn1neg_window[c + l2
1452 + window_offset];
1453 if (f > MAX_EXP)
1454 g = (label - 1) * alpha;
1455 else if (f < -MAX_EXP)
1456 g = (label - 0) * alpha;
1457 else
1458 g =
1459 (label
1460 - expTable[(int) ((f + MAX_EXP)
1461 * (EXP_TABLE_SIZE
1462 / MAX_EXP / 2))])
1463 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001464 if(debug_mode > 2 && ((long long) id) == 0) {
1465 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1466 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1467 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001468 for (c = 0; c < layer1_size; c++)
1469 neu1e[c] +=
1470 g
1471 * syn1neg_window[c + l2
1472 + window_offset];
1473 for (c = 0; c < layer1_size; c++)
1474 syn1neg_window[c + l2 + window_offset] += g
1475 * syn0[c + l1];
1476 if (cap == 1)
1477 for (c = 0; c < layer1_size; c++)
1478 capParam(syn1neg_window,
1479 c + l2 + window_offset);
1480 }
1481 // Noise Constrastive Estimation
1482 if (nce > 0)
1483 for (d = 0; d < nce + 1; d++) {
1484 if (d == 0) {
1485 target = word;
1486 label = 1;
1487 } else {
1488 next_random = next_random
1489 * (unsigned long long) 25214903917 + 11;
1490 if (word_to_group != NULL
1491 && word_to_group[word] != -1) {
1492 target = word;
1493 while (target == word) {
1494 target =
1495 group_to_table[word_to_group[word]
1496 * table_size
1497 + (next_random >> 16)
1498 % table_size];
1499 next_random =
1500 next_random
1501 * (unsigned long long) 25214903917
1502 + 11;
1503 }
1504 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1505 } else {
1506 target = table[(next_random >> 16)
1507 % table_size];
1508 }
1509 if (target == 0)
1510 target = next_random % (vocab_size - 1) + 1;
1511 if (target == word)
1512 continue;
1513 label = 0;
1514 }
1515 l2 = target * window_layer_size;
1516 f = 0;
1517 for (c = 0; c < layer1_size; c++)
1518 f +=
1519 syn0[c + l1]
1520 * syn1nce_window[c + l2
1521 + window_offset];
1522 if (f > MAX_EXP)
1523 g = (label - 1) * alpha;
1524 else if (f < -MAX_EXP)
1525 g = (label - 0) * alpha;
1526 else {
1527 f = exp(f);
1528 g = (label
1529 - f
1530 / (noise_distribution[target]
1531 * nce + f)) * alpha;
1532 }
1533 for (c = 0; c < layer1_size; c++)
1534 neu1e[c] +=
1535 g
1536 * syn1nce_window[c + l2
1537 + window_offset];
1538 for (c = 0; c < layer1_size; c++)
1539 syn1nce_window[c + l2 + window_offset] += g
1540 * syn0[c + l1];
1541 if (cap == 1)
1542 for (c = 0; c < layer1_size; c++)
1543 capParam(syn1nce_window,
1544 c + l2 + window_offset);
1545 }
1546 // Learn weights input -> hidden
1547 for (c = 0; c < layer1_size; c++) {
1548 syn0[c + l1] += neu1e[c];
1549 if (syn0[c + l1] > 50)
1550 syn0[c + l1] = 50;
1551 if (syn0[c + l1] < -50)
1552 syn0[c + l1] = -50;
1553 }
1554 }
1555 } else if (type == 4) { //training senna
1556 // in -> hidden
1557 cw = 0;
1558 for (a = 0; a < window * 2 + 1; a++)
1559 if (a != window) {
1560 c = sentence_position - window + a;
1561 if (c < 0)
1562 continue;
1563 if (c >= sentence_length)
1564 continue;
1565 last_word = sen[c];
1566 if (last_word == -1)
1567 continue;
1568 window_offset = a * layer1_size;
1569 if (a > window)
1570 window_offset -= layer1_size;
1571 for (c = 0; c < layer1_size; c++)
1572 neu1[c + window_offset] += syn0[c
1573 + last_word * layer1_size];
1574 cw++;
1575 }
1576 if (cw) {
1577 for (a = 0; a < window_hidden_size; a++) {
1578 c = a * window_layer_size;
1579 for (b = 0; b < window_layer_size; b++) {
1580 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1581 }
1582 }
1583 if (hs)
1584 for (d = 0; d < vocab[word].codelen; d++) {
1585 f = 0;
1586 l2 = vocab[word].point[d] * window_hidden_size;
1587 // Propagate hidden -> output
1588 for (c = 0; c < window_hidden_size; c++)
1589 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1590 if (f <= -MAX_EXP)
1591 continue;
1592 else if (f >= MAX_EXP)
1593 continue;
1594 else
1595 f = expTable[(int) ((f + MAX_EXP)
1596 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1597 // 'g' is the gradient multiplied by the learning rate
1598 g = (1 - vocab[word].code[d] - f) * alpha;
1599 // Propagate errors output -> hidden
1600 for (c = 0; c < window_hidden_size; c++)
1601 neu2e[c] += dHardTanh(neu2[c], g) * g
1602 * syn_hidden_word[c + l2];
1603 // Learn weights hidden -> output
1604 for (c = 0; c < window_hidden_size; c++)
1605 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1606 * neu2[c];
1607 }
1608 // NEGATIVE SAMPLING
1609 if (negative > 0)
1610 for (d = 0; d < negative + 1; d++) {
1611 if (d == 0) {
1612 target = word;
1613 label = 1;
1614 } else {
1615 next_random = next_random
1616 * (unsigned long long) 25214903917 + 11;
1617 if (word_to_group != NULL
1618 && word_to_group[word] != -1) {
1619 target = word;
1620 while (target == word) {
1621 target = group_to_table[word_to_group[word]
1622 * table_size
1623 + (next_random >> 16) % table_size];
1624 next_random = next_random
1625 * (unsigned long long) 25214903917
1626 + 11;
1627 }
1628 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1629 } else {
1630 target =
1631 table[(next_random >> 16) % table_size];
1632 }
1633 if (target == 0)
1634 target = next_random % (vocab_size - 1) + 1;
1635 if (target == word)
1636 continue;
1637 label = 0;
1638 }
1639 l2 = target * window_hidden_size;
1640 f = 0;
1641 for (c = 0; c < window_hidden_size; c++)
1642 f += hardTanh(neu2[c])
1643 * syn_hidden_word_neg[c + l2];
1644 if (f > MAX_EXP)
1645 g = (label - 1) * alpha / negative;
1646 else if (f < -MAX_EXP)
1647 g = (label - 0) * alpha / negative;
1648 else
1649 g = (label
1650 - expTable[(int) ((f + MAX_EXP)
1651 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1652 * alpha / negative;
1653 for (c = 0; c < window_hidden_size; c++)
1654 neu2e[c] += dHardTanh(neu2[c], g) * g
1655 * syn_hidden_word_neg[c + l2];
1656 for (c = 0; c < window_hidden_size; c++)
1657 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1658 * g * neu2[c];
1659 }
1660 for (a = 0; a < window_hidden_size; a++)
1661 for (b = 0; b < window_layer_size; b++)
1662 neu1e[b] += neu2e[a]
1663 * syn_window_hidden[a * window_layer_size + b];
1664 for (a = 0; a < window_hidden_size; a++)
1665 for (b = 0; b < window_layer_size; b++)
1666 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1667 * neu1[b];
1668 // hidden -> in
1669 for (a = 0; a < window * 2 + 1; a++)
1670 if (a != window) {
1671 c = sentence_position - window + a;
1672 if (c < 0)
1673 continue;
1674 if (c >= sentence_length)
1675 continue;
1676 last_word = sen[c];
1677 if (last_word == -1)
1678 continue;
1679 window_offset = a * layer1_size;
1680 if (a > window)
1681 window_offset -= layer1_size;
1682 for (c = 0; c < layer1_size; c++)
1683 syn0[c + last_word * layer1_size] += neu1e[c
1684 + window_offset];
1685 }
1686 }
1687 } else {
1688 printf("unknown type %i", type);
1689 exit(0);
1690 }
1691 sentence_position++;
1692 if (sentence_position >= sentence_length) {
1693 sentence_length = 0;
1694 continue;
1695 }
1696 }
1697 fclose(fi);
1698 free(neu1);
1699 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001700 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001701 pthread_exit(NULL);
1702}
1703
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001704void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001705 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001706 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001707 real *target_sums, bestf[MAX_CC], worstbest;
1708 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001709 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001710 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1711
1712 for (d = cc; d < vocab_size; d++) {
1713 for (b = 0; b < vocab_size; b++)
1714 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001715 for (b = 0; b < N; b++)
1716 bestf[b]=-1;
1717 worstbest = -1;
1718
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001719 maxmax_f = -1;
1720 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001721 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001722 if (a != window) {
1723 max_f = -1;
1724 window_offset = a * layer1_size;
1725 if (a > window)
1726 window_offset -= layer1_size;
1727 for(target = 0; target < vocab_size; target ++) {
1728 if(target == d)
1729 continue;
1730 f = 0;
1731 for (c = 0; c < layer1_size; c++)
1732 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1733 if (f < -MAX_EXP)
1734 continue;
1735 else if (f > MAX_EXP)
1736 continue;
1737 else
1738 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1739 if(f > max_f) {
1740 max_f = f;
1741 max_target = target;
1742 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001743 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001744 if(f > worstbest) {
1745 for (b = 0; b < N; b++) {
1746 if (f > bestf[b]) {
1747 for (e = N - 1; e > b; e--) {
1748 bestf[e] = bestf[e - 1];
1749 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001750 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001751 }
1752 bestf[b] = f;
1753 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001754 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001755 break;
1756 }
1757 }
1758 worstbest = bestf[N-1];
1759 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001760 }
1761 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1762 if(max_f > maxmax_f) {
1763 maxmax_f = max_f;
1764 maxmax_target = max_target;
1765 }
1766 } else {
1767 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1768 }
1769 }
1770 max_f = -1;
1771 for (b = 0; b < vocab_size; b++) {
1772 if(target_sums[b] > max_f) {
1773 max_f = target_sums[b];
1774 max_target = b;
1775 }
1776 }
1777 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001778 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001779 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001780 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001781 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001782 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001783 }
1784}
1785
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001786void TrainModel() {
1787 long a, b, c, d;
1788 FILE *fo;
1789 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001790 threadPos = malloc(num_threads * sizeof(long long));
1791 threadIters = malloc(num_threads * sizeof(int));
1792 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001793 printf("Starting training using file %s\n", train_file);
1794 starting_alpha = alpha;
1795 if (read_vocab_file[0] != 0)
1796 ReadVocab();
1797 else
1798 LearnVocabFromTrainFile();
1799 if (save_vocab_file[0] != 0)
1800 SaveVocab();
1801 if (output_file[0] == 0)
1802 return;
1803 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001804 if(cc > 0)
1805 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001806 if (negative > 0 || nce > 0)
1807 InitUnigramTable();
1808 if (negative_classes_file[0] != 0)
1809 InitClassUnigramTable();
1810 start = clock();
1811 for (a = 0; a < num_threads; a++)
1812 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001813 if(debug_mode > 1)
1814 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001815 for (a = 0; a < num_threads; a++)
1816 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001817 if(debug_mode > 1) {
1818 pthread_join(pt[num_threads], NULL);
1819 clock_t now = clock();
1820 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now-start) / CLOCKS_PER_SEC / num_threads, (now-start) / CLOCKS_PER_SEC);
1821 printf("Saving vectors to %s ...", output_file);
1822 fflush(stdout);
1823 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001824 fo = fopen(output_file, "wb");
1825 if (classes == 0) {
1826 // Save the word vectors
1827 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1828 for (a = 0; a < vocab_size; a++) {
1829 fprintf(fo, "%s ", vocab[a].word);
1830 if (binary)
1831 for (b = 0; b < layer1_size; b++)
1832 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1833 else
1834 for (b = 0; b < layer1_size; b++)
1835 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1836 fprintf(fo, "\n");
1837 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001838 if(debug_mode > 1)
1839 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001840 } else {
1841 // Run K-means on the word vectors
1842 int clcn = classes, iter = 10, closeid;
1843 int *centcn = (int *) malloc(classes * sizeof(int));
1844 int *cl = (int *) calloc(vocab_size, sizeof(int));
1845 real closev, x;
1846 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1847 for (a = 0; a < vocab_size; a++)
1848 cl[a] = a % clcn;
1849 for (a = 0; a < iter; a++) {
1850 for (b = 0; b < clcn * layer1_size; b++)
1851 cent[b] = 0;
1852 for (b = 0; b < clcn; b++)
1853 centcn[b] = 1;
1854 for (c = 0; c < vocab_size; c++) {
1855 for (d = 0; d < layer1_size; d++)
1856 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1857 centcn[cl[c]]++;
1858 }
1859 for (b = 0; b < clcn; b++) {
1860 closev = 0;
1861 for (c = 0; c < layer1_size; c++) {
1862 cent[layer1_size * b + c] /= centcn[b];
1863 closev += cent[layer1_size * b + c]
1864 * cent[layer1_size * b + c];
1865 }
1866 closev = sqrt(closev);
1867 for (c = 0; c < layer1_size; c++)
1868 cent[layer1_size * b + c] /= closev;
1869 }
1870 for (c = 0; c < vocab_size; c++) {
1871 closev = -10;
1872 closeid = 0;
1873 for (d = 0; d < clcn; d++) {
1874 x = 0;
1875 for (b = 0; b < layer1_size; b++)
1876 x += cent[layer1_size * d + b]
1877 * syn0[c * layer1_size + b];
1878 if (x > closev) {
1879 closev = x;
1880 closeid = d;
1881 }
1882 }
1883 cl[c] = closeid;
1884 }
1885 }
1886 // Save the K-means classes
1887 for (a = 0; a < vocab_size; a++)
1888 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1889 free(centcn);
1890 free(cent);
1891 free(cl);
1892 }
1893 fclose(fo);
1894 if (save_net_file[0] != 0)
1895 SaveNet();
1896}
1897
1898int ArgPos(char *str, int argc, char **argv) {
1899 int a;
1900 for (a = 1; a < argc; a++)
1901 if (!strcmp(str, argv[a])) {
1902 if (a == argc - 1) {
1903 printf("Argument missing for %s\n", str);
1904 exit(1);
1905 }
1906 return a;
1907 }
1908 return -1;
1909}
1910
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001911void print_help() {
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001912 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1913 printf("Options:\n");
1914 printf("Parameters for training:\n");
1915 printf("\t-train <file>\n");
1916 printf("\t\tUse text data from <file> to train the model\n");
1917 printf("\t-output <file>\n");
1918 printf(
1919 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1920 printf("\t-size <int>\n");
1921 printf("\t\tSet size of word vectors; default is 100\n");
1922 printf("\t-window <int>\n");
1923 printf("\t\tSet max skip length between words; default is 5\n");
1924 printf("\t-sample <float>\n");
1925 printf(
1926 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1927 printf(
1928 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1929 printf("\t-hs <int>\n");
1930 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1931 printf("\t-negative <int>\n");
1932 printf(
1933 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1934 printf("\t-negative-classes <file>\n");
1935 printf("\t\tNegative classes to sample from\n");
1936 printf("\t-nce <int>\n");
1937 printf(
1938 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1939 printf("\t-threads <int>\n");
1940 printf("\t\tUse <int> threads (default 12)\n");
1941 printf("\t-iter <int>\n");
1942 printf("\t\tRun more training iterations (default 5)\n");
1943 printf("\t-min-count <int>\n");
1944 printf(
1945 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1946 printf("\t-alpha <float>\n");
1947 printf(
1948 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1949 printf("\t-classes <int>\n");
1950 printf(
1951 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1952 printf("\t-debug <int>\n");
1953 printf(
1954 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1955 printf("\t-binary <int>\n");
1956 printf(
1957 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1958 printf("\t-save-vocab <file>\n");
1959 printf("\t\tThe vocabulary will be saved to <file>\n");
1960 printf("\t-read-vocab <file>\n");
1961 printf(
1962 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1963 printf("\t-read-net <file>\n");
1964 printf(
1965 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1966 printf("\t-save-net <file>\n");
1967 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001968 printf("\t-show-cc <int>\n");
1969 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001970 printf("\t-type <int>\n");
1971 printf(
1972 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1973 printf("\t-cap <int>\n");
1974 printf(
1975 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1976 printf("\nExamples:\n");
1977 printf(
1978 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001979}
1980
1981int main(int argc, char **argv) {
1982 int i;
1983 setlocale(LC_ALL, "");
1984 if (argc == 1) {
1985 print_help();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001986 return 0;
1987 }
1988 output_file[0] = 0;
1989 save_vocab_file[0] = 0;
1990 read_vocab_file[0] = 0;
1991 save_net_file[0] = 0;
1992 read_net_file[0] = 0;
1993 negative_classes_file[0] = 0;
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001994 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
1995 print_help();
1996 return(0);
1997 }
1998 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
1999 print_help();
2000 return(0);
2001 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002002 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2003 layer1_size = atoi(argv[i + 1]);
2004 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2005 strcpy(train_file, argv[i + 1]);
2006 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2007 strcpy(save_vocab_file, argv[i + 1]);
2008 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2009 strcpy(read_vocab_file, argv[i + 1]);
2010 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2011 strcpy(save_net_file, argv[i + 1]);
2012 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2013 strcpy(read_net_file, argv[i + 1]);
2014 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2015 debug_mode = atoi(argv[i + 1]);
2016 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2017 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002018 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2019 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002020 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2021 type = atoi(argv[i + 1]);
2022 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2023 strcpy(output_file, argv[i + 1]);
2024 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2025 window = atoi(argv[i + 1]);
2026 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2027 sample = atof(argv[i + 1]);
2028 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2029 hs = atoi(argv[i + 1]);
2030 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2031 negative = atoi(argv[i + 1]);
2032 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2033 strcpy(negative_classes_file, argv[i + 1]);
2034 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2035 nce = atoi(argv[i + 1]);
2036 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2037 num_threads = atoi(argv[i + 1]);
2038 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2039 iter = atoi(argv[i + 1]);
2040 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2041 min_count = atoi(argv[i + 1]);
2042 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2043 classes = atoi(argv[i + 1]);
2044 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2045 cap = atoi(argv[i + 1]);
2046 if (type == 0 || type == 2 || type == 4)
2047 alpha = 0.05;
2048 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2049 alpha = atof(argv[i + 1]);
2050 vocab = (struct vocab_word *) calloc(vocab_max_size,
2051 sizeof(struct vocab_word));
2052 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2053 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2054 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2055 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2056 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2057 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002058 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002059 TrainModel();
2060 return 0;
2061}
2062