blob: 69b738a66e0819a3c758d206e01068d4d1e9517e [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Marc Kupietze23c5402016-07-14 11:10:09 +020015#include <locale.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010016#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020019#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010020#include <math.h>
21#include <pthread.h>
22
23#define MAX_STRING 100
24#define EXP_TABLE_SIZE 1000
25#define MAX_EXP 6
26#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010027#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010028#define MAX_CODE_LENGTH 40
29
30const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
31
32typedef float real; // Precision of float numbers
33
34struct vocab_word {
35 long long cn;
36 int *point;
37 char *word, *code, codelen;
38};
39
40char train_file[MAX_STRING], output_file[MAX_STRING];
41char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
42char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
43struct vocab_word *vocab;
44int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietzc2731b22016-07-14 08:56:14 +020045 num_threads = 12, min_reduce = 1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010046int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020047long long *threadPos;
48int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010049long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
50long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
51 classes = 0;
52real alpha = 0.025, starting_alpha, sample = 1e-3;
53real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020054real avgWordLength=0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010055clock_t start;
56
57real *syn1_window, *syn1neg_window, *syn1nce_window;
58int w_offset, window_layer_size;
59
60int window_hidden_size = 500;
61real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
62 *syn_hidden_word_nce;
63
64int hs = 0, negative = 5;
65const int table_size = 1e8;
66int *table;
67
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010068long cc = 0;
69
Marc Kupietzd6f9c712016-03-16 11:50:56 +010070//constrastive negative sampling
71char negative_classes_file[MAX_STRING];
72int *word_to_group;
73int *group_to_table; //group_size*table_size
74int class_number;
75
76//nce
77real* noise_distribution;
78int nce = 0;
79
80//param caps
81real CAP_VALUE = 50;
82int cap = 0;
83
84void capParam(real* array, int index) {
85 if (array[index] > CAP_VALUE)
86 array[index] = CAP_VALUE;
87 else if (array[index] < -CAP_VALUE)
88 array[index] = -CAP_VALUE;
89}
90
91real hardTanh(real x) {
92 if (x >= 1) {
93 return 1;
94 } else if (x <= -1) {
95 return -1;
96 } else {
97 return x;
98 }
99}
100
101real dHardTanh(real x, real g) {
102 if (x > 1 && g > 0) {
103 return 0;
104 }
105 if (x < -1 && g < 0) {
106 return 0;
107 }
108 return 1;
109}
110
111void InitUnigramTable() {
112 int a, i;
113 long long train_words_pow = 0;
114 real d1, power = 0.75;
115 table = (int *) malloc(table_size * sizeof(int));
116 for (a = 0; a < vocab_size; a++)
117 train_words_pow += pow(vocab[a].cn, power);
118 i = 0;
119 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
120 for (a = 0; a < table_size; a++) {
121 table[a] = i;
122 if (a / (real) table_size > d1) {
123 i++;
124 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
125 }
126 if (i >= vocab_size)
127 i = vocab_size - 1;
128 }
129
130 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
131 for (a = 0; a < vocab_size; a++)
132 noise_distribution[a] = pow(vocab[a].cn, power)
133 / (real) train_words_pow;
134}
135
136// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
137void ReadWord(char *word, FILE *fin) {
138 int a = 0, ch;
139 while (!feof(fin)) {
140 ch = fgetc(fin);
141 if (ch == 13)
142 continue;
143 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
144 if (a > 0) {
145 if (ch == '\n')
146 ungetc(ch, fin);
147 break;
148 }
149 if (ch == '\n') {
150 strcpy(word, (char *) "</s>");
151 return;
152 } else
153 continue;
154 }
155 word[a] = ch;
156 a++;
157 if (a >= MAX_STRING - 1)
158 a--; // Truncate too long words
159 }
160 word[a] = 0;
161}
162
163// Returns hash value of a word
164int GetWordHash(char *word) {
165 unsigned long long a, hash = 0;
166 for (a = 0; a < strlen(word); a++)
167 hash = hash * 257 + word[a];
168 hash = hash % vocab_hash_size;
169 return hash;
170}
171
172// Returns position of a word in the vocabulary; if the word is not found, returns -1
173int SearchVocab(char *word) {
174 unsigned int hash = GetWordHash(word);
175 while (1) {
176 if (vocab_hash[hash] == -1)
177 return -1;
178 if (!strcmp(word, vocab[vocab_hash[hash]].word))
179 return vocab_hash[hash];
180 hash = (hash + 1) % vocab_hash_size;
181 }
182 return -1;
183}
184
185// Reads a word and returns its index in the vocabulary
186int ReadWordIndex(FILE *fin) {
187 char word[MAX_STRING];
188 ReadWord(word, fin);
189 if (feof(fin))
190 return -1;
191 return SearchVocab(word);
192}
193
194// Adds a word to the vocabulary
195int AddWordToVocab(char *word) {
196 unsigned int hash, length = strlen(word) + 1;
197 if (length > MAX_STRING)
198 length = MAX_STRING;
199 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
200 strcpy(vocab[vocab_size].word, word);
201 vocab[vocab_size].cn = 0;
202 vocab_size++;
203 // Reallocate memory if needed
204 if (vocab_size + 2 >= vocab_max_size) {
205 vocab_max_size += 1000;
206 vocab = (struct vocab_word *) realloc(vocab,
207 vocab_max_size * sizeof(struct vocab_word));
208 }
209 hash = GetWordHash(word);
210 while (vocab_hash[hash] != -1)
211 hash = (hash + 1) % vocab_hash_size;
212 vocab_hash[hash] = vocab_size - 1;
213 return vocab_size - 1;
214}
215
216// Used later for sorting by word counts
217int VocabCompare(const void *a, const void *b) {
218 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
219}
220
221// Sorts the vocabulary by frequency using word counts
222void SortVocab() {
223 int a, size;
224 unsigned int hash;
225 // Sort the vocabulary and keep </s> at the first position
226 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
227 for (a = 0; a < vocab_hash_size; a++)
228 vocab_hash[a] = -1;
229 size = vocab_size;
230 train_words = 0;
231 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200232 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100233 // Words occuring less than min_count times will be discarded from the vocab
234 if ((vocab[a].cn < min_count) && (a != 0)) {
235 vocab_size--;
236 free(vocab[a].word);
237 } else {
238 // Hash will be re-computed, as after the sorting it is not actual
239 hash = GetWordHash(vocab[a].word);
240 while (vocab_hash[hash] != -1)
241 hash = (hash + 1) % vocab_hash_size;
242 vocab_hash[hash] = a;
243 train_words += vocab[a].cn;
244 }
245 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200246 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100247 vocab = (struct vocab_word *) realloc(vocab,
248 (vocab_size + 1) * sizeof(struct vocab_word));
249 // Allocate memory for the binary tree construction
250 for (a = 0; a < vocab_size; a++) {
251 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
252 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
253 }
254}
255
256// Reduces the vocabulary by removing infrequent tokens
257void ReduceVocab() {
258 int a, b = 0;
259 unsigned int hash;
260 for (a = 0; a < vocab_size; a++)
261 if (vocab[a].cn > min_reduce) {
262 vocab[b].cn = vocab[a].cn;
263 vocab[b].word = vocab[a].word;
264 b++;
265 } else
266 free(vocab[a].word);
267 vocab_size = b;
268 for (a = 0; a < vocab_hash_size; a++)
269 vocab_hash[a] = -1;
270 for (a = 0; a < vocab_size; a++) {
271 // Hash will be re-computed, as it is not actual
272 hash = GetWordHash(vocab[a].word);
273 while (vocab_hash[hash] != -1)
274 hash = (hash + 1) % vocab_hash_size;
275 vocab_hash[hash] = a;
276 }
277 fflush(stdout);
278 min_reduce++;
279}
280
281// Create binary Huffman tree using the word counts
282// Frequent words will have short uniqe binary codes
283void CreateBinaryTree() {
284 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
285 char code[MAX_CODE_LENGTH];
286 long long *count = (long long *) calloc(vocab_size * 2 + 1,
287 sizeof(long long));
288 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
289 sizeof(long long));
290 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
291 sizeof(long long));
292 for (a = 0; a < vocab_size; a++)
293 count[a] = vocab[a].cn;
294 for (a = vocab_size; a < vocab_size * 2; a++)
295 count[a] = 1e15;
296 pos1 = vocab_size - 1;
297 pos2 = vocab_size;
298 // Following algorithm constructs the Huffman tree by adding one node at a time
299 for (a = 0; a < vocab_size - 1; a++) {
300 // First, find two smallest nodes 'min1, min2'
301 if (pos1 >= 0) {
302 if (count[pos1] < count[pos2]) {
303 min1i = pos1;
304 pos1--;
305 } else {
306 min1i = pos2;
307 pos2++;
308 }
309 } else {
310 min1i = pos2;
311 pos2++;
312 }
313 if (pos1 >= 0) {
314 if (count[pos1] < count[pos2]) {
315 min2i = pos1;
316 pos1--;
317 } else {
318 min2i = pos2;
319 pos2++;
320 }
321 } else {
322 min2i = pos2;
323 pos2++;
324 }
325 count[vocab_size + a] = count[min1i] + count[min2i];
326 parent_node[min1i] = vocab_size + a;
327 parent_node[min2i] = vocab_size + a;
328 binary[min2i] = 1;
329 }
330 // Now assign binary code to each vocabulary word
331 for (a = 0; a < vocab_size; a++) {
332 b = a;
333 i = 0;
334 while (1) {
335 code[i] = binary[b];
336 point[i] = b;
337 i++;
338 b = parent_node[b];
339 if (b == vocab_size * 2 - 2)
340 break;
341 }
342 vocab[a].codelen = i;
343 vocab[a].point[0] = vocab_size - 2;
344 for (b = 0; b < i; b++) {
345 vocab[a].code[i - b - 1] = code[b];
346 vocab[a].point[i - b] = point[b] - vocab_size;
347 }
348 }
349 free(count);
350 free(binary);
351 free(parent_node);
352}
353
354void LearnVocabFromTrainFile() {
355 char word[MAX_STRING];
356 FILE *fin;
357 long long a, i;
358 for (a = 0; a < vocab_hash_size; a++)
359 vocab_hash[a] = -1;
360 fin = fopen(train_file, "rb");
361 if (fin == NULL) {
362 printf("ERROR: training data file not found!\n");
363 exit(1);
364 }
365 vocab_size = 0;
366 AddWordToVocab((char *) "</s>");
367 while (1) {
368 ReadWord(word, fin);
369 if (feof(fin))
370 break;
371 train_words++;
372 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
373 printf("%lldK%c", train_words / 1000, 13);
374 fflush(stdout);
375 }
376 i = SearchVocab(word);
377 if (i == -1) {
378 a = AddWordToVocab(word);
379 vocab[a].cn = 1;
380 } else
381 vocab[i].cn++;
382 if (vocab_size > vocab_hash_size * 0.7)
383 ReduceVocab();
384 }
385 SortVocab();
386 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200387 printf("Vocab size: %'lld\n", vocab_size);
388 printf("Words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100389 }
390 file_size = ftell(fin);
391 fclose(fin);
392}
393
394void SaveVocab() {
395 long long i;
396 FILE *fo = fopen(save_vocab_file, "wb");
397 for (i = 0; i < vocab_size; i++)
398 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
399 fclose(fo);
400}
401
402void ReadVocab() {
403 long long a, i = 0;
404 char c;
405 char word[MAX_STRING];
406 FILE *fin = fopen(read_vocab_file, "rb");
407 if (fin == NULL) {
408 printf("Vocabulary file not found\n");
409 exit(1);
410 }
411 for (a = 0; a < vocab_hash_size; a++)
412 vocab_hash[a] = -1;
413 vocab_size = 0;
414 while (1) {
415 ReadWord(word, fin);
416 if (feof(fin))
417 break;
418 a = AddWordToVocab(word);
419 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
420 i++;
421 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200422 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100423 fin = fopen(train_file, "rb");
424 if (fin == NULL) {
425 printf("ERROR: training data file not found!\n");
426 exit(1);
427 }
428 fseek(fin, 0, SEEK_END);
429 file_size = ftell(fin);
430 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200431 SortVocab();
432 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200433 printf("Vocab size: %'lld\n", vocab_size);
434 printf("Words in vocab's train file: %'lld\n", train_words);
435 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200436 }
Marc Kupietze23c5402016-07-14 11:10:09 +0200437 train_words = file_size / avgWordLength;
438 if(debug_mode > 0)
439 printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100440}
441
442void InitClassUnigramTable() {
443 long long a, c;
444 printf("loading class unigrams \n");
445 FILE *fin = fopen(negative_classes_file, "rb");
446 if (fin == NULL) {
447 printf("ERROR: class file not found!\n");
448 exit(1);
449 }
450 word_to_group = (int *) malloc(vocab_size * sizeof(int));
451 for (a = 0; a < vocab_size; a++)
452 word_to_group[a] = -1;
453 char class[MAX_STRING];
454 char prev_class[MAX_STRING];
455 prev_class[0] = 0;
456 char word[MAX_STRING];
457 class_number = -1;
458 while (1) {
459 if (feof(fin))
460 break;
461 ReadWord(class, fin);
462 ReadWord(word, fin);
463 int word_index = SearchVocab(word);
464 if (word_index != -1) {
465 if (strcmp(class, prev_class) != 0) {
466 class_number++;
467 strcpy(prev_class, class);
468 }
469 word_to_group[word_index] = class_number;
470 }
471 ReadWord(word, fin);
472 }
473 class_number++;
474 fclose(fin);
475
476 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
477 long long train_words_pow = 0;
478 real d1, power = 0.75;
479
480 for (c = 0; c < class_number; c++) {
481 long long offset = c * table_size;
482 train_words_pow = 0;
483 for (a = 0; a < vocab_size; a++)
484 if (word_to_group[a] == c)
485 train_words_pow += pow(vocab[a].cn, power);
486 int i = 0;
487 while (word_to_group[i] != c && i < vocab_size)
488 i++;
489 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
490 for (a = 0; a < table_size; a++) {
491 //printf("index %lld , word %d\n", a, i);
492 group_to_table[offset + a] = i;
493 if (a / (real) table_size > d1) {
494 i++;
495 while (word_to_group[i] != c && i < vocab_size)
496 i++;
497 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
498 }
499 if (i >= vocab_size)
500 while (word_to_group[i] != c && i >= 0)
501 i--;
502 }
503 }
504}
505
Marc Kupietz210b9d52016-04-02 21:48:13 +0200506void SaveArgs(int argc, char **argv) {
507 unsigned int i;
Marc Kupietz44136742017-12-22 17:52:56 +0100508 char args_file[MAX_STRING];
509 strcpy(args_file, output_file);
Marc Kupietz210b9d52016-04-02 21:48:13 +0200510 strcat(args_file, ".args");
511 FILE *fargs = fopen(args_file, "w");
512 if (fargs == NULL) {
513 printf("Cannot save args to %s.\n", args_file);
514 return;
515 }
516
Marc Kupietz44136742017-12-22 17:52:56 +0100517 for(i=1; i<argc; i++)
518 fprintf(fargs, "%s ", argv[i]);
519
520 fprintf(fargs, "\n");
Marc Kupietz210b9d52016-04-02 21:48:13 +0200521 fclose(fargs);
Marc Kupietz44136742017-12-22 17:52:56 +0100522
Marc Kupietz210b9d52016-04-02 21:48:13 +0200523 return;
524}
525
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100526void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100527 if(type != 3 || negative <= 0) {
528 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
529 return;
530 }
531
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100532 FILE *fnet = fopen(save_net_file, "wb");
533 if (fnet == NULL) {
534 printf("Net parameter file not found\n");
535 exit(1);
536 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100537 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100538 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100539 fclose(fnet);
540}
541
542void InitNet() {
543 long long a, b;
544 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100545 long long read;
546
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100547 window_layer_size = layer1_size * window * 2;
548 a = posix_memalign((void **) &syn0, 128,
549 (long long) vocab_size * layer1_size * sizeof(real));
550 if (syn0 == NULL) {
551 printf("Memory allocation failed\n");
552 exit(1);
553 }
554
555 if (hs) {
556 a = posix_memalign((void **) &syn1, 128,
557 (long long) vocab_size * layer1_size * sizeof(real));
558 if (syn1 == NULL) {
559 printf("Memory allocation failed\n");
560 exit(1);
561 }
562 a = posix_memalign((void **) &syn1_window, 128,
563 (long long) vocab_size * window_layer_size * sizeof(real));
564 if (syn1_window == NULL) {
565 printf("Memory allocation failed\n");
566 exit(1);
567 }
568 a = posix_memalign((void **) &syn_hidden_word, 128,
569 (long long) vocab_size * window_hidden_size * sizeof(real));
570 if (syn_hidden_word == NULL) {
571 printf("Memory allocation failed\n");
572 exit(1);
573 }
574
575 for (a = 0; a < vocab_size; a++)
576 for (b = 0; b < layer1_size; b++)
577 syn1[a * layer1_size + b] = 0;
578 for (a = 0; a < vocab_size; a++)
579 for (b = 0; b < window_layer_size; b++)
580 syn1_window[a * window_layer_size + b] = 0;
581 for (a = 0; a < vocab_size; a++)
582 for (b = 0; b < window_hidden_size; b++)
583 syn_hidden_word[a * window_hidden_size + b] = 0;
584 }
585 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100586 if(type == 0) {
587 a = posix_memalign((void **) &syn1neg, 128,
588 (long long) vocab_size * layer1_size * sizeof(real));
589 if (syn1neg == NULL) {
590 printf("Memory allocation failed\n");
591 exit(1);
592 }
593 for (a = 0; a < vocab_size; a++)
594 for (b = 0; b < layer1_size; b++)
595 syn1neg[a * layer1_size + b] = 0;
596 } else if (type == 3) {
597 a = posix_memalign((void **) &syn1neg_window, 128,
598 (long long) vocab_size * window_layer_size * sizeof(real));
599 if (syn1neg_window == NULL) {
600 printf("Memory allocation failed\n");
601 exit(1);
602 }
603 for (a = 0; a < vocab_size; a++)
604 for (b = 0; b < window_layer_size; b++)
605 syn1neg_window[a * window_layer_size + b] = 0;
606 } else if (type == 4) {
607 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
608 (long long) vocab_size * window_hidden_size * sizeof(real));
609 if (syn_hidden_word_neg == NULL) {
610 printf("Memory allocation failed\n");
611 exit(1);
612 }
613 for (a = 0; a < vocab_size; a++)
614 for (b = 0; b < window_hidden_size; b++)
615 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100616 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100617 }
618 if (nce > 0) {
619 a = posix_memalign((void **) &syn1nce, 128,
620 (long long) vocab_size * layer1_size * sizeof(real));
621 if (syn1nce == NULL) {
622 printf("Memory allocation failed\n");
623 exit(1);
624 }
625 a = posix_memalign((void **) &syn1nce_window, 128,
626 (long long) vocab_size * window_layer_size * sizeof(real));
627 if (syn1nce_window == NULL) {
628 printf("Memory allocation failed\n");
629 exit(1);
630 }
631 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
632 (long long) vocab_size * window_hidden_size * sizeof(real));
633 if (syn_hidden_word_nce == NULL) {
634 printf("Memory allocation failed\n");
635 exit(1);
636 }
637
638 for (a = 0; a < vocab_size; a++)
639 for (b = 0; b < layer1_size; b++)
640 syn1nce[a * layer1_size + b] = 0;
641 for (a = 0; a < vocab_size; a++)
642 for (b = 0; b < window_layer_size; b++)
643 syn1nce_window[a * window_layer_size + b] = 0;
644 for (a = 0; a < vocab_size; a++)
645 for (b = 0; b < window_hidden_size; b++)
646 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
647 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100648
Marc Kupietz1006a272016-03-16 15:50:20 +0100649 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100650 a = posix_memalign((void **) &syn_window_hidden, 128,
651 window_hidden_size * window_layer_size * sizeof(real));
652 if (syn_window_hidden == NULL) {
653 printf("Memory allocation failed\n");
654 exit(1);
655 }
656 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
657 next_random = next_random * (unsigned long long) 25214903917 + 11;
658 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
659 - 0.5) / (window_hidden_size * window_layer_size);
660 }
661 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100662
663 if (read_net_file[0] == 0) {
664 for (a = 0; a < vocab_size; a++)
665 for (b = 0; b < layer1_size; b++) {
666 next_random = next_random * (unsigned long long) 25214903917
667 + 11;
668 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
669 / (real) 65536) - 0.5) / layer1_size;
670 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100671 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100672 FILE *fnet = fopen(read_net_file, "rb");
673 if (fnet == NULL) {
674 printf("Net parameter file not found\n");
675 exit(1);
676 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100677 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
678 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
679 if(read != vocab_size * layer1_size) {
680 fprintf(stderr, "read-net failed %lld\n", read);
681 exit(-1);
682 }
683 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
684 if(read != (long long) vocab_size * window_layer_size) {
685 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
686 (long long) sizeof(real) * vocab_size * window_layer_size);
687 exit(-1);
688 }
689 fgetc(fnet);
690 if(!feof(fnet)) {
691 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
692 exit(-1);
693 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100694 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100695 } else {
696 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
697 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100698 }
699
700 CreateBinaryTree();
701}
702
Marc Kupietz202723e2016-07-14 09:12:00 +0200703char *currentDateTime(char *buf, real offset) {
704 time_t t;
705 time(&t);
706 t += (long) offset;
707 struct tm tstruct;
708 tstruct = *localtime(&t);
709 strftime(buf, 80, "%c", &tstruct);
710 return buf;
711}
712
713void *MonitorThread(void *id) {
714 char *timebuf = malloc(80);;
715 int i, n=num_threads;
716 long long sum;
717 sleep(1);
718 while(n > 0) {
719 sleep(1);
720 sum = n = 0;
721 for(i=0; i < num_threads; i++) {
722 if(threadPos[i] >= 0) {
723 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
724 n++;
725 } else {
726 sum += iter * file_size / num_threads;
727 }
728 }
729 if(n == 0)
730 break;
731 real finished_portion = (real) sum / (float) (file_size * iter);
732 long long now = clock();
733 long long elapsed = (now - start) / CLOCKS_PER_SEC / num_threads;
734 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed) * ((real) num_threads / n) ;
735
736 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/t/s TE: %llds TTG: %llds ETA: %s\033[K",
737 alpha,
738 finished_portion * 100,
739 (float) sum / elapsed / num_threads / 1000,
740 elapsed,
741 ttg,
742 currentDateTime(timebuf, ttg)
743 );
744 fflush(stdout);
745 }
746 pthread_exit(NULL);
747}
748
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100749void *TrainModelThread(void *id) {
750 long long a, b, d, cw, word, last_word, sentence_length = 0,
751 sentence_position = 0;
752 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
753 long long l1, l2, c, target, label, local_iter = iter;
754 unsigned long long next_random = (long long) id;
755 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100756 int input_len_1 = layer1_size;
757 int window_offset = -1;
758 if (type == 2 || type == 4) {
759 input_len_1 = window_layer_size;
760 }
761 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
762 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200763 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100764
765 int input_len_2 = 0;
766 if (type == 4) {
767 input_len_2 = window_hidden_size;
768 }
769 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
770 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
771
772 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200773 long long start_pos = file_size / (long long) num_threads * (long long) id;
774 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
775 long long current_pos = start_pos;
776 long long last_pos = start_pos;;
777 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100778 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200779 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100780 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200781 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100782 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100783 alpha = starting_alpha
784 * (1 - word_count_actual / (real) (iter * train_words + 1));
785 if (alpha < starting_alpha * 0.0001)
786 alpha = starting_alpha * 0.0001;
787 }
788 if (sentence_length == 0) {
789 while (1) {
790 word = ReadWordIndex(fi);
791 if (feof(fi))
792 break;
793 if (word == -1)
794 continue;
795 word_count++;
796 if (word == 0)
797 break;
798 // The subsampling randomly discards frequent words while keeping the ranking same
799 if (sample > 0) {
800 real ran = (sqrt(vocab[word].cn / (sample * train_words))
801 + 1) * (sample * train_words) / vocab[word].cn;
802 next_random = next_random * (unsigned long long) 25214903917
803 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100804 if (ran < (next_random & 0xFFFF) / (real) 65536) {
805 if(type == 3) // in structured skipgrams
806 word = -2; // keep the window position correct
807 else
808 continue;
809 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100810 }
811 sen[sentence_length] = word;
812 sentence_length++;
813 if (sentence_length >= MAX_SENTENCE_LENGTH)
814 break;
815 }
816 sentence_position = 0;
817 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200818 current_pos = threadPos[(long) id] = ftell(fi);
819 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100820 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200821 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100822 local_iter--;
823 if (local_iter == 0)
824 break;
825 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200826 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100827 last_word_count = 0;
828 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200829 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100830 continue;
831 }
832 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200833 while (word == -2 && sentence_position<sentence_length)
834 word = sen[++sentence_position];
835 if (sentence_position>=sentence_length) {
836 sentence_length=0;
837 continue;
838 }
839 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100840 continue;
841 for (c = 0; c < input_len_1; c++)
842 neu1[c] = 0;
843 for (c = 0; c < input_len_1; c++)
844 neu1e[c] = 0;
845 for (c = 0; c < input_len_2; c++)
846 neu2[c] = 0;
847 for (c = 0; c < input_len_2; c++)
848 neu2e[c] = 0;
849 next_random = next_random * (unsigned long long) 25214903917 + 11;
850 b = next_random % window;
851 if (type == 0) { //train the cbow architecture
852 // in -> hidden
853 cw = 0;
854 for (a = b; a < window * 2 + 1 - b; a++)
855 if (a != window) {
856 c = sentence_position - window + a;
857 if (c < 0)
858 continue;
859 if (c >= sentence_length)
860 continue;
861 last_word = sen[c];
862 if (last_word == -1)
863 continue;
864 for (c = 0; c < layer1_size; c++)
865 neu1[c] += syn0[c + last_word * layer1_size];
866 cw++;
867 }
868 if (cw) {
869 for (c = 0; c < layer1_size; c++)
870 neu1[c] /= cw;
871 if (hs)
872 for (d = 0; d < vocab[word].codelen; d++) {
873 f = 0;
874 l2 = vocab[word].point[d] * layer1_size;
875 // Propagate hidden -> output
876 for (c = 0; c < layer1_size; c++)
877 f += neu1[c] * syn1[c + l2];
878 if (f <= -MAX_EXP)
879 continue;
880 else if (f >= MAX_EXP)
881 continue;
882 else
883 f = expTable[(int) ((f + MAX_EXP)
884 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
885 // 'g' is the gradient multiplied by the learning rate
886 g = (1 - vocab[word].code[d] - f) * alpha;
887 // Propagate errors output -> hidden
888 for (c = 0; c < layer1_size; c++)
889 neu1e[c] += g * syn1[c + l2];
890 // Learn weights hidden -> output
891 for (c = 0; c < layer1_size; c++)
892 syn1[c + l2] += g * neu1[c];
893 if (cap == 1)
894 for (c = 0; c < layer1_size; c++)
895 capParam(syn1, c + l2);
896 }
897 // NEGATIVE SAMPLING
898 if (negative > 0)
899 for (d = 0; d < negative + 1; d++) {
900 if (d == 0) {
901 target = word;
902 label = 1;
903 } else {
904 next_random = next_random
905 * (unsigned long long) 25214903917 + 11;
906 if (word_to_group != NULL
907 && word_to_group[word] != -1) {
908 target = word;
909 while (target == word) {
910 target = group_to_table[word_to_group[word]
911 * table_size
912 + (next_random >> 16) % table_size];
913 next_random = next_random
914 * (unsigned long long) 25214903917
915 + 11;
916 }
917 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
918 } else {
919 target =
920 table[(next_random >> 16) % table_size];
921 }
922 if (target == 0)
923 target = next_random % (vocab_size - 1) + 1;
924 if (target == word)
925 continue;
926 label = 0;
927 }
928 l2 = target * layer1_size;
929 f = 0;
930 for (c = 0; c < layer1_size; c++)
931 f += neu1[c] * syn1neg[c + l2];
932 if (f > MAX_EXP)
933 g = (label - 1) * alpha;
934 else if (f < -MAX_EXP)
935 g = (label - 0) * alpha;
936 else
937 g = (label
938 - expTable[(int) ((f + MAX_EXP)
939 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
940 * alpha;
941 for (c = 0; c < layer1_size; c++)
942 neu1e[c] += g * syn1neg[c + l2];
943 for (c = 0; c < layer1_size; c++)
944 syn1neg[c + l2] += g * neu1[c];
945 if (cap == 1)
946 for (c = 0; c < layer1_size; c++)
947 capParam(syn1neg, c + l2);
948 }
949 // Noise Contrastive Estimation
950 if (nce > 0)
951 for (d = 0; d < nce + 1; d++) {
952 if (d == 0) {
953 target = word;
954 label = 1;
955 } else {
956 next_random = next_random
957 * (unsigned long long) 25214903917 + 11;
958 if (word_to_group != NULL
959 && word_to_group[word] != -1) {
960 target = word;
961 while (target == word) {
962 target = group_to_table[word_to_group[word]
963 * table_size
964 + (next_random >> 16) % table_size];
965 next_random = next_random
966 * (unsigned long long) 25214903917
967 + 11;
968 }
969 } else {
970 target =
971 table[(next_random >> 16) % table_size];
972 }
973 if (target == 0)
974 target = next_random % (vocab_size - 1) + 1;
975 if (target == word)
976 continue;
977 label = 0;
978 }
979 l2 = target * layer1_size;
980 f = 0;
981
982 for (c = 0; c < layer1_size; c++)
983 f += neu1[c] * syn1nce[c + l2];
984 if (f > MAX_EXP)
985 g = (label - 1) * alpha;
986 else if (f < -MAX_EXP)
987 g = (label - 0) * alpha;
988 else {
989 f = exp(f);
990 g =
991 (label
992 - f
993 / (noise_distribution[target]
994 * nce + f)) * alpha;
995 }
996 for (c = 0; c < layer1_size; c++)
997 neu1e[c] += g * syn1nce[c + l2];
998 for (c = 0; c < layer1_size; c++)
999 syn1nce[c + l2] += g * neu1[c];
1000 if (cap == 1)
1001 for (c = 0; c < layer1_size; c++)
1002 capParam(syn1nce, c + l2);
1003 }
1004 // hidden -> in
1005 for (a = b; a < window * 2 + 1 - b; a++)
1006 if (a != window) {
1007 c = sentence_position - window + a;
1008 if (c < 0)
1009 continue;
1010 if (c >= sentence_length)
1011 continue;
1012 last_word = sen[c];
1013 if (last_word == -1)
1014 continue;
1015 for (c = 0; c < layer1_size; c++)
1016 syn0[c + last_word * layer1_size] += neu1e[c];
1017 }
1018 }
1019 } else if (type == 1) { //train skip-gram
1020 for (a = b; a < window * 2 + 1 - b; a++)
1021 if (a != window) {
1022 c = sentence_position - window + a;
1023 if (c < 0)
1024 continue;
1025 if (c >= sentence_length)
1026 continue;
1027 last_word = sen[c];
1028 if (last_word == -1)
1029 continue;
1030 l1 = last_word * layer1_size;
1031 for (c = 0; c < layer1_size; c++)
1032 neu1e[c] = 0;
1033 // HIERARCHICAL SOFTMAX
1034 if (hs)
1035 for (d = 0; d < vocab[word].codelen; d++) {
1036 f = 0;
1037 l2 = vocab[word].point[d] * layer1_size;
1038 // Propagate hidden -> output
1039 for (c = 0; c < layer1_size; c++)
1040 f += syn0[c + l1] * syn1[c + l2];
1041 if (f <= -MAX_EXP)
1042 continue;
1043 else if (f >= MAX_EXP)
1044 continue;
1045 else
1046 f = expTable[(int) ((f + MAX_EXP)
1047 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1048 // 'g' is the gradient multiplied by the learning rate
1049 g = (1 - vocab[word].code[d] - f) * alpha;
1050 // Propagate errors output -> hidden
1051 for (c = 0; c < layer1_size; c++)
1052 neu1e[c] += g * syn1[c + l2];
1053 // Learn weights hidden -> output
1054 for (c = 0; c < layer1_size; c++)
1055 syn1[c + l2] += g * syn0[c + l1];
1056 if (cap == 1)
1057 for (c = 0; c < layer1_size; c++)
1058 capParam(syn1, c + l2);
1059 }
1060 // NEGATIVE SAMPLING
1061 if (negative > 0)
1062 for (d = 0; d < negative + 1; d++) {
1063 if (d == 0) {
1064 target = word;
1065 label = 1;
1066 } else {
1067 next_random = next_random
1068 * (unsigned long long) 25214903917 + 11;
1069 if (word_to_group != NULL
1070 && word_to_group[word] != -1) {
1071 target = word;
1072 while (target == word) {
1073 target =
1074 group_to_table[word_to_group[word]
1075 * table_size
1076 + (next_random >> 16)
1077 % table_size];
1078 next_random =
1079 next_random
1080 * (unsigned long long) 25214903917
1081 + 11;
1082 }
1083 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1084 } else {
1085 target = table[(next_random >> 16)
1086 % table_size];
1087 }
1088 if (target == 0)
1089 target = next_random % (vocab_size - 1) + 1;
1090 if (target == word)
1091 continue;
1092 label = 0;
1093 }
1094 l2 = target * layer1_size;
1095 f = 0;
1096 for (c = 0; c < layer1_size; c++)
1097 f += syn0[c + l1] * syn1neg[c + l2];
1098 if (f > MAX_EXP)
1099 g = (label - 1) * alpha;
1100 else if (f < -MAX_EXP)
1101 g = (label - 0) * alpha;
1102 else
1103 g =
1104 (label
1105 - expTable[(int) ((f + MAX_EXP)
1106 * (EXP_TABLE_SIZE
1107 / MAX_EXP / 2))])
1108 * alpha;
1109 for (c = 0; c < layer1_size; c++)
1110 neu1e[c] += g * syn1neg[c + l2];
1111 for (c = 0; c < layer1_size; c++)
1112 syn1neg[c + l2] += g * syn0[c + l1];
1113 if (cap == 1)
1114 for (c = 0; c < layer1_size; c++)
1115 capParam(syn1neg, c + l2);
1116 }
1117 //Noise Contrastive Estimation
1118 if (nce > 0)
1119 for (d = 0; d < nce + 1; d++) {
1120 if (d == 0) {
1121 target = word;
1122 label = 1;
1123 } else {
1124 next_random = next_random
1125 * (unsigned long long) 25214903917 + 11;
1126 if (word_to_group != NULL
1127 && word_to_group[word] != -1) {
1128 target = word;
1129 while (target == word) {
1130 target =
1131 group_to_table[word_to_group[word]
1132 * table_size
1133 + (next_random >> 16)
1134 % table_size];
1135 next_random =
1136 next_random
1137 * (unsigned long long) 25214903917
1138 + 11;
1139 }
1140 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1141 } else {
1142 target = table[(next_random >> 16)
1143 % table_size];
1144 }
1145 if (target == 0)
1146 target = next_random % (vocab_size - 1) + 1;
1147 if (target == word)
1148 continue;
1149 label = 0;
1150 }
1151 l2 = target * layer1_size;
1152 f = 0;
1153 for (c = 0; c < layer1_size; c++)
1154 f += syn0[c + l1] * syn1nce[c + l2];
1155 if (f > MAX_EXP)
1156 g = (label - 1) * alpha;
1157 else if (f < -MAX_EXP)
1158 g = (label - 0) * alpha;
1159 else {
1160 f = exp(f);
1161 g = (label
1162 - f
1163 / (noise_distribution[target]
1164 * nce + f)) * alpha;
1165 }
1166 for (c = 0; c < layer1_size; c++)
1167 neu1e[c] += g * syn1nce[c + l2];
1168 for (c = 0; c < layer1_size; c++)
1169 syn1nce[c + l2] += g * syn0[c + l1];
1170 if (cap == 1)
1171 for (c = 0; c < layer1_size; c++)
1172 capParam(syn1nce, c + l2);
1173 }
1174 // Learn weights input -> hidden
1175 for (c = 0; c < layer1_size; c++)
1176 syn0[c + l1] += neu1e[c];
1177 }
1178 } else if (type == 2) { //train the cwindow architecture
1179 // in -> hidden
1180 cw = 0;
1181 for (a = 0; a < window * 2 + 1; a++)
1182 if (a != window) {
1183 c = sentence_position - window + a;
1184 if (c < 0)
1185 continue;
1186 if (c >= sentence_length)
1187 continue;
1188 last_word = sen[c];
1189 if (last_word == -1)
1190 continue;
1191 window_offset = a * layer1_size;
1192 if (a > window)
1193 window_offset -= layer1_size;
1194 for (c = 0; c < layer1_size; c++)
1195 neu1[c + window_offset] += syn0[c
1196 + last_word * layer1_size];
1197 cw++;
1198 }
1199 if (cw) {
1200 if (hs)
1201 for (d = 0; d < vocab[word].codelen; d++) {
1202 f = 0;
1203 l2 = vocab[word].point[d] * window_layer_size;
1204 // Propagate hidden -> output
1205 for (c = 0; c < window_layer_size; c++)
1206 f += neu1[c] * syn1_window[c + l2];
1207 if (f <= -MAX_EXP)
1208 continue;
1209 else if (f >= MAX_EXP)
1210 continue;
1211 else
1212 f = expTable[(int) ((f + MAX_EXP)
1213 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1214 // 'g' is the gradient multiplied by the learning rate
1215 g = (1 - vocab[word].code[d] - f) * alpha;
1216 // Propagate errors output -> hidden
1217 for (c = 0; c < window_layer_size; c++)
1218 neu1e[c] += g * syn1_window[c + l2];
1219 // Learn weights hidden -> output
1220 for (c = 0; c < window_layer_size; c++)
1221 syn1_window[c + l2] += g * neu1[c];
1222 if (cap == 1)
1223 for (c = 0; c < window_layer_size; c++)
1224 capParam(syn1_window, c + l2);
1225 }
1226 // NEGATIVE SAMPLING
1227 if (negative > 0)
1228 for (d = 0; d < negative + 1; d++) {
1229 if (d == 0) {
1230 target = word;
1231 label = 1;
1232 } else {
1233 next_random = next_random
1234 * (unsigned long long) 25214903917 + 11;
1235 if (word_to_group != NULL
1236 && word_to_group[word] != -1) {
1237 target = word;
1238 while (target == word) {
1239 target = group_to_table[word_to_group[word]
1240 * table_size
1241 + (next_random >> 16) % table_size];
1242 next_random = next_random
1243 * (unsigned long long) 25214903917
1244 + 11;
1245 }
1246 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1247 } else {
1248 target =
1249 table[(next_random >> 16) % table_size];
1250 }
1251 if (target == 0)
1252 target = next_random % (vocab_size - 1) + 1;
1253 if (target == word)
1254 continue;
1255 label = 0;
1256 }
1257 l2 = target * window_layer_size;
1258 f = 0;
1259 for (c = 0; c < window_layer_size; c++)
1260 f += neu1[c] * syn1neg_window[c + l2];
1261 if (f > MAX_EXP)
1262 g = (label - 1) * alpha;
1263 else if (f < -MAX_EXP)
1264 g = (label - 0) * alpha;
1265 else
1266 g = (label
1267 - expTable[(int) ((f + MAX_EXP)
1268 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1269 * alpha;
1270 for (c = 0; c < window_layer_size; c++)
1271 neu1e[c] += g * syn1neg_window[c + l2];
1272 for (c = 0; c < window_layer_size; c++)
1273 syn1neg_window[c + l2] += g * neu1[c];
1274 if (cap == 1)
1275 for (c = 0; c < window_layer_size; c++)
1276 capParam(syn1neg_window, c + l2);
1277 }
1278 // Noise Contrastive Estimation
1279 if (nce > 0)
1280 for (d = 0; d < nce + 1; d++) {
1281 if (d == 0) {
1282 target = word;
1283 label = 1;
1284 } else {
1285 next_random = next_random
1286 * (unsigned long long) 25214903917 + 11;
1287 if (word_to_group != NULL
1288 && word_to_group[word] != -1) {
1289 target = word;
1290 while (target == word) {
1291 target = group_to_table[word_to_group[word]
1292 * table_size
1293 + (next_random >> 16) % table_size];
1294 next_random = next_random
1295 * (unsigned long long) 25214903917
1296 + 11;
1297 }
1298 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1299 } else {
1300 target =
1301 table[(next_random >> 16) % table_size];
1302 }
1303 if (target == 0)
1304 target = next_random % (vocab_size - 1) + 1;
1305 if (target == word)
1306 continue;
1307 label = 0;
1308 }
1309 l2 = target * window_layer_size;
1310 f = 0;
1311 for (c = 0; c < window_layer_size; c++)
1312 f += neu1[c] * syn1nce_window[c + l2];
1313 if (f > MAX_EXP)
1314 g = (label - 1) * alpha;
1315 else if (f < -MAX_EXP)
1316 g = (label - 0) * alpha;
1317 else {
1318 f = exp(f);
1319 g =
1320 (label
1321 - f
1322 / (noise_distribution[target]
1323 * nce + f)) * alpha;
1324 }
1325 for (c = 0; c < window_layer_size; c++)
1326 neu1e[c] += g * syn1nce_window[c + l2];
1327 for (c = 0; c < window_layer_size; c++)
1328 syn1nce_window[c + l2] += g * neu1[c];
1329 if (cap == 1)
1330 for (c = 0; c < window_layer_size; c++)
1331 capParam(syn1nce_window, c + l2);
1332 }
1333 // hidden -> in
1334 for (a = 0; a < window * 2 + 1; a++)
1335 if (a != window) {
1336 c = sentence_position - window + a;
1337 if (c < 0)
1338 continue;
1339 if (c >= sentence_length)
1340 continue;
1341 last_word = sen[c];
1342 if (last_word == -1)
1343 continue;
1344 window_offset = a * layer1_size;
1345 if (a > window)
1346 window_offset -= layer1_size;
1347 for (c = 0; c < layer1_size; c++)
1348 syn0[c + last_word * layer1_size] += neu1e[c
1349 + window_offset];
1350 }
1351 }
1352 } else if (type == 3) { //train structured skip-gram
1353 for (a = 0; a < window * 2 + 1; a++)
1354 if (a != window) {
1355 c = sentence_position - window + a;
1356 if (c < 0)
1357 continue;
1358 if (c >= sentence_length)
1359 continue;
1360 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001361 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001362 continue;
1363 l1 = last_word * layer1_size;
1364 window_offset = a * layer1_size;
1365 if (a > window)
1366 window_offset -= layer1_size;
1367 for (c = 0; c < layer1_size; c++)
1368 neu1e[c] = 0;
1369 // HIERARCHICAL SOFTMAX
1370 if (hs)
1371 for (d = 0; d < vocab[word].codelen; d++) {
1372 f = 0;
1373 l2 = vocab[word].point[d] * window_layer_size;
1374 // Propagate hidden -> output
1375 for (c = 0; c < layer1_size; c++)
1376 f += syn0[c + l1]
1377 * syn1_window[c + l2 + window_offset];
1378 if (f <= -MAX_EXP)
1379 continue;
1380 else if (f >= MAX_EXP)
1381 continue;
1382 else
1383 f = expTable[(int) ((f + MAX_EXP)
1384 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1385 // 'g' is the gradient multiplied by the learning rate
1386 g = (1 - vocab[word].code[d] - f) * alpha;
1387 // Propagate errors output -> hidden
1388 for (c = 0; c < layer1_size; c++)
1389 neu1e[c] += g
1390 * syn1_window[c + l2 + window_offset];
1391 // Learn weights hidden -> output
1392 for (c = 0; c < layer1_size; c++)
1393 syn1[c + l2 + window_offset] += g
1394 * syn0[c + l1];
1395 if (cap == 1)
1396 for (c = 0; c < layer1_size; c++)
1397 capParam(syn1, c + l2 + window_offset);
1398 }
1399 // NEGATIVE SAMPLING
1400 if (negative > 0)
1401 for (d = 0; d < negative + 1; d++) {
1402 if (d == 0) {
1403 target = word;
1404 label = 1;
1405 } else {
1406 next_random = next_random
1407 * (unsigned long long) 25214903917 + 11;
1408 if (word_to_group != NULL
1409 && word_to_group[word] != -1) {
1410 target = word;
1411 while (target == word) {
1412 target =
1413 group_to_table[word_to_group[word]
1414 * table_size
1415 + (next_random >> 16)
1416 % table_size];
1417 next_random =
1418 next_random
1419 * (unsigned long long) 25214903917
1420 + 11;
1421 }
1422 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1423 } else {
1424 target = table[(next_random >> 16)
1425 % table_size];
1426 }
1427 if (target == 0)
1428 target = next_random % (vocab_size - 1) + 1;
1429 if (target == word)
1430 continue;
1431 label = 0;
1432 }
1433 l2 = target * window_layer_size;
1434 f = 0;
1435 for (c = 0; c < layer1_size; c++)
1436 f +=
1437 syn0[c + l1]
1438 * syn1neg_window[c + l2
1439 + window_offset];
1440 if (f > MAX_EXP)
1441 g = (label - 1) * alpha;
1442 else if (f < -MAX_EXP)
1443 g = (label - 0) * alpha;
1444 else
1445 g =
1446 (label
1447 - expTable[(int) ((f + MAX_EXP)
1448 * (EXP_TABLE_SIZE
1449 / MAX_EXP / 2))])
1450 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001451 if(debug_mode > 2 && ((long long) id) == 0) {
1452 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1453 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1454 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001455 for (c = 0; c < layer1_size; c++)
1456 neu1e[c] +=
1457 g
1458 * syn1neg_window[c + l2
1459 + window_offset];
1460 for (c = 0; c < layer1_size; c++)
1461 syn1neg_window[c + l2 + window_offset] += g
1462 * syn0[c + l1];
1463 if (cap == 1)
1464 for (c = 0; c < layer1_size; c++)
1465 capParam(syn1neg_window,
1466 c + l2 + window_offset);
1467 }
1468 // Noise Constrastive Estimation
1469 if (nce > 0)
1470 for (d = 0; d < nce + 1; d++) {
1471 if (d == 0) {
1472 target = word;
1473 label = 1;
1474 } else {
1475 next_random = next_random
1476 * (unsigned long long) 25214903917 + 11;
1477 if (word_to_group != NULL
1478 && word_to_group[word] != -1) {
1479 target = word;
1480 while (target == word) {
1481 target =
1482 group_to_table[word_to_group[word]
1483 * table_size
1484 + (next_random >> 16)
1485 % table_size];
1486 next_random =
1487 next_random
1488 * (unsigned long long) 25214903917
1489 + 11;
1490 }
1491 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1492 } else {
1493 target = table[(next_random >> 16)
1494 % table_size];
1495 }
1496 if (target == 0)
1497 target = next_random % (vocab_size - 1) + 1;
1498 if (target == word)
1499 continue;
1500 label = 0;
1501 }
1502 l2 = target * window_layer_size;
1503 f = 0;
1504 for (c = 0; c < layer1_size; c++)
1505 f +=
1506 syn0[c + l1]
1507 * syn1nce_window[c + l2
1508 + window_offset];
1509 if (f > MAX_EXP)
1510 g = (label - 1) * alpha;
1511 else if (f < -MAX_EXP)
1512 g = (label - 0) * alpha;
1513 else {
1514 f = exp(f);
1515 g = (label
1516 - f
1517 / (noise_distribution[target]
1518 * nce + f)) * alpha;
1519 }
1520 for (c = 0; c < layer1_size; c++)
1521 neu1e[c] +=
1522 g
1523 * syn1nce_window[c + l2
1524 + window_offset];
1525 for (c = 0; c < layer1_size; c++)
1526 syn1nce_window[c + l2 + window_offset] += g
1527 * syn0[c + l1];
1528 if (cap == 1)
1529 for (c = 0; c < layer1_size; c++)
1530 capParam(syn1nce_window,
1531 c + l2 + window_offset);
1532 }
1533 // Learn weights input -> hidden
1534 for (c = 0; c < layer1_size; c++) {
1535 syn0[c + l1] += neu1e[c];
1536 if (syn0[c + l1] > 50)
1537 syn0[c + l1] = 50;
1538 if (syn0[c + l1] < -50)
1539 syn0[c + l1] = -50;
1540 }
1541 }
1542 } else if (type == 4) { //training senna
1543 // in -> hidden
1544 cw = 0;
1545 for (a = 0; a < window * 2 + 1; a++)
1546 if (a != window) {
1547 c = sentence_position - window + a;
1548 if (c < 0)
1549 continue;
1550 if (c >= sentence_length)
1551 continue;
1552 last_word = sen[c];
1553 if (last_word == -1)
1554 continue;
1555 window_offset = a * layer1_size;
1556 if (a > window)
1557 window_offset -= layer1_size;
1558 for (c = 0; c < layer1_size; c++)
1559 neu1[c + window_offset] += syn0[c
1560 + last_word * layer1_size];
1561 cw++;
1562 }
1563 if (cw) {
1564 for (a = 0; a < window_hidden_size; a++) {
1565 c = a * window_layer_size;
1566 for (b = 0; b < window_layer_size; b++) {
1567 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1568 }
1569 }
1570 if (hs)
1571 for (d = 0; d < vocab[word].codelen; d++) {
1572 f = 0;
1573 l2 = vocab[word].point[d] * window_hidden_size;
1574 // Propagate hidden -> output
1575 for (c = 0; c < window_hidden_size; c++)
1576 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1577 if (f <= -MAX_EXP)
1578 continue;
1579 else if (f >= MAX_EXP)
1580 continue;
1581 else
1582 f = expTable[(int) ((f + MAX_EXP)
1583 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1584 // 'g' is the gradient multiplied by the learning rate
1585 g = (1 - vocab[word].code[d] - f) * alpha;
1586 // Propagate errors output -> hidden
1587 for (c = 0; c < window_hidden_size; c++)
1588 neu2e[c] += dHardTanh(neu2[c], g) * g
1589 * syn_hidden_word[c + l2];
1590 // Learn weights hidden -> output
1591 for (c = 0; c < window_hidden_size; c++)
1592 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1593 * neu2[c];
1594 }
1595 // NEGATIVE SAMPLING
1596 if (negative > 0)
1597 for (d = 0; d < negative + 1; d++) {
1598 if (d == 0) {
1599 target = word;
1600 label = 1;
1601 } else {
1602 next_random = next_random
1603 * (unsigned long long) 25214903917 + 11;
1604 if (word_to_group != NULL
1605 && word_to_group[word] != -1) {
1606 target = word;
1607 while (target == word) {
1608 target = group_to_table[word_to_group[word]
1609 * table_size
1610 + (next_random >> 16) % table_size];
1611 next_random = next_random
1612 * (unsigned long long) 25214903917
1613 + 11;
1614 }
1615 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1616 } else {
1617 target =
1618 table[(next_random >> 16) % table_size];
1619 }
1620 if (target == 0)
1621 target = next_random % (vocab_size - 1) + 1;
1622 if (target == word)
1623 continue;
1624 label = 0;
1625 }
1626 l2 = target * window_hidden_size;
1627 f = 0;
1628 for (c = 0; c < window_hidden_size; c++)
1629 f += hardTanh(neu2[c])
1630 * syn_hidden_word_neg[c + l2];
1631 if (f > MAX_EXP)
1632 g = (label - 1) * alpha / negative;
1633 else if (f < -MAX_EXP)
1634 g = (label - 0) * alpha / negative;
1635 else
1636 g = (label
1637 - expTable[(int) ((f + MAX_EXP)
1638 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1639 * alpha / negative;
1640 for (c = 0; c < window_hidden_size; c++)
1641 neu2e[c] += dHardTanh(neu2[c], g) * g
1642 * syn_hidden_word_neg[c + l2];
1643 for (c = 0; c < window_hidden_size; c++)
1644 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1645 * g * neu2[c];
1646 }
1647 for (a = 0; a < window_hidden_size; a++)
1648 for (b = 0; b < window_layer_size; b++)
1649 neu1e[b] += neu2e[a]
1650 * syn_window_hidden[a * window_layer_size + b];
1651 for (a = 0; a < window_hidden_size; a++)
1652 for (b = 0; b < window_layer_size; b++)
1653 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1654 * neu1[b];
1655 // hidden -> in
1656 for (a = 0; a < window * 2 + 1; a++)
1657 if (a != window) {
1658 c = sentence_position - window + a;
1659 if (c < 0)
1660 continue;
1661 if (c >= sentence_length)
1662 continue;
1663 last_word = sen[c];
1664 if (last_word == -1)
1665 continue;
1666 window_offset = a * layer1_size;
1667 if (a > window)
1668 window_offset -= layer1_size;
1669 for (c = 0; c < layer1_size; c++)
1670 syn0[c + last_word * layer1_size] += neu1e[c
1671 + window_offset];
1672 }
1673 }
1674 } else {
1675 printf("unknown type %i", type);
1676 exit(0);
1677 }
1678 sentence_position++;
1679 if (sentence_position >= sentence_length) {
1680 sentence_length = 0;
1681 continue;
1682 }
1683 }
1684 fclose(fi);
1685 free(neu1);
1686 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001687 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001688 pthread_exit(NULL);
1689}
1690
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001691void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001692 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001693 real f, max_f, maxmax_f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001694 real *target_sums, bestf[MAX_CC], worstbest;
1695 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001696 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001697 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1698
1699 for (d = cc; d < vocab_size; d++) {
1700 for (b = 0; b < vocab_size; b++)
1701 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001702 for (b = 0; b < N; b++)
1703 bestf[b]=-1;
1704 worstbest = -1;
1705
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001706 maxmax_f = -1;
1707 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001708 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001709 if (a != window) {
1710 max_f = -1;
1711 window_offset = a * layer1_size;
1712 if (a > window)
1713 window_offset -= layer1_size;
1714 for(target = 0; target < vocab_size; target ++) {
1715 if(target == d)
1716 continue;
1717 f = 0;
1718 for (c = 0; c < layer1_size; c++)
1719 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1720 if (f < -MAX_EXP)
1721 continue;
1722 else if (f > MAX_EXP)
1723 continue;
1724 else
1725 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1726 if(f > max_f) {
1727 max_f = f;
1728 max_target = target;
1729 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001730 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001731 if(f > worstbest) {
1732 for (b = 0; b < N; b++) {
1733 if (f > bestf[b]) {
1734 for (e = N - 1; e > b; e--) {
1735 bestf[e] = bestf[e - 1];
1736 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001737 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001738 }
1739 bestf[b] = f;
1740 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001741 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001742 break;
1743 }
1744 }
1745 worstbest = bestf[N-1];
1746 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001747 }
1748 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1749 if(max_f > maxmax_f) {
1750 maxmax_f = max_f;
1751 maxmax_target = max_target;
1752 }
1753 } else {
1754 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1755 }
1756 }
1757 max_f = -1;
1758 for (b = 0; b < vocab_size; b++) {
1759 if(target_sums[b] > max_f) {
1760 max_f = target_sums[b];
1761 max_target = b;
1762 }
1763 }
1764 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001765 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001766 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001767 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001768 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001769 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001770 }
1771}
1772
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001773void TrainModel() {
1774 long a, b, c, d;
1775 FILE *fo;
1776 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001777 threadPos = malloc(num_threads * sizeof(long long));
1778 threadIters = malloc(num_threads * sizeof(int));
1779 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001780 printf("Starting training using file %s\n", train_file);
1781 starting_alpha = alpha;
1782 if (read_vocab_file[0] != 0)
1783 ReadVocab();
1784 else
1785 LearnVocabFromTrainFile();
1786 if (save_vocab_file[0] != 0)
1787 SaveVocab();
1788 if (output_file[0] == 0)
1789 return;
1790 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001791 if(cc > 0)
1792 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001793 if (negative > 0 || nce > 0)
1794 InitUnigramTable();
1795 if (negative_classes_file[0] != 0)
1796 InitClassUnigramTable();
1797 start = clock();
1798 for (a = 0; a < num_threads; a++)
1799 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001800 if(debug_mode > 1)
1801 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001802 for (a = 0; a < num_threads; a++)
1803 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001804 if(debug_mode > 1) {
1805 pthread_join(pt[num_threads], NULL);
1806 clock_t now = clock();
1807 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now-start) / CLOCKS_PER_SEC / num_threads, (now-start) / CLOCKS_PER_SEC);
1808 printf("Saving vectors to %s ...", output_file);
1809 fflush(stdout);
1810 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001811 fo = fopen(output_file, "wb");
1812 if (classes == 0) {
1813 // Save the word vectors
1814 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1815 for (a = 0; a < vocab_size; a++) {
1816 fprintf(fo, "%s ", vocab[a].word);
1817 if (binary)
1818 for (b = 0; b < layer1_size; b++)
1819 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1820 else
1821 for (b = 0; b < layer1_size; b++)
1822 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1823 fprintf(fo, "\n");
1824 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001825 if(debug_mode > 1)
1826 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001827 } else {
1828 // Run K-means on the word vectors
1829 int clcn = classes, iter = 10, closeid;
1830 int *centcn = (int *) malloc(classes * sizeof(int));
1831 int *cl = (int *) calloc(vocab_size, sizeof(int));
1832 real closev, x;
1833 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1834 for (a = 0; a < vocab_size; a++)
1835 cl[a] = a % clcn;
1836 for (a = 0; a < iter; a++) {
1837 for (b = 0; b < clcn * layer1_size; b++)
1838 cent[b] = 0;
1839 for (b = 0; b < clcn; b++)
1840 centcn[b] = 1;
1841 for (c = 0; c < vocab_size; c++) {
1842 for (d = 0; d < layer1_size; d++)
1843 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1844 centcn[cl[c]]++;
1845 }
1846 for (b = 0; b < clcn; b++) {
1847 closev = 0;
1848 for (c = 0; c < layer1_size; c++) {
1849 cent[layer1_size * b + c] /= centcn[b];
1850 closev += cent[layer1_size * b + c]
1851 * cent[layer1_size * b + c];
1852 }
1853 closev = sqrt(closev);
1854 for (c = 0; c < layer1_size; c++)
1855 cent[layer1_size * b + c] /= closev;
1856 }
1857 for (c = 0; c < vocab_size; c++) {
1858 closev = -10;
1859 closeid = 0;
1860 for (d = 0; d < clcn; d++) {
1861 x = 0;
1862 for (b = 0; b < layer1_size; b++)
1863 x += cent[layer1_size * d + b]
1864 * syn0[c * layer1_size + b];
1865 if (x > closev) {
1866 closev = x;
1867 closeid = d;
1868 }
1869 }
1870 cl[c] = closeid;
1871 }
1872 }
1873 // Save the K-means classes
1874 for (a = 0; a < vocab_size; a++)
1875 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1876 free(centcn);
1877 free(cent);
1878 free(cl);
1879 }
1880 fclose(fo);
1881 if (save_net_file[0] != 0)
1882 SaveNet();
1883}
1884
1885int ArgPos(char *str, int argc, char **argv) {
1886 int a;
1887 for (a = 1; a < argc; a++)
1888 if (!strcmp(str, argv[a])) {
1889 if (a == argc - 1) {
1890 printf("Argument missing for %s\n", str);
1891 exit(1);
1892 }
1893 return a;
1894 }
1895 return -1;
1896}
1897
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001898void print_help() {
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001899 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1900 printf("Options:\n");
1901 printf("Parameters for training:\n");
1902 printf("\t-train <file>\n");
1903 printf("\t\tUse text data from <file> to train the model\n");
1904 printf("\t-output <file>\n");
1905 printf(
1906 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1907 printf("\t-size <int>\n");
1908 printf("\t\tSet size of word vectors; default is 100\n");
1909 printf("\t-window <int>\n");
1910 printf("\t\tSet max skip length between words; default is 5\n");
1911 printf("\t-sample <float>\n");
1912 printf(
1913 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1914 printf(
1915 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1916 printf("\t-hs <int>\n");
1917 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1918 printf("\t-negative <int>\n");
1919 printf(
1920 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1921 printf("\t-negative-classes <file>\n");
1922 printf("\t\tNegative classes to sample from\n");
1923 printf("\t-nce <int>\n");
1924 printf(
1925 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1926 printf("\t-threads <int>\n");
1927 printf("\t\tUse <int> threads (default 12)\n");
1928 printf("\t-iter <int>\n");
1929 printf("\t\tRun more training iterations (default 5)\n");
1930 printf("\t-min-count <int>\n");
1931 printf(
1932 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1933 printf("\t-alpha <float>\n");
1934 printf(
1935 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1936 printf("\t-classes <int>\n");
1937 printf(
1938 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1939 printf("\t-debug <int>\n");
1940 printf(
1941 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1942 printf("\t-binary <int>\n");
1943 printf(
1944 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1945 printf("\t-save-vocab <file>\n");
1946 printf("\t\tThe vocabulary will be saved to <file>\n");
1947 printf("\t-read-vocab <file>\n");
1948 printf(
1949 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1950 printf("\t-read-net <file>\n");
1951 printf(
1952 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1953 printf("\t-save-net <file>\n");
1954 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001955 printf("\t-show-cc <int>\n");
1956 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001957 printf("\t-type <int>\n");
1958 printf(
1959 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1960 printf("\t-cap <int>\n");
1961 printf(
1962 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1963 printf("\nExamples:\n");
1964 printf(
1965 "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001966}
1967
1968int main(int argc, char **argv) {
1969 int i;
1970 setlocale(LC_ALL, "");
1971 if (argc == 1) {
1972 print_help();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001973 return 0;
1974 }
1975 output_file[0] = 0;
1976 save_vocab_file[0] = 0;
1977 read_vocab_file[0] = 0;
1978 save_net_file[0] = 0;
1979 read_net_file[0] = 0;
1980 negative_classes_file[0] = 0;
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001981 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
1982 print_help();
1983 return(0);
1984 }
1985 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
1986 print_help();
1987 return(0);
1988 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001989 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
1990 layer1_size = atoi(argv[i + 1]);
1991 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
1992 strcpy(train_file, argv[i + 1]);
1993 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
1994 strcpy(save_vocab_file, argv[i + 1]);
1995 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
1996 strcpy(read_vocab_file, argv[i + 1]);
1997 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
1998 strcpy(save_net_file, argv[i + 1]);
1999 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2000 strcpy(read_net_file, argv[i + 1]);
2001 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2002 debug_mode = atoi(argv[i + 1]);
2003 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2004 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002005 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2006 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002007 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2008 type = atoi(argv[i + 1]);
2009 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2010 strcpy(output_file, argv[i + 1]);
2011 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2012 window = atoi(argv[i + 1]);
2013 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2014 sample = atof(argv[i + 1]);
2015 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2016 hs = atoi(argv[i + 1]);
2017 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2018 negative = atoi(argv[i + 1]);
2019 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2020 strcpy(negative_classes_file, argv[i + 1]);
2021 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2022 nce = atoi(argv[i + 1]);
2023 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2024 num_threads = atoi(argv[i + 1]);
2025 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2026 iter = atoi(argv[i + 1]);
2027 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2028 min_count = atoi(argv[i + 1]);
2029 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2030 classes = atoi(argv[i + 1]);
2031 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2032 cap = atoi(argv[i + 1]);
2033 if (type == 0 || type == 2 || type == 4)
2034 alpha = 0.05;
2035 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2036 alpha = atof(argv[i + 1]);
2037 vocab = (struct vocab_word *) calloc(vocab_max_size,
2038 sizeof(struct vocab_word));
2039 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2040 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2041 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2042 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2043 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2044 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002045 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002046 TrainModel();
2047 return 0;
2048}
2049