blob: 266cf7667e35b395b5247c50ccafcb10c3d20c2c [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39struct vocab_word *vocab;
40int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
41int *vocab_hash;
42long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
43long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
44real alpha = 0.025, starting_alpha, sample = 1e-3;
45real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
46clock_t start;
47
48real *syn1_window, *syn1neg_window, *syn1nce_window;
49int w_offset, window_layer_size;
50
51int window_hidden_size = 500;
52real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg, *syn_hidden_word_nce;
53
54int hs = 0, negative = 5;
55const int table_size = 1e8;
56int *table;
57
58//constrastive negative sampling
59char negative_classes_file[MAX_STRING];
60int *word_to_group;
61int *group_to_table; //group_size*table_size
62int class_number;
63
64//nce
65real* noise_distribution;
66int nce = 10;
67
68//param caps
69real CAP_VALUE = 50;
70int cap = 0;
71
72// char models
73char boundToken = 'Z';
74char *unkNgramToken = "ZZZ";
75int cngram_size = 6;
76real *syn0_cngram;
77long long cngram_vocab_size = 0;
78struct vocab_word *cngram_vocab;
79int *cngram_vocab_hash;
80long long cngram_vocab_max_size = 1000;
81char extra_vocab_file[MAX_STRING];
82long long maxNgramSize = 1000000;
83
84// Returns hash value of a word
85int GetWordHash(char *word) {
86 unsigned long long a, hash = 0;
87 for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
88 hash = hash % vocab_hash_size;
89 return hash;
90}
91
92// Search
93int SearchCNgramVocab(char *ngram) {
94 unsigned int hash = GetWordHash(ngram);
95 while (1) {
96 if (cngram_vocab_hash[hash] == -1) return -1;
97 if (!strcmp(ngram, cngram_vocab[cngram_vocab_hash[hash]].word)) return cngram_vocab_hash[hash];
98 hash = (hash + 1) % vocab_hash_size;
99 }
100 return -1;
101}
102
103// char functions
104void ForwardCNgramWordNgram(real *output, char *ngram){
105 long long a;
106 int index = SearchCNgramVocab(ngram);
107 if (index == -1) {index = SearchCNgramVocab(unkNgramToken);}
108 long long startIndex = layer1_size * index;
109 for (a = 0; a < layer1_size; a++){
110 output[a] += syn0_cngram[startIndex + a];
111 }
112}
113
114void ForwardCNgramWordRepresentation(real *output, char *word){
115 int length = strlen(word);
116 int start;
117 int cur_len;
118 char *ngram;
119 char tmp[cngram_size+1];
120 tmp[cngram_size] = '\0';
121 int ngrams = 0;
122 for(start = 0; start < length-cngram_size+1; start++){
123 ngram = word + start;
124 strncpy(tmp, ngram, cngram_size);
125 ForwardCNgramWordNgram(output, tmp);
126 ngrams++;
127 }
128 for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cur_len] = boundToken;
129 strncpy(tmp+1, word, cur_len);
130 ForwardCNgramWordNgram(output, tmp);
131 for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cngram_size-cur_len-1] = boundToken;
132 cur_len = cngram_size - 1;
133 if(length < cur_len){
134 cur_len = length;
135 }
136 ngram = word + length - cur_len;
137 strncpy(tmp, ngram, cur_len);
138 tmp[cur_len] = 'Z';
139 tmp[cur_len + 1] = '\0';
140 ForwardCNgramWordNgram(output, tmp);
141 for(start = 0; start < layer1_size; start++){
142 output[start] /= ngrams+2;
143 }
144}
145
146void BackwardCNgramWordNgram(real *output, char *ngram, real *output_err){
147 long long a;
148 int index = SearchCNgramVocab(ngram);
149 if (index == -1) index = SearchCNgramVocab(unkNgramToken);
150 long long startIndex = layer1_size * index;
151 for (a = 0; a < layer1_size; a++){
152 syn0_cngram[startIndex + a] += output_err[a];
153 }
154}
155
156void BackwardCNgramWordRepresentation(real *output, char *word, real *output_err){
157 int length = strlen(word);
158 int start;
159 int cur_len;
160 char *ngram;
161 char tmp[cngram_size+1];
162 tmp[cngram_size] = '\0';
163 for(start = 0; start < length-cngram_size+1; start++){
164 ngram = word + start;
165 strncpy(tmp, ngram, cngram_size);
166 BackwardCNgramWordNgram(output, tmp, output_err);
167 }
168 for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cur_len] = boundToken;
169 strncpy(tmp+1, word, cur_len);
170 BackwardCNgramWordNgram(output, tmp, output_err);
171 for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cngram_size-cur_len-1] = boundToken;
172 cur_len = cngram_size - 1;
173 if(length < cur_len){
174 cur_len = length;
175 }
176 ngram = word + length - cur_len;
177 strncpy(tmp, ngram, cur_len);
178 tmp[cur_len] = 'Z';
179 tmp[cur_len + 1] = '\0';
180 BackwardCNgramWordNgram(output, tmp, output_err);
181}
182
183void AddWordNgramToVocab(char *ngram, int count){
184 int index = SearchCNgramVocab(ngram);
185 if(index != -1){
186 cngram_vocab[index].cn+=count;
187 return;
188 }
189 unsigned int hash, length = strlen(ngram) + 1;
190 if (length > MAX_STRING) length = MAX_STRING;
191 cngram_vocab[cngram_vocab_size].word = (char *)calloc(length, sizeof(char));
192 strcpy(cngram_vocab[cngram_vocab_size].word, ngram);
193 cngram_vocab[cngram_vocab_size].cn = count;
194 cngram_vocab_size++;
195 // Reallocate memory if needed
196 if (cngram_vocab_size + 2 >= cngram_vocab_max_size) {
197 cngram_vocab_max_size += 1000;
198 cngram_vocab = (struct vocab_word *)realloc(cngram_vocab, cngram_vocab_max_size * sizeof(struct vocab_word));
199 }
200 hash = GetWordHash(ngram);
201 while (cngram_vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
202 cngram_vocab_hash[hash] = cngram_vocab_size - 1;
203}
204
205void AddAllWordNgramToVocab(char *word, int count){
206 int length = strlen(word);
207 int start;
208 int cur_len;
209 char *ngram;
210 char tmp[cngram_size+1];
211 tmp[cngram_size] = '\0';
212 for(start = 0; start < length-cngram_size+1; start++){
213 ngram = word + start;
214 strncpy(tmp, ngram, cngram_size);
215 AddWordNgramToVocab(tmp, count);
216 }
217 for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cur_len] = boundToken;
218 strncpy(tmp+1, word, cur_len);
219 AddWordNgramToVocab(tmp, count);
220 for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cngram_size-cur_len-1] = boundToken;
221 cur_len = cngram_size - 1;
222 if(length < cur_len){
223 cur_len = length;
224 }
225 ngram = word + length - cur_len;
226 strncpy(tmp, ngram, cur_len);
227 tmp[cur_len] = 'Z';
228 tmp[cur_len + 1] = '\0';
229 AddWordNgramToVocab(tmp, count);
230}
231
232void capParam(real* array, int index){
233 if(array[index] > CAP_VALUE)
234 array[index] = CAP_VALUE;
235 else if(array[index] < -CAP_VALUE)
236 array[index] = -CAP_VALUE;
237}
238
239real hardTanh(real x){
240 if(x>=1){
241 return 1;
242 }
243 else if(x<=-1){
244 return -1;
245 }
246 else{
247 return x;
248 }
249}
250
251real dHardTanh(real x, real g){
252 if(x > 1 && g > 0){
253 return 0;
254 }
255 if(x < -1 && g < 0){
256 return 0;
257 }
258 return 1;
259}
260
261void InitUnigramTable() {
262 int a, i;
263 long long train_words_pow = 0;
264 real d1, power = 0.75;
265 table = (int *)malloc(table_size * sizeof(int));
266 for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
267 i = 0;
268 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
269 for (a = 0; a < table_size; a++) {
270 table[a] = i;
271 if (a / (real)table_size > d1) {
272 i++;
273 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
274 }
275 if (i >= vocab_size) i = vocab_size - 1;
276 }
277
278 noise_distribution = (real *)calloc(vocab_size, sizeof(real));
279 for (a = 0; a < vocab_size; a++) noise_distribution[a] = pow(vocab[a].cn, power)/(real)train_words_pow;
280}
281
282// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
283void ReadWord(char *word, FILE *fin) {
284 int a = 0, ch;
285 while (!feof(fin)) {
286 ch = fgetc(fin);
287 if (ch == 13) continue;
288 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
289 if (a > 0) {
290 if (ch == '\n') ungetc(ch, fin);
291 break;
292 }
293 if (ch == '\n') {
294 strcpy(word, (char *)"</s>");
295 return;
296 } else continue;
297 }
298 word[a] = ch;
299 a++;
300 if (a >= MAX_STRING - 1) a--; // Truncate too long words
301 }
302 word[a] = 0;
303}
304
305// Returns position of a word in the vocabulary; if the word is not found, returns -1
306int SearchVocab(char *word) {
307 unsigned int hash = GetWordHash(word);
308 while (1) {
309 if (vocab_hash[hash] == -1) return -1;
310 if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
311 hash = (hash + 1) % vocab_hash_size;
312 }
313 return -1;
314}
315
316// Reads a word and returns its index in the vocabulary
317int ReadWordIndex(FILE *fin) {
318 char word[MAX_STRING];
319 ReadWord(word, fin);
320 if (feof(fin)) return -1;
321 return SearchVocab(word);
322}
323
324// Adds a word to the vocabulary
325int AddWordToVocab(char *word) {
326 unsigned int hash, length = strlen(word) + 1;
327 if (length > MAX_STRING) length = MAX_STRING;
328 vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
329 strcpy(vocab[vocab_size].word, word);
330 vocab[vocab_size].cn = 0;
331 vocab_size++;
332 // Reallocate memory if needed
333 if (vocab_size + 2 >= vocab_max_size) {
334 vocab_max_size += 1000;
335 vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
336 }
337 hash = GetWordHash(word);
338 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
339 vocab_hash[hash] = vocab_size - 1;
340 return vocab_size - 1;
341}
342
343// Used later for sorting by word counts
344int VocabCompare(const void *a, const void *b) {
345 return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
346}
347
348// Sorts the vocabulary by frequency using word counts
349void SortVocab() {
350 int a, size;
351 unsigned int hash;
352 // Sort the vocabulary and keep </s> at the first position
353 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
354 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
355 size = vocab_size;
356 train_words = 0;
357 for (a = 0; a < size; a++) {
358 // Words occuring less than min_count times will be discarded from the vocab
359 if ((vocab[a].cn < min_count) && (a != 0)) {
360 vocab_size--;
361 free(vocab[a].word);
362 } else {
363 // Hash will be re-computed, as after the sorting it is not actual
364 hash=GetWordHash(vocab[a].word);
365 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
366 vocab_hash[hash] = a;
367 train_words += vocab[a].cn;
368 }
369 }
370 vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
371 // Allocate memory for the binary tree construction
372 for (a = 0; a < vocab_size; a++) {
373 vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
374 vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
375 }
376}
377
378// Reduces the vocabulary by removing infrequent tokens
379void ReduceVocab() {
380 int a, b = 0;
381 unsigned int hash;
382 for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
383 vocab[b].cn = vocab[a].cn;
384 vocab[b].word = vocab[a].word;
385 b++;
386 } else free(vocab[a].word);
387 vocab_size = b;
388 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
389 for (a = 0; a < vocab_size; a++) {
390 // Hash will be re-computed, as it is not actual
391 hash = GetWordHash(vocab[a].word);
392 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
393 vocab_hash[hash] = a;
394 }
395 fflush(stdout);
396 min_reduce++;
397}
398
399// Create binary Huffman tree using the word counts
400// Frequent words will have short uniqe binary codes
401void CreateBinaryTree() {
402 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
403 char code[MAX_CODE_LENGTH];
404 long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
405 long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
406 long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
407 for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
408 for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
409 pos1 = vocab_size - 1;
410 pos2 = vocab_size;
411 // Following algorithm constructs the Huffman tree by adding one node at a time
412 for (a = 0; a < vocab_size - 1; a++) {
413 // First, find two smallest nodes 'min1, min2'
414 if (pos1 >= 0) {
415 if (count[pos1] < count[pos2]) {
416 min1i = pos1;
417 pos1--;
418 } else {
419 min1i = pos2;
420 pos2++;
421 }
422 } else {
423 min1i = pos2;
424 pos2++;
425 }
426 if (pos1 >= 0) {
427 if (count[pos1] < count[pos2]) {
428 min2i = pos1;
429 pos1--;
430 } else {
431 min2i = pos2;
432 pos2++;
433 }
434 } else {
435 min2i = pos2;
436 pos2++;
437 }
438 count[vocab_size + a] = count[min1i] + count[min2i];
439 parent_node[min1i] = vocab_size + a;
440 parent_node[min2i] = vocab_size + a;
441 binary[min2i] = 1;
442 }
443 // Now assign binary code to each vocabulary word
444 for (a = 0; a < vocab_size; a++) {
445 b = a;
446 i = 0;
447 while (1) {
448 code[i] = binary[b];
449 point[i] = b;
450 i++;
451 b = parent_node[b];
452 if (b == vocab_size * 2 - 2) break;
453 }
454 vocab[a].codelen = i;
455 vocab[a].point[0] = vocab_size - 2;
456 for (b = 0; b < i; b++) {
457 vocab[a].code[i - b - 1] = code[b];
458 vocab[a].point[i - b] = point[b] - vocab_size;
459 }
460 }
461 free(count);
462 free(binary);
463 free(parent_node);
464}
465
466void LearnVocabFromTrainFile() {
467 char word[MAX_STRING];
468 FILE *fin;
469 long long a, i;
470 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
471 for (a = 0; a < vocab_hash_size; a++) cngram_vocab_hash[a] = -1;
472 fin = fopen(train_file, "rb");
473 if (fin == NULL) {
474 printf("ERROR: training data file not found!\n");
475 exit(1);
476 }
477 vocab_size = 0;
478 AddWordToVocab((char *)"</s>");
479 AddWordNgramToVocab(unkNgramToken,1000000);
480 while (1) {
481 ReadWord(word, fin);
482 if (feof(fin)) break;
483 train_words++;
484 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
485 printf("%lldK%c", train_words / 1000, 13);
486 fflush(stdout);
487 }
488 i = SearchVocab(word);
489 if (i == -1) {
490 a = AddWordToVocab(word);
491 vocab[a].cn = 1;
492 } else vocab[i].cn++;
493 if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
494 }
495 SortVocab();
496 for (a = 0; a < vocab_size; a++){
497 AddAllWordNgramToVocab(vocab[a].word, vocab[a].cn);
498 }
499 if (debug_mode > 0) {
500 printf("Vocab size: %lld\n", vocab_size);
501 printf("Ngrams size: %lld\n", cngram_vocab_size);
502 printf("Words in train file: %lld\n", train_words);
503 }
504 file_size = ftell(fin);
505 fclose(fin);
506}
507
508void SaveVocab() {
509 long long i;
510 FILE *fo = fopen(save_vocab_file, "wb");
511 for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
512 fclose(fo);
513}
514
515void ReadVocab() {
516 long long a, i = 0;
517 char c;
518 char word[MAX_STRING];
519 FILE *fin = fopen(read_vocab_file, "rb");
520 if (fin == NULL) {
521 printf("Vocabulary file not found\n");
522 exit(1);
523 }
524 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
525 vocab_size = 0;
526 while (1) {
527 ReadWord(word, fin);
528 if (feof(fin)) break;
529 a = AddWordToVocab(word);
530 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
531 i++;
532 }
533 SortVocab();
534 if (debug_mode > 0) {
535 printf("Vocab size: %lld\n", vocab_size);
536 printf("Words in train file: %lld\n", train_words);
537 }
538 fin = fopen(train_file, "rb");
539 if (fin == NULL) {
540 printf("ERROR: training data file not found!\n");
541 exit(1);
542 }
543 fseek(fin, 0, SEEK_END);
544 file_size = ftell(fin);
545 fclose(fin);
546}
547
548void InitClassUnigramTable() {
549 long long a,c;
550 printf("loading class unigrams \n");
551 FILE *fin = fopen(negative_classes_file, "rb");
552 if (fin == NULL) {
553 printf("ERROR: class file not found!\n");
554 exit(1);
555 }
556 word_to_group = (int *)malloc(vocab_size * sizeof(int));
557 for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
558 char class[MAX_STRING];
559 char prev_class[MAX_STRING];
560 prev_class[0] = 0;
561 char word[MAX_STRING];
562 class_number = -1;
563 while (1) {
564 if (feof(fin)) break;
565 ReadWord(class, fin);
566 ReadWord(word, fin);
567 int word_index = SearchVocab(word);
568 if (word_index != -1){
569 if(strcmp(class, prev_class) != 0){
570 class_number++;
571 strcpy(prev_class, class);
572 }
573 word_to_group[word_index] = class_number;
574 }
575 ReadWord(word, fin);
576 }
577 class_number++;
578 fclose(fin);
579
580 group_to_table = (int *)malloc(table_size * class_number * sizeof(int));
581 long long train_words_pow = 0;
582 real d1, power = 0.75;
583
584 for(c = 0; c < class_number; c++){
585 long long offset = c * table_size;
586 train_words_pow = 0;
587 for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
588 int i = 0;
589 while(word_to_group[i]!=c && i < vocab_size) i++;
590 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
591 for (a = 0; a < table_size; a++) {
592 //printf("index %lld , word %d\n", a, i);
593 group_to_table[offset + a] = i;
594 if (a / (real)table_size > d1) {
595 i++;
596 while(word_to_group[i]!=c && i < vocab_size) i++;
597 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
598 }
599 if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
600 }
601 }
602}
603
604void InitNet() {
605 long long a, b;
606 unsigned long long next_random = 1;
607 window_layer_size = layer1_size*window*2;
608 a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
609 if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
610 a = posix_memalign((void **)&syn0_cngram, 128, (long long)vocab_size * layer1_size * sizeof(real));
611 if (syn0_cngram == NULL) {printf("Memory allocation failed\n"); exit(1);}
612
613 if (hs) {
614 a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
615 if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
616 a = posix_memalign((void **)&syn1_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
617 if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
618 a = posix_memalign((void **)&syn_hidden_word, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
619 if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
620
621 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
622 syn1[a * layer1_size + b] = 0;
623 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
624 syn1_window[a * window_layer_size + b] = 0;
625 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
626 syn_hidden_word[a * window_hidden_size + b] = 0;
627 }
628 if (negative>0) {
629 a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
630 if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
631 a = posix_memalign((void **)&syn1neg_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
632 if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
633 a = posix_memalign((void **)&syn_hidden_word_neg, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
634 if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
635
636 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
637 syn1neg[a * layer1_size + b] = 0;
638 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
639 syn1neg_window[a * window_layer_size + b] = 0;
640 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
641 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
642 }
643 if (nce>0) {
644 a = posix_memalign((void **)&syn1nce, 128, (long long)vocab_size * layer1_size * sizeof(real));
645 if (syn1nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
646 a = posix_memalign((void **)&syn1nce_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
647 if (syn1nce_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
648 a = posix_memalign((void **)&syn_hidden_word_nce, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
649 if (syn_hidden_word_nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
650
651 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
652 syn1nce[a * layer1_size + b] = 0;
653 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
654 syn1nce_window[a * window_layer_size + b] = 0;
655 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
656 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
657 }
658 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
659 next_random = next_random * (unsigned long long)25214903917 + 11;
660 syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
661 }
662
663 for (a = 0; a < cngram_vocab_size; a++) for (b = 0; b < layer1_size; b++){
664 next_random = next_random * (unsigned long long)25214903917 + 11;
665 syn0_cngram[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
666 }
667
668 a = posix_memalign((void **)&syn_window_hidden, 128, window_hidden_size * window_layer_size * sizeof(real));
669 if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
670 for (a = 0; a < window_hidden_size * window_layer_size; a++){
671 next_random = next_random * (unsigned long long)25214903917 + 11;
672 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
673 }
674
675 CreateBinaryTree();
676}
677
678void *TrainModelThread(void *id) {
679 long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
680 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
681 long long l1, l2, c, target, label, local_iter = iter;
682 unsigned long long next_random = (long long)id;
683 real f, g;
684 clock_t now;
685 int input_len_1 = layer1_size;
686 int window_offset = -1;
687 if(type == 2 || type == 4){
688 input_len_1=window_layer_size;
689 }
690 real *neu1 = (real *)calloc(input_len_1, sizeof(real));
691 real *neu1e = (real *)calloc(input_len_1, sizeof(real));
692
693 int input_len_2 = 0;
694 if(type == 4){
695 input_len_2 = window_hidden_size;
696 }
697 real *neu2 = (real *)calloc(input_len_2, sizeof(real));
698 real *neu2e = (real *)calloc(input_len_2, sizeof(real));
699
700 FILE *fi = fopen(train_file, "rb");
701 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
702 while (1) {
703 if (word_count - last_word_count > 10000) {
704 word_count_actual += word_count - last_word_count;
705 last_word_count = word_count;
706 if ((debug_mode > 1)) {
707 now=clock();
708 printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
709 word_count_actual / (real)(iter * train_words + 1) * 100,
710 word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
711 fflush(stdout);
712 }
713 alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
714 if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
715 }
716 if (sentence_length == 0) {
717 while (1) {
718 word = ReadWordIndex(fi);
719 if (feof(fi)) break;
720 if (word == -1) continue;
721 word_count++;
722 if (word == 0) break;
723 // The subsampling randomly discards frequent words while keeping the ranking same
724 if (sample > 0) {
725 real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
726 next_random = next_random * (unsigned long long)25214903917 + 11;
727 if (ran < (next_random & 0xFFFF) / (real)65536) continue;
728 }
729 sen[sentence_length] = word;
730 sentence_length++;
731 if (sentence_length >= MAX_SENTENCE_LENGTH) break;
732 }
733 sentence_position = 0;
734 }
735 if (feof(fi) || (word_count > train_words / num_threads)) {
736 word_count_actual += word_count - last_word_count;
737 local_iter--;
738 if (local_iter == 0) break;
739 word_count = 0;
740 last_word_count = 0;
741 sentence_length = 0;
742 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
743 continue;
744 }
745 word = sen[sentence_position];
746 if (word == -1) continue;
747 for (c = 0; c < input_len_1; c++) neu1[c] = 0;
748 for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
749 for (c = 0; c < input_len_2; c++) neu2[c] = 0;
750 for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
751 next_random = next_random * (unsigned long long)25214903917 + 11;
752 b = next_random % window;
753 if (type == 0) { //train the cbow architecture
754 // in -> hidden
755 cw = 0;
756 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
757 c = sentence_position - window + a;
758 if (c < 0) continue;
759 if (c >= sentence_length) continue;
760 last_word = sen[c];
761 if (last_word == -1) continue;
762 ForwardCNgramWordRepresentation(neu1, vocab[last_word].word);
763 cw++;
764 }
765 if (cw) {
766 for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
767 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
768 f = 0;
769 l2 = vocab[word].point[d] * layer1_size;
770 // Propagate hidden -> output
771 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
772 if (f <= -MAX_EXP) continue;
773 else if (f >= MAX_EXP) continue;
774 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
775 // 'g' is the gradient multiplied by the learning rate
776 g = (1 - vocab[word].code[d] - f) * alpha;
777 // Propagate errors output -> hidden
778 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
779 // Learn weights hidden -> output
780 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
781 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
782 }
783 // NEGATIVE SAMPLING
784 if (negative > 0) for (d = 0; d < negative + 1; d++) {
785 if (d == 0) {
786 target = word;
787 label = 1;
788 } else {
789 next_random = next_random * (unsigned long long)25214903917 + 11;
790 if(word_to_group != NULL && word_to_group[word] != -1){
791 target = word;
792 while(target == word) {
793 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
794 next_random = next_random * (unsigned long long)25214903917 + 11;
795 }
796 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
797 }
798 else{
799 target = table[(next_random >> 16) % table_size];
800 }
801 if (target == 0) target = next_random % (vocab_size - 1) + 1;
802 if (target == word) continue;
803 label = 0;
804 }
805 l2 = target * layer1_size;
806 f = 0;
807 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
808 if (f > MAX_EXP) g = (label - 1) * alpha;
809 else if (f < -MAX_EXP) g = (label - 0) * alpha;
810 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
811 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
812 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
813 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
814 }
815 // Noise Contrastive Estimation
816 if (nce > 0) for (d = 0; d < nce + 1; d++) {
817 if (d == 0) {
818 target = word;
819 label = 1;
820 } else {
821 next_random = next_random * (unsigned long long)25214903917 + 11;
822 if(word_to_group != NULL && word_to_group[word] != -1){
823 target = word;
824 while(target == word) {
825 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
826 next_random = next_random * (unsigned long long)25214903917 + 11;
827 }
828 }
829 else{
830 target = table[(next_random >> 16) % table_size];
831 }
832 if (target == 0) target = next_random % (vocab_size - 1) + 1;
833 if (target == word) continue;
834 label = 0;
835 }
836 l2 = target * layer1_size;
837 f = 0;
838
839 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
840 if (f > MAX_EXP) g = (label - 1) * alpha;
841 else if (f < -MAX_EXP) g = (label - 0) * alpha;
842 else {
843 f = exp(f);
844 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
845 }
846 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
847 for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
848 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce,c + l2);
849 }
850 // hidden -> in
851 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
852 c = sentence_position - window + a;
853 if (c < 0) continue;
854 if (c >= sentence_length) continue;
855 last_word = sen[c];
856 if (last_word == -1) continue;
857 BackwardCNgramWordRepresentation(neu1, vocab[last_word].word, neu1e);
858 }
859 }
860 } else if(type==1) { //train skip-gram
861 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
862 c = sentence_position - window + a;
863 if (c < 0) continue;
864 if (c >= sentence_length) continue;
865 last_word = sen[c];
866 if (last_word == -1) continue;
867 l1 = last_word * layer1_size;
868 ForwardCNgramWordRepresentation(neu1, vocab[last_word].word);
869 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
870 // HIERARCHICAL SOFTMAX
871 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
872 f = 0;
873 l2 = vocab[word].point[d] * layer1_size;
874 // Propagate hidden -> output
875 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
876 if (f <= -MAX_EXP) continue;
877 else if (f >= MAX_EXP) continue;
878 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
879 // 'g' is the gradient multiplied by the learning rate
880 g = (1 - vocab[word].code[d] - f) * alpha;
881 // Propagate errors output -> hidden
882 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
883 // Learn weights hidden -> output
884 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
885 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
886 }
887 // NEGATIVE SAMPLING
888 if (negative > 0) for (d = 0; d < negative + 1; d++) {
889 if (d == 0) {
890 target = word;
891 label = 1;
892 } else {
893 next_random = next_random * (unsigned long long)25214903917 + 11;
894 if(word_to_group != NULL && word_to_group[word] != -1){
895 target = word;
896 while(target == word) {
897 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
898 next_random = next_random * (unsigned long long)25214903917 + 11;
899 }
900 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
901 }
902 else{
903 target = table[(next_random >> 16) % table_size];
904 }
905 if (target == 0) target = next_random % (vocab_size - 1) + 1;
906 if (target == word) continue;
907 label = 0;
908 }
909 l2 = target * layer1_size;
910 f = 0;
911 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
912 if (f > MAX_EXP) g = (label - 1) * alpha;
913 else if (f < -MAX_EXP) g = (label - 0) * alpha;
914 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
915 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
916 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
917 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
918 }
919 //Noise Contrastive Estimation
920 if (nce > 0) for (d = 0; d < nce + 1; d++) {
921 if (d == 0) {
922 target = word;
923 label = 1;
924 } else {
925 next_random = next_random * (unsigned long long)25214903917 + 11;
926 if(word_to_group != NULL && word_to_group[word] != -1){
927 target = word;
928 while(target == word) {
929 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
930 next_random = next_random * (unsigned long long)25214903917 + 11;
931 }
932 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
933 }
934 else{
935 target = table[(next_random >> 16) % table_size];
936 }
937 if (target == 0) target = next_random % (vocab_size - 1) + 1;
938 if (target == word) continue;
939 label = 0;
940 }
941 l2 = target * layer1_size;
942 f = 0;
943 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
944 if (f > MAX_EXP) g = (label - 1) * alpha;
945 else if (f < -MAX_EXP) g = (label - 0) * alpha;
946 else {
947 f = exp(f);
948 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
949 }
950 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
951 for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
952 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce, c + l2);
953 }
954 // Learn weights input -> hidden
955 BackwardCNgramWordRepresentation(neu1, vocab[last_word].word, neu1e);
956 }
957 }
958 else if(type == 2){ //train the cwindow architecture
959 // in -> hidden
960 cw = 0;
961 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
962 c = sentence_position - window + a;
963 if (c < 0) continue;
964 if (c >= sentence_length) continue;
965 last_word = sen[c];
966 if (last_word == -1) continue;
967 window_offset = a*layer1_size;
968 if (a > window) window_offset-=layer1_size;
969 ForwardCNgramWordRepresentation(&neu1[window_offset], vocab[last_word].word);
970 cw++;
971 }
972 if (cw) {
973 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
974 f = 0;
975 l2 = vocab[word].point[d] * window_layer_size;
976 // Propagate hidden -> output
977 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
978 if (f <= -MAX_EXP) continue;
979 else if (f >= MAX_EXP) continue;
980 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
981 // 'g' is the gradient multiplied by the learning rate
982 g = (1 - vocab[word].code[d] - f) * alpha;
983 // Propagate errors output -> hidden
984 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
985 // Learn weights hidden -> output
986 for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
987 if (cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1_window, c + l2);
988 }
989 // NEGATIVE SAMPLING
990 if (negative > 0) for (d = 0; d < negative + 1; d++) {
991 if (d == 0) {
992 target = word;
993 label = 1;
994 } else {
995 next_random = next_random * (unsigned long long)25214903917 + 11;
996 if(word_to_group != NULL && word_to_group[word] != -1){
997 target = word;
998 while(target == word) {
999 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1000 next_random = next_random * (unsigned long long)25214903917 + 11;
1001 }
1002 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1003 }
1004 else{
1005 target = table[(next_random >> 16) % table_size];
1006 }
1007 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1008 if (target == word) continue;
1009 label = 0;
1010 }
1011 l2 = target * window_layer_size;
1012 f = 0;
1013 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
1014 if (f > MAX_EXP) g = (label - 1) * alpha;
1015 else if (f < -MAX_EXP) g = (label - 0) * alpha;
1016 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
1017 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
1018 for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
1019 if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1neg_window, c + l2);
1020 }
1021 // Noise Contrastive Estimation
1022 if (nce > 0) for (d = 0; d < nce + 1; d++) {
1023 if (d == 0) {
1024 target = word;
1025 label = 1;
1026 } else {
1027 next_random = next_random * (unsigned long long)25214903917 + 11;
1028 if(word_to_group != NULL && word_to_group[word] != -1){
1029 target = word;
1030 while(target == word) {
1031 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1032 next_random = next_random * (unsigned long long)25214903917 + 11;
1033 }
1034 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1035 }
1036 else{
1037 target = table[(next_random >> 16) % table_size];
1038 }
1039 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1040 if (target == word) continue;
1041 label = 0;
1042 }
1043 l2 = target * window_layer_size;
1044 f = 0;
1045 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1nce_window[c + l2];
1046 if (f > MAX_EXP) g = (label - 1) * alpha;
1047 else if (f < -MAX_EXP) g = (label - 0) * alpha;
1048 else {
1049 f = exp(f);
1050 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
1051 }
1052 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1nce_window[c + l2];
1053 for (c = 0; c < window_layer_size; c++) syn1nce_window[c + l2] += g * neu1[c];
1054 if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1nce_window, c + l2);
1055 }
1056 // hidden -> in
1057 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1058 c = sentence_position - window + a;
1059 if (c < 0) continue;
1060 if (c >= sentence_length) continue;
1061 last_word = sen[c];
1062 if (last_word == -1) continue;
1063 window_offset = a * layer1_size;
1064 if(a > window) window_offset -= layer1_size;
1065 BackwardCNgramWordRepresentation(&neu1[window_offset], vocab[last_word].word, &neu1e[window_offset]);
1066 }
1067 }
1068 }
1069 else if (type == 3){ //train structured skip-gram
1070 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1071 c = sentence_position - window + a;
1072 if (c < 0) continue;
1073 if (c >= sentence_length) continue;
1074 last_word = sen[c];
1075 if (last_word == -1) continue;
1076 l1 = last_word * layer1_size;
1077 window_offset = a * layer1_size;
1078 if(a > window) window_offset -= layer1_size;
1079 ForwardCNgramWordRepresentation(neu1, vocab[last_word].word);
1080 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
1081 // HIERARCHICAL SOFTMAX
1082 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1083 f = 0;
1084 l2 = vocab[word].point[d] * window_layer_size;
1085 // Propagate hidden -> output
1086 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1_window[c + l2 + window_offset];
1087 if (f <= -MAX_EXP) continue;
1088 else if (f >= MAX_EXP) continue;
1089 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1090 // 'g' is the gradient multiplied by the learning rate
1091 g = (1 - vocab[word].code[d] - f) * alpha;
1092 // Propagate errors output -> hidden
1093 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
1094 // Learn weights hidden -> output
1095 for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * neu1[c];
1096 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2 + window_offset);
1097 }
1098 // NEGATIVE SAMPLING
1099 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1100 if (d == 0) {
1101 target = word;
1102 label = 1;
1103 } else {
1104 next_random = next_random * (unsigned long long)25214903917 + 11;
1105 if(word_to_group != NULL && word_to_group[word] != -1){
1106 target = word;
1107 while(target == word) {
1108 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1109 next_random = next_random * (unsigned long long)25214903917 + 11;
1110 }
1111 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1112 }
1113 else{
1114 target = table[(next_random >> 16) % table_size];
1115 }
1116 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1117 if (target == word) continue;
1118 label = 0;
1119 }
1120 l2 = target * window_layer_size;
1121 f = 0;
1122 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg_window[c + l2 + window_offset];
1123 if (f > MAX_EXP) g = (label - 1) * alpha;
1124 else if (f < -MAX_EXP) g = (label - 0) * alpha;
1125 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
1126 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
1127 for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * neu1[c];
1128 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg_window, c + l2 + window_offset);
1129 }
1130 // Noise Constrastive Estimation
1131 if (nce > 0) for (d = 0; d < nce + 1; d++) {
1132 if (d == 0) {
1133 target = word;
1134 label = 1;
1135 } else {
1136 next_random = next_random * (unsigned long long)25214903917 + 11;
1137 if(word_to_group != NULL && word_to_group[word] != -1){
1138 target = word;
1139 while(target == word) {
1140 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1141 next_random = next_random * (unsigned long long)25214903917 + 11;
1142 }
1143 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1144 }
1145 else{
1146 target = table[(next_random >> 16) % table_size];
1147 }
1148 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1149 if (target == word) continue;
1150 label = 0;
1151 }
1152 l2 = target * window_layer_size;
1153 f = 0;
1154 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce_window[c + l2 + window_offset];
1155 if (f > MAX_EXP) g = (label - 1) * alpha;
1156 else if (f < -MAX_EXP) g = (label - 0) * alpha;
1157 else {
1158 f = exp(f);
1159 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
1160 }
1161 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce_window[c + l2 + window_offset];
1162 for (c = 0; c < layer1_size; c++) syn1nce_window[c + l2 + window_offset] += g * neu1[c];
1163 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce_window, c + l2 + window_offset);
1164 }
1165 // Learn weights input -> hidden
1166 BackwardCNgramWordRepresentation(neu1, vocab[last_word].word, neu1e);
1167 }
1168 }
1169 else if(type == 4){ //training senna
1170 // in -> hidden
1171 cw = 0;
1172 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1173 c = sentence_position - window + a;
1174 if (c < 0) continue;
1175 if (c >= sentence_length) continue;
1176 last_word = sen[c];
1177 if (last_word == -1) continue;
1178 window_offset = a*layer1_size;
1179 if (a > window) window_offset-=layer1_size;
1180 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
1181 cw++;
1182 }
1183 if (cw) {
1184 for (a = 0; a < window_hidden_size; a++){
1185 c = a*window_layer_size;
1186 for(b = 0; b < window_layer_size; b++){
1187 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1188 }
1189 }
1190 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1191 f = 0;
1192 l2 = vocab[word].point[d] * window_hidden_size;
1193 // Propagate hidden -> output
1194 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1195 if (f <= -MAX_EXP) continue;
1196 else if (f >= MAX_EXP) continue;
1197 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1198 // 'g' is the gradient multiplied by the learning rate
1199 g = (1 - vocab[word].code[d] - f) * alpha;
1200 // Propagate errors output -> hidden
1201 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
1202 // Learn weights hidden -> output
1203 for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1204 }
1205 // NEGATIVE SAMPLING
1206 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1207 if (d == 0) {
1208 target = word;
1209 label = 1;
1210 } else {
1211 next_random = next_random * (unsigned long long)25214903917 + 11;
1212 if(word_to_group != NULL && word_to_group[word] != -1){
1213 target = word;
1214 while(target == word) {
1215 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1216 next_random = next_random * (unsigned long long)25214903917 + 11;
1217 }
1218 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1219 }
1220 else{
1221 target = table[(next_random >> 16) % table_size];
1222 }
1223 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1224 if (target == word) continue;
1225 label = 0;
1226 }
1227 l2 = target * window_hidden_size;
1228 f = 0;
1229 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
1230 if (f > MAX_EXP) g = (label - 1) * alpha / negative;
1231 else if (f < -MAX_EXP) g = (label - 0) * alpha / negative;
1232 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha / negative;
1233 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
1234 for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1235 }
1236 for (a = 0; a < window_hidden_size; a++)
1237 for(b = 0; b < window_layer_size; b++)
1238 neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
1239 for (a = 0; a < window_hidden_size; a++)
1240 for(b = 0; b < window_layer_size; b++)
1241 syn_window_hidden[a*window_layer_size + b] += neu2e[a] * neu1[b];
1242 // hidden -> in
1243 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1244 c = sentence_position - window + a;
1245 if (c < 0) continue;
1246 if (c >= sentence_length) continue;
1247 last_word = sen[c];
1248 if (last_word == -1) continue;
1249 window_offset = a * layer1_size;
1250 if(a > window) window_offset -= layer1_size;
1251 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
1252 }
1253 }
1254 }
1255 else{
1256 printf("unknown type %i", type);
1257 exit(0);
1258 }
1259 sentence_position++;
1260 if (sentence_position >= sentence_length) {
1261 sentence_length = 0;
1262 continue;
1263 }
1264 }
1265 fclose(fi);
1266 free(neu1);
1267 free(neu1e);
1268 pthread_exit(NULL);
1269}
1270
1271void TrainModel() {
1272 long a, b;
1273 long extra_words;
1274 FILE *fo;
1275 FILE *fi;
1276 pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
1277 printf("Starting training using file %s\n", train_file);
1278 starting_alpha = alpha;
1279 if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
1280 if (save_vocab_file[0] != 0) SaveVocab();
1281 if (output_file[0] == 0) return;
1282 InitNet();
1283 if (negative > 0 || nce > 0) InitUnigramTable();
1284 if (negative_classes_file[0] != 0) InitClassUnigramTable();
1285 start = clock();
1286 for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
1287 for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
1288 fo = fopen(output_file, "wb");
1289 if (classes == 0) {
1290 // Save the word vectors
1291 real neu1[layer1_size];
1292
1293 // count extra words
1294 extra_words = 0;
1295 fi = fopen(extra_vocab_file, "rb");
1296 if (fi != NULL) {
1297 char word[MAX_STRING];
1298 while(1){
1299 ReadWord(word, fi);
1300 if(feof(fi)) break;
1301 extra_words++;
1302 ReadWord(word, fi);
1303 }
1304 }
1305 fclose(fi);
1306 fprintf(fo, "%lld %lld\n", vocab_size + extra_words, layer1_size);
1307 for (a = 0; a < vocab_size; a++) {
1308 fprintf(fo, "%s ", vocab[a].word);
1309 for (b = 0; b < layer1_size; b++) neu1[b] = 0;
1310 ForwardCNgramWordRepresentation(neu1, vocab[a].word);
1311
1312 if (binary) for (b = 0; b < layer1_size; b++) fwrite(&neu1[b], sizeof(real), 1, fo);
1313 else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", neu1[b]);
1314 fprintf(fo, "\n");
1315 }
1316 fi = fopen(extra_vocab_file, "rb");
1317 if (fi != NULL) {
1318 char word[MAX_STRING];
1319 while(1){
1320 ReadWord(word, fi);
1321 if(feof(fi)) break;
1322 for (b = 0; b < layer1_size; b++) neu1[b] = 0;
1323 fprintf(fo, "%s ", word);
1324 ForwardCNgramWordRepresentation(neu1, word);
1325 if (binary) for (b = 0; b < layer1_size; b++) fwrite(&neu1[b], sizeof(real), 1, fo);
1326 else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", neu1[b]);
1327 fprintf(fo, "\n");
1328 ReadWord(word, fi);
1329 }
1330 }
1331 fclose(fi);
1332 }
1333 fclose(fo);
1334}
1335
1336int ArgPos(char *str, int argc, char **argv) {
1337 int a;
1338 for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
1339 if (a == argc - 1) {
1340 printf("Argument missing for %s\n", str);
1341 exit(1);
1342 }
1343 return a;
1344 }
1345 return -1;
1346}
1347
1348int main(int argc, char **argv) {
1349 int i;
1350 if (argc == 1) {
1351 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1352 printf("Options:\n");
1353 printf("Parameters for training:\n");
1354 printf("\t-train <file>\n");
1355 printf("\t\tUse text data from <file> to train the model\n");
1356 printf("\t-output <file>\n");
1357 printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
1358 printf("\t-size <int>\n");
1359 printf("\t\tSet size of word vectors; default is 100\n");
1360 printf("\t-window <int>\n");
1361 printf("\t\tSet max skip length between words; default is 5\n");
1362 printf("\t-sample <float>\n");
1363 printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1364 printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1365 printf("\t-hs <int>\n");
1366 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1367 printf("\t-negative <int>\n");
1368 printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1369 printf("\t-negative-classes <file>\n");
1370 printf("\t\tNegative classes to sample from\n");
1371 printf("\t-nce <int>\n");
1372 printf("\t\tNumber of negative examples for nce; default is 5, common values are 3 - 10 (0 = not used)\n");
1373 printf("\t-threads <int>\n");
1374 printf("\t\tUse <int> threads (default 12)\n");
1375 printf("\t-iter <int>\n");
1376 printf("\t\tRun more training iterations (default 5)\n");
1377 printf("\t-cngram-size <int>\n");
1378 printf("\t\tUse <int> size of the character ngrams (default 4)\n");
1379 printf("\t-extra_vocab_file <file>\n");
1380 printf("\t\tUse <file> file with extra words (one per line)\n");
1381 printf("\t-min-count <int>\n");
1382 printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
1383 printf("\t-alpha <float>\n");
1384 printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1385 printf("\t-classes <int>\n");
1386 printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1387 printf("\t-debug <int>\n");
1388 printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
1389 printf("\t-binary <int>\n");
1390 printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1391 printf("\t-save-vocab <file>\n");
1392 printf("\t\tThe vocabulary will be saved to <file>\n");
1393 printf("\t-read-vocab <file>\n");
1394 printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1395 printf("\t-type <int>\n");
1396 printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1397 printf("\t-cap <int>\n");
1398 printf("\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1399 printf("\nExamples:\n");
1400 printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3 -cngram-size 4 -extra_vocab_file extra.txt \n\n");
1401 return 0;
1402 }
1403 output_file[0] = 0;
1404 save_vocab_file[0] = 0;
1405 read_vocab_file[0] = 0;
1406 negative_classes_file[0] = 0;
1407 if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
1408 if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
1409 if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
1410 if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
1411 if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
1412 if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
1413 if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
1414 if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
1415 if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
1416 if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
1417 if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
1418 if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
1419 if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
1420 if ((i = ArgPos((char *)"-nce", argc, argv)) > 0) nce = atoi(argv[i + 1]);
1421 if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
1422 if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
1423 if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
1424 if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
1425 if ((i = ArgPos((char *)"-cap", argc, argv)) > 0) cap = atoi(argv[i + 1]);
1426 if ((i = ArgPos((char *)"-cngram-size", argc, argv)) > 0) cngram_size = atoi(argv[i + 1]);
1427 if ((i = ArgPos((char *)"-extra_vocab_file", argc, argv)) > 0) strcpy(extra_vocab_file, argv[i + 1]);
1428 if (type==0 || type==2 || type==4) alpha = 0.05;
1429 if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
1430 vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
1431 vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
1432 cngram_vocab = (struct vocab_word *)calloc(cngram_vocab_max_size, sizeof(struct vocab_word));
1433 cngram_vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
1434 expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1435 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1436 expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1437 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1438 }
1439 TrainModel();
1440 return 0;
1441}
1442