blob: f1d8f60df15a292f565dac364ee2b79691a3cad2 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39struct vocab_word *vocab;
40int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
41int *vocab_hash;
42long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
43long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
44real alpha = 0.025, starting_alpha, sample = 1e-3;
45real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
46clock_t start;
47
48real *syn1_window, *syn1neg_window, *syn1nce_window;
49int w_offset, window_layer_size;
50
51int window_hidden_size = 500;
52real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg, *syn_hidden_word_nce;
53
54int hs = 0, negative = 5;
55const int table_size = 1e8;
56int *table;
57
58//constrastive negative sampling
59char negative_classes_file[MAX_STRING];
60int *word_to_group;
61int *group_to_table; //group_size*table_size
62int class_number;
63
64//nce
65real* noise_distribution;
66int nce = 0;
67
68//param caps
69real CAP_VALUE = 50;
70int cap = 0;
71
72void capParam(real* array, int index){
73 if(array[index] > CAP_VALUE)
74 array[index] = CAP_VALUE;
75 else if(array[index] < -CAP_VALUE)
76 array[index] = -CAP_VALUE;
77}
78
79real hardTanh(real x){
80 if(x>=1){
81 return 1;
82 }
83 else if(x<=-1){
84 return -1;
85 }
86 else{
87 return x;
88 }
89}
90
91real dHardTanh(real x, real g){
92 if(x > 1 && g > 0){
93 return 0;
94 }
95 if(x < -1 && g < 0){
96 return 0;
97 }
98 return 1;
99}
100
101int isEndOfSentence(char* word){
102 return strcmp("</s>", word) == 0;
103}
104
105void InitUnigramTable() {
106 int a, i;
107 long long train_words_pow = 0;
108 real d1, power = 0.75;
109 table = (int *)malloc(table_size * sizeof(int));
110 for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
111 i = 0;
112 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
113 for (a = 0; a < table_size; a++) {
114 table[a] = i;
115 if (a / (real)table_size > d1) {
116 i++;
117 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
118 }
119 if (i >= vocab_size) i = vocab_size - 1;
120 }
121
122 noise_distribution = (real *)calloc(vocab_size, sizeof(real));
123 for (a = 0; a < vocab_size; a++) noise_distribution[a] = pow(vocab[a].cn, power)/(real)train_words_pow;
124}
125
126// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
127void ReadWord(char *word, FILE *fin) {
128 int a = 0, ch;
129 while (!feof(fin)) {
130 ch = fgetc(fin);
131 if (ch == 13) continue;
132 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
133 if (a > 0) {
134 if (ch == '\n') ungetc(ch, fin);
135 break;
136 }
137 if (ch == '\n') {
138 strcpy(word, (char *)"</s>");
139 return;
140 } else continue;
141 }
142 word[a] = ch;
143 a++;
144 if (a >= MAX_STRING - 1) a--; // Truncate too long words
145 }
146 word[a] = 0;
147}
148
149// Returns hash value of a word
150int GetWordHash(char *word) {
151 unsigned long long a, hash = 0;
152 for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
153 hash = hash % vocab_hash_size;
154 return hash;
155}
156
157// Returns position of a word in the vocabulary; if the word is not found, returns -1
158int SearchVocab(char *word) {
159 unsigned int hash = GetWordHash(word);
160 while (1) {
161 if (vocab_hash[hash] == -1) return -1;
162 if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
163 hash = (hash + 1) % vocab_hash_size;
164 }
165 return -1;
166}
167
168// Reads a word and returns its index in the vocabulary
169int ReadWordIndex(FILE *fin) {
170 char word[MAX_STRING];
171 ReadWord(word, fin);
172 if (feof(fin)) return -1;
173 return SearchVocab(word);
174}
175
176// Adds a word to the vocabulary
177int AddWordToVocab(char *word) {
178 unsigned int hash, length = strlen(word) + 1;
179 if (length > MAX_STRING) length = MAX_STRING;
180 vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
181 strcpy(vocab[vocab_size].word, word);
182 vocab[vocab_size].cn = 0;
183 vocab_size++;
184 // Reallocate memory if needed
185 if (vocab_size + 2 >= vocab_max_size) {
186 vocab_max_size += 1000;
187 vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
188 }
189 hash = GetWordHash(word);
190 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
191 vocab_hash[hash] = vocab_size - 1;
192 return vocab_size - 1;
193}
194
195// Used later for sorting by word counts
196int VocabCompare(const void *a, const void *b) {
197 return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
198}
199
200// Sorts the vocabulary by frequency using word counts
201void SortVocab() {
202 int a, size;
203 unsigned int hash;
204 // Sort the vocabulary and keep </s> at the first position
205 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
206 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
207 size = vocab_size;
208 train_words = 0;
209 for (a = 0; a < size; a++) {
210 // Words occuring less than min_count times will be discarded from the vocab
211 if ((vocab[a].cn < min_count) && (a != 0)) {
212 vocab_size--;
213 free(vocab[a].word);
214 } else {
215 // Hash will be re-computed, as after the sorting it is not actual
216 hash=GetWordHash(vocab[a].word);
217 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
218 vocab_hash[hash] = a;
219 train_words += vocab[a].cn;
220 }
221 }
222 vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
223 // Allocate memory for the binary tree construction
224 for (a = 0; a < vocab_size; a++) {
225 vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
226 vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
227 }
228}
229
230// Reduces the vocabulary by removing infrequent tokens
231void ReduceVocab() {
232 int a, b = 0;
233 unsigned int hash;
234 for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
235 vocab[b].cn = vocab[a].cn;
236 vocab[b].word = vocab[a].word;
237 b++;
238 } else free(vocab[a].word);
239 vocab_size = b;
240 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
241 for (a = 0; a < vocab_size; a++) {
242 // Hash will be re-computed, as it is not actual
243 hash = GetWordHash(vocab[a].word);
244 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
245 vocab_hash[hash] = a;
246 }
247 fflush(stdout);
248 min_reduce++;
249}
250
251// Create binary Huffman tree using the word counts
252// Frequent words will have short uniqe binary codes
253void CreateBinaryTree() {
254 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
255 char code[MAX_CODE_LENGTH];
256 long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
257 long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
258 long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
259 for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
260 for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
261 pos1 = vocab_size - 1;
262 pos2 = vocab_size;
263 // Following algorithm constructs the Huffman tree by adding one node at a time
264 for (a = 0; a < vocab_size - 1; a++) {
265 // First, find two smallest nodes 'min1, min2'
266 if (pos1 >= 0) {
267 if (count[pos1] < count[pos2]) {
268 min1i = pos1;
269 pos1--;
270 } else {
271 min1i = pos2;
272 pos2++;
273 }
274 } else {
275 min1i = pos2;
276 pos2++;
277 }
278 if (pos1 >= 0) {
279 if (count[pos1] < count[pos2]) {
280 min2i = pos1;
281 pos1--;
282 } else {
283 min2i = pos2;
284 pos2++;
285 }
286 } else {
287 min2i = pos2;
288 pos2++;
289 }
290 count[vocab_size + a] = count[min1i] + count[min2i];
291 parent_node[min1i] = vocab_size + a;
292 parent_node[min2i] = vocab_size + a;
293 binary[min2i] = 1;
294 }
295 // Now assign binary code to each vocabulary word
296 for (a = 0; a < vocab_size; a++) {
297 b = a;
298 i = 0;
299 while (1) {
300 code[i] = binary[b];
301 point[i] = b;
302 i++;
303 b = parent_node[b];
304 if (b == vocab_size * 2 - 2) break;
305 }
306 vocab[a].codelen = i;
307 vocab[a].point[0] = vocab_size - 2;
308 for (b = 0; b < i; b++) {
309 vocab[a].code[i - b - 1] = code[b];
310 vocab[a].point[i - b] = point[b] - vocab_size;
311 }
312 }
313 free(count);
314 free(binary);
315 free(parent_node);
316}
317
318void LearnVocabFromTrainFile() {
319 char word[MAX_STRING];
320 FILE *fin;
321 long long a, i;
322 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
323 fin = fopen(train_file, "rb");
324 if (fin == NULL) {
325 printf("ERROR: training data file not found!\n");
326 exit(1);
327 }
328 vocab_size = 0;
329 AddWordToVocab((char *)"</s>");
330 int startOfLine = 1;
331 while (1) {
332 ReadWord(word, fin);
333 if (feof(fin)) break;
334 if (startOfLine) {
335 ReadWord(word, fin);
336 startOfLine = 0;
337 }
338 if(isEndOfSentence(word)){
339 startOfLine = 1;
340 }
341 train_words++;
342 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
343 printf("%lldK%c", train_words / 1000, 13);
344 fflush(stdout);
345 }
346 i = SearchVocab(word);
347 if (i == -1) {
348 a = AddWordToVocab(word);
349 vocab[a].cn = 1;
350 } else vocab[i].cn++;
351 if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
352 }
353 SortVocab();
354 if (debug_mode > 0) {
355 printf("Vocab size: %lld\n", vocab_size);
356 printf("Words in train file: %lld\n", train_words);
357 }
358 file_size = ftell(fin);
359 fclose(fin);
360}
361
362void SaveVocab() {
363 long long i;
364 FILE *fo = fopen(save_vocab_file, "wb");
365 for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
366 fclose(fo);
367}
368
369void ReadVocab() {
370 long long a, i = 0;
371 char c;
372 char word[MAX_STRING];
373 FILE *fin = fopen(read_vocab_file, "rb");
374 if (fin == NULL) {
375 printf("Vocabulary file not found\n");
376 exit(1);
377 }
378 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
379 vocab_size = 0;
380 while (1) {
381 ReadWord(word, fin);
382 if (feof(fin)) break;
383 a = AddWordToVocab(word);
384 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
385 i++;
386 }
387 SortVocab();
388 if (debug_mode > 0) {
389 printf("Vocab size: %lld\n", vocab_size);
390 printf("Words in train file: %lld\n", train_words);
391 }
392 fin = fopen(train_file, "rb");
393 if (fin == NULL) {
394 printf("ERROR: training data file not found!\n");
395 exit(1);
396 }
397 fseek(fin, 0, SEEK_END);
398 file_size = ftell(fin);
399 fclose(fin);
400}
401
402void InitClassUnigramTable() {
403 long long a,c;
404 printf("loading class unigrams \n");
405 FILE *fin = fopen(negative_classes_file, "rb");
406 if (fin == NULL) {
407 printf("ERROR: class file not found!\n");
408 exit(1);
409 }
410 word_to_group = (int *)malloc(vocab_size * sizeof(int));
411 for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
412 char class[MAX_STRING];
413 char prev_class[MAX_STRING];
414 prev_class[0] = 0;
415 char word[MAX_STRING];
416 class_number = -1;
417 while (1) {
418 if (feof(fin)) break;
419 ReadWord(class, fin);
420 ReadWord(word, fin);
421 int word_index = SearchVocab(word);
422 if (word_index != -1){
423 if(strcmp(class, prev_class) != 0){
424 class_number++;
425 strcpy(prev_class, class);
426 }
427 word_to_group[word_index] = class_number;
428 }
429 ReadWord(word, fin);
430 }
431 class_number++;
432 fclose(fin);
433
434 group_to_table = (int *)malloc(table_size * class_number * sizeof(int));
435 long long train_words_pow = 0;
436 real d1, power = 0.75;
437
438 for(c = 0; c < class_number; c++){
439 long long offset = c * table_size;
440 train_words_pow = 0;
441 for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
442 int i = 0;
443 while(word_to_group[i]!=c && i < vocab_size) i++;
444 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
445 for (a = 0; a < table_size; a++) {
446 //printf("index %lld , word %d\n", a, i);
447 group_to_table[offset + a] = i;
448 if (a / (real)table_size > d1) {
449 i++;
450 while(word_to_group[i]!=c && i < vocab_size) i++;
451 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
452 }
453 if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
454 }
455 }
456}
457
458void InitNet() {
459 long long a, b;
460 unsigned long long next_random = 1;
461 window_layer_size = layer1_size*window*2;
462 a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
463 if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
464
465 if (hs) {
466 a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
467 if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
468 a = posix_memalign((void **)&syn1_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
469 if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
470 a = posix_memalign((void **)&syn_hidden_word, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
471 if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
472
473 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
474 syn1[a * layer1_size + b] = 0;
475 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
476 syn1_window[a * window_layer_size + b] = 0;
477 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
478 syn_hidden_word[a * window_hidden_size + b] = 0;
479 }
480 if (negative>0) {
481 a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
482 if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
483 a = posix_memalign((void **)&syn1neg_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
484 if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
485 a = posix_memalign((void **)&syn_hidden_word_neg, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
486 if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
487
488 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
489 syn1neg[a * layer1_size + b] = 0;
490 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
491 syn1neg_window[a * window_layer_size + b] = 0;
492 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
493 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
494 }
495 if (nce>0) {
496 a = posix_memalign((void **)&syn1nce, 128, (long long)vocab_size * layer1_size * sizeof(real));
497 if (syn1nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
498 a = posix_memalign((void **)&syn1nce_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
499 if (syn1nce_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
500 a = posix_memalign((void **)&syn_hidden_word_nce, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
501 if (syn_hidden_word_nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
502
503 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
504 syn1nce[a * layer1_size + b] = 0;
505 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
506 syn1nce_window[a * window_layer_size + b] = 0;
507 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
508 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
509 }
510 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
511 next_random = next_random * (unsigned long long)25214903917 + 11;
512 syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
513 }
514
515 a = posix_memalign((void **)&syn_window_hidden, 128, window_hidden_size * window_layer_size * sizeof(real));
516 if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
517 for (a = 0; a < window_hidden_size * window_layer_size; a++){
518 next_random = next_random * (unsigned long long)25214903917 + 11;
519 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
520 }
521
522 CreateBinaryTree();
523}
524
525long long findStartOfLine(char* file, long long start){
526 char word[MAX_STRING];
527 if(start == 0) return 0;
528 while(start != 0){
529 FILE*fi = fopen(file, "rb");
530 fseek(fi, start, SEEK_SET);
531 ReadWord(word, fi);
532 if(isEndOfSentence(word)){
533 fclose(fi);
534 return start+1;
535 }
536 fclose(fi);
537 start--;
538 }
539 return 0;
540}
541
542void *TrainModelThread(void *id) {
543 char word_str[MAX_STRING];
544 long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
545 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
546 long long l1, l2, c, target, label, local_iter = iter;
547 unsigned long long next_random = (long long)id;
548 real f, g;
549 clock_t now;
550 int input_len_1 = layer1_size;
551 int window_offset = -1;
552 float currentWeight = 0;
553 if(type == 2 || type == 4){
554 input_len_1=window_layer_size;
555 }
556 real *neu1 = (real *)calloc(input_len_1, sizeof(real));
557 real *neu1e = (real *)calloc(input_len_1, sizeof(real));
558
559 int input_len_2 = 0;
560 if(type == 4){
561 input_len_2 = window_hidden_size;
562 }
563 real *neu2 = (real *)calloc(input_len_2, sizeof(real));
564 real *neu2e = (real *)calloc(input_len_2, sizeof(real));
565
566 long long start_pos = findStartOfLine(train_file, file_size / (long long)num_threads * (long long)id);
567 FILE *fi = fopen(train_file, "rb");
568 fseek(fi, start_pos, SEEK_SET);
569 int startOfSentence = 1;
570 int startEndOfLineIndex = SearchVocab("</s>");
571 while (1) {
572 if (word_count - last_word_count > 10000) {
573 word_count_actual += word_count - last_word_count;
574 last_word_count = word_count;
575 if ((debug_mode > 1)) {
576 now=clock();
577 printf("%cAlpha: %f Weight: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, currentWeight,
578 word_count_actual / (real)(iter * train_words + 1) * 100,
579 word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
580 fflush(stdout);
581 }
582 alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
583 if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
584 }
585 if (sentence_length == 0) {
586 while (1) {
587 if(startOfSentence){
588 ReadWord(word_str, fi);
589 currentWeight = atof(word_str);
590 startOfSentence = 0;
591 continue;
592 }
593 word = ReadWordIndex(fi);
594 if (word == startEndOfLineIndex){
595 startOfSentence = 1;
596 }
597 if (feof(fi)) break;
598 if (word == -1) continue;
599 word_count++;
600 if (word == 0) break;
601 // The subsampling randomly discards frequent words while keeping the ranking same
602 if (sample > 0) {
603 real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
604 next_random = next_random * (unsigned long long)25214903917 + 11;
605 if (ran < (next_random & 0xFFFF) / (real)65536) continue;
606 }
607 sen[sentence_length] = word;
608 sentence_length++;
609 if (sentence_length >= MAX_SENTENCE_LENGTH) break;
610 }
611 sentence_position = 0;
612 }
613 if (feof(fi) || (word_count > train_words / num_threads)) {
614 word_count_actual += word_count - last_word_count;
615 local_iter--;
616 if (local_iter == 0) break;
617 word_count = 0;
618 last_word_count = 0;
619 sentence_length = 0;
620 fseek(fi, start_pos, SEEK_SET);
621 continue;
622 }
623 word = sen[sentence_position];
624 if (word == -1) continue;
625 for (c = 0; c < input_len_1; c++) neu1[c] = 0;
626 for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
627 for (c = 0; c < input_len_2; c++) neu2[c] = 0;
628 for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
629 next_random = next_random * (unsigned long long)25214903917 + 11;
630 b = next_random % window;
631 if (type == 0) { //train the cbow architecture
632 // in -> hidden
633 cw = 0;
634 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
635 c = sentence_position - window + a;
636 if (c < 0) continue;
637 if (c >= sentence_length) continue;
638 last_word = sen[c];
639 if (last_word == -1) continue;
640 for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
641 cw++;
642 }
643 if (cw) {
644 for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
645 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
646 f = 0;
647 l2 = vocab[word].point[d] * layer1_size;
648 // Propagate hidden -> output
649 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
650 if (f <= -MAX_EXP) continue;
651 else if (f >= MAX_EXP) continue;
652 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
653 // 'g' is the gradient multiplied by the learning rate
654 g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
655 // Propagate errors output -> hidden
656 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
657 // Learn weights hidden -> output
658 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
659 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
660 }
661 // NEGATIVE SAMPLING
662 if (negative > 0) for (d = 0; d < negative + 1; d++) {
663 if (d == 0) {
664 target = word;
665 label = 1;
666 } else {
667 next_random = next_random * (unsigned long long)25214903917 + 11;
668 if(word_to_group != NULL && word_to_group[word] != -1){
669 target = word;
670 while(target == word) {
671 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
672 next_random = next_random * (unsigned long long)25214903917 + 11;
673 }
674 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
675 }
676 else{
677 target = table[(next_random >> 16) % table_size];
678 }
679 if (target == 0) target = next_random % (vocab_size - 1) + 1;
680 if (target == word) continue;
681 label = 0;
682 }
683 l2 = target * layer1_size;
684 f = 0;
685 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
686 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
687 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
688 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight;
689 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
690 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
691 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
692 }
693 // Noise Contrastive Estimation
694 if (nce > 0) for (d = 0; d < nce + 1; d++) {
695 if (d == 0) {
696 target = word;
697 label = 1;
698 } else {
699 next_random = next_random * (unsigned long long)25214903917 + 11;
700 if(word_to_group != NULL && word_to_group[word] != -1){
701 target = word;
702 while(target == word) {
703 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
704 next_random = next_random * (unsigned long long)25214903917 + 11;
705 }
706 }
707 else{
708 target = table[(next_random >> 16) % table_size];
709 }
710 if (target == 0) target = next_random % (vocab_size - 1) + 1;
711 if (target == word) continue;
712 label = 0;
713 }
714 l2 = target * layer1_size;
715 f = 0;
716
717 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
718 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
719 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
720 else {
721 f = exp(f);
722 g = (label - f/(noise_distribution[target]*nce + f)) * alpha * currentWeight;
723 }
724 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
725 for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
726 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce,c + l2);
727 }
728 // hidden -> in
729 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
730 c = sentence_position - window + a;
731 if (c < 0) continue;
732 if (c >= sentence_length) continue;
733 last_word = sen[c];
734 if (last_word == -1) continue;
735 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
736 }
737 }
738 } else if(type==1) { //train skip-gram
739 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
740 c = sentence_position - window + a;
741 if (c < 0) continue;
742 if (c >= sentence_length) continue;
743 last_word = sen[c];
744 if (last_word == -1) continue;
745 l1 = last_word * layer1_size;
746 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
747 // HIERARCHICAL SOFTMAX
748 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
749 f = 0;
750 l2 = vocab[word].point[d] * layer1_size;
751 // Propagate hidden -> output
752 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
753 if (f <= -MAX_EXP) continue;
754 else if (f >= MAX_EXP) continue;
755 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
756 // 'g' is the gradient multiplied by the learning rate
757 g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
758 // Propagate errors output -> hidden
759 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
760 // Learn weights hidden -> output
761 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
762 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
763 }
764 // NEGATIVE SAMPLING
765 if (negative > 0) for (d = 0; d < negative + 1; d++) {
766 if (d == 0) {
767 target = word;
768 label = 1;
769 } else {
770 next_random = next_random * (unsigned long long)25214903917 + 11;
771 if(word_to_group != NULL && word_to_group[word] != -1){
772 target = word;
773 while(target == word) {
774 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
775 next_random = next_random * (unsigned long long)25214903917 + 11;
776 }
777 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
778 }
779 else{
780 target = table[(next_random >> 16) % table_size];
781 }
782 if (target == 0) target = next_random % (vocab_size - 1) + 1;
783 if (target == word) continue;
784 label = 0;
785 }
786 l2 = target * layer1_size;
787 f = 0;
788 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
789 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
790 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
791 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight;
792 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
793 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
794 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
795 }
796 //Noise Contrastive Estimation
797 if (nce > 0) for (d = 0; d < nce + 1; d++) {
798 if (d == 0) {
799 target = word;
800 label = 1;
801 } else {
802 next_random = next_random * (unsigned long long)25214903917 + 11;
803 if(word_to_group != NULL && word_to_group[word] != -1){
804 target = word;
805 while(target == word) {
806 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
807 next_random = next_random * (unsigned long long)25214903917 + 11;
808 }
809 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
810 }
811 else{
812 target = table[(next_random >> 16) % table_size];
813 }
814 if (target == 0) target = next_random % (vocab_size - 1) + 1;
815 if (target == word) continue;
816 label = 0;
817 }
818 l2 = target * layer1_size;
819 f = 0;
820 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce[c + l2];
821 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
822 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
823 else {
824 f = exp(f);
825 g = (label - f/(noise_distribution[target]*nce + f)) * alpha * currentWeight;
826 }
827 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
828 for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * syn0[c + l1];
829 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce, c + l2);
830 }
831 // Learn weights input -> hidden
832 for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
833 }
834 }
835 else if(type == 2){ //train the cwindow architecture
836 // in -> hidden
837 cw = 0;
838 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
839 c = sentence_position - window + a;
840 if (c < 0) continue;
841 if (c >= sentence_length) continue;
842 last_word = sen[c];
843 if (last_word == -1) continue;
844 window_offset = a*layer1_size;
845 if (a > window) window_offset-=layer1_size;
846 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
847 cw++;
848 }
849 if (cw) {
850 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
851 f = 0;
852 l2 = vocab[word].point[d] * window_layer_size;
853 // Propagate hidden -> output
854 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
855 if (f <= -MAX_EXP) continue;
856 else if (f >= MAX_EXP) continue;
857 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
858 // 'g' is the gradient multiplied by the learning rate
859 g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
860 // Propagate errors output -> hidden
861 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
862 // Learn weights hidden -> output
863 for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
864 if (cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1_window, c + l2);
865 }
866 // NEGATIVE SAMPLING
867 if (negative > 0) for (d = 0; d < negative + 1; d++) {
868 if (d == 0) {
869 target = word;
870 label = 1;
871 } else {
872 next_random = next_random * (unsigned long long)25214903917 + 11;
873 if(word_to_group != NULL && word_to_group[word] != -1){
874 target = word;
875 while(target == word) {
876 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
877 next_random = next_random * (unsigned long long)25214903917 + 11;
878 }
879 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
880 }
881 else{
882 target = table[(next_random >> 16) % table_size];
883 }
884 if (target == 0) target = next_random % (vocab_size - 1) + 1;
885 if (target == word) continue;
886 label = 0;
887 }
888 l2 = target * window_layer_size;
889 f = 0;
890 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
891 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
892 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
893 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight;
894 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
895 for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
896 if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1neg_window, c + l2);
897 }
898 // Noise Contrastive Estimation
899 if (nce > 0) for (d = 0; d < nce + 1; d++) {
900 if (d == 0) {
901 target = word;
902 label = 1;
903 } else {
904 next_random = next_random * (unsigned long long)25214903917 + 11;
905 if(word_to_group != NULL && word_to_group[word] != -1){
906 target = word;
907 while(target == word) {
908 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
909 next_random = next_random * (unsigned long long)25214903917 + 11;
910 }
911 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
912 }
913 else{
914 target = table[(next_random >> 16) % table_size];
915 }
916 if (target == 0) target = next_random % (vocab_size - 1) + 1;
917 if (target == word) continue;
918 label = 0;
919 }
920 l2 = target * window_layer_size;
921 f = 0;
922 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1nce_window[c + l2];
923 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
924 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
925 else {
926 f = exp(f);
927 g = (label - f/(noise_distribution[target]*nce + f)) * alpha * currentWeight;
928 }
929 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1nce_window[c + l2];
930 for (c = 0; c < window_layer_size; c++) syn1nce_window[c + l2] += g * neu1[c];
931 if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1nce_window, c + l2);
932 }
933 // hidden -> in
934 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
935 c = sentence_position - window + a;
936 if (c < 0) continue;
937 if (c >= sentence_length) continue;
938 last_word = sen[c];
939 if (last_word == -1) continue;
940 window_offset = a * layer1_size;
941 if(a > window) window_offset -= layer1_size;
942 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
943 }
944 }
945 }
946 else if (type == 3){ //train structured skip-gram
947 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
948 c = sentence_position - window + a;
949 if (c < 0) continue;
950 if (c >= sentence_length) continue;
951 last_word = sen[c];
952 if (last_word == -1) continue;
953 l1 = last_word * layer1_size;
954 window_offset = a * layer1_size;
955 if(a > window) window_offset -= layer1_size;
956 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
957 // HIERARCHICAL SOFTMAX
958 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
959 f = 0;
960 l2 = vocab[word].point[d] * window_layer_size;
961 // Propagate hidden -> output
962 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1_window[c + l2 + window_offset];
963 if (f <= -MAX_EXP) continue;
964 else if (f >= MAX_EXP) continue;
965 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
966 // 'g' is the gradient multiplied by the learning rate
967 g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
968 // Propagate errors output -> hidden
969 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
970 // Learn weights hidden -> output
971 for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * syn0[c + l1];
972 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2 + window_offset);
973 }
974 // NEGATIVE SAMPLING
975 if (negative > 0) for (d = 0; d < negative + 1; d++) {
976 if (d == 0) {
977 target = word;
978 label = 1;
979 } else {
980 next_random = next_random * (unsigned long long)25214903917 + 11;
981 if(word_to_group != NULL && word_to_group[word] != -1){
982 target = word;
983 while(target == word) {
984 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
985 next_random = next_random * (unsigned long long)25214903917 + 11;
986 }
987 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
988 }
989 else{
990 target = table[(next_random >> 16) % table_size];
991 }
992 if (target == 0) target = next_random % (vocab_size - 1) + 1;
993 if (target == word) continue;
994 label = 0;
995 }
996 l2 = target * window_layer_size;
997 f = 0;
998 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg_window[c + l2 + window_offset];
999 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
1000 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
1001 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight;
1002 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
1003 for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * syn0[c + l1];
1004 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg_window, c + l2 + window_offset);
1005 }
1006 // Noise Constrastive Estimation
1007 if (nce > 0) for (d = 0; d < nce + 1; d++) {
1008 if (d == 0) {
1009 target = word;
1010 label = 1;
1011 } else {
1012 next_random = next_random * (unsigned long long)25214903917 + 11;
1013 if(word_to_group != NULL && word_to_group[word] != -1){
1014 target = word;
1015 while(target == word) {
1016 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1017 next_random = next_random * (unsigned long long)25214903917 + 11;
1018 }
1019 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1020 }
1021 else{
1022 target = table[(next_random >> 16) % table_size];
1023 }
1024 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1025 if (target == word) continue;
1026 label = 0;
1027 }
1028 l2 = target * window_layer_size;
1029 f = 0;
1030 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce_window[c + l2 + window_offset];
1031 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
1032 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
1033 else {
1034 f = exp(f);
1035 g = (label - f/(noise_distribution[target]*nce + f)) * alpha * currentWeight;
1036 }
1037 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce_window[c + l2 + window_offset];
1038 for (c = 0; c < layer1_size; c++) syn1nce_window[c + l2 + window_offset] += g * syn0[c + l1];
1039 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce_window, c + l2 + window_offset);
1040 }
1041 // Learn weights input -> hidden
1042 for (c = 0; c < layer1_size; c++) {syn0[c + l1] += neu1e[c]; if(syn0[c + l1] > 50) syn0[c + l1] = 50; if(syn0[c + l1] < -50) syn0[c + l1] = -50;}
1043 }
1044 }
1045 else if(type == 4){ //training senna
1046 // in -> hidden
1047 cw = 0;
1048 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1049 c = sentence_position - window + a;
1050 if (c < 0) continue;
1051 if (c >= sentence_length) continue;
1052 last_word = sen[c];
1053 if (last_word == -1) continue;
1054 window_offset = a*layer1_size;
1055 if (a > window) window_offset-=layer1_size;
1056 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
1057 cw++;
1058 }
1059 if (cw) {
1060 for (a = 0; a < window_hidden_size; a++){
1061 c = a*window_layer_size;
1062 for(b = 0; b < window_layer_size; b++){
1063 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1064 }
1065 }
1066 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1067 f = 0;
1068 l2 = vocab[word].point[d] * window_hidden_size;
1069 // Propagate hidden -> output
1070 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1071 if (f <= -MAX_EXP) continue;
1072 else if (f >= MAX_EXP) continue;
1073 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1074 // 'g' is the gradient multiplied by the learning rate
1075 g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
1076 // Propagate errors output -> hidden
1077 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
1078 // Learn weights hidden -> output
1079 for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1080 }
1081 // NEGATIVE SAMPLING
1082 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1083 if (d == 0) {
1084 target = word;
1085 label = 1;
1086 } else {
1087 next_random = next_random * (unsigned long long)25214903917 + 11;
1088 if(word_to_group != NULL && word_to_group[word] != -1){
1089 target = word;
1090 while(target == word) {
1091 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1092 next_random = next_random * (unsigned long long)25214903917 + 11;
1093 }
1094 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1095 }
1096 else{
1097 target = table[(next_random >> 16) % table_size];
1098 }
1099 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1100 if (target == word) continue;
1101 label = 0;
1102 }
1103 l2 = target * window_hidden_size;
1104 f = 0;
1105 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
1106 if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight / negative;
1107 else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight / negative;
1108 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight / negative;
1109 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
1110 for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1111 }
1112 for (a = 0; a < window_hidden_size; a++)
1113 for(b = 0; b < window_layer_size; b++)
1114 neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
1115 for (a = 0; a < window_hidden_size; a++)
1116 for(b = 0; b < window_layer_size; b++)
1117 syn_window_hidden[a*window_layer_size + b] += neu2e[a] * neu1[b];
1118 // hidden -> in
1119 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1120 c = sentence_position - window + a;
1121 if (c < 0) continue;
1122 if (c >= sentence_length) continue;
1123 last_word = sen[c];
1124 if (last_word == -1) continue;
1125 window_offset = a * layer1_size;
1126 if(a > window) window_offset -= layer1_size;
1127 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
1128 }
1129 }
1130 }
1131 else{
1132 printf("unknown type %i", type);
1133 exit(0);
1134 }
1135 sentence_position++;
1136 if (sentence_position >= sentence_length) {
1137 sentence_length = 0;
1138 continue;
1139 }
1140 }
1141 fclose(fi);
1142 free(neu1);
1143 free(neu1e);
1144 pthread_exit(NULL);
1145}
1146
1147void TrainModel() {
1148 long a, b, c, d;
1149 FILE *fo;
1150 pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
1151 printf("Starting training using file %s\n", train_file);
1152 starting_alpha = alpha;
1153 if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
1154 if (save_vocab_file[0] != 0) SaveVocab();
1155 if (output_file[0] == 0) return;
1156 InitNet();
1157 if (negative > 0 || nce > 0) InitUnigramTable();
1158 if (negative_classes_file[0] != 0) InitClassUnigramTable();
1159 start = clock();
1160 for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
1161 for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
1162 fo = fopen(output_file, "wb");
1163 if (classes == 0) {
1164 // Save the word vectors
1165 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1166 for (a = 0; a < vocab_size; a++) {
1167 fprintf(fo, "%s ", vocab[a].word);
1168 if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1169 else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1170 fprintf(fo, "\n");
1171 }
1172 } else {
1173 // Run K-means on the word vectors
1174 int clcn = classes, iter = 10, closeid;
1175 int *centcn = (int *)malloc(classes * sizeof(int));
1176 int *cl = (int *)calloc(vocab_size, sizeof(int));
1177 real closev, x;
1178 real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
1179 for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
1180 for (a = 0; a < iter; a++) {
1181 for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
1182 for (b = 0; b < clcn; b++) centcn[b] = 1;
1183 for (c = 0; c < vocab_size; c++) {
1184 for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1185 centcn[cl[c]]++;
1186 }
1187 for (b = 0; b < clcn; b++) {
1188 closev = 0;
1189 for (c = 0; c < layer1_size; c++) {
1190 cent[layer1_size * b + c] /= centcn[b];
1191 closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
1192 }
1193 closev = sqrt(closev);
1194 for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
1195 }
1196 for (c = 0; c < vocab_size; c++) {
1197 closev = -10;
1198 closeid = 0;
1199 for (d = 0; d < clcn; d++) {
1200 x = 0;
1201 for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
1202 if (x > closev) {
1203 closev = x;
1204 closeid = d;
1205 }
1206 }
1207 cl[c] = closeid;
1208 }
1209 }
1210 // Save the K-means classes
1211 for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1212 free(centcn);
1213 free(cent);
1214 free(cl);
1215 }
1216 fclose(fo);
1217}
1218
1219int ArgPos(char *str, int argc, char **argv) {
1220 int a;
1221 for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
1222 if (a == argc - 1) {
1223 printf("Argument missing for %s\n", str);
1224 exit(1);
1225 }
1226 return a;
1227 }
1228 return -1;
1229}
1230
1231int main(int argc, char **argv) {
1232 int i;
1233 if (argc == 1) {
1234 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1235 printf("Options:\n");
1236 printf("Parameters for training:\n");
1237 printf("\t-train <file>\n");
1238 printf("\t\tUse text data from <file> to train the model\n");
1239 printf("\t-output <file>\n");
1240 printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
1241 printf("\t-size <int>\n");
1242 printf("\t\tSet size of word vectors; default is 100\n");
1243 printf("\t-window <int>\n");
1244 printf("\t\tSet max skip length between words; default is 5\n");
1245 printf("\t-sample <float>\n");
1246 printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1247 printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1248 printf("\t-hs <int>\n");
1249 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1250 printf("\t-negative <int>\n");
1251 printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1252 printf("\t-negative-classes <file>\n");
1253 printf("\t\tNegative classes to sample from\n");
1254 printf("\t-nce <int>\n");
1255 printf("\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1256 printf("\t-threads <int>\n");
1257 printf("\t\tUse <int> threads (default 12)\n");
1258 printf("\t-iter <int>\n");
1259 printf("\t\tRun more training iterations (default 5)\n");
1260 printf("\t-min-count <int>\n");
1261 printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
1262 printf("\t-alpha <float>\n");
1263 printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1264 printf("\t-classes <int>\n");
1265 printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1266 printf("\t-debug <int>\n");
1267 printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
1268 printf("\t-binary <int>\n");
1269 printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1270 printf("\t-save-vocab <file>\n");
1271 printf("\t\tThe vocabulary will be saved to <file>\n");
1272 printf("\t-read-vocab <file>\n");
1273 printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1274 printf("\t-type <int>\n");
1275 printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1276 printf("\t-cap <int>\n");
1277 printf("\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1278 printf("\nExamples:\n");
1279 printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1280 return 0;
1281 }
1282 output_file[0] = 0;
1283 save_vocab_file[0] = 0;
1284 read_vocab_file[0] = 0;
1285 negative_classes_file[0] = 0;
1286 if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
1287 if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
1288 if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
1289 if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
1290 if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
1291 if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
1292 if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
1293 if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
1294 if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
1295 if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
1296 if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
1297 if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
1298 if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
1299 if ((i = ArgPos((char *)"-nce", argc, argv)) > 0) nce = atoi(argv[i + 1]);
1300 if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
1301 if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
1302 if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
1303 if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
1304 if ((i = ArgPos((char *)"-cap", argc, argv)) > 0) cap = atoi(argv[i + 1]);
1305 if (type==0 || type==2 || type==4) alpha = 0.05;
1306 if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
1307 vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
1308 vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
1309 expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1310 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1311 expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1312 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1313 }
1314 TrainModel();
1315 return 0;
1316}
1317