blob: 67d98468a9ad160f318619533030fba759394529 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39struct vocab_word *vocab;
40int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
41int *vocab_hash;
42long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
43long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
44real alpha = 0.025, starting_alpha, sample = 1e-3;
45real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
46clock_t start;
47
48real *syn1_window, *syn1neg_window, *syn1nce_window;
49int w_offset, window_layer_size;
50
51int window_hidden_size = 500;
52real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg, *syn_hidden_word_nce;
53
54int hs = 0, negative = 5;
55const int table_size = 1e8;
56int *table;
57
58//constrastive negative sampling
59char negative_classes_file[MAX_STRING];
60int *word_to_group;
61int *group_to_table; //group_size*table_size
62int class_number;
63
64//nce
65real* noise_distribution;
66int nce = 0;
67
68//param caps
69real CAP_VALUE = 50;
70int cap = 0;
71
72void capParam(real* array, int index){
73 if(array[index] > CAP_VALUE)
74 array[index] = CAP_VALUE;
75 else if(array[index] < -CAP_VALUE)
76 array[index] = -CAP_VALUE;
77}
78
79real hardTanh(real x){
80 if(x>=1){
81 return 1;
82 }
83 else if(x<=-1){
84 return -1;
85 }
86 else{
87 return x;
88 }
89}
90
91real dHardTanh(real x, real g){
92 if(x > 1 && g > 0){
93 return 0;
94 }
95 if(x < -1 && g < 0){
96 return 0;
97 }
98 return 1;
99}
100
101void InitUnigramTable() {
102 int a, i;
103 long long train_words_pow = 0;
104 real d1, power = 0.75;
105 table = (int *)malloc(table_size * sizeof(int));
106 for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
107 i = 0;
108 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
109 for (a = 0; a < table_size; a++) {
110 table[a] = i;
111 if (a / (real)table_size > d1) {
112 i++;
113 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
114 }
115 if (i >= vocab_size) i = vocab_size - 1;
116 }
117
118 noise_distribution = (real *)calloc(vocab_size, sizeof(real));
119 for (a = 0; a < vocab_size; a++) noise_distribution[a] = pow(vocab[a].cn, power)/(real)train_words_pow;
120}
121
122// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
123void ReadWord(char *word, FILE *fin) {
124 int a = 0, ch;
125 while (!feof(fin)) {
126 ch = fgetc(fin);
127 if (ch == 13) continue;
128 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
129 if (a > 0) {
130 if (ch == '\n') ungetc(ch, fin);
131 break;
132 }
133 if (ch == '\n') {
134 strcpy(word, (char *)"</s>");
135 return;
136 } else continue;
137 }
138 word[a] = ch;
139 a++;
140 if (a >= MAX_STRING - 1) a--; // Truncate too long words
141 }
142 word[a] = 0;
143}
144
145// Returns hash value of a word
146int GetWordHash(char *word) {
147 unsigned long long a, hash = 0;
148 for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
149 hash = hash % vocab_hash_size;
150 return hash;
151}
152
153// Returns position of a word in the vocabulary; if the word is not found, returns -1
154int SearchVocab(char *word) {
155 unsigned int hash = GetWordHash(word);
156 while (1) {
157 if (vocab_hash[hash] == -1) return -1;
158 if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
159 hash = (hash + 1) % vocab_hash_size;
160 }
161 return -1;
162}
163
164// Reads a word and returns its index in the vocabulary
165int ReadWordIndex(FILE *fin) {
166 char word[MAX_STRING];
167 ReadWord(word, fin);
168 if (feof(fin)) return -1;
169 return SearchVocab(word);
170}
171
172// Adds a word to the vocabulary
173int AddWordToVocab(char *word) {
174 unsigned int hash, length = strlen(word) + 1;
175 if (length > MAX_STRING) length = MAX_STRING;
176 vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
177 strcpy(vocab[vocab_size].word, word);
178 vocab[vocab_size].cn = 0;
179 vocab_size++;
180 // Reallocate memory if needed
181 if (vocab_size + 2 >= vocab_max_size) {
182 vocab_max_size += 1000;
183 vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
184 }
185 hash = GetWordHash(word);
186 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
187 vocab_hash[hash] = vocab_size - 1;
188 return vocab_size - 1;
189}
190
191// Used later for sorting by word counts
192int VocabCompare(const void *a, const void *b) {
193 return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
194}
195
196// Sorts the vocabulary by frequency using word counts
197void SortVocab() {
198 int a, size;
199 unsigned int hash;
200 // Sort the vocabulary and keep </s> at the first position
201 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
202 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
203 size = vocab_size;
204 train_words = 0;
205 for (a = 0; a < size; a++) {
206 // Words occuring less than min_count times will be discarded from the vocab
207 if ((vocab[a].cn < min_count) && (a != 0)) {
208 vocab_size--;
209 free(vocab[a].word);
210 } else {
211 // Hash will be re-computed, as after the sorting it is not actual
212 hash=GetWordHash(vocab[a].word);
213 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
214 vocab_hash[hash] = a;
215 train_words += vocab[a].cn;
216 }
217 }
218 vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
219 // Allocate memory for the binary tree construction
220 for (a = 0; a < vocab_size; a++) {
221 vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
222 vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
223 }
224}
225
226// Reduces the vocabulary by removing infrequent tokens
227void ReduceVocab() {
228 int a, b = 0;
229 unsigned int hash;
230 for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
231 vocab[b].cn = vocab[a].cn;
232 vocab[b].word = vocab[a].word;
233 b++;
234 } else free(vocab[a].word);
235 vocab_size = b;
236 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
237 for (a = 0; a < vocab_size; a++) {
238 // Hash will be re-computed, as it is not actual
239 hash = GetWordHash(vocab[a].word);
240 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
241 vocab_hash[hash] = a;
242 }
243 fflush(stdout);
244 min_reduce++;
245}
246
247// Create binary Huffman tree using the word counts
248// Frequent words will have short uniqe binary codes
249void CreateBinaryTree() {
250 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
251 char code[MAX_CODE_LENGTH];
252 long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
253 long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
254 long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
255 for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
256 for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
257 pos1 = vocab_size - 1;
258 pos2 = vocab_size;
259 // Following algorithm constructs the Huffman tree by adding one node at a time
260 for (a = 0; a < vocab_size - 1; a++) {
261 // First, find two smallest nodes 'min1, min2'
262 if (pos1 >= 0) {
263 if (count[pos1] < count[pos2]) {
264 min1i = pos1;
265 pos1--;
266 } else {
267 min1i = pos2;
268 pos2++;
269 }
270 } else {
271 min1i = pos2;
272 pos2++;
273 }
274 if (pos1 >= 0) {
275 if (count[pos1] < count[pos2]) {
276 min2i = pos1;
277 pos1--;
278 } else {
279 min2i = pos2;
280 pos2++;
281 }
282 } else {
283 min2i = pos2;
284 pos2++;
285 }
286 count[vocab_size + a] = count[min1i] + count[min2i];
287 parent_node[min1i] = vocab_size + a;
288 parent_node[min2i] = vocab_size + a;
289 binary[min2i] = 1;
290 }
291 // Now assign binary code to each vocabulary word
292 for (a = 0; a < vocab_size; a++) {
293 b = a;
294 i = 0;
295 while (1) {
296 code[i] = binary[b];
297 point[i] = b;
298 i++;
299 b = parent_node[b];
300 if (b == vocab_size * 2 - 2) break;
301 }
302 vocab[a].codelen = i;
303 vocab[a].point[0] = vocab_size - 2;
304 for (b = 0; b < i; b++) {
305 vocab[a].code[i - b - 1] = code[b];
306 vocab[a].point[i - b] = point[b] - vocab_size;
307 }
308 }
309 free(count);
310 free(binary);
311 free(parent_node);
312}
313
314void LearnVocabFromTrainFile() {
315 char word[MAX_STRING];
316 FILE *fin;
317 long long a, i;
318 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
319 fin = fopen(train_file, "rb");
320 if (fin == NULL) {
321 printf("ERROR: training data file not found!\n");
322 exit(1);
323 }
324 vocab_size = 0;
325 AddWordToVocab((char *)"</s>");
326 while (1) {
327 ReadWord(word, fin);
328 if (feof(fin)) break;
329 train_words++;
330 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
331 printf("%lldK%c", train_words / 1000, 13);
332 fflush(stdout);
333 }
334 i = SearchVocab(word);
335 if (i == -1) {
336 a = AddWordToVocab(word);
337 vocab[a].cn = 1;
338 } else vocab[i].cn++;
339 if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
340 }
341 SortVocab();
342 if (debug_mode > 0) {
343 printf("Vocab size: %lld\n", vocab_size);
344 printf("Words in train file: %lld\n", train_words);
345 }
346 file_size = ftell(fin);
347 fclose(fin);
348}
349
350void SaveVocab() {
351 long long i;
352 FILE *fo = fopen(save_vocab_file, "wb");
353 for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
354 fclose(fo);
355}
356
357void ReadVocab() {
358 long long a, i = 0;
359 char c;
360 char word[MAX_STRING];
361 FILE *fin = fopen(read_vocab_file, "rb");
362 if (fin == NULL) {
363 printf("Vocabulary file not found\n");
364 exit(1);
365 }
366 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
367 vocab_size = 0;
368 while (1) {
369 ReadWord(word, fin);
370 if (feof(fin)) break;
371 a = AddWordToVocab(word);
372 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
373 i++;
374 }
375 SortVocab();
376 if (debug_mode > 0) {
377 printf("Vocab size: %lld\n", vocab_size);
378 printf("Words in train file: %lld\n", train_words);
379 }
380 fin = fopen(train_file, "rb");
381 if (fin == NULL) {
382 printf("ERROR: training data file not found!\n");
383 exit(1);
384 }
385 fseek(fin, 0, SEEK_END);
386 file_size = ftell(fin);
387 fclose(fin);
388}
389
390void InitClassUnigramTable() {
391 long long a,c;
392 printf("loading class unigrams \n");
393 FILE *fin = fopen(negative_classes_file, "rb");
394 if (fin == NULL) {
395 printf("ERROR: class file not found!\n");
396 exit(1);
397 }
398 word_to_group = (int *)malloc(vocab_size * sizeof(int));
399 for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
400 char class[MAX_STRING];
401 char prev_class[MAX_STRING];
402 prev_class[0] = 0;
403 char word[MAX_STRING];
404 class_number = -1;
405 while (1) {
406 if (feof(fin)) break;
407 ReadWord(class, fin);
408 ReadWord(word, fin);
409 int word_index = SearchVocab(word);
410 if (word_index != -1){
411 if(strcmp(class, prev_class) != 0){
412 class_number++;
413 strcpy(prev_class, class);
414 }
415 word_to_group[word_index] = class_number;
416 }
417 ReadWord(word, fin);
418 }
419 class_number++;
420 fclose(fin);
421
422 group_to_table = (int *)malloc(table_size * class_number * sizeof(int));
423 long long train_words_pow = 0;
424 real d1, power = 0.75;
425
426 for(c = 0; c < class_number; c++){
427 long long offset = c * table_size;
428 train_words_pow = 0;
429 for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
430 int i = 0;
431 while(word_to_group[i]!=c && i < vocab_size) i++;
432 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
433 for (a = 0; a < table_size; a++) {
434 //printf("index %lld , word %d\n", a, i);
435 group_to_table[offset + a] = i;
436 if (a / (real)table_size > d1) {
437 i++;
438 while(word_to_group[i]!=c && i < vocab_size) i++;
439 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
440 }
441 if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
442 }
443 }
444}
445
446void InitNet() {
447 long long a, b;
448 unsigned long long next_random = 1;
449 window_layer_size = layer1_size*window*2;
450 a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
451 if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
452
453 if (hs) {
454 a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
455 if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
456 a = posix_memalign((void **)&syn1_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
457 if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
458 a = posix_memalign((void **)&syn_hidden_word, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
459 if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
460
461 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
462 syn1[a * layer1_size + b] = 0;
463 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
464 syn1_window[a * window_layer_size + b] = 0;
465 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
466 syn_hidden_word[a * window_hidden_size + b] = 0;
467 }
468 if (negative>0) {
469 a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
470 if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
471 a = posix_memalign((void **)&syn1neg_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
472 if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
473 a = posix_memalign((void **)&syn_hidden_word_neg, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
474 if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
475
476 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
477 syn1neg[a * layer1_size + b] = 0;
478 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
479 syn1neg_window[a * window_layer_size + b] = 0;
480 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
481 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
482 }
483 if (nce>0) {
484 a = posix_memalign((void **)&syn1nce, 128, (long long)vocab_size * layer1_size * sizeof(real));
485 if (syn1nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
486 a = posix_memalign((void **)&syn1nce_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
487 if (syn1nce_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
488 a = posix_memalign((void **)&syn_hidden_word_nce, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
489 if (syn_hidden_word_nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
490
491 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
492 syn1nce[a * layer1_size + b] = 0;
493 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
494 syn1nce_window[a * window_layer_size + b] = 0;
495 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
496 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
497 }
498 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
499 next_random = next_random * (unsigned long long)25214903917 + 11;
500 syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
501 }
502
503 a = posix_memalign((void **)&syn_window_hidden, 128, window_hidden_size * window_layer_size * sizeof(real));
504 if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
505 for (a = 0; a < window_hidden_size * window_layer_size; a++){
506 next_random = next_random * (unsigned long long)25214903917 + 11;
507 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
508 }
509
510 CreateBinaryTree();
511}
512
513void *TrainModelThread(void *id) {
514 long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
515 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
516 long long l1, l2, c, target, label, local_iter = iter;
517 unsigned long long next_random = (long long)id;
518 real f, g;
519 clock_t now;
520 int input_len_1 = layer1_size;
521 int window_offset = -1;
522 if(type == 2 || type == 4){
523 input_len_1=window_layer_size;
524 }
525 real *neu1 = (real *)calloc(input_len_1, sizeof(real));
526 real *neu1e = (real *)calloc(input_len_1, sizeof(real));
527
528 int input_len_2 = 0;
529 if(type == 4){
530 input_len_2 = window_hidden_size;
531 }
532 real *neu2 = (real *)calloc(input_len_2, sizeof(real));
533 real *neu2e = (real *)calloc(input_len_2, sizeof(real));
534
535 FILE *fi = fopen(train_file, "rb");
536 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
537 while (1) {
538 if (word_count - last_word_count > 10000) {
539 word_count_actual += word_count - last_word_count;
540 last_word_count = word_count;
541 if ((debug_mode > 1)) {
542 now=clock();
543 printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
544 word_count_actual / (real)(iter * train_words + 1) * 100,
545 word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
546 fflush(stdout);
547 }
548 alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
549 if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
550 }
551 if (sentence_length == 0) {
552 while (1) {
553 word = ReadWordIndex(fi);
554 if (feof(fi)) break;
555 if (word == -1) continue;
556 word_count++;
557 if (word == 0) break;
558 // The subsampling randomly discards frequent words while keeping the ranking same
559 if (sample > 0) {
560 real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
561 next_random = next_random * (unsigned long long)25214903917 + 11;
562 if (ran < (next_random & 0xFFFF) / (real)65536) continue;
563 }
564 sen[sentence_length] = word;
565 sentence_length++;
566 if (sentence_length >= MAX_SENTENCE_LENGTH) break;
567 }
568 sentence_position = 0;
569 }
570 if (feof(fi) || (word_count > train_words / num_threads)) {
571 word_count_actual += word_count - last_word_count;
572 local_iter--;
573 if (local_iter == 0) break;
574 word_count = 0;
575 last_word_count = 0;
576 sentence_length = 0;
577 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
578 continue;
579 }
580 word = sen[sentence_position];
581 if (word == -1) continue;
582 for (c = 0; c < input_len_1; c++) neu1[c] = 0;
583 for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
584 for (c = 0; c < input_len_2; c++) neu2[c] = 0;
585 for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
586 next_random = next_random * (unsigned long long)25214903917 + 11;
587 b = next_random % window;
588 if (type == 0) { //train the cbow architecture
589 // in -> hidden
590 cw = 0;
591 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
592 c = sentence_position - window + a;
593 if (c < 0) continue;
594 if (c >= sentence_length) continue;
595 last_word = sen[c];
596 if (last_word == -1) continue;
597 for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
598 cw++;
599 }
600 if (cw) {
601 for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
602 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
603 f = 0;
604 l2 = vocab[word].point[d] * layer1_size;
605 // Propagate hidden -> output
606 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
607 if (f <= -MAX_EXP) continue;
608 else if (f >= MAX_EXP) continue;
609 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
610 // 'g' is the gradient multiplied by the learning rate
611 g = (1 - vocab[word].code[d] - f) * alpha;
612 // Propagate errors output -> hidden
613 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
614 // Learn weights hidden -> output
615 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
616 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
617 }
618 // NEGATIVE SAMPLING
619 if (negative > 0) for (d = 0; d < negative + 1; d++) {
620 if (d == 0) {
621 target = word;
622 label = 1;
623 } else {
624 next_random = next_random * (unsigned long long)25214903917 + 11;
625 if(word_to_group != NULL && word_to_group[word] != -1){
626 target = word;
627 while(target == word) {
628 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
629 next_random = next_random * (unsigned long long)25214903917 + 11;
630 }
631 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
632 }
633 else{
634 target = table[(next_random >> 16) % table_size];
635 }
636 if (target == 0) target = next_random % (vocab_size - 1) + 1;
637 if (target == word) continue;
638 label = 0;
639 }
640 l2 = target * layer1_size;
641 f = 0;
642 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
643 if (f > MAX_EXP) g = (label - 1) * alpha;
644 else if (f < -MAX_EXP) g = (label - 0) * alpha;
645 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
646 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
647 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
648 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
649 }
650 // Noise Contrastive Estimation
651 if (nce > 0) for (d = 0; d < nce + 1; d++) {
652 if (d == 0) {
653 target = word;
654 label = 1;
655 } else {
656 next_random = next_random * (unsigned long long)25214903917 + 11;
657 if(word_to_group != NULL && word_to_group[word] != -1){
658 target = word;
659 while(target == word) {
660 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
661 next_random = next_random * (unsigned long long)25214903917 + 11;
662 }
663 }
664 else{
665 target = table[(next_random >> 16) % table_size];
666 }
667 if (target == 0) target = next_random % (vocab_size - 1) + 1;
668 if (target == word) continue;
669 label = 0;
670 }
671 l2 = target * layer1_size;
672 f = 0;
673
674 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
675 if (f > MAX_EXP) g = (label - 1) * alpha;
676 else if (f < -MAX_EXP) g = (label - 0) * alpha;
677 else {
678 f = exp(f);
679 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
680 }
681 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
682 for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
683 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce,c + l2);
684 }
685 // hidden -> in
686 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
687 c = sentence_position - window + a;
688 if (c < 0) continue;
689 if (c >= sentence_length) continue;
690 last_word = sen[c];
691 if (last_word == -1) continue;
692 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
693 }
694 }
695 } else if(type==1) { //train skip-gram
696 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
697 c = sentence_position - window + a;
698 if (c < 0) continue;
699 if (c >= sentence_length) continue;
700 last_word = sen[c];
701 if (last_word == -1) continue;
702 l1 = last_word * layer1_size;
703 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
704 // HIERARCHICAL SOFTMAX
705 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
706 f = 0;
707 l2 = vocab[word].point[d] * layer1_size;
708 // Propagate hidden -> output
709 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
710 if (f <= -MAX_EXP) continue;
711 else if (f >= MAX_EXP) continue;
712 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
713 // 'g' is the gradient multiplied by the learning rate
714 g = (1 - vocab[word].code[d] - f) * alpha;
715 // Propagate errors output -> hidden
716 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
717 // Learn weights hidden -> output
718 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
719 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
720 }
721 // NEGATIVE SAMPLING
722 if (negative > 0) for (d = 0; d < negative + 1; d++) {
723 if (d == 0) {
724 target = word;
725 label = 1;
726 } else {
727 next_random = next_random * (unsigned long long)25214903917 + 11;
728 if(word_to_group != NULL && word_to_group[word] != -1){
729 target = word;
730 while(target == word) {
731 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
732 next_random = next_random * (unsigned long long)25214903917 + 11;
733 }
734 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
735 }
736 else{
737 target = table[(next_random >> 16) % table_size];
738 }
739 if (target == 0) target = next_random % (vocab_size - 1) + 1;
740 if (target == word) continue;
741 label = 0;
742 }
743 l2 = target * layer1_size;
744 f = 0;
745 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
746 if (f > MAX_EXP) g = (label - 1) * alpha;
747 else if (f < -MAX_EXP) g = (label - 0) * alpha;
748 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
749 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
750 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
751 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
752 }
753 //Noise Contrastive Estimation
754 if (nce > 0) for (d = 0; d < nce + 1; d++) {
755 if (d == 0) {
756 target = word;
757 label = 1;
758 } else {
759 next_random = next_random * (unsigned long long)25214903917 + 11;
760 if(word_to_group != NULL && word_to_group[word] != -1){
761 target = word;
762 while(target == word) {
763 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
764 next_random = next_random * (unsigned long long)25214903917 + 11;
765 }
766 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
767 }
768 else{
769 target = table[(next_random >> 16) % table_size];
770 }
771 if (target == 0) target = next_random % (vocab_size - 1) + 1;
772 if (target == word) continue;
773 label = 0;
774 }
775 l2 = target * layer1_size;
776 f = 0;
777 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce[c + l2];
778 if (f > MAX_EXP) g = (label - 1) * alpha;
779 else if (f < -MAX_EXP) g = (label - 0) * alpha;
780 else {
781 f = exp(f);
782 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
783 }
784 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
785 for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * syn0[c + l1];
786 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce, c + l2);
787 }
788 // Learn weights input -> hidden
789 for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
790 }
791 }
792 else if(type == 2){ //train the cwindow architecture
793 // in -> hidden
794 cw = 0;
795 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
796 c = sentence_position - window + a;
797 if (c < 0) continue;
798 if (c >= sentence_length) continue;
799 last_word = sen[c];
800 if (last_word == -1) continue;
801 window_offset = a*layer1_size;
802 if (a > window) window_offset-=layer1_size;
803 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
804 cw++;
805 }
806 if (cw) {
807 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
808 f = 0;
809 l2 = vocab[word].point[d] * window_layer_size;
810 // Propagate hidden -> output
811 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
812 if (f <= -MAX_EXP) continue;
813 else if (f >= MAX_EXP) continue;
814 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
815 // 'g' is the gradient multiplied by the learning rate
816 g = (1 - vocab[word].code[d] - f) * alpha;
817 // Propagate errors output -> hidden
818 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
819 // Learn weights hidden -> output
820 for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
821 if (cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1_window, c + l2);
822 }
823 // NEGATIVE SAMPLING
824 if (negative > 0) for (d = 0; d < negative + 1; d++) {
825 if (d == 0) {
826 target = word;
827 label = 1;
828 } else {
829 next_random = next_random * (unsigned long long)25214903917 + 11;
830 if(word_to_group != NULL && word_to_group[word] != -1){
831 target = word;
832 while(target == word) {
833 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
834 next_random = next_random * (unsigned long long)25214903917 + 11;
835 }
836 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
837 }
838 else{
839 target = table[(next_random >> 16) % table_size];
840 }
841 if (target == 0) target = next_random % (vocab_size - 1) + 1;
842 if (target == word) continue;
843 label = 0;
844 }
845 l2 = target * window_layer_size;
846 f = 0;
847 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
848 if (f > MAX_EXP) g = (label - 1) * alpha;
849 else if (f < -MAX_EXP) g = (label - 0) * alpha;
850 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
851 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
852 for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
853 if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1neg_window, c + l2);
854 }
855 // Noise Contrastive Estimation
856 if (nce > 0) for (d = 0; d < nce + 1; d++) {
857 if (d == 0) {
858 target = word;
859 label = 1;
860 } else {
861 next_random = next_random * (unsigned long long)25214903917 + 11;
862 if(word_to_group != NULL && word_to_group[word] != -1){
863 target = word;
864 while(target == word) {
865 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
866 next_random = next_random * (unsigned long long)25214903917 + 11;
867 }
868 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
869 }
870 else{
871 target = table[(next_random >> 16) % table_size];
872 }
873 if (target == 0) target = next_random % (vocab_size - 1) + 1;
874 if (target == word) continue;
875 label = 0;
876 }
877 l2 = target * window_layer_size;
878 f = 0;
879 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1nce_window[c + l2];
880 if (f > MAX_EXP) g = (label - 1) * alpha;
881 else if (f < -MAX_EXP) g = (label - 0) * alpha;
882 else {
883 f = exp(f);
884 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
885 }
886 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1nce_window[c + l2];
887 for (c = 0; c < window_layer_size; c++) syn1nce_window[c + l2] += g * neu1[c];
888 if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1nce_window, c + l2);
889 }
890 // hidden -> in
891 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
892 c = sentence_position - window + a;
893 if (c < 0) continue;
894 if (c >= sentence_length) continue;
895 last_word = sen[c];
896 if (last_word == -1) continue;
897 window_offset = a * layer1_size;
898 if(a > window) window_offset -= layer1_size;
899 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
900 }
901 }
902 }
903 else if (type == 3){ //train structured skip-gram
904 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
905 c = sentence_position - window + a;
906 if (c < 0) continue;
907 if (c >= sentence_length) continue;
908 last_word = sen[c];
909 if (last_word == -1) continue;
910 l1 = last_word * layer1_size;
911 window_offset = a * layer1_size;
912 if(a > window) window_offset -= layer1_size;
913 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
914 // HIERARCHICAL SOFTMAX
915 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
916 f = 0;
917 l2 = vocab[word].point[d] * window_layer_size;
918 // Propagate hidden -> output
919 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1_window[c + l2 + window_offset];
920 if (f <= -MAX_EXP) continue;
921 else if (f >= MAX_EXP) continue;
922 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
923 // 'g' is the gradient multiplied by the learning rate
924 g = (1 - vocab[word].code[d] - f) * alpha;
925 // Propagate errors output -> hidden
926 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
927 // Learn weights hidden -> output
928 for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * syn0[c + l1];
929 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2 + window_offset);
930 }
931 // NEGATIVE SAMPLING
932 if (negative > 0) for (d = 0; d < negative + 1; d++) {
933 if (d == 0) {
934 target = word;
935 label = 1;
936 } else {
937 next_random = next_random * (unsigned long long)25214903917 + 11;
938 if(word_to_group != NULL && word_to_group[word] != -1){
939 target = word;
940 while(target == word) {
941 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
942 next_random = next_random * (unsigned long long)25214903917 + 11;
943 }
944 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
945 }
946 else{
947 target = table[(next_random >> 16) % table_size];
948 }
949 if (target == 0) target = next_random % (vocab_size - 1) + 1;
950 if (target == word) continue;
951 label = 0;
952 }
953 l2 = target * window_layer_size;
954 f = 0;
955 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg_window[c + l2 + window_offset];
956 if (f > MAX_EXP) g = (label - 1) * alpha;
957 else if (f < -MAX_EXP) g = (label - 0) * alpha;
958 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
959 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
960 for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * syn0[c + l1];
961 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg_window, c + l2 + window_offset);
962 }
963 // Noise Constrastive Estimation
964 if (nce > 0) for (d = 0; d < nce + 1; d++) {
965 if (d == 0) {
966 target = word;
967 label = 1;
968 } else {
969 next_random = next_random * (unsigned long long)25214903917 + 11;
970 if(word_to_group != NULL && word_to_group[word] != -1){
971 target = word;
972 while(target == word) {
973 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
974 next_random = next_random * (unsigned long long)25214903917 + 11;
975 }
976 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
977 }
978 else{
979 target = table[(next_random >> 16) % table_size];
980 }
981 if (target == 0) target = next_random % (vocab_size - 1) + 1;
982 if (target == word) continue;
983 label = 0;
984 }
985 l2 = target * window_layer_size;
986 f = 0;
987 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce_window[c + l2 + window_offset];
988 if (f > MAX_EXP) g = (label - 1) * alpha;
989 else if (f < -MAX_EXP) g = (label - 0) * alpha;
990 else {
991 f = exp(f);
992 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
993 }
994 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce_window[c + l2 + window_offset];
995 for (c = 0; c < layer1_size; c++) syn1nce_window[c + l2 + window_offset] += g * syn0[c + l1];
996 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce_window, c + l2 + window_offset);
997 }
998 // Learn weights input -> hidden
999 for (c = 0; c < layer1_size; c++) {syn0[c + l1] += neu1e[c]; if(syn0[c + l1] > 50) syn0[c + l1] = 50; if(syn0[c + l1] < -50) syn0[c + l1] = -50;}
1000 }
1001 }
1002 else if(type == 4){ //training senna
1003 // in -> hidden
1004 cw = 0;
1005 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1006 c = sentence_position - window + a;
1007 if (c < 0) continue;
1008 if (c >= sentence_length) continue;
1009 last_word = sen[c];
1010 if (last_word == -1) continue;
1011 window_offset = a*layer1_size;
1012 if (a > window) window_offset-=layer1_size;
1013 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
1014 cw++;
1015 }
1016 if (cw) {
1017 for (a = 0; a < window_hidden_size; a++){
1018 c = a*window_layer_size;
1019 for(b = 0; b < window_layer_size; b++){
1020 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1021 }
1022 }
1023 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1024 f = 0;
1025 l2 = vocab[word].point[d] * window_hidden_size;
1026 // Propagate hidden -> output
1027 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1028 if (f <= -MAX_EXP) continue;
1029 else if (f >= MAX_EXP) continue;
1030 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1031 // 'g' is the gradient multiplied by the learning rate
1032 g = (1 - vocab[word].code[d] - f) * alpha;
1033 // Propagate errors output -> hidden
1034 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
1035 // Learn weights hidden -> output
1036 for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1037 }
1038 // NEGATIVE SAMPLING
1039 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1040 if (d == 0) {
1041 target = word;
1042 label = 1;
1043 } else {
1044 next_random = next_random * (unsigned long long)25214903917 + 11;
1045 if(word_to_group != NULL && word_to_group[word] != -1){
1046 target = word;
1047 while(target == word) {
1048 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1049 next_random = next_random * (unsigned long long)25214903917 + 11;
1050 }
1051 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1052 }
1053 else{
1054 target = table[(next_random >> 16) % table_size];
1055 }
1056 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1057 if (target == word) continue;
1058 label = 0;
1059 }
1060 l2 = target * window_hidden_size;
1061 f = 0;
1062 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
1063 if (f > MAX_EXP) g = (label - 1) * alpha / negative;
1064 else if (f < -MAX_EXP) g = (label - 0) * alpha / negative;
1065 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha / negative;
1066 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
1067 for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1068 }
1069 for (a = 0; a < window_hidden_size; a++)
1070 for(b = 0; b < window_layer_size; b++)
1071 neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
1072 for (a = 0; a < window_hidden_size; a++)
1073 for(b = 0; b < window_layer_size; b++)
1074 syn_window_hidden[a*window_layer_size + b] += neu2e[a] * neu1[b];
1075 // hidden -> in
1076 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1077 c = sentence_position - window + a;
1078 if (c < 0) continue;
1079 if (c >= sentence_length) continue;
1080 last_word = sen[c];
1081 if (last_word == -1) continue;
1082 window_offset = a * layer1_size;
1083 if(a > window) window_offset -= layer1_size;
1084 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
1085 }
1086 }
1087 }
1088 else{
1089 printf("unknown type %i", type);
1090 exit(0);
1091 }
1092 sentence_position++;
1093 if (sentence_position >= sentence_length) {
1094 sentence_length = 0;
1095 continue;
1096 }
1097 }
1098 fclose(fi);
1099 free(neu1);
1100 free(neu1e);
1101 pthread_exit(NULL);
1102}
1103
1104void TrainModel() {
1105 long a, b, c, d;
1106 FILE *fo;
1107 pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
1108 printf("Starting training using file %s\n", train_file);
1109 starting_alpha = alpha;
1110 if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
1111 if (save_vocab_file[0] != 0) SaveVocab();
1112 if (output_file[0] == 0) return;
1113 InitNet();
1114 if (negative > 0 || nce > 0) InitUnigramTable();
1115 if (negative_classes_file[0] != 0) InitClassUnigramTable();
1116 start = clock();
1117 for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
1118 for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
1119 fo = fopen(output_file, "wb");
1120 if (classes == 0) {
1121 // Save the word vectors
1122 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1123 for (a = 0; a < vocab_size; a++) {
1124 fprintf(fo, "%s ", vocab[a].word);
1125 if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1126 else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1127 fprintf(fo, "\n");
1128 }
1129 } else {
1130 // Run K-means on the word vectors
1131 int clcn = classes, iter = 10, closeid;
1132 int *centcn = (int *)malloc(classes * sizeof(int));
1133 int *cl = (int *)calloc(vocab_size, sizeof(int));
1134 real closev, x;
1135 real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
1136 for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
1137 for (a = 0; a < iter; a++) {
1138 for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
1139 for (b = 0; b < clcn; b++) centcn[b] = 1;
1140 for (c = 0; c < vocab_size; c++) {
1141 for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1142 centcn[cl[c]]++;
1143 }
1144 for (b = 0; b < clcn; b++) {
1145 closev = 0;
1146 for (c = 0; c < layer1_size; c++) {
1147 cent[layer1_size * b + c] /= centcn[b];
1148 closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
1149 }
1150 closev = sqrt(closev);
1151 for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
1152 }
1153 for (c = 0; c < vocab_size; c++) {
1154 closev = -10;
1155 closeid = 0;
1156 for (d = 0; d < clcn; d++) {
1157 x = 0;
1158 for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
1159 if (x > closev) {
1160 closev = x;
1161 closeid = d;
1162 }
1163 }
1164 cl[c] = closeid;
1165 }
1166 }
1167 // Save the K-means classes
1168 for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1169 free(centcn);
1170 free(cent);
1171 free(cl);
1172 }
1173 fclose(fo);
1174}
1175
1176int ArgPos(char *str, int argc, char **argv) {
1177 int a;
1178 for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
1179 if (a == argc - 1) {
1180 printf("Argument missing for %s\n", str);
1181 exit(1);
1182 }
1183 return a;
1184 }
1185 return -1;
1186}
1187
1188int main(int argc, char **argv) {
1189 int i;
1190 if (argc == 1) {
1191 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1192 printf("Options:\n");
1193 printf("Parameters for training:\n");
1194 printf("\t-train <file>\n");
1195 printf("\t\tUse text data from <file> to train the model\n");
1196 printf("\t-output <file>\n");
1197 printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
1198 printf("\t-size <int>\n");
1199 printf("\t\tSet size of word vectors; default is 100\n");
1200 printf("\t-window <int>\n");
1201 printf("\t\tSet max skip length between words; default is 5\n");
1202 printf("\t-sample <float>\n");
1203 printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1204 printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1205 printf("\t-hs <int>\n");
1206 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1207 printf("\t-negative <int>\n");
1208 printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1209 printf("\t-negative-classes <file>\n");
1210 printf("\t\tNegative classes to sample from\n");
1211 printf("\t-nce <int>\n");
1212 printf("\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1213 printf("\t-threads <int>\n");
1214 printf("\t\tUse <int> threads (default 12)\n");
1215 printf("\t-iter <int>\n");
1216 printf("\t\tRun more training iterations (default 5)\n");
1217 printf("\t-min-count <int>\n");
1218 printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
1219 printf("\t-alpha <float>\n");
1220 printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1221 printf("\t-classes <int>\n");
1222 printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1223 printf("\t-debug <int>\n");
1224 printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
1225 printf("\t-binary <int>\n");
1226 printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1227 printf("\t-save-vocab <file>\n");
1228 printf("\t\tThe vocabulary will be saved to <file>\n");
1229 printf("\t-read-vocab <file>\n");
1230 printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1231 printf("\t-type <int>\n");
1232 printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1233 printf("\t-cap <int>\n");
1234 printf("\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1235 printf("\nExamples:\n");
1236 printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1237 return 0;
1238 }
1239 output_file[0] = 0;
1240 save_vocab_file[0] = 0;
1241 read_vocab_file[0] = 0;
1242 negative_classes_file[0] = 0;
1243 if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
1244 if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
1245 if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
1246 if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
1247 if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
1248 if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
1249 if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
1250 if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
1251 if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
1252 if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
1253 if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
1254 if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
1255 if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
1256 if ((i = ArgPos((char *)"-nce", argc, argv)) > 0) nce = atoi(argv[i + 1]);
1257 if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
1258 if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
1259 if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
1260 if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
1261 if ((i = ArgPos((char *)"-cap", argc, argv)) > 0) cap = atoi(argv[i + 1]);
1262 if (type==0 || type==2 || type==4) alpha = 0.05;
1263 if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
1264 vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
1265 vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
1266 expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1267 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1268 expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1269 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1270 }
1271 TrainModel();
1272 return 0;
1273}
1274