blob: fbf96a1e6808423fdb3a27cb0c192430d59665e0 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
Marc Kupietzb16eba42018-12-20 11:29:57 +010020#include "collocatordb.h"
Marc Kupietzd6f9c712016-03-16 11:50:56 +010021
22#define MAX_STRING 100
23#define EXP_TABLE_SIZE 1000
24#define MAX_EXP 6
25#define MAX_SENTENCE_LENGTH 1000
26#define MAX_CODE_LENGTH 40
27
28const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
29
30typedef float real; // Precision of float numbers
31
32struct vocab_word {
33 long long cn;
34 int *point;
35 char *word, *code, codelen;
36};
37
38char train_file[MAX_STRING], output_file[MAX_STRING];
39char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
40struct vocab_word *vocab;
41int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
42int *vocab_hash;
43long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
44long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
45real alpha = 0.025, starting_alpha, sample = 1e-3;
46real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
47clock_t start;
48
49real *syn1_window, *syn1neg_window, *syn1nce_window;
50int w_offset, window_layer_size;
51
52int window_hidden_size = 500;
53real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg, *syn_hidden_word_nce;
54
55int hs = 0, negative = 5;
56const int table_size = 1e8;
57int *table;
58
59//constrastive negative sampling
60char negative_classes_file[MAX_STRING];
61int *word_to_group;
62int *group_to_table; //group_size*table_size
63int class_number;
64
65//nce
66real* noise_distribution;
67int nce = 0;
68
69//param caps
70real CAP_VALUE = 50;
71int cap = 0;
72
Marc Kupietzb16eba42018-12-20 11:29:57 +010073COLLOCATORS *cdb = null;
74
Marc Kupietzd6f9c712016-03-16 11:50:56 +010075void capParam(real* array, int index){
76 if(array[index] > CAP_VALUE)
77 array[index] = CAP_VALUE;
78 else if(array[index] < -CAP_VALUE)
79 array[index] = -CAP_VALUE;
80}
81
82real hardTanh(real x){
83 if(x>=1){
84 return 1;
85 }
86 else if(x<=-1){
87 return -1;
88 }
89 else{
90 return x;
91 }
92}
93
94real dHardTanh(real x, real g){
95 if(x > 1 && g > 0){
96 return 0;
97 }
98 if(x < -1 && g < 0){
99 return 0;
100 }
101 return 1;
102}
103
104void InitUnigramTable() {
105 int a, i;
106 long long train_words_pow = 0;
107 real d1, power = 0.75;
108 table = (int *)malloc(table_size * sizeof(int));
109 for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
110 i = 0;
111 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
112 for (a = 0; a < table_size; a++) {
113 table[a] = i;
114 if (a / (real)table_size > d1) {
115 i++;
116 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
117 }
118 if (i >= vocab_size) i = vocab_size - 1;
119 }
120
121 noise_distribution = (real *)calloc(vocab_size, sizeof(real));
122 for (a = 0; a < vocab_size; a++) noise_distribution[a] = pow(vocab[a].cn, power)/(real)train_words_pow;
123}
124
125// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
126void ReadWord(char *word, FILE *fin) {
127 int a = 0, ch;
128 while (!feof(fin)) {
129 ch = fgetc(fin);
130 if (ch == 13) continue;
131 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
132 if (a > 0) {
133 if (ch == '\n') ungetc(ch, fin);
134 break;
135 }
136 if (ch == '\n') {
137 strcpy(word, (char *)"</s>");
138 return;
139 } else continue;
140 }
141 word[a] = ch;
142 a++;
143 if (a >= MAX_STRING - 1) a--; // Truncate too long words
144 }
145 word[a] = 0;
146}
147
148// Returns hash value of a word
149int GetWordHash(char *word) {
150 unsigned long long a, hash = 0;
151 for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
152 hash = hash % vocab_hash_size;
153 return hash;
154}
155
156// Returns position of a word in the vocabulary; if the word is not found, returns -1
157int SearchVocab(char *word) {
158 unsigned int hash = GetWordHash(word);
159 while (1) {
160 if (vocab_hash[hash] == -1) return -1;
161 if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
162 hash = (hash + 1) % vocab_hash_size;
163 }
164 return -1;
165}
166
167// Reads a word and returns its index in the vocabulary
168int ReadWordIndex(FILE *fin) {
169 char word[MAX_STRING];
170 ReadWord(word, fin);
171 if (feof(fin)) return -1;
172 return SearchVocab(word);
173}
174
175// Adds a word to the vocabulary
176int AddWordToVocab(char *word) {
177 unsigned int hash, length = strlen(word) + 1;
178 if (length > MAX_STRING) length = MAX_STRING;
179 vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
180 strcpy(vocab[vocab_size].word, word);
181 vocab[vocab_size].cn = 0;
182 vocab_size++;
183 // Reallocate memory if needed
184 if (vocab_size + 2 >= vocab_max_size) {
185 vocab_max_size += 1000;
186 vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
187 }
188 hash = GetWordHash(word);
189 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
190 vocab_hash[hash] = vocab_size - 1;
191 return vocab_size - 1;
192}
193
194// Used later for sorting by word counts
195int VocabCompare(const void *a, const void *b) {
196 return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
197}
198
199// Sorts the vocabulary by frequency using word counts
200void SortVocab() {
201 int a, size;
202 unsigned int hash;
203 // Sort the vocabulary and keep </s> at the first position
204 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
205 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
206 size = vocab_size;
207 train_words = 0;
208 for (a = 0; a < size; a++) {
209 // Words occuring less than min_count times will be discarded from the vocab
210 if ((vocab[a].cn < min_count) && (a != 0)) {
211 vocab_size--;
212 free(vocab[a].word);
213 } else {
214 // Hash will be re-computed, as after the sorting it is not actual
215 hash=GetWordHash(vocab[a].word);
216 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
217 vocab_hash[hash] = a;
218 train_words += vocab[a].cn;
219 }
220 }
221 vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
222 // Allocate memory for the binary tree construction
223 for (a = 0; a < vocab_size; a++) {
224 vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
225 vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
226 }
227}
228
229// Reduces the vocabulary by removing infrequent tokens
230void ReduceVocab() {
231 int a, b = 0;
232 unsigned int hash;
233 for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
234 vocab[b].cn = vocab[a].cn;
235 vocab[b].word = vocab[a].word;
236 b++;
237 } else free(vocab[a].word);
238 vocab_size = b;
239 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
240 for (a = 0; a < vocab_size; a++) {
241 // Hash will be re-computed, as it is not actual
242 hash = GetWordHash(vocab[a].word);
243 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
244 vocab_hash[hash] = a;
245 }
246 fflush(stdout);
247 min_reduce++;
248}
249
250// Create binary Huffman tree using the word counts
251// Frequent words will have short uniqe binary codes
252void CreateBinaryTree() {
253 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
254 char code[MAX_CODE_LENGTH];
255 long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
256 long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
257 long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
258 for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
259 for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
260 pos1 = vocab_size - 1;
261 pos2 = vocab_size;
262 // Following algorithm constructs the Huffman tree by adding one node at a time
263 for (a = 0; a < vocab_size - 1; a++) {
264 // First, find two smallest nodes 'min1, min2'
265 if (pos1 >= 0) {
266 if (count[pos1] < count[pos2]) {
267 min1i = pos1;
268 pos1--;
269 } else {
270 min1i = pos2;
271 pos2++;
272 }
273 } else {
274 min1i = pos2;
275 pos2++;
276 }
277 if (pos1 >= 0) {
278 if (count[pos1] < count[pos2]) {
279 min2i = pos1;
280 pos1--;
281 } else {
282 min2i = pos2;
283 pos2++;
284 }
285 } else {
286 min2i = pos2;
287 pos2++;
288 }
289 count[vocab_size + a] = count[min1i] + count[min2i];
290 parent_node[min1i] = vocab_size + a;
291 parent_node[min2i] = vocab_size + a;
292 binary[min2i] = 1;
293 }
294 // Now assign binary code to each vocabulary word
295 for (a = 0; a < vocab_size; a++) {
296 b = a;
297 i = 0;
298 while (1) {
299 code[i] = binary[b];
300 point[i] = b;
301 i++;
302 b = parent_node[b];
303 if (b == vocab_size * 2 - 2) break;
304 }
305 vocab[a].codelen = i;
306 vocab[a].point[0] = vocab_size - 2;
307 for (b = 0; b < i; b++) {
308 vocab[a].code[i - b - 1] = code[b];
309 vocab[a].point[i - b] = point[b] - vocab_size;
310 }
311 }
312 free(count);
313 free(binary);
314 free(parent_node);
315}
316
317void LearnVocabFromTrainFile() {
318 char word[MAX_STRING];
319 FILE *fin;
320 long long a, i;
321 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
322 fin = fopen(train_file, "rb");
323 if (fin == NULL) {
324 printf("ERROR: training data file not found!\n");
325 exit(1);
326 }
327 vocab_size = 0;
328 AddWordToVocab((char *)"</s>");
329 while (1) {
330 ReadWord(word, fin);
331 if (feof(fin)) break;
332 train_words++;
333 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
334 printf("%lldK%c", train_words / 1000, 13);
335 fflush(stdout);
336 }
337 i = SearchVocab(word);
338 if (i == -1) {
339 a = AddWordToVocab(word);
340 vocab[a].cn = 1;
341 } else vocab[i].cn++;
342 if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
343 }
344 SortVocab();
345 if (debug_mode > 0) {
346 printf("Vocab size: %lld\n", vocab_size);
347 printf("Words in train file: %lld\n", train_words);
348 }
349 file_size = ftell(fin);
350 fclose(fin);
351}
352
353void SaveVocab() {
354 long long i;
355 FILE *fo = fopen(save_vocab_file, "wb");
356 for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
357 fclose(fo);
358}
359
360void ReadVocab() {
361 long long a, i = 0;
362 char c;
363 char word[MAX_STRING];
364 FILE *fin = fopen(read_vocab_file, "rb");
365 if (fin == NULL) {
366 printf("Vocabulary file not found\n");
367 exit(1);
368 }
369 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
370 vocab_size = 0;
371 while (1) {
372 ReadWord(word, fin);
373 if (feof(fin)) break;
374 a = AddWordToVocab(word);
375 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
376 i++;
377 }
378 SortVocab();
379 if (debug_mode > 0) {
380 printf("Vocab size: %lld\n", vocab_size);
381 printf("Words in train file: %lld\n", train_words);
382 }
383 fin = fopen(train_file, "rb");
384 if (fin == NULL) {
385 printf("ERROR: training data file not found!\n");
386 exit(1);
387 }
388 fseek(fin, 0, SEEK_END);
389 file_size = ftell(fin);
390 fclose(fin);
391}
392
393void InitClassUnigramTable() {
394 long long a,c;
395 printf("loading class unigrams \n");
396 FILE *fin = fopen(negative_classes_file, "rb");
397 if (fin == NULL) {
398 printf("ERROR: class file not found!\n");
399 exit(1);
400 }
401 word_to_group = (int *)malloc(vocab_size * sizeof(int));
402 for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
403 char class[MAX_STRING];
404 char prev_class[MAX_STRING];
405 prev_class[0] = 0;
406 char word[MAX_STRING];
407 class_number = -1;
408 while (1) {
409 if (feof(fin)) break;
410 ReadWord(class, fin);
411 ReadWord(word, fin);
412 int word_index = SearchVocab(word);
413 if (word_index != -1){
414 if(strcmp(class, prev_class) != 0){
415 class_number++;
416 strcpy(prev_class, class);
417 }
418 word_to_group[word_index] = class_number;
419 }
420 ReadWord(word, fin);
421 }
422 class_number++;
423 fclose(fin);
424
425 group_to_table = (int *)malloc(table_size * class_number * sizeof(int));
426 long long train_words_pow = 0;
427 real d1, power = 0.75;
428
429 for(c = 0; c < class_number; c++){
430 long long offset = c * table_size;
431 train_words_pow = 0;
432 for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
433 int i = 0;
434 while(word_to_group[i]!=c && i < vocab_size) i++;
435 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
436 for (a = 0; a < table_size; a++) {
437 //printf("index %lld , word %d\n", a, i);
438 group_to_table[offset + a] = i;
439 if (a / (real)table_size > d1) {
440 i++;
441 while(word_to_group[i]!=c && i < vocab_size) i++;
442 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
443 }
444 if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
445 }
446 }
447}
448
449void InitNet() {
450 long long a, b;
451 unsigned long long next_random = 1;
452 window_layer_size = layer1_size*window*2;
453 a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
454 if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
455
456 if (hs) {
457 a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
458 if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
459 a = posix_memalign((void **)&syn1_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
460 if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
461 a = posix_memalign((void **)&syn_hidden_word, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
462 if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
463
464 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
465 syn1[a * layer1_size + b] = 0;
466 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
467 syn1_window[a * window_layer_size + b] = 0;
468 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
469 syn_hidden_word[a * window_hidden_size + b] = 0;
470 }
471 if (negative>0) {
472 a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
473 if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
474 a = posix_memalign((void **)&syn1neg_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
475 if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
476 a = posix_memalign((void **)&syn_hidden_word_neg, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
477 if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
478
479 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
480 syn1neg[a * layer1_size + b] = 0;
481 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
482 syn1neg_window[a * window_layer_size + b] = 0;
483 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
484 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
485 }
486 if (nce>0) {
487 a = posix_memalign((void **)&syn1nce, 128, (long long)vocab_size * layer1_size * sizeof(real));
488 if (syn1nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
489 a = posix_memalign((void **)&syn1nce_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
490 if (syn1nce_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
491 a = posix_memalign((void **)&syn_hidden_word_nce, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
492 if (syn_hidden_word_nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
493
494 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
495 syn1nce[a * layer1_size + b] = 0;
496 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
497 syn1nce_window[a * window_layer_size + b] = 0;
498 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
499 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
500 }
501 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
502 next_random = next_random * (unsigned long long)25214903917 + 11;
503 syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
504 }
505
506 a = posix_memalign((void **)&syn_window_hidden, 128, window_hidden_size * window_layer_size * sizeof(real));
507 if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
508 for (a = 0; a < window_hidden_size * window_layer_size; a++){
509 next_random = next_random * (unsigned long long)25214903917 + 11;
510 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
511 }
512
513 CreateBinaryTree();
514}
515
516void *TrainModelThread(void *id) {
517 long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
518 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
519 long long l1, l2, c, target, label, local_iter = iter;
520 unsigned long long next_random = (long long)id;
521 real f, g;
522 clock_t now;
523 int input_len_1 = layer1_size;
524 int window_offset = -1;
525 if(type == 2 || type == 4){
526 input_len_1=window_layer_size;
527 }
528 real *neu1 = (real *)calloc(input_len_1, sizeof(real));
529 real *neu1e = (real *)calloc(input_len_1, sizeof(real));
530
531 int input_len_2 = 0;
532 if(type == 4){
533 input_len_2 = window_hidden_size;
534 }
535 real *neu2 = (real *)calloc(input_len_2, sizeof(real));
536 real *neu2e = (real *)calloc(input_len_2, sizeof(real));
537
538 FILE *fi = fopen(train_file, "rb");
539 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
540 while (1) {
541 if (word_count - last_word_count > 10000) {
542 word_count_actual += word_count - last_word_count;
543 last_word_count = word_count;
544 if ((debug_mode > 1)) {
545 now=clock();
546 printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
547 word_count_actual / (real)(iter * train_words + 1) * 100,
548 word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
549 fflush(stdout);
550 }
551 alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
552 if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
553 }
554 if (sentence_length == 0) {
555 while (1) {
556 word = ReadWordIndex(fi);
557 if (feof(fi)) break;
558 if (word == -1) continue;
559 word_count++;
560 if (word == 0) break;
561 // The subsampling randomly discards frequent words while keeping the ranking same
562 if (sample > 0) {
563 real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
564 next_random = next_random * (unsigned long long)25214903917 + 11;
565 if (ran < (next_random & 0xFFFF) / (real)65536) continue;
566 }
567 sen[sentence_length] = word;
568 sentence_length++;
569 if (sentence_length >= MAX_SENTENCE_LENGTH) break;
570 }
571 sentence_position = 0;
572 }
573 if (feof(fi) || (word_count > train_words / num_threads)) {
574 word_count_actual += word_count - last_word_count;
575 local_iter--;
576 if (local_iter == 0) break;
577 word_count = 0;
578 last_word_count = 0;
579 sentence_length = 0;
580 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
581 continue;
582 }
583 word = sen[sentence_position];
584 if (word == -1) continue;
585 for (c = 0; c < input_len_1; c++) neu1[c] = 0;
586 for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
587 for (c = 0; c < input_len_2; c++) neu2[c] = 0;
588 for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
589 next_random = next_random * (unsigned long long)25214903917 + 11;
590 b = next_random % window;
591 if (type == 0) { //train the cbow architecture
592 // in -> hidden
593 cw = 0;
594 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
595 c = sentence_position - window + a;
596 if (c < 0) continue;
597 if (c >= sentence_length) continue;
598 last_word = sen[c];
599 if (last_word == -1) continue;
600 for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
601 cw++;
602 }
603 if (cw) {
604 for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
605 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
606 f = 0;
607 l2 = vocab[word].point[d] * layer1_size;
608 // Propagate hidden -> output
609 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
610 if (f <= -MAX_EXP) continue;
611 else if (f >= MAX_EXP) continue;
612 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
613 // 'g' is the gradient multiplied by the learning rate
614 g = (1 - vocab[word].code[d] - f) * alpha;
615 // Propagate errors output -> hidden
616 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
617 // Learn weights hidden -> output
618 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
619 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
620 }
621 // NEGATIVE SAMPLING
622 if (negative > 0) for (d = 0; d < negative + 1; d++) {
623 if (d == 0) {
624 target = word;
625 label = 1;
626 } else {
627 next_random = next_random * (unsigned long long)25214903917 + 11;
628 if(word_to_group != NULL && word_to_group[word] != -1){
629 target = word;
630 while(target == word) {
631 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
632 next_random = next_random * (unsigned long long)25214903917 + 11;
633 }
634 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
635 }
636 else{
637 target = table[(next_random >> 16) % table_size];
638 }
639 if (target == 0) target = next_random % (vocab_size - 1) + 1;
640 if (target == word) continue;
641 label = 0;
642 }
643 l2 = target * layer1_size;
644 f = 0;
645 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
646 if (f > MAX_EXP) g = (label - 1) * alpha;
647 else if (f < -MAX_EXP) g = (label - 0) * alpha;
648 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
649 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
650 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
651 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
652 }
653 // Noise Contrastive Estimation
654 if (nce > 0) for (d = 0; d < nce + 1; d++) {
655 if (d == 0) {
656 target = word;
657 label = 1;
658 } else {
659 next_random = next_random * (unsigned long long)25214903917 + 11;
660 if(word_to_group != NULL && word_to_group[word] != -1){
661 target = word;
662 while(target == word) {
663 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
664 next_random = next_random * (unsigned long long)25214903917 + 11;
665 }
666 }
667 else{
668 target = table[(next_random >> 16) % table_size];
669 }
670 if (target == 0) target = next_random % (vocab_size - 1) + 1;
671 if (target == word) continue;
672 label = 0;
673 }
674 l2 = target * layer1_size;
675 f = 0;
676
677 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
678 if (f > MAX_EXP) g = (label - 1) * alpha;
679 else if (f < -MAX_EXP) g = (label - 0) * alpha;
680 else {
681 f = exp(f);
682 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
683 }
684 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
685 for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
686 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce,c + l2);
687 }
688 // hidden -> in
689 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
690 c = sentence_position - window + a;
691 if (c < 0) continue;
692 if (c >= sentence_length) continue;
693 last_word = sen[c];
694 if (last_word == -1) continue;
695 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
696 }
697 }
698 } else if(type==1) { //train skip-gram
699 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
700 c = sentence_position - window + a;
701 if (c < 0) continue;
702 if (c >= sentence_length) continue;
703 last_word = sen[c];
704 if (last_word == -1) continue;
705 l1 = last_word * layer1_size;
706 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
707 // HIERARCHICAL SOFTMAX
708 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
709 f = 0;
710 l2 = vocab[word].point[d] * layer1_size;
711 // Propagate hidden -> output
712 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
713 if (f <= -MAX_EXP) continue;
714 else if (f >= MAX_EXP) continue;
715 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
716 // 'g' is the gradient multiplied by the learning rate
717 g = (1 - vocab[word].code[d] - f) * alpha;
718 // Propagate errors output -> hidden
719 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
720 // Learn weights hidden -> output
721 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
722 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
723 }
724 // NEGATIVE SAMPLING
725 if (negative > 0) for (d = 0; d < negative + 1; d++) {
726 if (d == 0) {
727 target = word;
728 label = 1;
729 } else {
730 next_random = next_random * (unsigned long long)25214903917 + 11;
731 if(word_to_group != NULL && word_to_group[word] != -1){
732 target = word;
733 while(target == word) {
734 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
735 next_random = next_random * (unsigned long long)25214903917 + 11;
736 }
737 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
738 }
739 else{
740 target = table[(next_random >> 16) % table_size];
741 }
742 if (target == 0) target = next_random % (vocab_size - 1) + 1;
743 if (target == word) continue;
744 label = 0;
745 }
746 l2 = target * layer1_size;
747 f = 0;
748 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
749 if (f > MAX_EXP) g = (label - 1) * alpha;
750 else if (f < -MAX_EXP) g = (label - 0) * alpha;
751 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
752 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
753 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
754 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
755 }
756 //Noise Contrastive Estimation
757 if (nce > 0) for (d = 0; d < nce + 1; d++) {
758 if (d == 0) {
759 target = word;
760 label = 1;
761 } else {
762 next_random = next_random * (unsigned long long)25214903917 + 11;
763 if(word_to_group != NULL && word_to_group[word] != -1){
764 target = word;
765 while(target == word) {
766 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
767 next_random = next_random * (unsigned long long)25214903917 + 11;
768 }
769 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
770 }
771 else{
772 target = table[(next_random >> 16) % table_size];
773 }
774 if (target == 0) target = next_random % (vocab_size - 1) + 1;
775 if (target == word) continue;
776 label = 0;
777 }
778 l2 = target * layer1_size;
779 f = 0;
780 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce[c + l2];
781 if (f > MAX_EXP) g = (label - 1) * alpha;
782 else if (f < -MAX_EXP) g = (label - 0) * alpha;
783 else {
784 f = exp(f);
785 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
786 }
787 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
788 for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * syn0[c + l1];
789 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce, c + l2);
790 }
791 // Learn weights input -> hidden
792 for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
793 }
794 }
795 else if(type == 2){ //train the cwindow architecture
796 // in -> hidden
797 cw = 0;
798 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
799 c = sentence_position - window + a;
800 if (c < 0) continue;
801 if (c >= sentence_length) continue;
802 last_word = sen[c];
803 if (last_word == -1) continue;
804 window_offset = a*layer1_size;
805 if (a > window) window_offset-=layer1_size;
806 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
807 cw++;
808 }
809 if (cw) {
810 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
811 f = 0;
812 l2 = vocab[word].point[d] * window_layer_size;
813 // Propagate hidden -> output
814 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
815 if (f <= -MAX_EXP) continue;
816 else if (f >= MAX_EXP) continue;
817 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
818 // 'g' is the gradient multiplied by the learning rate
819 g = (1 - vocab[word].code[d] - f) * alpha;
820 // Propagate errors output -> hidden
821 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
822 // Learn weights hidden -> output
823 for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
824 if (cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1_window, c + l2);
825 }
826 // NEGATIVE SAMPLING
827 if (negative > 0) for (d = 0; d < negative + 1; d++) {
828 if (d == 0) {
829 target = word;
830 label = 1;
831 } else {
832 next_random = next_random * (unsigned long long)25214903917 + 11;
833 if(word_to_group != NULL && word_to_group[word] != -1){
834 target = word;
835 while(target == word) {
836 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
837 next_random = next_random * (unsigned long long)25214903917 + 11;
838 }
839 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
840 }
841 else{
842 target = table[(next_random >> 16) % table_size];
843 }
844 if (target == 0) target = next_random % (vocab_size - 1) + 1;
845 if (target == word) continue;
846 label = 0;
847 }
848 l2 = target * window_layer_size;
849 f = 0;
850 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
851 if (f > MAX_EXP) g = (label - 1) * alpha;
852 else if (f < -MAX_EXP) g = (label - 0) * alpha;
853 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
854 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
855 for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
856 if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1neg_window, c + l2);
857 }
858 // Noise Contrastive Estimation
859 if (nce > 0) for (d = 0; d < nce + 1; d++) {
860 if (d == 0) {
861 target = word;
862 label = 1;
863 } else {
864 next_random = next_random * (unsigned long long)25214903917 + 11;
865 if(word_to_group != NULL && word_to_group[word] != -1){
866 target = word;
867 while(target == word) {
868 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
869 next_random = next_random * (unsigned long long)25214903917 + 11;
870 }
871 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
872 }
873 else{
874 target = table[(next_random >> 16) % table_size];
875 }
876 if (target == 0) target = next_random % (vocab_size - 1) + 1;
877 if (target == word) continue;
878 label = 0;
879 }
880 l2 = target * window_layer_size;
881 f = 0;
882 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1nce_window[c + l2];
883 if (f > MAX_EXP) g = (label - 1) * alpha;
884 else if (f < -MAX_EXP) g = (label - 0) * alpha;
885 else {
886 f = exp(f);
887 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
888 }
889 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1nce_window[c + l2];
890 for (c = 0; c < window_layer_size; c++) syn1nce_window[c + l2] += g * neu1[c];
891 if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1nce_window, c + l2);
892 }
893 // hidden -> in
894 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
895 c = sentence_position - window + a;
896 if (c < 0) continue;
897 if (c >= sentence_length) continue;
898 last_word = sen[c];
899 if (last_word == -1) continue;
900 window_offset = a * layer1_size;
901 if(a > window) window_offset -= layer1_size;
902 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
903 }
904 }
905 }
906 else if (type == 3){ //train structured skip-gram
907 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
908 c = sentence_position - window + a;
909 if (c < 0) continue;
910 if (c >= sentence_length) continue;
911 last_word = sen[c];
912 if (last_word == -1) continue;
913 l1 = last_word * layer1_size;
914 window_offset = a * layer1_size;
915 if(a > window) window_offset -= layer1_size;
916 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
917 // HIERARCHICAL SOFTMAX
918 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
919 f = 0;
920 l2 = vocab[word].point[d] * window_layer_size;
921 // Propagate hidden -> output
922 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1_window[c + l2 + window_offset];
923 if (f <= -MAX_EXP) continue;
924 else if (f >= MAX_EXP) continue;
925 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
926 // 'g' is the gradient multiplied by the learning rate
927 g = (1 - vocab[word].code[d] - f) * alpha;
928 // Propagate errors output -> hidden
929 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
930 // Learn weights hidden -> output
931 for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * syn0[c + l1];
932 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2 + window_offset);
933 }
934 // NEGATIVE SAMPLING
935 if (negative > 0) for (d = 0; d < negative + 1; d++) {
936 if (d == 0) {
937 target = word;
938 label = 1;
939 } else {
940 next_random = next_random * (unsigned long long)25214903917 + 11;
941 if(word_to_group != NULL && word_to_group[word] != -1){
942 target = word;
943 while(target == word) {
944 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
945 next_random = next_random * (unsigned long long)25214903917 + 11;
946 }
947 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
948 }
949 else{
950 target = table[(next_random >> 16) % table_size];
951 }
952 if (target == 0) target = next_random % (vocab_size - 1) + 1;
953 if (target == word) continue;
954 label = 0;
955 }
956 l2 = target * window_layer_size;
957 f = 0;
958 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg_window[c + l2 + window_offset];
959 if (f > MAX_EXP) g = (label - 1) * alpha;
960 else if (f < -MAX_EXP) g = (label - 0) * alpha;
961 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
962 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
963 for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * syn0[c + l1];
964 if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg_window, c + l2 + window_offset);
965 }
966 // Noise Constrastive Estimation
967 if (nce > 0) for (d = 0; d < nce + 1; d++) {
968 if (d == 0) {
969 target = word;
970 label = 1;
971 } else {
972 next_random = next_random * (unsigned long long)25214903917 + 11;
973 if(word_to_group != NULL && word_to_group[word] != -1){
974 target = word;
975 while(target == word) {
976 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
977 next_random = next_random * (unsigned long long)25214903917 + 11;
978 }
979 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
980 }
981 else{
982 target = table[(next_random >> 16) % table_size];
983 }
984 if (target == 0) target = next_random % (vocab_size - 1) + 1;
985 if (target == word) continue;
986 label = 0;
987 }
988 l2 = target * window_layer_size;
989 f = 0;
990 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce_window[c + l2 + window_offset];
991 if (f > MAX_EXP) g = (label - 1) * alpha;
992 else if (f < -MAX_EXP) g = (label - 0) * alpha;
993 else {
994 f = exp(f);
995 g = (label - f/(noise_distribution[target]*nce + f)) * alpha;
996 }
997 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce_window[c + l2 + window_offset];
998 for (c = 0; c < layer1_size; c++) syn1nce_window[c + l2 + window_offset] += g * syn0[c + l1];
999 if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce_window, c + l2 + window_offset);
1000 }
1001 // Learn weights input -> hidden
1002 for (c = 0; c < layer1_size; c++) {syn0[c + l1] += neu1e[c]; if(syn0[c + l1] > 50) syn0[c + l1] = 50; if(syn0[c + l1] < -50) syn0[c + l1] = -50;}
1003 }
1004 }
1005 else if(type == 4){ //training senna
1006 // in -> hidden
1007 cw = 0;
1008 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1009 c = sentence_position - window + a;
1010 if (c < 0) continue;
1011 if (c >= sentence_length) continue;
1012 last_word = sen[c];
1013 if (last_word == -1) continue;
1014 window_offset = a*layer1_size;
1015 if (a > window) window_offset-=layer1_size;
1016 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
1017 cw++;
1018 }
1019 if (cw) {
1020 for (a = 0; a < window_hidden_size; a++){
1021 c = a*window_layer_size;
1022 for(b = 0; b < window_layer_size; b++){
1023 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1024 }
1025 }
1026 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1027 f = 0;
1028 l2 = vocab[word].point[d] * window_hidden_size;
1029 // Propagate hidden -> output
1030 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1031 if (f <= -MAX_EXP) continue;
1032 else if (f >= MAX_EXP) continue;
1033 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1034 // 'g' is the gradient multiplied by the learning rate
1035 g = (1 - vocab[word].code[d] - f) * alpha;
1036 // Propagate errors output -> hidden
1037 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
1038 // Learn weights hidden -> output
1039 for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1040 }
1041 // NEGATIVE SAMPLING
1042 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1043 if (d == 0) {
1044 target = word;
1045 label = 1;
1046 } else {
1047 next_random = next_random * (unsigned long long)25214903917 + 11;
1048 if(word_to_group != NULL && word_to_group[word] != -1){
1049 target = word;
1050 while(target == word) {
1051 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1052 next_random = next_random * (unsigned long long)25214903917 + 11;
1053 }
1054 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1055 }
1056 else{
1057 target = table[(next_random >> 16) % table_size];
1058 }
1059 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1060 if (target == word) continue;
1061 label = 0;
1062 }
1063 l2 = target * window_hidden_size;
1064 f = 0;
1065 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
1066 if (f > MAX_EXP) g = (label - 1) * alpha / negative;
1067 else if (f < -MAX_EXP) g = (label - 0) * alpha / negative;
1068 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha / negative;
1069 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
1070 for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1071 }
1072 for (a = 0; a < window_hidden_size; a++)
1073 for(b = 0; b < window_layer_size; b++)
1074 neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
1075 for (a = 0; a < window_hidden_size; a++)
1076 for(b = 0; b < window_layer_size; b++)
1077 syn_window_hidden[a*window_layer_size + b] += neu2e[a] * neu1[b];
1078 // hidden -> in
1079 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1080 c = sentence_position - window + a;
1081 if (c < 0) continue;
1082 if (c >= sentence_length) continue;
1083 last_word = sen[c];
1084 if (last_word == -1) continue;
1085 window_offset = a * layer1_size;
1086 if(a > window) window_offset -= layer1_size;
1087 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
1088 }
1089 }
Marc Kupietzb16eba42018-12-20 11:29:57 +01001090 } else if(type == 5) {
1091 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1092 c = sentence_position - window + a;
1093 if (c < 0) continue;
1094 if (c >= sentence_length) continue;
1095 last_word = sen[c];
1096 if (last_word == -1) continue;
1097 printf("storing %s %s - %d\n", vocab[word].word, vocab[last_word].word, a - window);
1098 cw++;
1099 }
1100 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001101 else{
1102 printf("unknown type %i", type);
1103 exit(0);
1104 }
1105 sentence_position++;
1106 if (sentence_position >= sentence_length) {
1107 sentence_length = 0;
1108 continue;
1109 }
1110 }
1111 fclose(fi);
1112 free(neu1);
1113 free(neu1e);
1114 pthread_exit(NULL);
1115}
1116
1117void TrainModel() {
1118 long a, b, c, d;
1119 FILE *fo;
1120 pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
1121 printf("Starting training using file %s\n", train_file);
1122 starting_alpha = alpha;
1123 if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
1124 if (save_vocab_file[0] != 0) SaveVocab();
1125 if (output_file[0] == 0) return;
1126 InitNet();
1127 if (negative > 0 || nce > 0) InitUnigramTable();
1128 if (negative_classes_file[0] != 0) InitClassUnigramTable();
1129 start = clock();
1130 for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
1131 for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
1132 fo = fopen(output_file, "wb");
1133 if (classes == 0) {
1134 // Save the word vectors
1135 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1136 for (a = 0; a < vocab_size; a++) {
1137 fprintf(fo, "%s ", vocab[a].word);
1138 if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1139 else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1140 fprintf(fo, "\n");
1141 }
1142 } else {
1143 // Run K-means on the word vectors
1144 int clcn = classes, iter = 10, closeid;
1145 int *centcn = (int *)malloc(classes * sizeof(int));
1146 int *cl = (int *)calloc(vocab_size, sizeof(int));
1147 real closev, x;
1148 real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
1149 for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
1150 for (a = 0; a < iter; a++) {
1151 for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
1152 for (b = 0; b < clcn; b++) centcn[b] = 1;
1153 for (c = 0; c < vocab_size; c++) {
1154 for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1155 centcn[cl[c]]++;
1156 }
1157 for (b = 0; b < clcn; b++) {
1158 closev = 0;
1159 for (c = 0; c < layer1_size; c++) {
1160 cent[layer1_size * b + c] /= centcn[b];
1161 closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
1162 }
1163 closev = sqrt(closev);
1164 for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
1165 }
1166 for (c = 0; c < vocab_size; c++) {
1167 closev = -10;
1168 closeid = 0;
1169 for (d = 0; d < clcn; d++) {
1170 x = 0;
1171 for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
1172 if (x > closev) {
1173 closev = x;
1174 closeid = d;
1175 }
1176 }
1177 cl[c] = closeid;
1178 }
1179 }
1180 // Save the K-means classes
1181 for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1182 free(centcn);
1183 free(cent);
1184 free(cl);
1185 }
1186 fclose(fo);
1187}
1188
1189int ArgPos(char *str, int argc, char **argv) {
1190 int a;
1191 for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
1192 if (a == argc - 1) {
1193 printf("Argument missing for %s\n", str);
1194 exit(1);
1195 }
1196 return a;
1197 }
1198 return -1;
1199}
1200
1201int main(int argc, char **argv) {
1202 int i;
1203 if (argc == 1) {
1204 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1205 printf("Options:\n");
1206 printf("Parameters for training:\n");
1207 printf("\t-train <file>\n");
1208 printf("\t\tUse text data from <file> to train the model\n");
1209 printf("\t-output <file>\n");
1210 printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
1211 printf("\t-size <int>\n");
1212 printf("\t\tSet size of word vectors; default is 100\n");
1213 printf("\t-window <int>\n");
1214 printf("\t\tSet max skip length between words; default is 5\n");
1215 printf("\t-sample <float>\n");
1216 printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1217 printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1218 printf("\t-hs <int>\n");
1219 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1220 printf("\t-negative <int>\n");
1221 printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1222 printf("\t-negative-classes <file>\n");
1223 printf("\t\tNegative classes to sample from\n");
1224 printf("\t-nce <int>\n");
1225 printf("\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1226 printf("\t-threads <int>\n");
1227 printf("\t\tUse <int> threads (default 12)\n");
1228 printf("\t-iter <int>\n");
1229 printf("\t\tRun more training iterations (default 5)\n");
1230 printf("\t-min-count <int>\n");
1231 printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
1232 printf("\t-alpha <float>\n");
1233 printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1234 printf("\t-classes <int>\n");
1235 printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1236 printf("\t-debug <int>\n");
1237 printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
1238 printf("\t-binary <int>\n");
1239 printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1240 printf("\t-save-vocab <file>\n");
1241 printf("\t\tThe vocabulary will be saved to <file>\n");
1242 printf("\t-read-vocab <file>\n");
1243 printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1244 printf("\t-type <int>\n");
Marc Kupietzb16eba42018-12-20 11:29:57 +01001245 printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001246 printf("\t-cap <int>\n");
1247 printf("\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
1248 printf("\nExamples:\n");
1249 printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1250 return 0;
1251 }
1252 output_file[0] = 0;
1253 save_vocab_file[0] = 0;
1254 read_vocab_file[0] = 0;
1255 negative_classes_file[0] = 0;
1256 if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
1257 if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
1258 if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
1259 if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
1260 if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
1261 if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
1262 if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
1263 if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
1264 if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
1265 if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
1266 if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
1267 if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
1268 if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
1269 if ((i = ArgPos((char *)"-nce", argc, argv)) > 0) nce = atoi(argv[i + 1]);
1270 if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
1271 if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
1272 if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
1273 if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
1274 if ((i = ArgPos((char *)"-cap", argc, argv)) > 0) cap = atoi(argv[i + 1]);
1275 if (type==0 || type==2 || type==4) alpha = 0.05;
Marc Kupietzb16eba42018-12-20 11:29:57 +01001276 if (type==5) {
1277 sample = 0;
1278 cdb = open_collocators(output_file);
1279 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001280 if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
1281 vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
1282 vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
1283 expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1284 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1285 expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1286 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1287 }
1288 TrainModel();
1289 return 0;
1290}
1291