blob: 3671bd628beaed453e4a227da767b3fbfa301064 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Marc Kupietze23c5402016-07-14 11:10:09 +020015#include <locale.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010016#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020019#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010020#include <math.h>
21#include <pthread.h>
Marc Kupietz613edbf2018-01-11 21:38:03 +010022#include <collocatordb.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010023
24#define MAX_STRING 100
25#define EXP_TABLE_SIZE 1000
26#define MAX_EXP 6
27#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010028#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010029#define MAX_CODE_LENGTH 40
Marc Kupietz178a3c92023-12-22 15:12:27 +010030#define MAX_METADATA_CATEGORIES 4
Marc Kupietzd6f9c712016-03-16 11:50:56 +010031
Marc Kupietz178a3c92023-12-22 15:12:27 +010032#define METADATA_MARKER ' '
Marc Kupietzd6f9c712016-03-16 11:50:56 +010033const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
34
35typedef float real; // Precision of float numbers
36
37struct vocab_word {
38 long long cn;
39 int *point;
40 char *word, *code, codelen;
41};
42
43char train_file[MAX_STRING], output_file[MAX_STRING];
44char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
45char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietze423f732017-12-22 17:57:03 +010046char magic_stop_file[MAX_STRING];
47
Marc Kupietzd6f9c712016-03-16 11:50:56 +010048struct vocab_word *vocab;
49int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz879333c2023-12-20 11:41:09 +010050 num_threads = 12, min_reduce = 1, metadata_categories = 0, expected_metadata_categories = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010051int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020052long long *threadPos;
53int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010054long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
55long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
56 classes = 0;
57real alpha = 0.025, starting_alpha, sample = 1e-3;
58real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020059real avgWordLength=0;
Marc Kupietzb366bcd2018-01-11 21:29:41 +010060clock_t start, start_clock;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010061
62real *syn1_window, *syn1neg_window, *syn1nce_window;
63int w_offset, window_layer_size;
64
65int window_hidden_size = 500;
66real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
67 *syn_hidden_word_nce;
68
69int hs = 0, negative = 5;
70const int table_size = 1e8;
71int *table;
72
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010073long cc = 0;
74
Marc Kupietzd6f9c712016-03-16 11:50:56 +010075//constrastive negative sampling
76char negative_classes_file[MAX_STRING];
77int *word_to_group;
78int *group_to_table; //group_size*table_size
79int class_number;
80
81//nce
82real* noise_distribution;
83int nce = 0;
84
85//param caps
86real CAP_VALUE = 50;
87int cap = 0;
88
Marc Kupietz613edbf2018-01-11 21:38:03 +010089COLLOCATORDB *cdb = NULL;
90
Marc Kupietzd6f9c712016-03-16 11:50:56 +010091void capParam(real* array, int index) {
92 if (array[index] > CAP_VALUE)
93 array[index] = CAP_VALUE;
94 else if (array[index] < -CAP_VALUE)
95 array[index] = -CAP_VALUE;
96}
97
98real hardTanh(real x) {
99 if (x >= 1) {
100 return 1;
101 } else if (x <= -1) {
102 return -1;
103 } else {
104 return x;
105 }
106}
107
108real dHardTanh(real x, real g) {
109 if (x > 1 && g > 0) {
110 return 0;
111 }
112 if (x < -1 && g < 0) {
113 return 0;
114 }
115 return 1;
116}
117
118void InitUnigramTable() {
119 int a, i;
120 long long train_words_pow = 0;
121 real d1, power = 0.75;
122 table = (int *) malloc(table_size * sizeof(int));
123 for (a = 0; a < vocab_size; a++)
124 train_words_pow += pow(vocab[a].cn, power);
125 i = 0;
126 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
127 for (a = 0; a < table_size; a++) {
128 table[a] = i;
129 if (a / (real) table_size > d1) {
130 i++;
131 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
132 }
133 if (i >= vocab_size)
134 i = vocab_size - 1;
135 }
136
137 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
138 for (a = 0; a < vocab_size; a++)
139 noise_distribution[a] = pow(vocab[a].cn, power)
140 / (real) train_words_pow;
141}
142
143// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
144void ReadWord(char *word, FILE *fin) {
145 int a = 0, ch;
146 while (!feof(fin)) {
147 ch = fgetc(fin);
148 if (ch == 13)
149 continue;
150 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
Marc Kupietz879333c2023-12-20 11:41:09 +0100151 if (ch == '\t' && expected_metadata_categories > 0) {
Marc Kupietz178a3c92023-12-22 15:12:27 +0100152 word[a] = 0;
153 a = 0;
154 expected_metadata_categories--;
155 if (debug_mode > 2)
156 printf("Metadata: %s\n", word);
157 strcpy(word + 1, word);
158 *word = METADATA_MARKER;
159 return;
Marc Kupietz879333c2023-12-20 11:41:09 +0100160 } else {
161 if (a > 0) {
162 if (ch == '\n') {
163 expected_metadata_categories = metadata_categories;
164 ungetc(ch, fin);
165 }
166 break;
167 }
168 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100169 if (ch == '\n') {
170 strcpy(word, (char *) "</s>");
Marc Kupietz879333c2023-12-20 11:41:09 +0100171 expected_metadata_categories = metadata_categories;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100172 return;
173 } else
174 continue;
175 }
176 word[a] = ch;
177 a++;
178 if (a >= MAX_STRING - 1)
179 a--; // Truncate too long words
180 }
181 word[a] = 0;
182}
183
184// Returns hash value of a word
185int GetWordHash(char *word) {
186 unsigned long long a, hash = 0;
187 for (a = 0; a < strlen(word); a++)
188 hash = hash * 257 + word[a];
189 hash = hash % vocab_hash_size;
190 return hash;
191}
192
193// Returns position of a word in the vocabulary; if the word is not found, returns -1
194int SearchVocab(char *word) {
195 unsigned int hash = GetWordHash(word);
196 while (1) {
197 if (vocab_hash[hash] == -1)
198 return -1;
199 if (!strcmp(word, vocab[vocab_hash[hash]].word))
200 return vocab_hash[hash];
201 hash = (hash + 1) % vocab_hash_size;
202 }
203 return -1;
204}
205
206// Reads a word and returns its index in the vocabulary
207int ReadWordIndex(FILE *fin) {
208 char word[MAX_STRING];
209 ReadWord(word, fin);
210 if (feof(fin))
211 return -1;
212 return SearchVocab(word);
213}
214
215// Adds a word to the vocabulary
216int AddWordToVocab(char *word) {
217 unsigned int hash, length = strlen(word) + 1;
218 if (length > MAX_STRING)
219 length = MAX_STRING;
220 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
221 strcpy(vocab[vocab_size].word, word);
222 vocab[vocab_size].cn = 0;
223 vocab_size++;
224 // Reallocate memory if needed
225 if (vocab_size + 2 >= vocab_max_size) {
226 vocab_max_size += 1000;
227 vocab = (struct vocab_word *) realloc(vocab,
228 vocab_max_size * sizeof(struct vocab_word));
229 }
230 hash = GetWordHash(word);
231 while (vocab_hash[hash] != -1)
232 hash = (hash + 1) % vocab_hash_size;
233 vocab_hash[hash] = vocab_size - 1;
234 return vocab_size - 1;
235}
236
237// Used later for sorting by word counts
238int VocabCompare(const void *a, const void *b) {
239 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
240}
241
242// Sorts the vocabulary by frequency using word counts
243void SortVocab() {
244 int a, size;
245 unsigned int hash;
246 // Sort the vocabulary and keep </s> at the first position
247 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
248 for (a = 0; a < vocab_hash_size; a++)
249 vocab_hash[a] = -1;
250 size = vocab_size;
251 train_words = 0;
252 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200253 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100254 // Words occuring less than min_count times will be discarded from the vocab
255 if ((vocab[a].cn < min_count) && (a != 0)) {
256 vocab_size--;
257 free(vocab[a].word);
258 } else {
259 // Hash will be re-computed, as after the sorting it is not actual
260 hash = GetWordHash(vocab[a].word);
261 while (vocab_hash[hash] != -1)
262 hash = (hash + 1) % vocab_hash_size;
263 vocab_hash[hash] = a;
264 train_words += vocab[a].cn;
265 }
266 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200267 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100268 vocab = (struct vocab_word *) realloc(vocab,
269 (vocab_size + 1) * sizeof(struct vocab_word));
270 // Allocate memory for the binary tree construction
271 for (a = 0; a < vocab_size; a++) {
272 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
273 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
274 }
275}
276
277// Reduces the vocabulary by removing infrequent tokens
278void ReduceVocab() {
279 int a, b = 0;
280 unsigned int hash;
281 for (a = 0; a < vocab_size; a++)
282 if (vocab[a].cn > min_reduce) {
283 vocab[b].cn = vocab[a].cn;
284 vocab[b].word = vocab[a].word;
285 b++;
286 } else
287 free(vocab[a].word);
288 vocab_size = b;
289 for (a = 0; a < vocab_hash_size; a++)
290 vocab_hash[a] = -1;
291 for (a = 0; a < vocab_size; a++) {
292 // Hash will be re-computed, as it is not actual
293 hash = GetWordHash(vocab[a].word);
294 while (vocab_hash[hash] != -1)
295 hash = (hash + 1) % vocab_hash_size;
296 vocab_hash[hash] = a;
297 }
298 fflush(stdout);
299 min_reduce++;
300}
301
302// Create binary Huffman tree using the word counts
303// Frequent words will have short uniqe binary codes
304void CreateBinaryTree() {
305 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
306 char code[MAX_CODE_LENGTH];
307 long long *count = (long long *) calloc(vocab_size * 2 + 1,
308 sizeof(long long));
309 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
310 sizeof(long long));
311 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
312 sizeof(long long));
313 for (a = 0; a < vocab_size; a++)
314 count[a] = vocab[a].cn;
315 for (a = vocab_size; a < vocab_size * 2; a++)
316 count[a] = 1e15;
317 pos1 = vocab_size - 1;
318 pos2 = vocab_size;
319 // Following algorithm constructs the Huffman tree by adding one node at a time
320 for (a = 0; a < vocab_size - 1; a++) {
321 // First, find two smallest nodes 'min1, min2'
322 if (pos1 >= 0) {
323 if (count[pos1] < count[pos2]) {
324 min1i = pos1;
325 pos1--;
326 } else {
327 min1i = pos2;
328 pos2++;
329 }
330 } else {
331 min1i = pos2;
332 pos2++;
333 }
334 if (pos1 >= 0) {
335 if (count[pos1] < count[pos2]) {
336 min2i = pos1;
337 pos1--;
338 } else {
339 min2i = pos2;
340 pos2++;
341 }
342 } else {
343 min2i = pos2;
344 pos2++;
345 }
346 count[vocab_size + a] = count[min1i] + count[min2i];
347 parent_node[min1i] = vocab_size + a;
348 parent_node[min2i] = vocab_size + a;
349 binary[min2i] = 1;
350 }
351 // Now assign binary code to each vocabulary word
352 for (a = 0; a < vocab_size; a++) {
353 b = a;
354 i = 0;
355 while (1) {
356 code[i] = binary[b];
357 point[i] = b;
358 i++;
359 b = parent_node[b];
360 if (b == vocab_size * 2 - 2)
361 break;
362 }
363 vocab[a].codelen = i;
364 vocab[a].point[0] = vocab_size - 2;
365 for (b = 0; b < i; b++) {
366 vocab[a].code[i - b - 1] = code[b];
367 vocab[a].point[i - b] = point[b] - vocab_size;
368 }
369 }
370 free(count);
371 free(binary);
372 free(parent_node);
373}
374
375void LearnVocabFromTrainFile() {
376 char word[MAX_STRING];
377 FILE *fin;
378 long long a, i;
379 for (a = 0; a < vocab_hash_size; a++)
380 vocab_hash[a] = -1;
381 fin = fopen(train_file, "rb");
382 if (fin == NULL) {
383 printf("ERROR: training data file not found!\n");
384 exit(1);
385 }
386 vocab_size = 0;
387 AddWordToVocab((char *) "</s>");
Marc Kupietz879333c2023-12-20 11:41:09 +0100388 for (int j=0; j < metadata_categories; j++) {
389 ReadWord(word, fin);
390 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100391 while (1) {
392 ReadWord(word, fin);
393 if (feof(fin))
394 break;
395 train_words++;
396 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
397 printf("%lldK%c", train_words / 1000, 13);
398 fflush(stdout);
399 }
400 i = SearchVocab(word);
401 if (i == -1) {
402 a = AddWordToVocab(word);
403 vocab[a].cn = 1;
404 } else
405 vocab[i].cn++;
406 if (vocab_size > vocab_hash_size * 0.7)
407 ReduceVocab();
408 }
409 SortVocab();
410 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200411 printf("Vocab size: %'lld\n", vocab_size);
412 printf("Words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100413 }
414 file_size = ftell(fin);
415 fclose(fin);
416}
417
418void SaveVocab() {
419 long long i;
420 FILE *fo = fopen(save_vocab_file, "wb");
421 for (i = 0; i < vocab_size; i++)
422 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
423 fclose(fo);
424}
425
426void ReadVocab() {
427 long long a, i = 0;
428 char c;
429 char word[MAX_STRING];
430 FILE *fin = fopen(read_vocab_file, "rb");
431 if (fin == NULL) {
432 printf("Vocabulary file not found\n");
433 exit(1);
434 }
435 for (a = 0; a < vocab_hash_size; a++)
436 vocab_hash[a] = -1;
437 vocab_size = 0;
438 while (1) {
439 ReadWord(word, fin);
440 if (feof(fin))
441 break;
442 a = AddWordToVocab(word);
443 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
444 i++;
445 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200446 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100447 fin = fopen(train_file, "rb");
448 if (fin == NULL) {
449 printf("ERROR: training data file not found!\n");
450 exit(1);
451 }
452 fseek(fin, 0, SEEK_END);
453 file_size = ftell(fin);
454 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200455 SortVocab();
456 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200457 printf("Vocab size: %'lld\n", vocab_size);
458 printf("Words in vocab's train file: %'lld\n", train_words);
459 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200460 }
Marc Kupietze23c5402016-07-14 11:10:09 +0200461 train_words = file_size / avgWordLength;
462 if(debug_mode > 0)
463 printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100464}
465
466void InitClassUnigramTable() {
467 long long a, c;
468 printf("loading class unigrams \n");
469 FILE *fin = fopen(negative_classes_file, "rb");
470 if (fin == NULL) {
471 printf("ERROR: class file not found!\n");
472 exit(1);
473 }
474 word_to_group = (int *) malloc(vocab_size * sizeof(int));
475 for (a = 0; a < vocab_size; a++)
476 word_to_group[a] = -1;
477 char class[MAX_STRING];
478 char prev_class[MAX_STRING];
479 prev_class[0] = 0;
480 char word[MAX_STRING];
481 class_number = -1;
482 while (1) {
483 if (feof(fin))
484 break;
485 ReadWord(class, fin);
486 ReadWord(word, fin);
487 int word_index = SearchVocab(word);
488 if (word_index != -1) {
489 if (strcmp(class, prev_class) != 0) {
490 class_number++;
491 strcpy(prev_class, class);
492 }
493 word_to_group[word_index] = class_number;
494 }
495 ReadWord(word, fin);
496 }
497 class_number++;
498 fclose(fin);
499
500 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
501 long long train_words_pow = 0;
502 real d1, power = 0.75;
503
504 for (c = 0; c < class_number; c++) {
505 long long offset = c * table_size;
506 train_words_pow = 0;
507 for (a = 0; a < vocab_size; a++)
508 if (word_to_group[a] == c)
509 train_words_pow += pow(vocab[a].cn, power);
510 int i = 0;
511 while (word_to_group[i] != c && i < vocab_size)
512 i++;
513 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
514 for (a = 0; a < table_size; a++) {
515 //printf("index %lld , word %d\n", a, i);
516 group_to_table[offset + a] = i;
517 if (a / (real) table_size > d1) {
518 i++;
519 while (word_to_group[i] != c && i < vocab_size)
520 i++;
521 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
522 }
523 if (i >= vocab_size)
524 while (word_to_group[i] != c && i >= 0)
525 i--;
526 }
527 }
528}
529
Marc Kupietz61485ad2023-12-22 16:16:59 +0100530void SaveArgs(unsigned int argc, char **argv) {
Marc Kupietz210b9d52016-04-02 21:48:13 +0200531 unsigned int i;
Marc Kupietz44136742017-12-22 17:52:56 +0100532 char args_file[MAX_STRING];
533 strcpy(args_file, output_file);
Marc Kupietz210b9d52016-04-02 21:48:13 +0200534 strcat(args_file, ".args");
535 FILE *fargs = fopen(args_file, "w");
536 if (fargs == NULL) {
537 printf("Cannot save args to %s.\n", args_file);
538 return;
539 }
540
Marc Kupietz44136742017-12-22 17:52:56 +0100541 for(i=1; i<argc; i++)
542 fprintf(fargs, "%s ", argv[i]);
543
544 fprintf(fargs, "\n");
Marc Kupietz210b9d52016-04-02 21:48:13 +0200545 fclose(fargs);
Marc Kupietz44136742017-12-22 17:52:56 +0100546
Marc Kupietz210b9d52016-04-02 21:48:13 +0200547 return;
548}
549
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100550void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100551 if(type != 3 || negative <= 0) {
552 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
553 return;
554 }
555
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100556 FILE *fnet = fopen(save_net_file, "wb");
557 if (fnet == NULL) {
558 printf("Net parameter file not found\n");
559 exit(1);
560 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100561 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100562 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100563 fclose(fnet);
564}
565
566void InitNet() {
567 long long a, b;
568 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100569 long long read;
570
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100571 window_layer_size = layer1_size * window * 2;
572 a = posix_memalign((void **) &syn0, 128,
573 (long long) vocab_size * layer1_size * sizeof(real));
574 if (syn0 == NULL) {
575 printf("Memory allocation failed\n");
576 exit(1);
577 }
578
579 if (hs) {
580 a = posix_memalign((void **) &syn1, 128,
581 (long long) vocab_size * layer1_size * sizeof(real));
582 if (syn1 == NULL) {
583 printf("Memory allocation failed\n");
584 exit(1);
585 }
586 a = posix_memalign((void **) &syn1_window, 128,
587 (long long) vocab_size * window_layer_size * sizeof(real));
588 if (syn1_window == NULL) {
589 printf("Memory allocation failed\n");
590 exit(1);
591 }
592 a = posix_memalign((void **) &syn_hidden_word, 128,
593 (long long) vocab_size * window_hidden_size * sizeof(real));
594 if (syn_hidden_word == NULL) {
595 printf("Memory allocation failed\n");
596 exit(1);
597 }
598
599 for (a = 0; a < vocab_size; a++)
600 for (b = 0; b < layer1_size; b++)
601 syn1[a * layer1_size + b] = 0;
602 for (a = 0; a < vocab_size; a++)
603 for (b = 0; b < window_layer_size; b++)
604 syn1_window[a * window_layer_size + b] = 0;
605 for (a = 0; a < vocab_size; a++)
606 for (b = 0; b < window_hidden_size; b++)
607 syn_hidden_word[a * window_hidden_size + b] = 0;
608 }
609 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100610 if(type == 0) {
611 a = posix_memalign((void **) &syn1neg, 128,
612 (long long) vocab_size * layer1_size * sizeof(real));
613 if (syn1neg == NULL) {
614 printf("Memory allocation failed\n");
615 exit(1);
616 }
617 for (a = 0; a < vocab_size; a++)
618 for (b = 0; b < layer1_size; b++)
619 syn1neg[a * layer1_size + b] = 0;
620 } else if (type == 3) {
621 a = posix_memalign((void **) &syn1neg_window, 128,
622 (long long) vocab_size * window_layer_size * sizeof(real));
623 if (syn1neg_window == NULL) {
624 printf("Memory allocation failed\n");
625 exit(1);
626 }
627 for (a = 0; a < vocab_size; a++)
628 for (b = 0; b < window_layer_size; b++)
629 syn1neg_window[a * window_layer_size + b] = 0;
630 } else if (type == 4) {
631 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
632 (long long) vocab_size * window_hidden_size * sizeof(real));
633 if (syn_hidden_word_neg == NULL) {
634 printf("Memory allocation failed\n");
635 exit(1);
636 }
637 for (a = 0; a < vocab_size; a++)
638 for (b = 0; b < window_hidden_size; b++)
639 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100640 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100641 }
642 if (nce > 0) {
643 a = posix_memalign((void **) &syn1nce, 128,
644 (long long) vocab_size * layer1_size * sizeof(real));
645 if (syn1nce == NULL) {
646 printf("Memory allocation failed\n");
647 exit(1);
648 }
649 a = posix_memalign((void **) &syn1nce_window, 128,
650 (long long) vocab_size * window_layer_size * sizeof(real));
651 if (syn1nce_window == NULL) {
652 printf("Memory allocation failed\n");
653 exit(1);
654 }
655 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
656 (long long) vocab_size * window_hidden_size * sizeof(real));
657 if (syn_hidden_word_nce == NULL) {
658 printf("Memory allocation failed\n");
659 exit(1);
660 }
661
662 for (a = 0; a < vocab_size; a++)
663 for (b = 0; b < layer1_size; b++)
664 syn1nce[a * layer1_size + b] = 0;
665 for (a = 0; a < vocab_size; a++)
666 for (b = 0; b < window_layer_size; b++)
667 syn1nce_window[a * window_layer_size + b] = 0;
668 for (a = 0; a < vocab_size; a++)
669 for (b = 0; b < window_hidden_size; b++)
670 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
671 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100672
Marc Kupietz1006a272016-03-16 15:50:20 +0100673 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100674 a = posix_memalign((void **) &syn_window_hidden, 128,
675 window_hidden_size * window_layer_size * sizeof(real));
676 if (syn_window_hidden == NULL) {
677 printf("Memory allocation failed\n");
678 exit(1);
679 }
680 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
681 next_random = next_random * (unsigned long long) 25214903917 + 11;
682 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
683 - 0.5) / (window_hidden_size * window_layer_size);
684 }
685 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100686
687 if (read_net_file[0] == 0) {
688 for (a = 0; a < vocab_size; a++)
689 for (b = 0; b < layer1_size; b++) {
690 next_random = next_random * (unsigned long long) 25214903917
691 + 11;
692 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
693 / (real) 65536) - 0.5) / layer1_size;
694 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100695 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100696 FILE *fnet = fopen(read_net_file, "rb");
697 if (fnet == NULL) {
698 printf("Net parameter file not found\n");
699 exit(1);
700 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100701 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
702 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
703 if(read != vocab_size * layer1_size) {
704 fprintf(stderr, "read-net failed %lld\n", read);
705 exit(-1);
706 }
707 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
708 if(read != (long long) vocab_size * window_layer_size) {
709 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
710 (long long) sizeof(real) * vocab_size * window_layer_size);
711 exit(-1);
712 }
713 fgetc(fnet);
714 if(!feof(fnet)) {
715 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
716 exit(-1);
717 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100718 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100719 } else {
720 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
721 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100722 }
723
724 CreateBinaryTree();
725}
726
Marc Kupietz202723e2016-07-14 09:12:00 +0200727char *currentDateTime(char *buf, real offset) {
728 time_t t;
729 time(&t);
730 t += (long) offset;
731 struct tm tstruct;
732 tstruct = *localtime(&t);
733 strftime(buf, 80, "%c", &tstruct);
734 return buf;
735}
736
737void *MonitorThread(void *id) {
738 char *timebuf = malloc(80);;
739 int i, n=num_threads;
740 long long sum;
741 sleep(1);
742 while(n > 0) {
743 sleep(1);
744 sum = n = 0;
745 for(i=0; i < num_threads; i++) {
746 if(threadPos[i] >= 0) {
747 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
748 n++;
749 } else {
750 sum += iter * file_size / num_threads;
751 }
752 }
753 if(n == 0)
754 break;
755 real finished_portion = (real) sum / (float) (file_size * iter);
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100756 long long now = time(NULL);
757 long long elapsed = (now - start);
758 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
Marc Kupietz202723e2016-07-14 09:12:00 +0200759
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100760 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
Marc Kupietz202723e2016-07-14 09:12:00 +0200761 alpha,
762 finished_portion * 100,
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100763 (float) sum / elapsed / 1000,
Marc Kupietz202723e2016-07-14 09:12:00 +0200764 elapsed,
765 ttg,
766 currentDateTime(timebuf, ttg)
767 );
768 fflush(stdout);
769 }
770 pthread_exit(NULL);
771}
772
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100773void *TrainModelThread(void *id) {
774 long long a, b, d, cw, word, last_word, sentence_length = 0,
775 sentence_position = 0;
776 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
777 long long l1, l2, c, target, label, local_iter = iter;
778 unsigned long long next_random = (long long) id;
779 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100780 int input_len_1 = layer1_size;
781 int window_offset = -1;
782 if (type == 2 || type == 4) {
783 input_len_1 = window_layer_size;
784 }
785 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
786 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200787 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100788
789 int input_len_2 = 0;
790 if (type == 4) {
791 input_len_2 = window_hidden_size;
792 }
793 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
794 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
795
796 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200797 long long start_pos = file_size / (long long) num_threads * (long long) id;
798 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
799 long long current_pos = start_pos;
800 long long last_pos = start_pos;;
801 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100802 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200803 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100804 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200805 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100806 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100807 alpha = starting_alpha
808 * (1 - word_count_actual / (real) (iter * train_words + 1));
809 if (alpha < starting_alpha * 0.0001)
810 alpha = starting_alpha * 0.0001;
811 }
812 if (sentence_length == 0) {
813 while (1) {
814 word = ReadWordIndex(fi);
815 if (feof(fi))
816 break;
817 if (word == -1)
818 continue;
819 word_count++;
820 if (word == 0)
821 break;
822 // The subsampling randomly discards frequent words while keeping the ranking same
823 if (sample > 0) {
824 real ran = (sqrt(vocab[word].cn / (sample * train_words))
825 + 1) * (sample * train_words) / vocab[word].cn;
826 next_random = next_random * (unsigned long long) 25214903917
827 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100828 if (ran < (next_random & 0xFFFF) / (real) 65536) {
829 if(type == 3) // in structured skipgrams
830 word = -2; // keep the window position correct
831 else
832 continue;
833 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100834 }
835 sen[sentence_length] = word;
836 sentence_length++;
837 if (sentence_length >= MAX_SENTENCE_LENGTH)
838 break;
839 }
840 sentence_position = 0;
841 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200842 current_pos = threadPos[(long) id] = ftell(fi);
843 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100844 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200845 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100846 local_iter--;
847 if (local_iter == 0)
848 break;
Marc Kupietze423f732017-12-22 17:57:03 +0100849 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
850 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
851 break;
852 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100853 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200854 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100855 last_word_count = 0;
856 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200857 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100858 continue;
859 }
860 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200861 while (word == -2 && sentence_position<sentence_length)
862 word = sen[++sentence_position];
863 if (sentence_position>=sentence_length) {
864 sentence_length=0;
865 continue;
866 }
867 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100868 continue;
869 for (c = 0; c < input_len_1; c++)
870 neu1[c] = 0;
871 for (c = 0; c < input_len_1; c++)
872 neu1e[c] = 0;
873 for (c = 0; c < input_len_2; c++)
874 neu2[c] = 0;
875 for (c = 0; c < input_len_2; c++)
876 neu2e[c] = 0;
877 next_random = next_random * (unsigned long long) 25214903917 + 11;
878 b = next_random % window;
879 if (type == 0) { //train the cbow architecture
880 // in -> hidden
881 cw = 0;
882 for (a = b; a < window * 2 + 1 - b; a++)
883 if (a != window) {
884 c = sentence_position - window + a;
885 if (c < 0)
886 continue;
887 if (c >= sentence_length)
888 continue;
889 last_word = sen[c];
890 if (last_word == -1)
891 continue;
892 for (c = 0; c < layer1_size; c++)
893 neu1[c] += syn0[c + last_word * layer1_size];
894 cw++;
895 }
896 if (cw) {
897 for (c = 0; c < layer1_size; c++)
898 neu1[c] /= cw;
899 if (hs)
900 for (d = 0; d < vocab[word].codelen; d++) {
901 f = 0;
902 l2 = vocab[word].point[d] * layer1_size;
903 // Propagate hidden -> output
904 for (c = 0; c < layer1_size; c++)
905 f += neu1[c] * syn1[c + l2];
906 if (f <= -MAX_EXP)
907 continue;
908 else if (f >= MAX_EXP)
909 continue;
910 else
911 f = expTable[(int) ((f + MAX_EXP)
912 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
913 // 'g' is the gradient multiplied by the learning rate
914 g = (1 - vocab[word].code[d] - f) * alpha;
915 // Propagate errors output -> hidden
916 for (c = 0; c < layer1_size; c++)
917 neu1e[c] += g * syn1[c + l2];
918 // Learn weights hidden -> output
919 for (c = 0; c < layer1_size; c++)
920 syn1[c + l2] += g * neu1[c];
921 if (cap == 1)
922 for (c = 0; c < layer1_size; c++)
923 capParam(syn1, c + l2);
924 }
925 // NEGATIVE SAMPLING
926 if (negative > 0)
927 for (d = 0; d < negative + 1; d++) {
928 if (d == 0) {
929 target = word;
930 label = 1;
931 } else {
932 next_random = next_random
933 * (unsigned long long) 25214903917 + 11;
934 if (word_to_group != NULL
935 && word_to_group[word] != -1) {
936 target = word;
937 while (target == word) {
938 target = group_to_table[word_to_group[word]
939 * table_size
940 + (next_random >> 16) % table_size];
941 next_random = next_random
942 * (unsigned long long) 25214903917
943 + 11;
944 }
945 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
946 } else {
947 target =
948 table[(next_random >> 16) % table_size];
949 }
950 if (target == 0)
951 target = next_random % (vocab_size - 1) + 1;
952 if (target == word)
953 continue;
954 label = 0;
955 }
956 l2 = target * layer1_size;
957 f = 0;
958 for (c = 0; c < layer1_size; c++)
959 f += neu1[c] * syn1neg[c + l2];
960 if (f > MAX_EXP)
961 g = (label - 1) * alpha;
962 else if (f < -MAX_EXP)
963 g = (label - 0) * alpha;
964 else
965 g = (label
966 - expTable[(int) ((f + MAX_EXP)
967 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
968 * alpha;
969 for (c = 0; c < layer1_size; c++)
970 neu1e[c] += g * syn1neg[c + l2];
971 for (c = 0; c < layer1_size; c++)
972 syn1neg[c + l2] += g * neu1[c];
973 if (cap == 1)
974 for (c = 0; c < layer1_size; c++)
975 capParam(syn1neg, c + l2);
976 }
977 // Noise Contrastive Estimation
978 if (nce > 0)
979 for (d = 0; d < nce + 1; d++) {
980 if (d == 0) {
981 target = word;
982 label = 1;
983 } else {
984 next_random = next_random
985 * (unsigned long long) 25214903917 + 11;
986 if (word_to_group != NULL
987 && word_to_group[word] != -1) {
988 target = word;
989 while (target == word) {
990 target = group_to_table[word_to_group[word]
991 * table_size
992 + (next_random >> 16) % table_size];
993 next_random = next_random
994 * (unsigned long long) 25214903917
995 + 11;
996 }
997 } else {
998 target =
999 table[(next_random >> 16) % table_size];
1000 }
1001 if (target == 0)
1002 target = next_random % (vocab_size - 1) + 1;
1003 if (target == word)
1004 continue;
1005 label = 0;
1006 }
1007 l2 = target * layer1_size;
1008 f = 0;
1009
1010 for (c = 0; c < layer1_size; c++)
1011 f += neu1[c] * syn1nce[c + l2];
1012 if (f > MAX_EXP)
1013 g = (label - 1) * alpha;
1014 else if (f < -MAX_EXP)
1015 g = (label - 0) * alpha;
1016 else {
1017 f = exp(f);
1018 g =
1019 (label
1020 - f
1021 / (noise_distribution[target]
1022 * nce + f)) * alpha;
1023 }
1024 for (c = 0; c < layer1_size; c++)
1025 neu1e[c] += g * syn1nce[c + l2];
1026 for (c = 0; c < layer1_size; c++)
1027 syn1nce[c + l2] += g * neu1[c];
1028 if (cap == 1)
1029 for (c = 0; c < layer1_size; c++)
1030 capParam(syn1nce, c + l2);
1031 }
1032 // hidden -> in
1033 for (a = b; a < window * 2 + 1 - b; a++)
1034 if (a != window) {
1035 c = sentence_position - window + a;
1036 if (c < 0)
1037 continue;
1038 if (c >= sentence_length)
1039 continue;
1040 last_word = sen[c];
1041 if (last_word == -1)
1042 continue;
1043 for (c = 0; c < layer1_size; c++)
1044 syn0[c + last_word * layer1_size] += neu1e[c];
1045 }
1046 }
1047 } else if (type == 1) { //train skip-gram
1048 for (a = b; a < window * 2 + 1 - b; a++)
1049 if (a != window) {
1050 c = sentence_position - window + a;
1051 if (c < 0)
1052 continue;
1053 if (c >= sentence_length)
1054 continue;
1055 last_word = sen[c];
1056 if (last_word == -1)
1057 continue;
1058 l1 = last_word * layer1_size;
1059 for (c = 0; c < layer1_size; c++)
1060 neu1e[c] = 0;
1061 // HIERARCHICAL SOFTMAX
1062 if (hs)
1063 for (d = 0; d < vocab[word].codelen; d++) {
1064 f = 0;
1065 l2 = vocab[word].point[d] * layer1_size;
1066 // Propagate hidden -> output
1067 for (c = 0; c < layer1_size; c++)
1068 f += syn0[c + l1] * syn1[c + l2];
1069 if (f <= -MAX_EXP)
1070 continue;
1071 else if (f >= MAX_EXP)
1072 continue;
1073 else
1074 f = expTable[(int) ((f + MAX_EXP)
1075 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1076 // 'g' is the gradient multiplied by the learning rate
1077 g = (1 - vocab[word].code[d] - f) * alpha;
1078 // Propagate errors output -> hidden
1079 for (c = 0; c < layer1_size; c++)
1080 neu1e[c] += g * syn1[c + l2];
1081 // Learn weights hidden -> output
1082 for (c = 0; c < layer1_size; c++)
1083 syn1[c + l2] += g * syn0[c + l1];
1084 if (cap == 1)
1085 for (c = 0; c < layer1_size; c++)
1086 capParam(syn1, c + l2);
1087 }
1088 // NEGATIVE SAMPLING
1089 if (negative > 0)
1090 for (d = 0; d < negative + 1; d++) {
1091 if (d == 0) {
1092 target = word;
1093 label = 1;
1094 } else {
1095 next_random = next_random
1096 * (unsigned long long) 25214903917 + 11;
1097 if (word_to_group != NULL
1098 && word_to_group[word] != -1) {
1099 target = word;
1100 while (target == word) {
1101 target =
1102 group_to_table[word_to_group[word]
1103 * table_size
1104 + (next_random >> 16)
1105 % table_size];
1106 next_random =
1107 next_random
1108 * (unsigned long long) 25214903917
1109 + 11;
1110 }
1111 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1112 } else {
1113 target = table[(next_random >> 16)
1114 % table_size];
1115 }
1116 if (target == 0)
1117 target = next_random % (vocab_size - 1) + 1;
1118 if (target == word)
1119 continue;
1120 label = 0;
1121 }
1122 l2 = target * layer1_size;
1123 f = 0;
1124 for (c = 0; c < layer1_size; c++)
1125 f += syn0[c + l1] * syn1neg[c + l2];
1126 if (f > MAX_EXP)
1127 g = (label - 1) * alpha;
1128 else if (f < -MAX_EXP)
1129 g = (label - 0) * alpha;
1130 else
1131 g =
1132 (label
1133 - expTable[(int) ((f + MAX_EXP)
1134 * (EXP_TABLE_SIZE
1135 / MAX_EXP / 2))])
1136 * alpha;
1137 for (c = 0; c < layer1_size; c++)
1138 neu1e[c] += g * syn1neg[c + l2];
1139 for (c = 0; c < layer1_size; c++)
1140 syn1neg[c + l2] += g * syn0[c + l1];
1141 if (cap == 1)
1142 for (c = 0; c < layer1_size; c++)
1143 capParam(syn1neg, c + l2);
1144 }
1145 //Noise Contrastive Estimation
1146 if (nce > 0)
1147 for (d = 0; d < nce + 1; d++) {
1148 if (d == 0) {
1149 target = word;
1150 label = 1;
1151 } else {
1152 next_random = next_random
1153 * (unsigned long long) 25214903917 + 11;
1154 if (word_to_group != NULL
1155 && word_to_group[word] != -1) {
1156 target = word;
1157 while (target == word) {
1158 target =
1159 group_to_table[word_to_group[word]
1160 * table_size
1161 + (next_random >> 16)
1162 % table_size];
1163 next_random =
1164 next_random
1165 * (unsigned long long) 25214903917
1166 + 11;
1167 }
1168 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1169 } else {
1170 target = table[(next_random >> 16)
1171 % table_size];
1172 }
1173 if (target == 0)
1174 target = next_random % (vocab_size - 1) + 1;
1175 if (target == word)
1176 continue;
1177 label = 0;
1178 }
1179 l2 = target * layer1_size;
1180 f = 0;
1181 for (c = 0; c < layer1_size; c++)
1182 f += syn0[c + l1] * syn1nce[c + l2];
1183 if (f > MAX_EXP)
1184 g = (label - 1) * alpha;
1185 else if (f < -MAX_EXP)
1186 g = (label - 0) * alpha;
1187 else {
1188 f = exp(f);
1189 g = (label
1190 - f
1191 / (noise_distribution[target]
1192 * nce + f)) * alpha;
1193 }
1194 for (c = 0; c < layer1_size; c++)
1195 neu1e[c] += g * syn1nce[c + l2];
1196 for (c = 0; c < layer1_size; c++)
1197 syn1nce[c + l2] += g * syn0[c + l1];
1198 if (cap == 1)
1199 for (c = 0; c < layer1_size; c++)
1200 capParam(syn1nce, c + l2);
1201 }
1202 // Learn weights input -> hidden
1203 for (c = 0; c < layer1_size; c++)
1204 syn0[c + l1] += neu1e[c];
1205 }
1206 } else if (type == 2) { //train the cwindow architecture
1207 // in -> hidden
1208 cw = 0;
1209 for (a = 0; a < window * 2 + 1; a++)
1210 if (a != window) {
1211 c = sentence_position - window + a;
1212 if (c < 0)
1213 continue;
1214 if (c >= sentence_length)
1215 continue;
1216 last_word = sen[c];
1217 if (last_word == -1)
1218 continue;
1219 window_offset = a * layer1_size;
1220 if (a > window)
1221 window_offset -= layer1_size;
1222 for (c = 0; c < layer1_size; c++)
1223 neu1[c + window_offset] += syn0[c
1224 + last_word * layer1_size];
1225 cw++;
1226 }
1227 if (cw) {
1228 if (hs)
1229 for (d = 0; d < vocab[word].codelen; d++) {
1230 f = 0;
1231 l2 = vocab[word].point[d] * window_layer_size;
1232 // Propagate hidden -> output
1233 for (c = 0; c < window_layer_size; c++)
1234 f += neu1[c] * syn1_window[c + l2];
1235 if (f <= -MAX_EXP)
1236 continue;
1237 else if (f >= MAX_EXP)
1238 continue;
1239 else
1240 f = expTable[(int) ((f + MAX_EXP)
1241 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1242 // 'g' is the gradient multiplied by the learning rate
1243 g = (1 - vocab[word].code[d] - f) * alpha;
1244 // Propagate errors output -> hidden
1245 for (c = 0; c < window_layer_size; c++)
1246 neu1e[c] += g * syn1_window[c + l2];
1247 // Learn weights hidden -> output
1248 for (c = 0; c < window_layer_size; c++)
1249 syn1_window[c + l2] += g * neu1[c];
1250 if (cap == 1)
1251 for (c = 0; c < window_layer_size; c++)
1252 capParam(syn1_window, c + l2);
1253 }
1254 // NEGATIVE SAMPLING
1255 if (negative > 0)
1256 for (d = 0; d < negative + 1; d++) {
1257 if (d == 0) {
1258 target = word;
1259 label = 1;
1260 } else {
1261 next_random = next_random
1262 * (unsigned long long) 25214903917 + 11;
1263 if (word_to_group != NULL
1264 && word_to_group[word] != -1) {
1265 target = word;
1266 while (target == word) {
1267 target = group_to_table[word_to_group[word]
1268 * table_size
1269 + (next_random >> 16) % table_size];
1270 next_random = next_random
1271 * (unsigned long long) 25214903917
1272 + 11;
1273 }
1274 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1275 } else {
1276 target =
1277 table[(next_random >> 16) % table_size];
1278 }
1279 if (target == 0)
1280 target = next_random % (vocab_size - 1) + 1;
1281 if (target == word)
1282 continue;
1283 label = 0;
1284 }
1285 l2 = target * window_layer_size;
1286 f = 0;
1287 for (c = 0; c < window_layer_size; c++)
1288 f += neu1[c] * syn1neg_window[c + l2];
1289 if (f > MAX_EXP)
1290 g = (label - 1) * alpha;
1291 else if (f < -MAX_EXP)
1292 g = (label - 0) * alpha;
1293 else
1294 g = (label
1295 - expTable[(int) ((f + MAX_EXP)
1296 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1297 * alpha;
1298 for (c = 0; c < window_layer_size; c++)
1299 neu1e[c] += g * syn1neg_window[c + l2];
1300 for (c = 0; c < window_layer_size; c++)
1301 syn1neg_window[c + l2] += g * neu1[c];
1302 if (cap == 1)
1303 for (c = 0; c < window_layer_size; c++)
1304 capParam(syn1neg_window, c + l2);
1305 }
1306 // Noise Contrastive Estimation
1307 if (nce > 0)
1308 for (d = 0; d < nce + 1; d++) {
1309 if (d == 0) {
1310 target = word;
1311 label = 1;
1312 } else {
1313 next_random = next_random
1314 * (unsigned long long) 25214903917 + 11;
1315 if (word_to_group != NULL
1316 && word_to_group[word] != -1) {
1317 target = word;
1318 while (target == word) {
1319 target = group_to_table[word_to_group[word]
1320 * table_size
1321 + (next_random >> 16) % table_size];
1322 next_random = next_random
1323 * (unsigned long long) 25214903917
1324 + 11;
1325 }
1326 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1327 } else {
1328 target =
1329 table[(next_random >> 16) % table_size];
1330 }
1331 if (target == 0)
1332 target = next_random % (vocab_size - 1) + 1;
1333 if (target == word)
1334 continue;
1335 label = 0;
1336 }
1337 l2 = target * window_layer_size;
1338 f = 0;
1339 for (c = 0; c < window_layer_size; c++)
1340 f += neu1[c] * syn1nce_window[c + l2];
1341 if (f > MAX_EXP)
1342 g = (label - 1) * alpha;
1343 else if (f < -MAX_EXP)
1344 g = (label - 0) * alpha;
1345 else {
1346 f = exp(f);
1347 g =
1348 (label
1349 - f
1350 / (noise_distribution[target]
1351 * nce + f)) * alpha;
1352 }
1353 for (c = 0; c < window_layer_size; c++)
1354 neu1e[c] += g * syn1nce_window[c + l2];
1355 for (c = 0; c < window_layer_size; c++)
1356 syn1nce_window[c + l2] += g * neu1[c];
1357 if (cap == 1)
1358 for (c = 0; c < window_layer_size; c++)
1359 capParam(syn1nce_window, c + l2);
1360 }
1361 // hidden -> in
1362 for (a = 0; a < window * 2 + 1; a++)
1363 if (a != window) {
1364 c = sentence_position - window + a;
1365 if (c < 0)
1366 continue;
1367 if (c >= sentence_length)
1368 continue;
1369 last_word = sen[c];
1370 if (last_word == -1)
1371 continue;
1372 window_offset = a * layer1_size;
1373 if (a > window)
1374 window_offset -= layer1_size;
1375 for (c = 0; c < layer1_size; c++)
1376 syn0[c + last_word * layer1_size] += neu1e[c
1377 + window_offset];
1378 }
1379 }
1380 } else if (type == 3) { //train structured skip-gram
1381 for (a = 0; a < window * 2 + 1; a++)
1382 if (a != window) {
1383 c = sentence_position - window + a;
1384 if (c < 0)
1385 continue;
1386 if (c >= sentence_length)
1387 continue;
1388 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001389 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001390 continue;
1391 l1 = last_word * layer1_size;
1392 window_offset = a * layer1_size;
1393 if (a > window)
1394 window_offset -= layer1_size;
1395 for (c = 0; c < layer1_size; c++)
1396 neu1e[c] = 0;
1397 // HIERARCHICAL SOFTMAX
1398 if (hs)
1399 for (d = 0; d < vocab[word].codelen; d++) {
1400 f = 0;
1401 l2 = vocab[word].point[d] * window_layer_size;
1402 // Propagate hidden -> output
1403 for (c = 0; c < layer1_size; c++)
1404 f += syn0[c + l1]
1405 * syn1_window[c + l2 + window_offset];
1406 if (f <= -MAX_EXP)
1407 continue;
1408 else if (f >= MAX_EXP)
1409 continue;
1410 else
1411 f = expTable[(int) ((f + MAX_EXP)
1412 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1413 // 'g' is the gradient multiplied by the learning rate
1414 g = (1 - vocab[word].code[d] - f) * alpha;
1415 // Propagate errors output -> hidden
1416 for (c = 0; c < layer1_size; c++)
1417 neu1e[c] += g
1418 * syn1_window[c + l2 + window_offset];
1419 // Learn weights hidden -> output
1420 for (c = 0; c < layer1_size; c++)
1421 syn1[c + l2 + window_offset] += g
1422 * syn0[c + l1];
1423 if (cap == 1)
1424 for (c = 0; c < layer1_size; c++)
1425 capParam(syn1, c + l2 + window_offset);
1426 }
1427 // NEGATIVE SAMPLING
1428 if (negative > 0)
1429 for (d = 0; d < negative + 1; d++) {
1430 if (d == 0) {
1431 target = word;
1432 label = 1;
1433 } else {
1434 next_random = next_random
1435 * (unsigned long long) 25214903917 + 11;
1436 if (word_to_group != NULL
1437 && word_to_group[word] != -1) {
1438 target = word;
1439 while (target == word) {
1440 target =
1441 group_to_table[word_to_group[word]
1442 * table_size
1443 + (next_random >> 16)
1444 % table_size];
1445 next_random =
1446 next_random
1447 * (unsigned long long) 25214903917
1448 + 11;
1449 }
1450 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1451 } else {
1452 target = table[(next_random >> 16)
1453 % table_size];
1454 }
1455 if (target == 0)
1456 target = next_random % (vocab_size - 1) + 1;
1457 if (target == word)
1458 continue;
1459 label = 0;
1460 }
1461 l2 = target * window_layer_size;
1462 f = 0;
1463 for (c = 0; c < layer1_size; c++)
1464 f +=
1465 syn0[c + l1]
1466 * syn1neg_window[c + l2
1467 + window_offset];
1468 if (f > MAX_EXP)
1469 g = (label - 1) * alpha;
1470 else if (f < -MAX_EXP)
1471 g = (label - 0) * alpha;
1472 else
1473 g =
1474 (label
1475 - expTable[(int) ((f + MAX_EXP)
1476 * (EXP_TABLE_SIZE
1477 / MAX_EXP / 2))])
1478 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001479 if(debug_mode > 2 && ((long long) id) == 0) {
1480 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1481 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1482 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001483 for (c = 0; c < layer1_size; c++)
1484 neu1e[c] +=
1485 g
1486 * syn1neg_window[c + l2
1487 + window_offset];
1488 for (c = 0; c < layer1_size; c++)
1489 syn1neg_window[c + l2 + window_offset] += g
1490 * syn0[c + l1];
1491 if (cap == 1)
1492 for (c = 0; c < layer1_size; c++)
1493 capParam(syn1neg_window,
1494 c + l2 + window_offset);
1495 }
1496 // Noise Constrastive Estimation
1497 if (nce > 0)
1498 for (d = 0; d < nce + 1; d++) {
1499 if (d == 0) {
1500 target = word;
1501 label = 1;
1502 } else {
1503 next_random = next_random
1504 * (unsigned long long) 25214903917 + 11;
1505 if (word_to_group != NULL
1506 && word_to_group[word] != -1) {
1507 target = word;
1508 while (target == word) {
1509 target =
1510 group_to_table[word_to_group[word]
1511 * table_size
1512 + (next_random >> 16)
1513 % table_size];
1514 next_random =
1515 next_random
1516 * (unsigned long long) 25214903917
1517 + 11;
1518 }
1519 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1520 } else {
1521 target = table[(next_random >> 16)
1522 % table_size];
1523 }
1524 if (target == 0)
1525 target = next_random % (vocab_size - 1) + 1;
1526 if (target == word)
1527 continue;
1528 label = 0;
1529 }
1530 l2 = target * window_layer_size;
1531 f = 0;
1532 for (c = 0; c < layer1_size; c++)
1533 f +=
1534 syn0[c + l1]
1535 * syn1nce_window[c + l2
1536 + window_offset];
1537 if (f > MAX_EXP)
1538 g = (label - 1) * alpha;
1539 else if (f < -MAX_EXP)
1540 g = (label - 0) * alpha;
1541 else {
1542 f = exp(f);
1543 g = (label
1544 - f
1545 / (noise_distribution[target]
1546 * nce + f)) * alpha;
1547 }
1548 for (c = 0; c < layer1_size; c++)
1549 neu1e[c] +=
1550 g
1551 * syn1nce_window[c + l2
1552 + window_offset];
1553 for (c = 0; c < layer1_size; c++)
1554 syn1nce_window[c + l2 + window_offset] += g
1555 * syn0[c + l1];
1556 if (cap == 1)
1557 for (c = 0; c < layer1_size; c++)
1558 capParam(syn1nce_window,
1559 c + l2 + window_offset);
1560 }
1561 // Learn weights input -> hidden
1562 for (c = 0; c < layer1_size; c++) {
1563 syn0[c + l1] += neu1e[c];
1564 if (syn0[c + l1] > 50)
1565 syn0[c + l1] = 50;
1566 if (syn0[c + l1] < -50)
1567 syn0[c + l1] = -50;
1568 }
1569 }
1570 } else if (type == 4) { //training senna
1571 // in -> hidden
1572 cw = 0;
1573 for (a = 0; a < window * 2 + 1; a++)
1574 if (a != window) {
1575 c = sentence_position - window + a;
1576 if (c < 0)
1577 continue;
1578 if (c >= sentence_length)
1579 continue;
1580 last_word = sen[c];
1581 if (last_word == -1)
1582 continue;
1583 window_offset = a * layer1_size;
1584 if (a > window)
1585 window_offset -= layer1_size;
1586 for (c = 0; c < layer1_size; c++)
1587 neu1[c + window_offset] += syn0[c
1588 + last_word * layer1_size];
1589 cw++;
1590 }
1591 if (cw) {
1592 for (a = 0; a < window_hidden_size; a++) {
1593 c = a * window_layer_size;
1594 for (b = 0; b < window_layer_size; b++) {
1595 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1596 }
1597 }
1598 if (hs)
1599 for (d = 0; d < vocab[word].codelen; d++) {
1600 f = 0;
1601 l2 = vocab[word].point[d] * window_hidden_size;
1602 // Propagate hidden -> output
1603 for (c = 0; c < window_hidden_size; c++)
1604 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1605 if (f <= -MAX_EXP)
1606 continue;
1607 else if (f >= MAX_EXP)
1608 continue;
1609 else
1610 f = expTable[(int) ((f + MAX_EXP)
1611 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1612 // 'g' is the gradient multiplied by the learning rate
1613 g = (1 - vocab[word].code[d] - f) * alpha;
1614 // Propagate errors output -> hidden
1615 for (c = 0; c < window_hidden_size; c++)
1616 neu2e[c] += dHardTanh(neu2[c], g) * g
1617 * syn_hidden_word[c + l2];
1618 // Learn weights hidden -> output
1619 for (c = 0; c < window_hidden_size; c++)
1620 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1621 * neu2[c];
1622 }
1623 // NEGATIVE SAMPLING
1624 if (negative > 0)
1625 for (d = 0; d < negative + 1; d++) {
1626 if (d == 0) {
1627 target = word;
1628 label = 1;
1629 } else {
1630 next_random = next_random
1631 * (unsigned long long) 25214903917 + 11;
1632 if (word_to_group != NULL
1633 && word_to_group[word] != -1) {
1634 target = word;
1635 while (target == word) {
1636 target = group_to_table[word_to_group[word]
1637 * table_size
1638 + (next_random >> 16) % table_size];
1639 next_random = next_random
1640 * (unsigned long long) 25214903917
1641 + 11;
1642 }
1643 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1644 } else {
1645 target =
1646 table[(next_random >> 16) % table_size];
1647 }
1648 if (target == 0)
1649 target = next_random % (vocab_size - 1) + 1;
1650 if (target == word)
1651 continue;
1652 label = 0;
1653 }
1654 l2 = target * window_hidden_size;
1655 f = 0;
1656 for (c = 0; c < window_hidden_size; c++)
1657 f += hardTanh(neu2[c])
1658 * syn_hidden_word_neg[c + l2];
1659 if (f > MAX_EXP)
1660 g = (label - 1) * alpha / negative;
1661 else if (f < -MAX_EXP)
1662 g = (label - 0) * alpha / negative;
1663 else
1664 g = (label
1665 - expTable[(int) ((f + MAX_EXP)
1666 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1667 * alpha / negative;
1668 for (c = 0; c < window_hidden_size; c++)
1669 neu2e[c] += dHardTanh(neu2[c], g) * g
1670 * syn_hidden_word_neg[c + l2];
1671 for (c = 0; c < window_hidden_size; c++)
1672 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1673 * g * neu2[c];
1674 }
1675 for (a = 0; a < window_hidden_size; a++)
1676 for (b = 0; b < window_layer_size; b++)
1677 neu1e[b] += neu2e[a]
1678 * syn_window_hidden[a * window_layer_size + b];
1679 for (a = 0; a < window_hidden_size; a++)
1680 for (b = 0; b < window_layer_size; b++)
1681 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1682 * neu1[b];
1683 // hidden -> in
1684 for (a = 0; a < window * 2 + 1; a++)
1685 if (a != window) {
1686 c = sentence_position - window + a;
1687 if (c < 0)
1688 continue;
1689 if (c >= sentence_length)
1690 continue;
1691 last_word = sen[c];
1692 if (last_word == -1)
1693 continue;
1694 window_offset = a * layer1_size;
1695 if (a > window)
1696 window_offset -= layer1_size;
1697 for (c = 0; c < layer1_size; c++)
1698 syn0[c + last_word * layer1_size] += neu1e[c
1699 + window_offset];
1700 }
1701 }
Marc Kupietz613edbf2018-01-11 21:38:03 +01001702 } else if(type == 5) {
1703 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1704 c = sentence_position - window + a;
1705 if (c < 0) continue;
1706 if (c >= sentence_length) continue;
1707 last_word = sen[c];
1708 if (last_word == -1) continue;
1709 inc_collocator(cdb, word, last_word, a - window);
1710 // printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
1711 // cw++;
1712 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001713 } else {
1714 printf("unknown type %i", type);
1715 exit(0);
1716 }
1717 sentence_position++;
1718 if (sentence_position >= sentence_length) {
1719 sentence_length = 0;
1720 continue;
1721 }
1722 }
1723 fclose(fi);
1724 free(neu1);
1725 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001726 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001727 pthread_exit(NULL);
1728}
1729
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001730void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001731 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001732 real f, max_f, maxmax_f;
Marc Kupietzf00e7b02023-12-22 11:11:56 +01001733 real *target_sums=0L, bestf[MAX_CC], worstbest;
Marc Kupietz71996e72016-03-18 13:40:24 +01001734 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001735 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001736 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1737
1738 for (d = cc; d < vocab_size; d++) {
1739 for (b = 0; b < vocab_size; b++)
1740 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001741 for (b = 0; b < N; b++)
1742 bestf[b]=-1;
1743 worstbest = -1;
1744
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001745 maxmax_f = -1;
1746 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001747 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001748 if (a != window) {
1749 max_f = -1;
1750 window_offset = a * layer1_size;
1751 if (a > window)
1752 window_offset -= layer1_size;
1753 for(target = 0; target < vocab_size; target ++) {
1754 if(target == d)
1755 continue;
1756 f = 0;
1757 for (c = 0; c < layer1_size; c++)
1758 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1759 if (f < -MAX_EXP)
1760 continue;
1761 else if (f > MAX_EXP)
1762 continue;
1763 else
1764 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1765 if(f > max_f) {
1766 max_f = f;
1767 max_target = target;
1768 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001769 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001770 if(f > worstbest) {
1771 for (b = 0; b < N; b++) {
1772 if (f > bestf[b]) {
1773 for (e = N - 1; e > b; e--) {
1774 bestf[e] = bestf[e - 1];
1775 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001776 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001777 }
1778 bestf[b] = f;
1779 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001780 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001781 break;
1782 }
1783 }
1784 worstbest = bestf[N-1];
1785 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001786 }
1787 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1788 if(max_f > maxmax_f) {
1789 maxmax_f = max_f;
1790 maxmax_target = max_target;
1791 }
1792 } else {
1793 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1794 }
1795 }
1796 max_f = -1;
1797 for (b = 0; b < vocab_size; b++) {
1798 if(target_sums[b] > max_f) {
1799 max_f = target_sums[b];
1800 max_target = b;
1801 }
1802 }
1803 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001804 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001805 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001806 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001807 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001808 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001809 }
1810}
1811
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001812void TrainModel() {
1813 long a, b, c, d;
1814 FILE *fo;
1815 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001816 threadPos = malloc(num_threads * sizeof(long long));
1817 threadIters = malloc(num_threads * sizeof(int));
1818 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001819 printf("Starting training using file %s\n", train_file);
1820 starting_alpha = alpha;
1821 if (read_vocab_file[0] != 0)
1822 ReadVocab();
1823 else
1824 LearnVocabFromTrainFile();
1825 if (save_vocab_file[0] != 0)
1826 SaveVocab();
1827 if (output_file[0] == 0)
1828 return;
1829 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001830 if(cc > 0)
1831 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001832 if (negative > 0 || nce > 0)
1833 InitUnigramTable();
1834 if (negative_classes_file[0] != 0)
1835 InitClassUnigramTable();
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001836 start = time(NULL);
1837 start_clock = clock();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001838 for (a = 0; a < num_threads; a++)
1839 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001840 if(debug_mode > 1)
1841 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001842 for (a = 0; a < num_threads; a++)
1843 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001844 if(debug_mode > 1) {
1845 pthread_join(pt[num_threads], NULL);
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001846 clock_t now = time(NULL);
1847 clock_t now_clock = clock();
1848 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
Marc Kupietz613edbf2018-01-11 21:38:03 +01001849 if(type == 5) // don't save vectorsmfor classic collocators
1850 return;
Marc Kupietz202723e2016-07-14 09:12:00 +02001851 printf("Saving vectors to %s ...", output_file);
1852 fflush(stdout);
1853 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001854 fo = fopen(output_file, "wb");
1855 if (classes == 0) {
1856 // Save the word vectors
1857 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1858 for (a = 0; a < vocab_size; a++) {
1859 fprintf(fo, "%s ", vocab[a].word);
1860 if (binary)
1861 for (b = 0; b < layer1_size; b++)
1862 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1863 else
1864 for (b = 0; b < layer1_size; b++)
1865 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1866 fprintf(fo, "\n");
1867 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001868 if(debug_mode > 1)
1869 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001870 } else {
1871 // Run K-means on the word vectors
1872 int clcn = classes, iter = 10, closeid;
1873 int *centcn = (int *) malloc(classes * sizeof(int));
1874 int *cl = (int *) calloc(vocab_size, sizeof(int));
1875 real closev, x;
1876 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1877 for (a = 0; a < vocab_size; a++)
1878 cl[a] = a % clcn;
1879 for (a = 0; a < iter; a++) {
1880 for (b = 0; b < clcn * layer1_size; b++)
1881 cent[b] = 0;
1882 for (b = 0; b < clcn; b++)
1883 centcn[b] = 1;
1884 for (c = 0; c < vocab_size; c++) {
1885 for (d = 0; d < layer1_size; d++)
1886 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1887 centcn[cl[c]]++;
1888 }
1889 for (b = 0; b < clcn; b++) {
1890 closev = 0;
1891 for (c = 0; c < layer1_size; c++) {
1892 cent[layer1_size * b + c] /= centcn[b];
1893 closev += cent[layer1_size * b + c]
1894 * cent[layer1_size * b + c];
1895 }
1896 closev = sqrt(closev);
1897 for (c = 0; c < layer1_size; c++)
1898 cent[layer1_size * b + c] /= closev;
1899 }
1900 for (c = 0; c < vocab_size; c++) {
1901 closev = -10;
1902 closeid = 0;
1903 for (d = 0; d < clcn; d++) {
1904 x = 0;
1905 for (b = 0; b < layer1_size; b++)
1906 x += cent[layer1_size * d + b]
1907 * syn0[c * layer1_size + b];
1908 if (x > closev) {
1909 closev = x;
1910 closeid = d;
1911 }
1912 }
1913 cl[c] = closeid;
1914 }
1915 }
1916 // Save the K-means classes
1917 for (a = 0; a < vocab_size; a++)
1918 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1919 free(centcn);
1920 free(cent);
1921 free(cl);
1922 }
1923 fclose(fo);
1924 if (save_net_file[0] != 0)
1925 SaveNet();
1926}
1927
1928int ArgPos(char *str, int argc, char **argv) {
1929 int a;
1930 for (a = 1; a < argc; a++)
1931 if (!strcmp(str, argv[a])) {
1932 if (a == argc - 1) {
1933 printf("Argument missing for %s\n", str);
1934 exit(1);
1935 }
1936 return a;
1937 }
1938 return -1;
1939}
1940
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001941void print_help() {
Marc Kupietz83a67d42021-03-22 17:29:36 +01001942 printf("WORD VECTOR estimation toolkit v 0.9.0\n\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001943 printf("Options:\n");
1944 printf("Parameters for training:\n");
1945 printf("\t-train <file>\n");
1946 printf("\t\tUse text data from <file> to train the model\n");
1947 printf("\t-output <file>\n");
1948 printf(
1949 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1950 printf("\t-size <int>\n");
1951 printf("\t\tSet size of word vectors; default is 100\n");
1952 printf("\t-window <int>\n");
1953 printf("\t\tSet max skip length between words; default is 5\n");
1954 printf("\t-sample <float>\n");
1955 printf(
1956 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1957 printf(
1958 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1959 printf("\t-hs <int>\n");
1960 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1961 printf("\t-negative <int>\n");
1962 printf(
1963 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1964 printf("\t-negative-classes <file>\n");
1965 printf("\t\tNegative classes to sample from\n");
1966 printf("\t-nce <int>\n");
1967 printf(
1968 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1969 printf("\t-threads <int>\n");
1970 printf("\t\tUse <int> threads (default 12)\n");
1971 printf("\t-iter <int>\n");
1972 printf("\t\tRun more training iterations (default 5)\n");
1973 printf("\t-min-count <int>\n");
1974 printf(
1975 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1976 printf("\t-alpha <float>\n");
1977 printf(
1978 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1979 printf("\t-classes <int>\n");
1980 printf(
1981 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1982 printf("\t-debug <int>\n");
1983 printf(
1984 "\t\tSet the debug mode (default = 2 = more info during training)\n");
1985 printf("\t-binary <int>\n");
1986 printf(
1987 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1988 printf("\t-save-vocab <file>\n");
1989 printf("\t\tThe vocabulary will be saved to <file>\n");
1990 printf("\t-read-vocab <file>\n");
1991 printf(
1992 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1993 printf("\t-read-net <file>\n");
1994 printf(
1995 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
1996 printf("\t-save-net <file>\n");
1997 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietze423f732017-12-22 17:57:03 +01001998 printf("\t-magic-stop-file <file>\n");
1999 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002000 printf("\t-show-cc <int>\n");
2001 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002002 printf("\t-type <int>\n");
2003 printf(
Marc Kupietz613edbf2018-01-11 21:38:03 +01002004 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002005 printf("\t-cap <int>\n");
2006 printf(
2007 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
2008 printf("\nExamples:\n");
2009 printf(
Marc Kupietz83a67d42021-03-22 17:29:36 +01002010 "./dereko2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietzc7f773b2017-12-02 12:04:03 +01002011}
2012
2013int main(int argc, char **argv) {
2014 int i;
2015 setlocale(LC_ALL, "");
2016 if (argc == 1) {
2017 print_help();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002018 return 0;
2019 }
2020 output_file[0] = 0;
2021 save_vocab_file[0] = 0;
2022 read_vocab_file[0] = 0;
2023 save_net_file[0] = 0;
2024 read_net_file[0] = 0;
2025 negative_classes_file[0] = 0;
Marc Kupietzc7f773b2017-12-02 12:04:03 +01002026 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
2027 print_help();
2028 return(0);
2029 }
2030 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
2031 print_help();
2032 return(0);
2033 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002034 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2035 layer1_size = atoi(argv[i + 1]);
2036 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2037 strcpy(train_file, argv[i + 1]);
2038 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2039 strcpy(save_vocab_file, argv[i + 1]);
2040 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2041 strcpy(read_vocab_file, argv[i + 1]);
2042 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2043 strcpy(save_net_file, argv[i + 1]);
2044 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2045 strcpy(read_net_file, argv[i + 1]);
Marc Kupietze423f732017-12-22 17:57:03 +01002046 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2047 strcpy(magic_stop_file, argv[i + 1]);
2048 if (access(magic_stop_file, F_OK ) != -1) {
2049 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2050 exit(1);
2051 }
2052 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002053 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2054 debug_mode = atoi(argv[i + 1]);
2055 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2056 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002057 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2058 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002059 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2060 type = atoi(argv[i + 1]);
2061 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2062 strcpy(output_file, argv[i + 1]);
2063 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2064 window = atoi(argv[i + 1]);
2065 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2066 sample = atof(argv[i + 1]);
2067 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2068 hs = atoi(argv[i + 1]);
2069 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2070 negative = atoi(argv[i + 1]);
2071 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2072 strcpy(negative_classes_file, argv[i + 1]);
2073 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2074 nce = atoi(argv[i + 1]);
2075 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2076 num_threads = atoi(argv[i + 1]);
2077 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2078 iter = atoi(argv[i + 1]);
2079 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2080 min_count = atoi(argv[i + 1]);
2081 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2082 classes = atoi(argv[i + 1]);
Marc Kupietz879333c2023-12-20 11:41:09 +01002083 if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0)
Marc Kupietz178a3c92023-12-22 15:12:27 +01002084 metadata_categories = atoi(argv[i + 1]);
2085 if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0) {
2086 metadata_categories = atoi(argv[i + 1]);
2087 if (metadata_categories > MAX_METADATA_CATEGORIES) {
2088 printf("ERROR: metadata categories must be <= %d\n", MAX_METADATA_CATEGORIES);
2089 exit(1);
2090 }
2091 for (int j = 0; j <= metadata_categories; j++) {
2092 }
2093 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002094 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2095 cap = atoi(argv[i + 1]);
2096 if (type == 0 || type == 2 || type == 4)
2097 alpha = 0.05;
Marc Kupietz613edbf2018-01-11 21:38:03 +01002098 if (type==5) {
2099 sample = 0;
2100 cdb = open_collocatordb_for_write(output_file);
2101 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002102 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2103 alpha = atof(argv[i + 1]);
2104 vocab = (struct vocab_word *) calloc(vocab_max_size,
2105 sizeof(struct vocab_word));
2106 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2107 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2108 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2109 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2110 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2111 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002112 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002113 TrainModel();
2114 return 0;
2115}
2116