blob: 9578f1b533a20c99be326fe9f53c0b253099baec [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
Marc Kupietze23c5402016-07-14 11:10:09 +020015#include <locale.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010016#include <stdio.h>
17#include <stdlib.h>
18#include <string.h>
Marc Kupietz202723e2016-07-14 09:12:00 +020019#include <unistd.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010020#include <math.h>
21#include <pthread.h>
Marc Kupietz613edbf2018-01-11 21:38:03 +010022#include <collocatordb.h>
Marc Kupietzd6f9c712016-03-16 11:50:56 +010023
24#define MAX_STRING 100
25#define EXP_TABLE_SIZE 1000
26#define MAX_EXP 6
27#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz71996e72016-03-18 13:40:24 +010028#define MAX_CC 100
Marc Kupietzd6f9c712016-03-16 11:50:56 +010029#define MAX_CODE_LENGTH 40
Marc Kupietz178a3c92023-12-22 15:12:27 +010030#define MAX_METADATA_CATEGORIES 4
Marc Kupietzd6f9c712016-03-16 11:50:56 +010031
Marc Kupietz178a3c92023-12-22 15:12:27 +010032#define METADATA_MARKER ' '
Marc Kupietzd6f9c712016-03-16 11:50:56 +010033const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
34
35typedef float real; // Precision of float numbers
36
37struct vocab_word {
38 long long cn;
39 int *point;
40 char *word, *code, codelen;
41};
42
43char train_file[MAX_STRING], output_file[MAX_STRING];
44char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
45char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietze423f732017-12-22 17:57:03 +010046char magic_stop_file[MAX_STRING];
47
Marc Kupietzd6f9c712016-03-16 11:50:56 +010048struct vocab_word *vocab;
49int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz879333c2023-12-20 11:41:09 +010050 num_threads = 12, min_reduce = 1, metadata_categories = 0, expected_metadata_categories = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010051int *vocab_hash;
Marc Kupietzc2731b22016-07-14 08:56:14 +020052long long *threadPos;
53int *threadIters;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010054long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
55long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
56 classes = 0;
57real alpha = 0.025, starting_alpha, sample = 1e-3;
58real *syn0, *syn1, *syn1neg, *syn1nce, *expTable;
Marc Kupietzc2731b22016-07-14 08:56:14 +020059real avgWordLength=0;
Marc Kupietzb366bcd2018-01-11 21:29:41 +010060clock_t start, start_clock;
Marc Kupietzd6f9c712016-03-16 11:50:56 +010061
62real *syn1_window, *syn1neg_window, *syn1nce_window;
63int w_offset, window_layer_size;
64
65int window_hidden_size = 500;
66real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg,
67 *syn_hidden_word_nce;
68
69int hs = 0, negative = 5;
70const int table_size = 1e8;
71int *table;
72
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +010073long cc = 0;
74
Marc Kupietzd6f9c712016-03-16 11:50:56 +010075//constrastive negative sampling
76char negative_classes_file[MAX_STRING];
77int *word_to_group;
78int *group_to_table; //group_size*table_size
79int class_number;
80
81//nce
82real* noise_distribution;
83int nce = 0;
84
85//param caps
86real CAP_VALUE = 50;
87int cap = 0;
88
Marc Kupietz613edbf2018-01-11 21:38:03 +010089COLLOCATORDB *cdb = NULL;
90
Marc Kupietzd6f9c712016-03-16 11:50:56 +010091void capParam(real* array, int index) {
92 if (array[index] > CAP_VALUE)
93 array[index] = CAP_VALUE;
94 else if (array[index] < -CAP_VALUE)
95 array[index] = -CAP_VALUE;
96}
97
98real hardTanh(real x) {
99 if (x >= 1) {
100 return 1;
101 } else if (x <= -1) {
102 return -1;
103 } else {
104 return x;
105 }
106}
107
108real dHardTanh(real x, real g) {
109 if (x > 1 && g > 0) {
110 return 0;
111 }
112 if (x < -1 && g < 0) {
113 return 0;
114 }
115 return 1;
116}
117
118void InitUnigramTable() {
119 int a, i;
120 long long train_words_pow = 0;
121 real d1, power = 0.75;
122 table = (int *) malloc(table_size * sizeof(int));
123 for (a = 0; a < vocab_size; a++)
124 train_words_pow += pow(vocab[a].cn, power);
125 i = 0;
126 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
127 for (a = 0; a < table_size; a++) {
128 table[a] = i;
129 if (a / (real) table_size > d1) {
130 i++;
131 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
132 }
133 if (i >= vocab_size)
134 i = vocab_size - 1;
135 }
136
137 noise_distribution = (real *) calloc(vocab_size, sizeof(real));
138 for (a = 0; a < vocab_size; a++)
139 noise_distribution[a] = pow(vocab[a].cn, power)
140 / (real) train_words_pow;
141}
142
143// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
144void ReadWord(char *word, FILE *fin) {
145 int a = 0, ch;
146 while (!feof(fin)) {
147 ch = fgetc(fin);
148 if (ch == 13)
149 continue;
150 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
Marc Kupietz879333c2023-12-20 11:41:09 +0100151 if (ch == '\t' && expected_metadata_categories > 0) {
Marc Kupietz178a3c92023-12-22 15:12:27 +0100152 word[a] = 0;
153 a = 0;
154 expected_metadata_categories--;
Marc Kupietzc564b1f2023-12-22 15:38:29 +0100155 if (debug_mode > 3)
Marc Kupietz178a3c92023-12-22 15:12:27 +0100156 printf("Metadata: %s\n", word);
157 strcpy(word + 1, word);
158 *word = METADATA_MARKER;
159 return;
Marc Kupietz879333c2023-12-20 11:41:09 +0100160 } else {
161 if (a > 0) {
162 if (ch == '\n') {
163 expected_metadata_categories = metadata_categories;
164 ungetc(ch, fin);
165 }
166 break;
167 }
168 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100169 if (ch == '\n') {
170 strcpy(word, (char *) "</s>");
Marc Kupietz879333c2023-12-20 11:41:09 +0100171 expected_metadata_categories = metadata_categories;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100172 return;
173 } else
174 continue;
175 }
176 word[a] = ch;
177 a++;
178 if (a >= MAX_STRING - 1)
179 a--; // Truncate too long words
180 }
181 word[a] = 0;
182}
183
184// Returns hash value of a word
185int GetWordHash(char *word) {
186 unsigned long long a, hash = 0;
187 for (a = 0; a < strlen(word); a++)
188 hash = hash * 257 + word[a];
189 hash = hash % vocab_hash_size;
190 return hash;
191}
192
193// Returns position of a word in the vocabulary; if the word is not found, returns -1
194int SearchVocab(char *word) {
195 unsigned int hash = GetWordHash(word);
196 while (1) {
197 if (vocab_hash[hash] == -1)
198 return -1;
199 if (!strcmp(word, vocab[vocab_hash[hash]].word))
200 return vocab_hash[hash];
201 hash = (hash + 1) % vocab_hash_size;
202 }
203 return -1;
204}
205
206// Reads a word and returns its index in the vocabulary
Marc Kupietzc564b1f2023-12-22 15:38:29 +0100207int ReadWordIndex(FILE *fin, int *is_metadata) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100208 char word[MAX_STRING];
209 ReadWord(word, fin);
210 if (feof(fin))
211 return -1;
Marc Kupietzc564b1f2023-12-22 15:38:29 +0100212 if (word[0] == METADATA_MARKER) {
213 *is_metadata = 1;
214 } else {
215 *is_metadata = 0;
216 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100217 return SearchVocab(word);
218}
219
220// Adds a word to the vocabulary
221int AddWordToVocab(char *word) {
222 unsigned int hash, length = strlen(word) + 1;
223 if (length > MAX_STRING)
224 length = MAX_STRING;
225 vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
226 strcpy(vocab[vocab_size].word, word);
227 vocab[vocab_size].cn = 0;
228 vocab_size++;
229 // Reallocate memory if needed
230 if (vocab_size + 2 >= vocab_max_size) {
231 vocab_max_size += 1000;
232 vocab = (struct vocab_word *) realloc(vocab,
233 vocab_max_size * sizeof(struct vocab_word));
234 }
235 hash = GetWordHash(word);
236 while (vocab_hash[hash] != -1)
237 hash = (hash + 1) % vocab_hash_size;
238 vocab_hash[hash] = vocab_size - 1;
239 return vocab_size - 1;
240}
241
242// Used later for sorting by word counts
243int VocabCompare(const void *a, const void *b) {
244 return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn;
245}
246
247// Sorts the vocabulary by frequency using word counts
248void SortVocab() {
249 int a, size;
250 unsigned int hash;
251 // Sort the vocabulary and keep </s> at the first position
252 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
253 for (a = 0; a < vocab_hash_size; a++)
254 vocab_hash[a] = -1;
255 size = vocab_size;
256 train_words = 0;
257 for (a = 0; a < size; a++) {
Marc Kupietzc2731b22016-07-14 08:56:14 +0200258 avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100259 // Words occuring less than min_count times will be discarded from the vocab
260 if ((vocab[a].cn < min_count) && (a != 0)) {
261 vocab_size--;
262 free(vocab[a].word);
263 } else {
264 // Hash will be re-computed, as after the sorting it is not actual
265 hash = GetWordHash(vocab[a].word);
266 while (vocab_hash[hash] != -1)
267 hash = (hash + 1) % vocab_hash_size;
268 vocab_hash[hash] = a;
269 train_words += vocab[a].cn;
270 }
271 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200272 avgWordLength /= train_words;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100273 vocab = (struct vocab_word *) realloc(vocab,
274 (vocab_size + 1) * sizeof(struct vocab_word));
275 // Allocate memory for the binary tree construction
276 for (a = 0; a < vocab_size; a++) {
277 vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
278 vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
279 }
280}
281
282// Reduces the vocabulary by removing infrequent tokens
283void ReduceVocab() {
284 int a, b = 0;
285 unsigned int hash;
286 for (a = 0; a < vocab_size; a++)
287 if (vocab[a].cn > min_reduce) {
288 vocab[b].cn = vocab[a].cn;
289 vocab[b].word = vocab[a].word;
290 b++;
291 } else
292 free(vocab[a].word);
293 vocab_size = b;
294 for (a = 0; a < vocab_hash_size; a++)
295 vocab_hash[a] = -1;
296 for (a = 0; a < vocab_size; a++) {
297 // Hash will be re-computed, as it is not actual
298 hash = GetWordHash(vocab[a].word);
299 while (vocab_hash[hash] != -1)
300 hash = (hash + 1) % vocab_hash_size;
301 vocab_hash[hash] = a;
302 }
303 fflush(stdout);
304 min_reduce++;
305}
306
307// Create binary Huffman tree using the word counts
308// Frequent words will have short uniqe binary codes
309void CreateBinaryTree() {
310 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
311 char code[MAX_CODE_LENGTH];
312 long long *count = (long long *) calloc(vocab_size * 2 + 1,
313 sizeof(long long));
314 long long *binary = (long long *) calloc(vocab_size * 2 + 1,
315 sizeof(long long));
316 long long *parent_node = (long long *) calloc(vocab_size * 2 + 1,
317 sizeof(long long));
318 for (a = 0; a < vocab_size; a++)
319 count[a] = vocab[a].cn;
320 for (a = vocab_size; a < vocab_size * 2; a++)
321 count[a] = 1e15;
322 pos1 = vocab_size - 1;
323 pos2 = vocab_size;
324 // Following algorithm constructs the Huffman tree by adding one node at a time
325 for (a = 0; a < vocab_size - 1; a++) {
326 // First, find two smallest nodes 'min1, min2'
327 if (pos1 >= 0) {
328 if (count[pos1] < count[pos2]) {
329 min1i = pos1;
330 pos1--;
331 } else {
332 min1i = pos2;
333 pos2++;
334 }
335 } else {
336 min1i = pos2;
337 pos2++;
338 }
339 if (pos1 >= 0) {
340 if (count[pos1] < count[pos2]) {
341 min2i = pos1;
342 pos1--;
343 } else {
344 min2i = pos2;
345 pos2++;
346 }
347 } else {
348 min2i = pos2;
349 pos2++;
350 }
351 count[vocab_size + a] = count[min1i] + count[min2i];
352 parent_node[min1i] = vocab_size + a;
353 parent_node[min2i] = vocab_size + a;
354 binary[min2i] = 1;
355 }
356 // Now assign binary code to each vocabulary word
357 for (a = 0; a < vocab_size; a++) {
358 b = a;
359 i = 0;
360 while (1) {
361 code[i] = binary[b];
362 point[i] = b;
363 i++;
364 b = parent_node[b];
365 if (b == vocab_size * 2 - 2)
366 break;
367 }
368 vocab[a].codelen = i;
369 vocab[a].point[0] = vocab_size - 2;
370 for (b = 0; b < i; b++) {
371 vocab[a].code[i - b - 1] = code[b];
372 vocab[a].point[i - b] = point[b] - vocab_size;
373 }
374 }
375 free(count);
376 free(binary);
377 free(parent_node);
378}
379
380void LearnVocabFromTrainFile() {
381 char word[MAX_STRING];
382 FILE *fin;
383 long long a, i;
384 for (a = 0; a < vocab_hash_size; a++)
385 vocab_hash[a] = -1;
386 fin = fopen(train_file, "rb");
387 if (fin == NULL) {
388 printf("ERROR: training data file not found!\n");
389 exit(1);
390 }
391 vocab_size = 0;
392 AddWordToVocab((char *) "</s>");
Marc Kupietz879333c2023-12-20 11:41:09 +0100393 for (int j=0; j < metadata_categories; j++) {
394 ReadWord(word, fin);
395 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100396 while (1) {
397 ReadWord(word, fin);
398 if (feof(fin))
399 break;
400 train_words++;
401 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
402 printf("%lldK%c", train_words / 1000, 13);
403 fflush(stdout);
404 }
405 i = SearchVocab(word);
406 if (i == -1) {
407 a = AddWordToVocab(word);
408 vocab[a].cn = 1;
409 } else
410 vocab[i].cn++;
411 if (vocab_size > vocab_hash_size * 0.7)
412 ReduceVocab();
413 }
414 SortVocab();
415 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200416 printf("Vocab size: %'lld\n", vocab_size);
417 printf("Words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100418 }
419 file_size = ftell(fin);
420 fclose(fin);
421}
422
423void SaveVocab() {
424 long long i;
425 FILE *fo = fopen(save_vocab_file, "wb");
426 for (i = 0; i < vocab_size; i++)
427 fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
428 fclose(fo);
429}
430
431void ReadVocab() {
432 long long a, i = 0;
433 char c;
434 char word[MAX_STRING];
435 FILE *fin = fopen(read_vocab_file, "rb");
436 if (fin == NULL) {
437 printf("Vocabulary file not found\n");
438 exit(1);
439 }
440 for (a = 0; a < vocab_hash_size; a++)
441 vocab_hash[a] = -1;
442 vocab_size = 0;
443 while (1) {
444 ReadWord(word, fin);
445 if (feof(fin))
446 break;
447 a = AddWordToVocab(word);
448 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
449 i++;
450 }
Marc Kupietzc2731b22016-07-14 08:56:14 +0200451 fclose(fin);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100452 fin = fopen(train_file, "rb");
453 if (fin == NULL) {
454 printf("ERROR: training data file not found!\n");
455 exit(1);
456 }
457 fseek(fin, 0, SEEK_END);
458 file_size = ftell(fin);
459 fclose(fin);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200460 SortVocab();
461 if (debug_mode > 0) {
Marc Kupietze23c5402016-07-14 11:10:09 +0200462 printf("Vocab size: %'lld\n", vocab_size);
463 printf("Words in vocab's train file: %'lld\n", train_words);
464 printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietzc2731b22016-07-14 08:56:14 +0200465 }
Marc Kupietze23c5402016-07-14 11:10:09 +0200466 train_words = file_size / avgWordLength;
467 if(debug_mode > 0)
468 printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100469}
470
471void InitClassUnigramTable() {
472 long long a, c;
473 printf("loading class unigrams \n");
474 FILE *fin = fopen(negative_classes_file, "rb");
475 if (fin == NULL) {
476 printf("ERROR: class file not found!\n");
477 exit(1);
478 }
479 word_to_group = (int *) malloc(vocab_size * sizeof(int));
480 for (a = 0; a < vocab_size; a++)
481 word_to_group[a] = -1;
482 char class[MAX_STRING];
483 char prev_class[MAX_STRING];
484 prev_class[0] = 0;
485 char word[MAX_STRING];
486 class_number = -1;
487 while (1) {
488 if (feof(fin))
489 break;
490 ReadWord(class, fin);
491 ReadWord(word, fin);
492 int word_index = SearchVocab(word);
493 if (word_index != -1) {
494 if (strcmp(class, prev_class) != 0) {
495 class_number++;
496 strcpy(prev_class, class);
497 }
498 word_to_group[word_index] = class_number;
499 }
500 ReadWord(word, fin);
501 }
502 class_number++;
503 fclose(fin);
504
505 group_to_table = (int *) malloc(table_size * class_number * sizeof(int));
506 long long train_words_pow = 0;
507 real d1, power = 0.75;
508
509 for (c = 0; c < class_number; c++) {
510 long long offset = c * table_size;
511 train_words_pow = 0;
512 for (a = 0; a < vocab_size; a++)
513 if (word_to_group[a] == c)
514 train_words_pow += pow(vocab[a].cn, power);
515 int i = 0;
516 while (word_to_group[i] != c && i < vocab_size)
517 i++;
518 d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
519 for (a = 0; a < table_size; a++) {
520 //printf("index %lld , word %d\n", a, i);
521 group_to_table[offset + a] = i;
522 if (a / (real) table_size > d1) {
523 i++;
524 while (word_to_group[i] != c && i < vocab_size)
525 i++;
526 d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
527 }
528 if (i >= vocab_size)
529 while (word_to_group[i] != c && i >= 0)
530 i--;
531 }
532 }
533}
534
Marc Kupietz61485ad2023-12-22 16:16:59 +0100535void SaveArgs(unsigned int argc, char **argv) {
Marc Kupietz210b9d52016-04-02 21:48:13 +0200536 unsigned int i;
Marc Kupietz44136742017-12-22 17:52:56 +0100537 char args_file[MAX_STRING];
538 strcpy(args_file, output_file);
Marc Kupietz210b9d52016-04-02 21:48:13 +0200539 strcat(args_file, ".args");
540 FILE *fargs = fopen(args_file, "w");
541 if (fargs == NULL) {
542 printf("Cannot save args to %s.\n", args_file);
543 return;
544 }
545
Marc Kupietz44136742017-12-22 17:52:56 +0100546 for(i=1; i<argc; i++)
547 fprintf(fargs, "%s ", argv[i]);
548
549 fprintf(fargs, "\n");
Marc Kupietz210b9d52016-04-02 21:48:13 +0200550 fclose(fargs);
Marc Kupietz44136742017-12-22 17:52:56 +0100551
Marc Kupietz210b9d52016-04-02 21:48:13 +0200552 return;
553}
554
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100555void SaveNet() {
Marc Kupietz313fcc52016-03-16 16:43:37 +0100556 if(type != 3 || negative <= 0) {
557 fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
558 return;
559 }
560
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100561 FILE *fnet = fopen(save_net_file, "wb");
562 if (fnet == NULL) {
563 printf("Net parameter file not found\n");
564 exit(1);
565 }
Marc Kupietzc6979332016-03-16 15:29:07 +0100566 fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100567 fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100568 fclose(fnet);
569}
570
571void InitNet() {
572 long long a, b;
573 unsigned long long next_random = 1;
Marc Kupietz57c0df12016-03-18 12:48:00 +0100574 long long read;
575
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100576 window_layer_size = layer1_size * window * 2;
577 a = posix_memalign((void **) &syn0, 128,
578 (long long) vocab_size * layer1_size * sizeof(real));
579 if (syn0 == NULL) {
580 printf("Memory allocation failed\n");
581 exit(1);
582 }
583
584 if (hs) {
585 a = posix_memalign((void **) &syn1, 128,
586 (long long) vocab_size * layer1_size * sizeof(real));
587 if (syn1 == NULL) {
588 printf("Memory allocation failed\n");
589 exit(1);
590 }
591 a = posix_memalign((void **) &syn1_window, 128,
592 (long long) vocab_size * window_layer_size * sizeof(real));
593 if (syn1_window == NULL) {
594 printf("Memory allocation failed\n");
595 exit(1);
596 }
597 a = posix_memalign((void **) &syn_hidden_word, 128,
598 (long long) vocab_size * window_hidden_size * sizeof(real));
599 if (syn_hidden_word == NULL) {
600 printf("Memory allocation failed\n");
601 exit(1);
602 }
603
604 for (a = 0; a < vocab_size; a++)
605 for (b = 0; b < layer1_size; b++)
606 syn1[a * layer1_size + b] = 0;
607 for (a = 0; a < vocab_size; a++)
608 for (b = 0; b < window_layer_size; b++)
609 syn1_window[a * window_layer_size + b] = 0;
610 for (a = 0; a < vocab_size; a++)
611 for (b = 0; b < window_hidden_size; b++)
612 syn_hidden_word[a * window_hidden_size + b] = 0;
613 }
614 if (negative > 0) {
Marc Kupietz1006a272016-03-16 15:50:20 +0100615 if(type == 0) {
616 a = posix_memalign((void **) &syn1neg, 128,
617 (long long) vocab_size * layer1_size * sizeof(real));
618 if (syn1neg == NULL) {
619 printf("Memory allocation failed\n");
620 exit(1);
621 }
622 for (a = 0; a < vocab_size; a++)
623 for (b = 0; b < layer1_size; b++)
624 syn1neg[a * layer1_size + b] = 0;
625 } else if (type == 3) {
626 a = posix_memalign((void **) &syn1neg_window, 128,
627 (long long) vocab_size * window_layer_size * sizeof(real));
628 if (syn1neg_window == NULL) {
629 printf("Memory allocation failed\n");
630 exit(1);
631 }
632 for (a = 0; a < vocab_size; a++)
633 for (b = 0; b < window_layer_size; b++)
634 syn1neg_window[a * window_layer_size + b] = 0;
635 } else if (type == 4) {
636 a = posix_memalign((void **) &syn_hidden_word_neg, 128,
637 (long long) vocab_size * window_hidden_size * sizeof(real));
638 if (syn_hidden_word_neg == NULL) {
639 printf("Memory allocation failed\n");
640 exit(1);
641 }
642 for (a = 0; a < vocab_size; a++)
643 for (b = 0; b < window_hidden_size; b++)
644 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100645 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100646 }
647 if (nce > 0) {
648 a = posix_memalign((void **) &syn1nce, 128,
649 (long long) vocab_size * layer1_size * sizeof(real));
650 if (syn1nce == NULL) {
651 printf("Memory allocation failed\n");
652 exit(1);
653 }
654 a = posix_memalign((void **) &syn1nce_window, 128,
655 (long long) vocab_size * window_layer_size * sizeof(real));
656 if (syn1nce_window == NULL) {
657 printf("Memory allocation failed\n");
658 exit(1);
659 }
660 a = posix_memalign((void **) &syn_hidden_word_nce, 128,
661 (long long) vocab_size * window_hidden_size * sizeof(real));
662 if (syn_hidden_word_nce == NULL) {
663 printf("Memory allocation failed\n");
664 exit(1);
665 }
666
667 for (a = 0; a < vocab_size; a++)
668 for (b = 0; b < layer1_size; b++)
669 syn1nce[a * layer1_size + b] = 0;
670 for (a = 0; a < vocab_size; a++)
671 for (b = 0; b < window_layer_size; b++)
672 syn1nce_window[a * window_layer_size + b] = 0;
673 for (a = 0; a < vocab_size; a++)
674 for (b = 0; b < window_hidden_size; b++)
675 syn_hidden_word_nce[a * window_hidden_size + b] = 0;
676 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100677
Marc Kupietz1006a272016-03-16 15:50:20 +0100678 if(type == 4) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100679 a = posix_memalign((void **) &syn_window_hidden, 128,
680 window_hidden_size * window_layer_size * sizeof(real));
681 if (syn_window_hidden == NULL) {
682 printf("Memory allocation failed\n");
683 exit(1);
684 }
685 for (a = 0; a < window_hidden_size * window_layer_size; a++) {
686 next_random = next_random * (unsigned long long) 25214903917 + 11;
687 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
688 - 0.5) / (window_hidden_size * window_layer_size);
689 }
690 }
Marc Kupietz1006a272016-03-16 15:50:20 +0100691
692 if (read_net_file[0] == 0) {
693 for (a = 0; a < vocab_size; a++)
694 for (b = 0; b < layer1_size; b++) {
695 next_random = next_random * (unsigned long long) 25214903917
696 + 11;
697 syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
698 / (real) 65536) - 0.5) / layer1_size;
699 }
Marc Kupietz313fcc52016-03-16 16:43:37 +0100700 } else if(type == 3 && negative > 0) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100701 FILE *fnet = fopen(read_net_file, "rb");
702 if (fnet == NULL) {
703 printf("Net parameter file not found\n");
704 exit(1);
705 }
Marc Kupietz57c0df12016-03-18 12:48:00 +0100706 printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
707 read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
708 if(read != vocab_size * layer1_size) {
709 fprintf(stderr, "read-net failed %lld\n", read);
710 exit(-1);
711 }
712 read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
713 if(read != (long long) vocab_size * window_layer_size) {
714 fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
715 (long long) sizeof(real) * vocab_size * window_layer_size);
716 exit(-1);
717 }
718 fgetc(fnet);
719 if(!feof(fnet)) {
720 fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
721 exit(-1);
722 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100723 fclose(fnet);
Marc Kupietz313fcc52016-03-16 16:43:37 +0100724 } else {
725 fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
726 exit(-1);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100727 }
728
729 CreateBinaryTree();
730}
731
Marc Kupietz202723e2016-07-14 09:12:00 +0200732char *currentDateTime(char *buf, real offset) {
733 time_t t;
734 time(&t);
735 t += (long) offset;
736 struct tm tstruct;
737 tstruct = *localtime(&t);
738 strftime(buf, 80, "%c", &tstruct);
739 return buf;
740}
741
742void *MonitorThread(void *id) {
743 char *timebuf = malloc(80);;
744 int i, n=num_threads;
745 long long sum;
746 sleep(1);
747 while(n > 0) {
748 sleep(1);
749 sum = n = 0;
750 for(i=0; i < num_threads; i++) {
751 if(threadPos[i] >= 0) {
752 sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
753 n++;
754 } else {
755 sum += iter * file_size / num_threads;
756 }
757 }
758 if(n == 0)
759 break;
760 real finished_portion = (real) sum / (float) (file_size * iter);
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100761 long long now = time(NULL);
762 long long elapsed = (now - start);
763 long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
Marc Kupietz202723e2016-07-14 09:12:00 +0200764
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100765 printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
Marc Kupietz202723e2016-07-14 09:12:00 +0200766 alpha,
767 finished_portion * 100,
Marc Kupietzb366bcd2018-01-11 21:29:41 +0100768 (float) sum / elapsed / 1000,
Marc Kupietz202723e2016-07-14 09:12:00 +0200769 elapsed,
770 ttg,
771 currentDateTime(timebuf, ttg)
772 );
773 fflush(stdout);
774 }
775 pthread_exit(NULL);
776}
777
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100778void *TrainModelThread(void *id) {
779 long long a, b, d, cw, word, last_word, sentence_length = 0,
780 sentence_position = 0;
Marc Kupietzc564b1f2023-12-22 15:38:29 +0100781 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1], metadata[MAX_METADATA_CATEGORIES];
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100782 long long l1, l2, c, target, label, local_iter = iter;
783 unsigned long long next_random = (long long) id;
784 real f, g;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100785 int input_len_1 = layer1_size;
786 int window_offset = -1;
787 if (type == 2 || type == 4) {
788 input_len_1 = window_layer_size;
789 }
790 real *neu1 = (real *) calloc(input_len_1, sizeof(real));
791 real *neu1e = (real *) calloc(input_len_1, sizeof(real));
Marc Kupietz202723e2016-07-14 09:12:00 +0200792 threadIters[(long) id] = iter;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100793
794 int input_len_2 = 0;
795 if (type == 4) {
796 input_len_2 = window_hidden_size;
797 }
798 real *neu2 = (real *) calloc(input_len_2, sizeof(real));
799 real *neu2e = (real *) calloc(input_len_2, sizeof(real));
800
801 FILE *fi = fopen(train_file, "rb");
Marc Kupietz202723e2016-07-14 09:12:00 +0200802 long long start_pos = file_size / (long long) num_threads * (long long) id;
803 long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
804 long long current_pos = start_pos;
Marc Kupietzc564b1f2023-12-22 15:38:29 +0100805 long long last_pos = start_pos;
806 int is_metadata = 0;
807 int metadata_index = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200808 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100809 while (1) {
Marc Kupietz202723e2016-07-14 09:12:00 +0200810 if ((current_pos - last_pos > 100000)) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100811 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200812 last_pos = current_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100813 last_word_count = word_count;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100814 alpha = starting_alpha
815 * (1 - word_count_actual / (real) (iter * train_words + 1));
816 if (alpha < starting_alpha * 0.0001)
817 alpha = starting_alpha * 0.0001;
818 }
819 if (sentence_length == 0) {
820 while (1) {
Marc Kupietzc564b1f2023-12-22 15:38:29 +0100821 word = ReadWordIndex(fi, &is_metadata);
822 if (is_metadata) {
823 if (debug_mode > 1)
824 printf("Metadata: %s\n", vocab[word].word);
825 metadata[metadata_index++] = word;
826 if (metadata_index >= metadata_categories) {
827 metadata_index = 0;
828 }
829 continue;
830 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100831 if (feof(fi))
832 break;
833 if (word == -1)
834 continue;
835 word_count++;
836 if (word == 0)
837 break;
838 // The subsampling randomly discards frequent words while keeping the ranking same
839 if (sample > 0) {
840 real ran = (sqrt(vocab[word].cn / (sample * train_words))
841 + 1) * (sample * train_words) / vocab[word].cn;
842 next_random = next_random * (unsigned long long) 25214903917
843 + 11;
Marc Kupietzab4e5af2016-03-22 14:24:03 +0100844 if (ran < (next_random & 0xFFFF) / (real) 65536) {
845 if(type == 3) // in structured skipgrams
846 word = -2; // keep the window position correct
847 else
848 continue;
849 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100850 }
851 sen[sentence_length] = word;
852 sentence_length++;
853 if (sentence_length >= MAX_SENTENCE_LENGTH)
854 break;
855 }
856 sentence_position = 0;
857 }
Marc Kupietz202723e2016-07-14 09:12:00 +0200858 current_pos = threadPos[(long) id] = ftell(fi);
859 if (feof(fi) || current_pos >= end_pos ) {
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100860 word_count_actual += word_count - last_word_count;
Marc Kupietz202723e2016-07-14 09:12:00 +0200861 threadIters[(long) id]--;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100862 local_iter--;
863 if (local_iter == 0)
864 break;
Marc Kupietze423f732017-12-22 17:57:03 +0100865 if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
866 printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
867 break;
868 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100869 word_count = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200870 current_pos = last_pos = start_pos;
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100871 last_word_count = 0;
872 sentence_length = 0;
Marc Kupietz202723e2016-07-14 09:12:00 +0200873 fseek(fi, start_pos, SEEK_SET);
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100874 continue;
875 }
876 word = sen[sentence_position];
Peter Fankhauser66035a42016-04-20 13:29:33 +0200877 while (word == -2 && sentence_position<sentence_length)
878 word = sen[++sentence_position];
879 if (sentence_position>=sentence_length) {
880 sentence_length=0;
881 continue;
882 }
883 if (word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +0100884 continue;
885 for (c = 0; c < input_len_1; c++)
886 neu1[c] = 0;
887 for (c = 0; c < input_len_1; c++)
888 neu1e[c] = 0;
889 for (c = 0; c < input_len_2; c++)
890 neu2[c] = 0;
891 for (c = 0; c < input_len_2; c++)
892 neu2e[c] = 0;
893 next_random = next_random * (unsigned long long) 25214903917 + 11;
894 b = next_random % window;
895 if (type == 0) { //train the cbow architecture
896 // in -> hidden
897 cw = 0;
898 for (a = b; a < window * 2 + 1 - b; a++)
899 if (a != window) {
900 c = sentence_position - window + a;
901 if (c < 0)
902 continue;
903 if (c >= sentence_length)
904 continue;
905 last_word = sen[c];
906 if (last_word == -1)
907 continue;
908 for (c = 0; c < layer1_size; c++)
909 neu1[c] += syn0[c + last_word * layer1_size];
910 cw++;
911 }
912 if (cw) {
913 for (c = 0; c < layer1_size; c++)
914 neu1[c] /= cw;
915 if (hs)
916 for (d = 0; d < vocab[word].codelen; d++) {
917 f = 0;
918 l2 = vocab[word].point[d] * layer1_size;
919 // Propagate hidden -> output
920 for (c = 0; c < layer1_size; c++)
921 f += neu1[c] * syn1[c + l2];
922 if (f <= -MAX_EXP)
923 continue;
924 else if (f >= MAX_EXP)
925 continue;
926 else
927 f = expTable[(int) ((f + MAX_EXP)
928 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
929 // 'g' is the gradient multiplied by the learning rate
930 g = (1 - vocab[word].code[d] - f) * alpha;
931 // Propagate errors output -> hidden
932 for (c = 0; c < layer1_size; c++)
933 neu1e[c] += g * syn1[c + l2];
934 // Learn weights hidden -> output
935 for (c = 0; c < layer1_size; c++)
936 syn1[c + l2] += g * neu1[c];
937 if (cap == 1)
938 for (c = 0; c < layer1_size; c++)
939 capParam(syn1, c + l2);
940 }
941 // NEGATIVE SAMPLING
942 if (negative > 0)
943 for (d = 0; d < negative + 1; d++) {
944 if (d == 0) {
945 target = word;
946 label = 1;
947 } else {
948 next_random = next_random
949 * (unsigned long long) 25214903917 + 11;
950 if (word_to_group != NULL
951 && word_to_group[word] != -1) {
952 target = word;
953 while (target == word) {
954 target = group_to_table[word_to_group[word]
955 * table_size
956 + (next_random >> 16) % table_size];
957 next_random = next_random
958 * (unsigned long long) 25214903917
959 + 11;
960 }
961 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
962 } else {
963 target =
964 table[(next_random >> 16) % table_size];
965 }
966 if (target == 0)
967 target = next_random % (vocab_size - 1) + 1;
968 if (target == word)
969 continue;
970 label = 0;
971 }
972 l2 = target * layer1_size;
973 f = 0;
974 for (c = 0; c < layer1_size; c++)
975 f += neu1[c] * syn1neg[c + l2];
976 if (f > MAX_EXP)
977 g = (label - 1) * alpha;
978 else if (f < -MAX_EXP)
979 g = (label - 0) * alpha;
980 else
981 g = (label
982 - expTable[(int) ((f + MAX_EXP)
983 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
984 * alpha;
985 for (c = 0; c < layer1_size; c++)
986 neu1e[c] += g * syn1neg[c + l2];
987 for (c = 0; c < layer1_size; c++)
988 syn1neg[c + l2] += g * neu1[c];
989 if (cap == 1)
990 for (c = 0; c < layer1_size; c++)
991 capParam(syn1neg, c + l2);
992 }
993 // Noise Contrastive Estimation
994 if (nce > 0)
995 for (d = 0; d < nce + 1; d++) {
996 if (d == 0) {
997 target = word;
998 label = 1;
999 } else {
1000 next_random = next_random
1001 * (unsigned long long) 25214903917 + 11;
1002 if (word_to_group != NULL
1003 && word_to_group[word] != -1) {
1004 target = word;
1005 while (target == word) {
1006 target = group_to_table[word_to_group[word]
1007 * table_size
1008 + (next_random >> 16) % table_size];
1009 next_random = next_random
1010 * (unsigned long long) 25214903917
1011 + 11;
1012 }
1013 } else {
1014 target =
1015 table[(next_random >> 16) % table_size];
1016 }
1017 if (target == 0)
1018 target = next_random % (vocab_size - 1) + 1;
1019 if (target == word)
1020 continue;
1021 label = 0;
1022 }
1023 l2 = target * layer1_size;
1024 f = 0;
1025
1026 for (c = 0; c < layer1_size; c++)
1027 f += neu1[c] * syn1nce[c + l2];
1028 if (f > MAX_EXP)
1029 g = (label - 1) * alpha;
1030 else if (f < -MAX_EXP)
1031 g = (label - 0) * alpha;
1032 else {
1033 f = exp(f);
1034 g =
1035 (label
1036 - f
1037 / (noise_distribution[target]
1038 * nce + f)) * alpha;
1039 }
1040 for (c = 0; c < layer1_size; c++)
1041 neu1e[c] += g * syn1nce[c + l2];
1042 for (c = 0; c < layer1_size; c++)
1043 syn1nce[c + l2] += g * neu1[c];
1044 if (cap == 1)
1045 for (c = 0; c < layer1_size; c++)
1046 capParam(syn1nce, c + l2);
1047 }
1048 // hidden -> in
1049 for (a = b; a < window * 2 + 1 - b; a++)
1050 if (a != window) {
1051 c = sentence_position - window + a;
1052 if (c < 0)
1053 continue;
1054 if (c >= sentence_length)
1055 continue;
1056 last_word = sen[c];
1057 if (last_word == -1)
1058 continue;
1059 for (c = 0; c < layer1_size; c++)
1060 syn0[c + last_word * layer1_size] += neu1e[c];
1061 }
1062 }
1063 } else if (type == 1) { //train skip-gram
1064 for (a = b; a < window * 2 + 1 - b; a++)
1065 if (a != window) {
1066 c = sentence_position - window + a;
1067 if (c < 0)
1068 continue;
1069 if (c >= sentence_length)
1070 continue;
1071 last_word = sen[c];
1072 if (last_word == -1)
1073 continue;
1074 l1 = last_word * layer1_size;
1075 for (c = 0; c < layer1_size; c++)
1076 neu1e[c] = 0;
1077 // HIERARCHICAL SOFTMAX
1078 if (hs)
1079 for (d = 0; d < vocab[word].codelen; d++) {
1080 f = 0;
1081 l2 = vocab[word].point[d] * layer1_size;
1082 // Propagate hidden -> output
1083 for (c = 0; c < layer1_size; c++)
1084 f += syn0[c + l1] * syn1[c + l2];
1085 if (f <= -MAX_EXP)
1086 continue;
1087 else if (f >= MAX_EXP)
1088 continue;
1089 else
1090 f = expTable[(int) ((f + MAX_EXP)
1091 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1092 // 'g' is the gradient multiplied by the learning rate
1093 g = (1 - vocab[word].code[d] - f) * alpha;
1094 // Propagate errors output -> hidden
1095 for (c = 0; c < layer1_size; c++)
1096 neu1e[c] += g * syn1[c + l2];
1097 // Learn weights hidden -> output
1098 for (c = 0; c < layer1_size; c++)
1099 syn1[c + l2] += g * syn0[c + l1];
1100 if (cap == 1)
1101 for (c = 0; c < layer1_size; c++)
1102 capParam(syn1, c + l2);
1103 }
1104 // NEGATIVE SAMPLING
1105 if (negative > 0)
1106 for (d = 0; d < negative + 1; d++) {
1107 if (d == 0) {
1108 target = word;
1109 label = 1;
1110 } else {
1111 next_random = next_random
1112 * (unsigned long long) 25214903917 + 11;
1113 if (word_to_group != NULL
1114 && word_to_group[word] != -1) {
1115 target = word;
1116 while (target == word) {
1117 target =
1118 group_to_table[word_to_group[word]
1119 * table_size
1120 + (next_random >> 16)
1121 % table_size];
1122 next_random =
1123 next_random
1124 * (unsigned long long) 25214903917
1125 + 11;
1126 }
1127 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1128 } else {
1129 target = table[(next_random >> 16)
1130 % table_size];
1131 }
1132 if (target == 0)
1133 target = next_random % (vocab_size - 1) + 1;
1134 if (target == word)
1135 continue;
1136 label = 0;
1137 }
1138 l2 = target * layer1_size;
1139 f = 0;
1140 for (c = 0; c < layer1_size; c++)
1141 f += syn0[c + l1] * syn1neg[c + l2];
1142 if (f > MAX_EXP)
1143 g = (label - 1) * alpha;
1144 else if (f < -MAX_EXP)
1145 g = (label - 0) * alpha;
1146 else
1147 g =
1148 (label
1149 - expTable[(int) ((f + MAX_EXP)
1150 * (EXP_TABLE_SIZE
1151 / MAX_EXP / 2))])
1152 * alpha;
1153 for (c = 0; c < layer1_size; c++)
1154 neu1e[c] += g * syn1neg[c + l2];
1155 for (c = 0; c < layer1_size; c++)
1156 syn1neg[c + l2] += g * syn0[c + l1];
1157 if (cap == 1)
1158 for (c = 0; c < layer1_size; c++)
1159 capParam(syn1neg, c + l2);
1160 }
1161 //Noise Contrastive Estimation
1162 if (nce > 0)
1163 for (d = 0; d < nce + 1; d++) {
1164 if (d == 0) {
1165 target = word;
1166 label = 1;
1167 } else {
1168 next_random = next_random
1169 * (unsigned long long) 25214903917 + 11;
1170 if (word_to_group != NULL
1171 && word_to_group[word] != -1) {
1172 target = word;
1173 while (target == word) {
1174 target =
1175 group_to_table[word_to_group[word]
1176 * table_size
1177 + (next_random >> 16)
1178 % table_size];
1179 next_random =
1180 next_random
1181 * (unsigned long long) 25214903917
1182 + 11;
1183 }
1184 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1185 } else {
1186 target = table[(next_random >> 16)
1187 % table_size];
1188 }
1189 if (target == 0)
1190 target = next_random % (vocab_size - 1) + 1;
1191 if (target == word)
1192 continue;
1193 label = 0;
1194 }
1195 l2 = target * layer1_size;
1196 f = 0;
1197 for (c = 0; c < layer1_size; c++)
1198 f += syn0[c + l1] * syn1nce[c + l2];
1199 if (f > MAX_EXP)
1200 g = (label - 1) * alpha;
1201 else if (f < -MAX_EXP)
1202 g = (label - 0) * alpha;
1203 else {
1204 f = exp(f);
1205 g = (label
1206 - f
1207 / (noise_distribution[target]
1208 * nce + f)) * alpha;
1209 }
1210 for (c = 0; c < layer1_size; c++)
1211 neu1e[c] += g * syn1nce[c + l2];
1212 for (c = 0; c < layer1_size; c++)
1213 syn1nce[c + l2] += g * syn0[c + l1];
1214 if (cap == 1)
1215 for (c = 0; c < layer1_size; c++)
1216 capParam(syn1nce, c + l2);
1217 }
1218 // Learn weights input -> hidden
1219 for (c = 0; c < layer1_size; c++)
1220 syn0[c + l1] += neu1e[c];
1221 }
1222 } else if (type == 2) { //train the cwindow architecture
1223 // in -> hidden
1224 cw = 0;
1225 for (a = 0; a < window * 2 + 1; a++)
1226 if (a != window) {
1227 c = sentence_position - window + a;
1228 if (c < 0)
1229 continue;
1230 if (c >= sentence_length)
1231 continue;
1232 last_word = sen[c];
1233 if (last_word == -1)
1234 continue;
1235 window_offset = a * layer1_size;
1236 if (a > window)
1237 window_offset -= layer1_size;
1238 for (c = 0; c < layer1_size; c++)
1239 neu1[c + window_offset] += syn0[c
1240 + last_word * layer1_size];
1241 cw++;
1242 }
1243 if (cw) {
1244 if (hs)
1245 for (d = 0; d < vocab[word].codelen; d++) {
1246 f = 0;
1247 l2 = vocab[word].point[d] * window_layer_size;
1248 // Propagate hidden -> output
1249 for (c = 0; c < window_layer_size; c++)
1250 f += neu1[c] * syn1_window[c + l2];
1251 if (f <= -MAX_EXP)
1252 continue;
1253 else if (f >= MAX_EXP)
1254 continue;
1255 else
1256 f = expTable[(int) ((f + MAX_EXP)
1257 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1258 // 'g' is the gradient multiplied by the learning rate
1259 g = (1 - vocab[word].code[d] - f) * alpha;
1260 // Propagate errors output -> hidden
1261 for (c = 0; c < window_layer_size; c++)
1262 neu1e[c] += g * syn1_window[c + l2];
1263 // Learn weights hidden -> output
1264 for (c = 0; c < window_layer_size; c++)
1265 syn1_window[c + l2] += g * neu1[c];
1266 if (cap == 1)
1267 for (c = 0; c < window_layer_size; c++)
1268 capParam(syn1_window, c + l2);
1269 }
1270 // NEGATIVE SAMPLING
1271 if (negative > 0)
1272 for (d = 0; d < negative + 1; d++) {
1273 if (d == 0) {
1274 target = word;
1275 label = 1;
1276 } else {
1277 next_random = next_random
1278 * (unsigned long long) 25214903917 + 11;
1279 if (word_to_group != NULL
1280 && word_to_group[word] != -1) {
1281 target = word;
1282 while (target == word) {
1283 target = group_to_table[word_to_group[word]
1284 * table_size
1285 + (next_random >> 16) % table_size];
1286 next_random = next_random
1287 * (unsigned long long) 25214903917
1288 + 11;
1289 }
1290 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1291 } else {
1292 target =
1293 table[(next_random >> 16) % table_size];
1294 }
1295 if (target == 0)
1296 target = next_random % (vocab_size - 1) + 1;
1297 if (target == word)
1298 continue;
1299 label = 0;
1300 }
1301 l2 = target * window_layer_size;
1302 f = 0;
1303 for (c = 0; c < window_layer_size; c++)
1304 f += neu1[c] * syn1neg_window[c + l2];
1305 if (f > MAX_EXP)
1306 g = (label - 1) * alpha;
1307 else if (f < -MAX_EXP)
1308 g = (label - 0) * alpha;
1309 else
1310 g = (label
1311 - expTable[(int) ((f + MAX_EXP)
1312 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1313 * alpha;
1314 for (c = 0; c < window_layer_size; c++)
1315 neu1e[c] += g * syn1neg_window[c + l2];
1316 for (c = 0; c < window_layer_size; c++)
1317 syn1neg_window[c + l2] += g * neu1[c];
1318 if (cap == 1)
1319 for (c = 0; c < window_layer_size; c++)
1320 capParam(syn1neg_window, c + l2);
1321 }
1322 // Noise Contrastive Estimation
1323 if (nce > 0)
1324 for (d = 0; d < nce + 1; d++) {
1325 if (d == 0) {
1326 target = word;
1327 label = 1;
1328 } else {
1329 next_random = next_random
1330 * (unsigned long long) 25214903917 + 11;
1331 if (word_to_group != NULL
1332 && word_to_group[word] != -1) {
1333 target = word;
1334 while (target == word) {
1335 target = group_to_table[word_to_group[word]
1336 * table_size
1337 + (next_random >> 16) % table_size];
1338 next_random = next_random
1339 * (unsigned long long) 25214903917
1340 + 11;
1341 }
1342 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1343 } else {
1344 target =
1345 table[(next_random >> 16) % table_size];
1346 }
1347 if (target == 0)
1348 target = next_random % (vocab_size - 1) + 1;
1349 if (target == word)
1350 continue;
1351 label = 0;
1352 }
1353 l2 = target * window_layer_size;
1354 f = 0;
1355 for (c = 0; c < window_layer_size; c++)
1356 f += neu1[c] * syn1nce_window[c + l2];
1357 if (f > MAX_EXP)
1358 g = (label - 1) * alpha;
1359 else if (f < -MAX_EXP)
1360 g = (label - 0) * alpha;
1361 else {
1362 f = exp(f);
1363 g =
1364 (label
1365 - f
1366 / (noise_distribution[target]
1367 * nce + f)) * alpha;
1368 }
1369 for (c = 0; c < window_layer_size; c++)
1370 neu1e[c] += g * syn1nce_window[c + l2];
1371 for (c = 0; c < window_layer_size; c++)
1372 syn1nce_window[c + l2] += g * neu1[c];
1373 if (cap == 1)
1374 for (c = 0; c < window_layer_size; c++)
1375 capParam(syn1nce_window, c + l2);
1376 }
1377 // hidden -> in
1378 for (a = 0; a < window * 2 + 1; a++)
1379 if (a != window) {
1380 c = sentence_position - window + a;
1381 if (c < 0)
1382 continue;
1383 if (c >= sentence_length)
1384 continue;
1385 last_word = sen[c];
1386 if (last_word == -1)
1387 continue;
1388 window_offset = a * layer1_size;
1389 if (a > window)
1390 window_offset -= layer1_size;
1391 for (c = 0; c < layer1_size; c++)
1392 syn0[c + last_word * layer1_size] += neu1e[c
1393 + window_offset];
1394 }
1395 }
1396 } else if (type == 3) { //train structured skip-gram
1397 for (a = 0; a < window * 2 + 1; a++)
1398 if (a != window) {
1399 c = sentence_position - window + a;
1400 if (c < 0)
1401 continue;
1402 if (c >= sentence_length)
1403 continue;
1404 last_word = sen[c];
Peter Fankhauser66035a42016-04-20 13:29:33 +02001405 if (last_word < 0)
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001406 continue;
1407 l1 = last_word * layer1_size;
1408 window_offset = a * layer1_size;
1409 if (a > window)
1410 window_offset -= layer1_size;
1411 for (c = 0; c < layer1_size; c++)
1412 neu1e[c] = 0;
1413 // HIERARCHICAL SOFTMAX
1414 if (hs)
1415 for (d = 0; d < vocab[word].codelen; d++) {
1416 f = 0;
1417 l2 = vocab[word].point[d] * window_layer_size;
1418 // Propagate hidden -> output
1419 for (c = 0; c < layer1_size; c++)
1420 f += syn0[c + l1]
1421 * syn1_window[c + l2 + window_offset];
1422 if (f <= -MAX_EXP)
1423 continue;
1424 else if (f >= MAX_EXP)
1425 continue;
1426 else
1427 f = expTable[(int) ((f + MAX_EXP)
1428 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1429 // 'g' is the gradient multiplied by the learning rate
1430 g = (1 - vocab[word].code[d] - f) * alpha;
1431 // Propagate errors output -> hidden
1432 for (c = 0; c < layer1_size; c++)
1433 neu1e[c] += g
1434 * syn1_window[c + l2 + window_offset];
1435 // Learn weights hidden -> output
1436 for (c = 0; c < layer1_size; c++)
1437 syn1[c + l2 + window_offset] += g
1438 * syn0[c + l1];
1439 if (cap == 1)
1440 for (c = 0; c < layer1_size; c++)
1441 capParam(syn1, c + l2 + window_offset);
1442 }
1443 // NEGATIVE SAMPLING
1444 if (negative > 0)
1445 for (d = 0; d < negative + 1; d++) {
1446 if (d == 0) {
1447 target = word;
1448 label = 1;
1449 } else {
1450 next_random = next_random
1451 * (unsigned long long) 25214903917 + 11;
1452 if (word_to_group != NULL
1453 && word_to_group[word] != -1) {
1454 target = word;
1455 while (target == word) {
1456 target =
1457 group_to_table[word_to_group[word]
1458 * table_size
1459 + (next_random >> 16)
1460 % table_size];
1461 next_random =
1462 next_random
1463 * (unsigned long long) 25214903917
1464 + 11;
1465 }
1466 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1467 } else {
1468 target = table[(next_random >> 16)
1469 % table_size];
1470 }
1471 if (target == 0)
1472 target = next_random % (vocab_size - 1) + 1;
1473 if (target == word)
1474 continue;
1475 label = 0;
1476 }
1477 l2 = target * window_layer_size;
1478 f = 0;
1479 for (c = 0; c < layer1_size; c++)
1480 f +=
1481 syn0[c + l1]
1482 * syn1neg_window[c + l2
1483 + window_offset];
1484 if (f > MAX_EXP)
1485 g = (label - 1) * alpha;
1486 else if (f < -MAX_EXP)
1487 g = (label - 0) * alpha;
1488 else
1489 g =
1490 (label
1491 - expTable[(int) ((f + MAX_EXP)
1492 * (EXP_TABLE_SIZE
1493 / MAX_EXP / 2))])
1494 * alpha;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001495 if(debug_mode > 2 && ((long long) id) == 0) {
1496 printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
1497 printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
1498 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001499 for (c = 0; c < layer1_size; c++)
1500 neu1e[c] +=
1501 g
1502 * syn1neg_window[c + l2
1503 + window_offset];
1504 for (c = 0; c < layer1_size; c++)
1505 syn1neg_window[c + l2 + window_offset] += g
1506 * syn0[c + l1];
1507 if (cap == 1)
1508 for (c = 0; c < layer1_size; c++)
1509 capParam(syn1neg_window,
1510 c + l2 + window_offset);
1511 }
1512 // Noise Constrastive Estimation
1513 if (nce > 0)
1514 for (d = 0; d < nce + 1; d++) {
1515 if (d == 0) {
1516 target = word;
1517 label = 1;
1518 } else {
1519 next_random = next_random
1520 * (unsigned long long) 25214903917 + 11;
1521 if (word_to_group != NULL
1522 && word_to_group[word] != -1) {
1523 target = word;
1524 while (target == word) {
1525 target =
1526 group_to_table[word_to_group[word]
1527 * table_size
1528 + (next_random >> 16)
1529 % table_size];
1530 next_random =
1531 next_random
1532 * (unsigned long long) 25214903917
1533 + 11;
1534 }
1535 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1536 } else {
1537 target = table[(next_random >> 16)
1538 % table_size];
1539 }
1540 if (target == 0)
1541 target = next_random % (vocab_size - 1) + 1;
1542 if (target == word)
1543 continue;
1544 label = 0;
1545 }
1546 l2 = target * window_layer_size;
1547 f = 0;
1548 for (c = 0; c < layer1_size; c++)
1549 f +=
1550 syn0[c + l1]
1551 * syn1nce_window[c + l2
1552 + window_offset];
1553 if (f > MAX_EXP)
1554 g = (label - 1) * alpha;
1555 else if (f < -MAX_EXP)
1556 g = (label - 0) * alpha;
1557 else {
1558 f = exp(f);
1559 g = (label
1560 - f
1561 / (noise_distribution[target]
1562 * nce + f)) * alpha;
1563 }
1564 for (c = 0; c < layer1_size; c++)
1565 neu1e[c] +=
1566 g
1567 * syn1nce_window[c + l2
1568 + window_offset];
1569 for (c = 0; c < layer1_size; c++)
1570 syn1nce_window[c + l2 + window_offset] += g
1571 * syn0[c + l1];
1572 if (cap == 1)
1573 for (c = 0; c < layer1_size; c++)
1574 capParam(syn1nce_window,
1575 c + l2 + window_offset);
1576 }
1577 // Learn weights input -> hidden
1578 for (c = 0; c < layer1_size; c++) {
1579 syn0[c + l1] += neu1e[c];
1580 if (syn0[c + l1] > 50)
1581 syn0[c + l1] = 50;
1582 if (syn0[c + l1] < -50)
1583 syn0[c + l1] = -50;
1584 }
1585 }
1586 } else if (type == 4) { //training senna
1587 // in -> hidden
1588 cw = 0;
1589 for (a = 0; a < window * 2 + 1; a++)
1590 if (a != window) {
1591 c = sentence_position - window + a;
1592 if (c < 0)
1593 continue;
1594 if (c >= sentence_length)
1595 continue;
1596 last_word = sen[c];
1597 if (last_word == -1)
1598 continue;
1599 window_offset = a * layer1_size;
1600 if (a > window)
1601 window_offset -= layer1_size;
1602 for (c = 0; c < layer1_size; c++)
1603 neu1[c + window_offset] += syn0[c
1604 + last_word * layer1_size];
1605 cw++;
1606 }
1607 if (cw) {
1608 for (a = 0; a < window_hidden_size; a++) {
1609 c = a * window_layer_size;
1610 for (b = 0; b < window_layer_size; b++) {
1611 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1612 }
1613 }
1614 if (hs)
1615 for (d = 0; d < vocab[word].codelen; d++) {
1616 f = 0;
1617 l2 = vocab[word].point[d] * window_hidden_size;
1618 // Propagate hidden -> output
1619 for (c = 0; c < window_hidden_size; c++)
1620 f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1621 if (f <= -MAX_EXP)
1622 continue;
1623 else if (f >= MAX_EXP)
1624 continue;
1625 else
1626 f = expTable[(int) ((f + MAX_EXP)
1627 * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1628 // 'g' is the gradient multiplied by the learning rate
1629 g = (1 - vocab[word].code[d] - f) * alpha;
1630 // Propagate errors output -> hidden
1631 for (c = 0; c < window_hidden_size; c++)
1632 neu2e[c] += dHardTanh(neu2[c], g) * g
1633 * syn_hidden_word[c + l2];
1634 // Learn weights hidden -> output
1635 for (c = 0; c < window_hidden_size; c++)
1636 syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
1637 * neu2[c];
1638 }
1639 // NEGATIVE SAMPLING
1640 if (negative > 0)
1641 for (d = 0; d < negative + 1; d++) {
1642 if (d == 0) {
1643 target = word;
1644 label = 1;
1645 } else {
1646 next_random = next_random
1647 * (unsigned long long) 25214903917 + 11;
1648 if (word_to_group != NULL
1649 && word_to_group[word] != -1) {
1650 target = word;
1651 while (target == word) {
1652 target = group_to_table[word_to_group[word]
1653 * table_size
1654 + (next_random >> 16) % table_size];
1655 next_random = next_random
1656 * (unsigned long long) 25214903917
1657 + 11;
1658 }
1659 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1660 } else {
1661 target =
1662 table[(next_random >> 16) % table_size];
1663 }
1664 if (target == 0)
1665 target = next_random % (vocab_size - 1) + 1;
1666 if (target == word)
1667 continue;
1668 label = 0;
1669 }
1670 l2 = target * window_hidden_size;
1671 f = 0;
1672 for (c = 0; c < window_hidden_size; c++)
1673 f += hardTanh(neu2[c])
1674 * syn_hidden_word_neg[c + l2];
1675 if (f > MAX_EXP)
1676 g = (label - 1) * alpha / negative;
1677 else if (f < -MAX_EXP)
1678 g = (label - 0) * alpha / negative;
1679 else
1680 g = (label
1681 - expTable[(int) ((f + MAX_EXP)
1682 * (EXP_TABLE_SIZE / MAX_EXP / 2))])
1683 * alpha / negative;
1684 for (c = 0; c < window_hidden_size; c++)
1685 neu2e[c] += dHardTanh(neu2[c], g) * g
1686 * syn_hidden_word_neg[c + l2];
1687 for (c = 0; c < window_hidden_size; c++)
1688 syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
1689 * g * neu2[c];
1690 }
1691 for (a = 0; a < window_hidden_size; a++)
1692 for (b = 0; b < window_layer_size; b++)
1693 neu1e[b] += neu2e[a]
1694 * syn_window_hidden[a * window_layer_size + b];
1695 for (a = 0; a < window_hidden_size; a++)
1696 for (b = 0; b < window_layer_size; b++)
1697 syn_window_hidden[a * window_layer_size + b] += neu2e[a]
1698 * neu1[b];
1699 // hidden -> in
1700 for (a = 0; a < window * 2 + 1; a++)
1701 if (a != window) {
1702 c = sentence_position - window + a;
1703 if (c < 0)
1704 continue;
1705 if (c >= sentence_length)
1706 continue;
1707 last_word = sen[c];
1708 if (last_word == -1)
1709 continue;
1710 window_offset = a * layer1_size;
1711 if (a > window)
1712 window_offset -= layer1_size;
1713 for (c = 0; c < layer1_size; c++)
1714 syn0[c + last_word * layer1_size] += neu1e[c
1715 + window_offset];
1716 }
1717 }
Marc Kupietz613edbf2018-01-11 21:38:03 +01001718 } else if(type == 5) {
1719 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1720 c = sentence_position - window + a;
1721 if (c < 0) continue;
1722 if (c >= sentence_length) continue;
1723 last_word = sen[c];
1724 if (last_word == -1) continue;
1725 inc_collocator(cdb, word, last_word, a - window);
1726 // printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
1727 // cw++;
1728 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001729 } else {
1730 printf("unknown type %i", type);
1731 exit(0);
1732 }
1733 sentence_position++;
1734 if (sentence_position >= sentence_length) {
1735 sentence_length = 0;
1736 continue;
1737 }
1738 }
1739 fclose(fi);
1740 free(neu1);
1741 free(neu1e);
Marc Kupietz202723e2016-07-14 09:12:00 +02001742 threadPos[(long) id] = -1;
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001743 pthread_exit(NULL);
1744}
1745
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001746void ShowCollocations() {
Marc Kupietz71996e72016-03-18 13:40:24 +01001747 long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001748 real f, max_f, maxmax_f;
Marc Kupietzf00e7b02023-12-22 11:11:56 +01001749 real *target_sums=0L, bestf[MAX_CC], worstbest;
Marc Kupietz71996e72016-03-18 13:40:24 +01001750 long besti[MAX_CC];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001751 int N = 10, bestp[MAX_CC];
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001752 a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real));
1753
1754 for (d = cc; d < vocab_size; d++) {
1755 for (b = 0; b < vocab_size; b++)
1756 target_sums[b]=0;
Marc Kupietz71996e72016-03-18 13:40:24 +01001757 for (b = 0; b < N; b++)
1758 bestf[b]=-1;
1759 worstbest = -1;
1760
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001761 maxmax_f = -1;
1762 maxmax_target = 0;
Marc Kupietz0a664c12016-03-18 13:18:22 +01001763 for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001764 if (a != window) {
1765 max_f = -1;
1766 window_offset = a * layer1_size;
1767 if (a > window)
1768 window_offset -= layer1_size;
1769 for(target = 0; target < vocab_size; target ++) {
1770 if(target == d)
1771 continue;
1772 f = 0;
1773 for (c = 0; c < layer1_size; c++)
1774 f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
1775 if (f < -MAX_EXP)
1776 continue;
1777 else if (f > MAX_EXP)
1778 continue;
1779 else
1780 f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1781 if(f > max_f) {
1782 max_f = f;
1783 max_target = target;
1784 }
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001785 target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz71996e72016-03-18 13:40:24 +01001786 if(f > worstbest) {
1787 for (b = 0; b < N; b++) {
1788 if (f > bestf[b]) {
1789 for (e = N - 1; e > b; e--) {
1790 bestf[e] = bestf[e - 1];
1791 besti[e] = besti[e - 1];
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001792 bestp[e] = bestp[e - 1];
Marc Kupietz71996e72016-03-18 13:40:24 +01001793 }
1794 bestf[b] = f;
1795 besti[b] = target;
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001796 bestp[b] = window-a;
Marc Kupietz71996e72016-03-18 13:40:24 +01001797 break;
1798 }
1799 }
1800 worstbest = bestf[N-1];
1801 }
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001802 }
1803 printf("%s (%.2f) ", vocab[max_target].word, max_f);
1804 if(max_f > maxmax_f) {
1805 maxmax_f = max_f;
1806 maxmax_target = max_target;
1807 }
1808 } else {
1809 printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
1810 }
1811 }
1812 max_f = -1;
1813 for (b = 0; b < vocab_size; b++) {
1814 if(target_sums[b] > max_f) {
1815 max_f = target_sums[b];
1816 max_target = b;
1817 }
1818 }
1819 printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz0fb5d612016-03-18 11:01:21 +01001820 vocab[max_target].word, max_f,
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001821 vocab[maxmax_target].word, maxmax_f);
Marc Kupietz71996e72016-03-18 13:40:24 +01001822 for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz79fd83d2016-03-18 14:09:07 +01001823 printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz71996e72016-03-18 13:40:24 +01001824 printf("\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001825 }
1826}
1827
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001828void TrainModel() {
1829 long a, b, c, d;
1830 FILE *fo;
1831 pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz202723e2016-07-14 09:12:00 +02001832 threadPos = malloc(num_threads * sizeof(long long));
1833 threadIters = malloc(num_threads * sizeof(int));
1834 char *timebuf = malloc(80);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001835 printf("Starting training using file %s\n", train_file);
1836 starting_alpha = alpha;
1837 if (read_vocab_file[0] != 0)
1838 ReadVocab();
1839 else
1840 LearnVocabFromTrainFile();
1841 if (save_vocab_file[0] != 0)
1842 SaveVocab();
1843 if (output_file[0] == 0)
1844 return;
1845 InitNet();
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01001846 if(cc > 0)
1847 ShowCollocations();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001848 if (negative > 0 || nce > 0)
1849 InitUnigramTable();
1850 if (negative_classes_file[0] != 0)
1851 InitClassUnigramTable();
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001852 start = time(NULL);
1853 start_clock = clock();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001854 for (a = 0; a < num_threads; a++)
1855 pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz202723e2016-07-14 09:12:00 +02001856 if(debug_mode > 1)
1857 pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001858 for (a = 0; a < num_threads; a++)
1859 pthread_join(pt[a], NULL);
Marc Kupietz202723e2016-07-14 09:12:00 +02001860 if(debug_mode > 1) {
1861 pthread_join(pt[num_threads], NULL);
Marc Kupietzb366bcd2018-01-11 21:29:41 +01001862 clock_t now = time(NULL);
1863 clock_t now_clock = clock();
1864 printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
Marc Kupietz613edbf2018-01-11 21:38:03 +01001865 if(type == 5) // don't save vectorsmfor classic collocators
1866 return;
Marc Kupietz202723e2016-07-14 09:12:00 +02001867 printf("Saving vectors to %s ...", output_file);
1868 fflush(stdout);
1869 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001870 fo = fopen(output_file, "wb");
1871 if (classes == 0) {
1872 // Save the word vectors
1873 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1874 for (a = 0; a < vocab_size; a++) {
1875 fprintf(fo, "%s ", vocab[a].word);
1876 if (binary)
1877 for (b = 0; b < layer1_size; b++)
1878 fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1879 else
1880 for (b = 0; b < layer1_size; b++)
1881 fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1882 fprintf(fo, "\n");
1883 }
Marc Kupietz202723e2016-07-14 09:12:00 +02001884 if(debug_mode > 1)
1885 fprintf(stderr, "\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001886 } else {
1887 // Run K-means on the word vectors
1888 int clcn = classes, iter = 10, closeid;
1889 int *centcn = (int *) malloc(classes * sizeof(int));
1890 int *cl = (int *) calloc(vocab_size, sizeof(int));
1891 real closev, x;
1892 real *cent = (real *) calloc(classes * layer1_size, sizeof(real));
1893 for (a = 0; a < vocab_size; a++)
1894 cl[a] = a % clcn;
1895 for (a = 0; a < iter; a++) {
1896 for (b = 0; b < clcn * layer1_size; b++)
1897 cent[b] = 0;
1898 for (b = 0; b < clcn; b++)
1899 centcn[b] = 1;
1900 for (c = 0; c < vocab_size; c++) {
1901 for (d = 0; d < layer1_size; d++)
1902 cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1903 centcn[cl[c]]++;
1904 }
1905 for (b = 0; b < clcn; b++) {
1906 closev = 0;
1907 for (c = 0; c < layer1_size; c++) {
1908 cent[layer1_size * b + c] /= centcn[b];
1909 closev += cent[layer1_size * b + c]
1910 * cent[layer1_size * b + c];
1911 }
1912 closev = sqrt(closev);
1913 for (c = 0; c < layer1_size; c++)
1914 cent[layer1_size * b + c] /= closev;
1915 }
1916 for (c = 0; c < vocab_size; c++) {
1917 closev = -10;
1918 closeid = 0;
1919 for (d = 0; d < clcn; d++) {
1920 x = 0;
1921 for (b = 0; b < layer1_size; b++)
1922 x += cent[layer1_size * d + b]
1923 * syn0[c * layer1_size + b];
1924 if (x > closev) {
1925 closev = x;
1926 closeid = d;
1927 }
1928 }
1929 cl[c] = closeid;
1930 }
1931 }
1932 // Save the K-means classes
1933 for (a = 0; a < vocab_size; a++)
1934 fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1935 free(centcn);
1936 free(cent);
1937 free(cl);
1938 }
1939 fclose(fo);
1940 if (save_net_file[0] != 0)
1941 SaveNet();
1942}
1943
1944int ArgPos(char *str, int argc, char **argv) {
1945 int a;
1946 for (a = 1; a < argc; a++)
1947 if (!strcmp(str, argv[a])) {
1948 if (a == argc - 1) {
1949 printf("Argument missing for %s\n", str);
1950 exit(1);
1951 }
1952 return a;
1953 }
1954 return -1;
1955}
1956
Marc Kupietzc7f773b2017-12-02 12:04:03 +01001957void print_help() {
Marc Kupietz83a67d42021-03-22 17:29:36 +01001958 printf("WORD VECTOR estimation toolkit v 0.9.0\n\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001959 printf("Options:\n");
1960 printf("Parameters for training:\n");
1961 printf("\t-train <file>\n");
1962 printf("\t\tUse text data from <file> to train the model\n");
1963 printf("\t-output <file>\n");
1964 printf(
1965 "\t\tUse <file> to save the resulting word vectors / word clusters\n");
1966 printf("\t-size <int>\n");
1967 printf("\t\tSet size of word vectors; default is 100\n");
1968 printf("\t-window <int>\n");
1969 printf("\t\tSet max skip length between words; default is 5\n");
1970 printf("\t-sample <float>\n");
1971 printf(
1972 "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1973 printf(
1974 "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1975 printf("\t-hs <int>\n");
1976 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1977 printf("\t-negative <int>\n");
1978 printf(
1979 "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1980 printf("\t-negative-classes <file>\n");
1981 printf("\t\tNegative classes to sample from\n");
1982 printf("\t-nce <int>\n");
1983 printf(
1984 "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
1985 printf("\t-threads <int>\n");
1986 printf("\t\tUse <int> threads (default 12)\n");
1987 printf("\t-iter <int>\n");
1988 printf("\t\tRun more training iterations (default 5)\n");
1989 printf("\t-min-count <int>\n");
1990 printf(
1991 "\t\tThis will discard words that appear less than <int> times; default is 5\n");
1992 printf("\t-alpha <float>\n");
1993 printf(
1994 "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1995 printf("\t-classes <int>\n");
1996 printf(
1997 "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1998 printf("\t-debug <int>\n");
1999 printf(
2000 "\t\tSet the debug mode (default = 2 = more info during training)\n");
2001 printf("\t-binary <int>\n");
2002 printf(
2003 "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
2004 printf("\t-save-vocab <file>\n");
2005 printf("\t\tThe vocabulary will be saved to <file>\n");
2006 printf("\t-read-vocab <file>\n");
2007 printf(
2008 "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
2009 printf("\t-read-net <file>\n");
2010 printf(
2011 "\t\tThe net parameters will be read from <file>, not initialized randomly\n");
2012 printf("\t-save-net <file>\n");
2013 printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietze423f732017-12-22 17:57:03 +01002014 printf("\t-magic-stop-file <file>\n");
2015 printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002016 printf("\t-show-cc <int>\n");
2017 printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002018 printf("\t-type <int>\n");
2019 printf(
Marc Kupietz613edbf2018-01-11 21:38:03 +01002020 "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002021 printf("\t-cap <int>\n");
2022 printf(
2023 "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
2024 printf("\nExamples:\n");
2025 printf(
Marc Kupietz83a67d42021-03-22 17:29:36 +01002026 "./dereko2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietzc7f773b2017-12-02 12:04:03 +01002027}
2028
2029int main(int argc, char **argv) {
2030 int i;
2031 setlocale(LC_ALL, "");
2032 if (argc == 1) {
2033 print_help();
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002034 return 0;
2035 }
2036 output_file[0] = 0;
2037 save_vocab_file[0] = 0;
2038 read_vocab_file[0] = 0;
2039 save_net_file[0] = 0;
2040 read_net_file[0] = 0;
2041 negative_classes_file[0] = 0;
Marc Kupietzc7f773b2017-12-02 12:04:03 +01002042 if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
2043 print_help();
2044 return(0);
2045 }
2046 if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
2047 print_help();
2048 return(0);
2049 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002050 if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
2051 layer1_size = atoi(argv[i + 1]);
2052 if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
2053 strcpy(train_file, argv[i + 1]);
2054 if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
2055 strcpy(save_vocab_file, argv[i + 1]);
2056 if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
2057 strcpy(read_vocab_file, argv[i + 1]);
2058 if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
2059 strcpy(save_net_file, argv[i + 1]);
2060 if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
2061 strcpy(read_net_file, argv[i + 1]);
Marc Kupietze423f732017-12-22 17:57:03 +01002062 if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
2063 strcpy(magic_stop_file, argv[i + 1]);
2064 if (access(magic_stop_file, F_OK ) != -1) {
2065 printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
2066 exit(1);
2067 }
2068 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002069 if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
2070 debug_mode = atoi(argv[i + 1]);
2071 if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
2072 binary = atoi(argv[i + 1]);
Marc Kupietz6b1f2ba2016-03-17 21:17:42 +01002073 if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
2074 cc = atoi(argv[i + 1]);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002075 if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
2076 type = atoi(argv[i + 1]);
2077 if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
2078 strcpy(output_file, argv[i + 1]);
2079 if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
2080 window = atoi(argv[i + 1]);
2081 if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
2082 sample = atof(argv[i + 1]);
2083 if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
2084 hs = atoi(argv[i + 1]);
2085 if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
2086 negative = atoi(argv[i + 1]);
2087 if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
2088 strcpy(negative_classes_file, argv[i + 1]);
2089 if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
2090 nce = atoi(argv[i + 1]);
2091 if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
2092 num_threads = atoi(argv[i + 1]);
2093 if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
2094 iter = atoi(argv[i + 1]);
2095 if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
2096 min_count = atoi(argv[i + 1]);
2097 if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
2098 classes = atoi(argv[i + 1]);
Marc Kupietz879333c2023-12-20 11:41:09 +01002099 if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0)
Marc Kupietz178a3c92023-12-22 15:12:27 +01002100 metadata_categories = atoi(argv[i + 1]);
2101 if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0) {
2102 metadata_categories = atoi(argv[i + 1]);
2103 if (metadata_categories > MAX_METADATA_CATEGORIES) {
2104 printf("ERROR: metadata categories must be <= %d\n", MAX_METADATA_CATEGORIES);
2105 exit(1);
2106 }
2107 for (int j = 0; j <= metadata_categories; j++) {
2108 }
2109 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002110 if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
2111 cap = atoi(argv[i + 1]);
2112 if (type == 0 || type == 2 || type == 4)
2113 alpha = 0.05;
Marc Kupietz613edbf2018-01-11 21:38:03 +01002114 if (type==5) {
2115 sample = 0;
2116 cdb = open_collocatordb_for_write(output_file);
2117 }
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002118 if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
2119 alpha = atof(argv[i + 1]);
2120 vocab = (struct vocab_word *) calloc(vocab_max_size,
2121 sizeof(struct vocab_word));
2122 vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
2123 expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
2124 for (i = 0; i < EXP_TABLE_SIZE; i++) {
2125 expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
2126 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
2127 }
Marc Kupietz210b9d52016-04-02 21:48:13 +02002128 SaveArgs(argc, argv);
Marc Kupietzd6f9c712016-03-16 11:50:56 +01002129 TrainModel();
2130 return 0;
2131}
2132