| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 1 | //  Copyright 2013 Google Inc. All Rights Reserved. | 
|  | 2 | // | 
|  | 3 | //  Licensed under the Apache License, Version 2.0 (the "License"); | 
|  | 4 | //  you may not use this file except in compliance with the License. | 
|  | 5 | //  You may obtain a copy of the License at | 
|  | 6 | // | 
|  | 7 | //      http://www.apache.org/licenses/LICENSE-2.0 | 
|  | 8 | // | 
|  | 9 | //  Unless required by applicable law or agreed to in writing, software | 
|  | 10 | //  distributed under the License is distributed on an "AS IS" BASIS, | 
|  | 11 | //  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | 
|  | 12 | //  See the License for the specific language governing permissions and | 
|  | 13 | //  limitations under the License. | 
|  | 14 |  | 
|  | 15 | #include <stdio.h> | 
|  | 16 | #include <stdlib.h> | 
|  | 17 | #include <string.h> | 
|  | 18 | #include <math.h> | 
|  | 19 | #include <pthread.h> | 
|  | 20 |  | 
|  | 21 | #define MAX_STRING 100 | 
|  | 22 | #define EXP_TABLE_SIZE 1000 | 
|  | 23 | #define MAX_EXP 6 | 
|  | 24 | #define MAX_SENTENCE_LENGTH 1000 | 
|  | 25 | #define MAX_CODE_LENGTH 40 | 
|  | 26 |  | 
|  | 27 | const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary | 
|  | 28 |  | 
|  | 29 | typedef float real;                    // Precision of float numbers | 
|  | 30 |  | 
|  | 31 | struct vocab_word { | 
|  | 32 | long long cn; | 
|  | 33 | int *point; | 
|  | 34 | char *word, *code, codelen; | 
|  | 35 | }; | 
|  | 36 |  | 
|  | 37 | char train_file[MAX_STRING], output_file[MAX_STRING]; | 
|  | 38 | char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING]; | 
|  | 39 | char save_net_file[MAX_STRING], read_net_file[MAX_STRING]; | 
|  | 40 | struct vocab_word *vocab; | 
|  | 41 | int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, | 
|  | 42 | num_threads = 12, min_reduce = 1; | 
|  | 43 | int *vocab_hash; | 
|  | 44 | long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100; | 
|  | 45 | long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, | 
|  | 46 | classes = 0; | 
|  | 47 | real alpha = 0.025, starting_alpha, sample = 1e-3; | 
|  | 48 | real *syn0, *syn1, *syn1neg, *syn1nce, *expTable; | 
|  | 49 | clock_t start; | 
|  | 50 |  | 
|  | 51 | real *syn1_window, *syn1neg_window, *syn1nce_window; | 
|  | 52 | int w_offset, window_layer_size; | 
|  | 53 |  | 
|  | 54 | int window_hidden_size = 500; | 
|  | 55 | real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg, | 
|  | 56 | *syn_hidden_word_nce; | 
|  | 57 |  | 
|  | 58 | int hs = 0, negative = 5; | 
|  | 59 | const int table_size = 1e8; | 
|  | 60 | int *table; | 
|  | 61 |  | 
| Marc Kupietz | 6b1f2ba | 2016-03-17 21:17:42 +0100 | [diff] [blame^] | 62 | long cc = 0; | 
|  | 63 |  | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 64 | //constrastive negative sampling | 
|  | 65 | char negative_classes_file[MAX_STRING]; | 
|  | 66 | int *word_to_group; | 
|  | 67 | int *group_to_table; //group_size*table_size | 
|  | 68 | int class_number; | 
|  | 69 |  | 
|  | 70 | //nce | 
|  | 71 | real* noise_distribution; | 
|  | 72 | int nce = 0; | 
|  | 73 |  | 
|  | 74 | //param caps | 
|  | 75 | real CAP_VALUE = 50; | 
|  | 76 | int cap = 0; | 
|  | 77 |  | 
|  | 78 | void capParam(real* array, int index) { | 
|  | 79 | if (array[index] > CAP_VALUE) | 
|  | 80 | array[index] = CAP_VALUE; | 
|  | 81 | else if (array[index] < -CAP_VALUE) | 
|  | 82 | array[index] = -CAP_VALUE; | 
|  | 83 | } | 
|  | 84 |  | 
|  | 85 | real hardTanh(real x) { | 
|  | 86 | if (x >= 1) { | 
|  | 87 | return 1; | 
|  | 88 | } else if (x <= -1) { | 
|  | 89 | return -1; | 
|  | 90 | } else { | 
|  | 91 | return x; | 
|  | 92 | } | 
|  | 93 | } | 
|  | 94 |  | 
|  | 95 | real dHardTanh(real x, real g) { | 
|  | 96 | if (x > 1 && g > 0) { | 
|  | 97 | return 0; | 
|  | 98 | } | 
|  | 99 | if (x < -1 && g < 0) { | 
|  | 100 | return 0; | 
|  | 101 | } | 
|  | 102 | return 1; | 
|  | 103 | } | 
|  | 104 |  | 
|  | 105 | void InitUnigramTable() { | 
|  | 106 | int a, i; | 
|  | 107 | long long train_words_pow = 0; | 
|  | 108 | real d1, power = 0.75; | 
|  | 109 | table = (int *) malloc(table_size * sizeof(int)); | 
|  | 110 | for (a = 0; a < vocab_size; a++) | 
|  | 111 | train_words_pow += pow(vocab[a].cn, power); | 
|  | 112 | i = 0; | 
|  | 113 | d1 = pow(vocab[i].cn, power) / (real) train_words_pow; | 
|  | 114 | for (a = 0; a < table_size; a++) { | 
|  | 115 | table[a] = i; | 
|  | 116 | if (a / (real) table_size > d1) { | 
|  | 117 | i++; | 
|  | 118 | d1 += pow(vocab[i].cn, power) / (real) train_words_pow; | 
|  | 119 | } | 
|  | 120 | if (i >= vocab_size) | 
|  | 121 | i = vocab_size - 1; | 
|  | 122 | } | 
|  | 123 |  | 
|  | 124 | noise_distribution = (real *) calloc(vocab_size, sizeof(real)); | 
|  | 125 | for (a = 0; a < vocab_size; a++) | 
|  | 126 | noise_distribution[a] = pow(vocab[a].cn, power) | 
|  | 127 | / (real) train_words_pow; | 
|  | 128 | } | 
|  | 129 |  | 
|  | 130 | // Reads a single word from a file, assuming space + tab + EOL to be word boundaries | 
|  | 131 | void ReadWord(char *word, FILE *fin) { | 
|  | 132 | int a = 0, ch; | 
|  | 133 | while (!feof(fin)) { | 
|  | 134 | ch = fgetc(fin); | 
|  | 135 | if (ch == 13) | 
|  | 136 | continue; | 
|  | 137 | if ((ch == ' ') || (ch == '\t') || (ch == '\n')) { | 
|  | 138 | if (a > 0) { | 
|  | 139 | if (ch == '\n') | 
|  | 140 | ungetc(ch, fin); | 
|  | 141 | break; | 
|  | 142 | } | 
|  | 143 | if (ch == '\n') { | 
|  | 144 | strcpy(word, (char *) "</s>"); | 
|  | 145 | return; | 
|  | 146 | } else | 
|  | 147 | continue; | 
|  | 148 | } | 
|  | 149 | word[a] = ch; | 
|  | 150 | a++; | 
|  | 151 | if (a >= MAX_STRING - 1) | 
|  | 152 | a--;   // Truncate too long words | 
|  | 153 | } | 
|  | 154 | word[a] = 0; | 
|  | 155 | } | 
|  | 156 |  | 
|  | 157 | // Returns hash value of a word | 
|  | 158 | int GetWordHash(char *word) { | 
|  | 159 | unsigned long long a, hash = 0; | 
|  | 160 | for (a = 0; a < strlen(word); a++) | 
|  | 161 | hash = hash * 257 + word[a]; | 
|  | 162 | hash = hash % vocab_hash_size; | 
|  | 163 | return hash; | 
|  | 164 | } | 
|  | 165 |  | 
|  | 166 | // Returns position of a word in the vocabulary; if the word is not found, returns -1 | 
|  | 167 | int SearchVocab(char *word) { | 
|  | 168 | unsigned int hash = GetWordHash(word); | 
|  | 169 | while (1) { | 
|  | 170 | if (vocab_hash[hash] == -1) | 
|  | 171 | return -1; | 
|  | 172 | if (!strcmp(word, vocab[vocab_hash[hash]].word)) | 
|  | 173 | return vocab_hash[hash]; | 
|  | 174 | hash = (hash + 1) % vocab_hash_size; | 
|  | 175 | } | 
|  | 176 | return -1; | 
|  | 177 | } | 
|  | 178 |  | 
|  | 179 | // Reads a word and returns its index in the vocabulary | 
|  | 180 | int ReadWordIndex(FILE *fin) { | 
|  | 181 | char word[MAX_STRING]; | 
|  | 182 | ReadWord(word, fin); | 
|  | 183 | if (feof(fin)) | 
|  | 184 | return -1; | 
|  | 185 | return SearchVocab(word); | 
|  | 186 | } | 
|  | 187 |  | 
|  | 188 | // Adds a word to the vocabulary | 
|  | 189 | int AddWordToVocab(char *word) { | 
|  | 190 | unsigned int hash, length = strlen(word) + 1; | 
|  | 191 | if (length > MAX_STRING) | 
|  | 192 | length = MAX_STRING; | 
|  | 193 | vocab[vocab_size].word = (char *) calloc(length, sizeof(char)); | 
|  | 194 | strcpy(vocab[vocab_size].word, word); | 
|  | 195 | vocab[vocab_size].cn = 0; | 
|  | 196 | vocab_size++; | 
|  | 197 | // Reallocate memory if needed | 
|  | 198 | if (vocab_size + 2 >= vocab_max_size) { | 
|  | 199 | vocab_max_size += 1000; | 
|  | 200 | vocab = (struct vocab_word *) realloc(vocab, | 
|  | 201 | vocab_max_size * sizeof(struct vocab_word)); | 
|  | 202 | } | 
|  | 203 | hash = GetWordHash(word); | 
|  | 204 | while (vocab_hash[hash] != -1) | 
|  | 205 | hash = (hash + 1) % vocab_hash_size; | 
|  | 206 | vocab_hash[hash] = vocab_size - 1; | 
|  | 207 | return vocab_size - 1; | 
|  | 208 | } | 
|  | 209 |  | 
|  | 210 | // Used later for sorting by word counts | 
|  | 211 | int VocabCompare(const void *a, const void *b) { | 
|  | 212 | return ((struct vocab_word *) b)->cn - ((struct vocab_word *) a)->cn; | 
|  | 213 | } | 
|  | 214 |  | 
|  | 215 | // Sorts the vocabulary by frequency using word counts | 
|  | 216 | void SortVocab() { | 
|  | 217 | int a, size; | 
|  | 218 | unsigned int hash; | 
|  | 219 | // Sort the vocabulary and keep </s> at the first position | 
|  | 220 | qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare); | 
|  | 221 | for (a = 0; a < vocab_hash_size; a++) | 
|  | 222 | vocab_hash[a] = -1; | 
|  | 223 | size = vocab_size; | 
|  | 224 | train_words = 0; | 
|  | 225 | for (a = 0; a < size; a++) { | 
|  | 226 | // Words occuring less than min_count times will be discarded from the vocab | 
|  | 227 | if ((vocab[a].cn < min_count) && (a != 0)) { | 
|  | 228 | vocab_size--; | 
|  | 229 | free(vocab[a].word); | 
|  | 230 | } else { | 
|  | 231 | // Hash will be re-computed, as after the sorting it is not actual | 
|  | 232 | hash = GetWordHash(vocab[a].word); | 
|  | 233 | while (vocab_hash[hash] != -1) | 
|  | 234 | hash = (hash + 1) % vocab_hash_size; | 
|  | 235 | vocab_hash[hash] = a; | 
|  | 236 | train_words += vocab[a].cn; | 
|  | 237 | } | 
|  | 238 | } | 
|  | 239 | vocab = (struct vocab_word *) realloc(vocab, | 
|  | 240 | (vocab_size + 1) * sizeof(struct vocab_word)); | 
|  | 241 | // Allocate memory for the binary tree construction | 
|  | 242 | for (a = 0; a < vocab_size; a++) { | 
|  | 243 | vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char)); | 
|  | 244 | vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int)); | 
|  | 245 | } | 
|  | 246 | } | 
|  | 247 |  | 
|  | 248 | // Reduces the vocabulary by removing infrequent tokens | 
|  | 249 | void ReduceVocab() { | 
|  | 250 | int a, b = 0; | 
|  | 251 | unsigned int hash; | 
|  | 252 | for (a = 0; a < vocab_size; a++) | 
|  | 253 | if (vocab[a].cn > min_reduce) { | 
|  | 254 | vocab[b].cn = vocab[a].cn; | 
|  | 255 | vocab[b].word = vocab[a].word; | 
|  | 256 | b++; | 
|  | 257 | } else | 
|  | 258 | free(vocab[a].word); | 
|  | 259 | vocab_size = b; | 
|  | 260 | for (a = 0; a < vocab_hash_size; a++) | 
|  | 261 | vocab_hash[a] = -1; | 
|  | 262 | for (a = 0; a < vocab_size; a++) { | 
|  | 263 | // Hash will be re-computed, as it is not actual | 
|  | 264 | hash = GetWordHash(vocab[a].word); | 
|  | 265 | while (vocab_hash[hash] != -1) | 
|  | 266 | hash = (hash + 1) % vocab_hash_size; | 
|  | 267 | vocab_hash[hash] = a; | 
|  | 268 | } | 
|  | 269 | fflush(stdout); | 
|  | 270 | min_reduce++; | 
|  | 271 | } | 
|  | 272 |  | 
|  | 273 | // Create binary Huffman tree using the word counts | 
|  | 274 | // Frequent words will have short uniqe binary codes | 
|  | 275 | void CreateBinaryTree() { | 
|  | 276 | long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH]; | 
|  | 277 | char code[MAX_CODE_LENGTH]; | 
|  | 278 | long long *count = (long long *) calloc(vocab_size * 2 + 1, | 
|  | 279 | sizeof(long long)); | 
|  | 280 | long long *binary = (long long *) calloc(vocab_size * 2 + 1, | 
|  | 281 | sizeof(long long)); | 
|  | 282 | long long *parent_node = (long long *) calloc(vocab_size * 2 + 1, | 
|  | 283 | sizeof(long long)); | 
|  | 284 | for (a = 0; a < vocab_size; a++) | 
|  | 285 | count[a] = vocab[a].cn; | 
|  | 286 | for (a = vocab_size; a < vocab_size * 2; a++) | 
|  | 287 | count[a] = 1e15; | 
|  | 288 | pos1 = vocab_size - 1; | 
|  | 289 | pos2 = vocab_size; | 
|  | 290 | // Following algorithm constructs the Huffman tree by adding one node at a time | 
|  | 291 | for (a = 0; a < vocab_size - 1; a++) { | 
|  | 292 | // First, find two smallest nodes 'min1, min2' | 
|  | 293 | if (pos1 >= 0) { | 
|  | 294 | if (count[pos1] < count[pos2]) { | 
|  | 295 | min1i = pos1; | 
|  | 296 | pos1--; | 
|  | 297 | } else { | 
|  | 298 | min1i = pos2; | 
|  | 299 | pos2++; | 
|  | 300 | } | 
|  | 301 | } else { | 
|  | 302 | min1i = pos2; | 
|  | 303 | pos2++; | 
|  | 304 | } | 
|  | 305 | if (pos1 >= 0) { | 
|  | 306 | if (count[pos1] < count[pos2]) { | 
|  | 307 | min2i = pos1; | 
|  | 308 | pos1--; | 
|  | 309 | } else { | 
|  | 310 | min2i = pos2; | 
|  | 311 | pos2++; | 
|  | 312 | } | 
|  | 313 | } else { | 
|  | 314 | min2i = pos2; | 
|  | 315 | pos2++; | 
|  | 316 | } | 
|  | 317 | count[vocab_size + a] = count[min1i] + count[min2i]; | 
|  | 318 | parent_node[min1i] = vocab_size + a; | 
|  | 319 | parent_node[min2i] = vocab_size + a; | 
|  | 320 | binary[min2i] = 1; | 
|  | 321 | } | 
|  | 322 | // Now assign binary code to each vocabulary word | 
|  | 323 | for (a = 0; a < vocab_size; a++) { | 
|  | 324 | b = a; | 
|  | 325 | i = 0; | 
|  | 326 | while (1) { | 
|  | 327 | code[i] = binary[b]; | 
|  | 328 | point[i] = b; | 
|  | 329 | i++; | 
|  | 330 | b = parent_node[b]; | 
|  | 331 | if (b == vocab_size * 2 - 2) | 
|  | 332 | break; | 
|  | 333 | } | 
|  | 334 | vocab[a].codelen = i; | 
|  | 335 | vocab[a].point[0] = vocab_size - 2; | 
|  | 336 | for (b = 0; b < i; b++) { | 
|  | 337 | vocab[a].code[i - b - 1] = code[b]; | 
|  | 338 | vocab[a].point[i - b] = point[b] - vocab_size; | 
|  | 339 | } | 
|  | 340 | } | 
|  | 341 | free(count); | 
|  | 342 | free(binary); | 
|  | 343 | free(parent_node); | 
|  | 344 | } | 
|  | 345 |  | 
|  | 346 | void LearnVocabFromTrainFile() { | 
|  | 347 | char word[MAX_STRING]; | 
|  | 348 | FILE *fin; | 
|  | 349 | long long a, i; | 
|  | 350 | for (a = 0; a < vocab_hash_size; a++) | 
|  | 351 | vocab_hash[a] = -1; | 
|  | 352 | fin = fopen(train_file, "rb"); | 
|  | 353 | if (fin == NULL) { | 
|  | 354 | printf("ERROR: training data file not found!\n"); | 
|  | 355 | exit(1); | 
|  | 356 | } | 
|  | 357 | vocab_size = 0; | 
|  | 358 | AddWordToVocab((char *) "</s>"); | 
|  | 359 | while (1) { | 
|  | 360 | ReadWord(word, fin); | 
|  | 361 | if (feof(fin)) | 
|  | 362 | break; | 
|  | 363 | train_words++; | 
|  | 364 | if ((debug_mode > 1) && (train_words % 100000 == 0)) { | 
|  | 365 | printf("%lldK%c", train_words / 1000, 13); | 
|  | 366 | fflush(stdout); | 
|  | 367 | } | 
|  | 368 | i = SearchVocab(word); | 
|  | 369 | if (i == -1) { | 
|  | 370 | a = AddWordToVocab(word); | 
|  | 371 | vocab[a].cn = 1; | 
|  | 372 | } else | 
|  | 373 | vocab[i].cn++; | 
|  | 374 | if (vocab_size > vocab_hash_size * 0.7) | 
|  | 375 | ReduceVocab(); | 
|  | 376 | } | 
|  | 377 | SortVocab(); | 
|  | 378 | if (debug_mode > 0) { | 
|  | 379 | printf("Vocab size: %lld\n", vocab_size); | 
|  | 380 | printf("Words in train file: %lld\n", train_words); | 
|  | 381 | } | 
|  | 382 | file_size = ftell(fin); | 
|  | 383 | fclose(fin); | 
|  | 384 | } | 
|  | 385 |  | 
|  | 386 | void SaveVocab() { | 
|  | 387 | long long i; | 
|  | 388 | FILE *fo = fopen(save_vocab_file, "wb"); | 
|  | 389 | for (i = 0; i < vocab_size; i++) | 
|  | 390 | fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn); | 
|  | 391 | fclose(fo); | 
|  | 392 | } | 
|  | 393 |  | 
|  | 394 | void ReadVocab() { | 
|  | 395 | long long a, i = 0; | 
|  | 396 | char c; | 
|  | 397 | char word[MAX_STRING]; | 
|  | 398 | FILE *fin = fopen(read_vocab_file, "rb"); | 
|  | 399 | if (fin == NULL) { | 
|  | 400 | printf("Vocabulary file not found\n"); | 
|  | 401 | exit(1); | 
|  | 402 | } | 
|  | 403 | for (a = 0; a < vocab_hash_size; a++) | 
|  | 404 | vocab_hash[a] = -1; | 
|  | 405 | vocab_size = 0; | 
|  | 406 | while (1) { | 
|  | 407 | ReadWord(word, fin); | 
|  | 408 | if (feof(fin)) | 
|  | 409 | break; | 
|  | 410 | a = AddWordToVocab(word); | 
|  | 411 | fscanf(fin, "%lld%c", &vocab[a].cn, &c); | 
|  | 412 | i++; | 
|  | 413 | } | 
|  | 414 | SortVocab(); | 
|  | 415 | if (debug_mode > 0) { | 
|  | 416 | printf("Vocab size: %lld\n", vocab_size); | 
|  | 417 | printf("Words in train file: %lld\n", train_words); | 
|  | 418 | } | 
|  | 419 | fin = fopen(train_file, "rb"); | 
|  | 420 | if (fin == NULL) { | 
|  | 421 | printf("ERROR: training data file not found!\n"); | 
|  | 422 | exit(1); | 
|  | 423 | } | 
|  | 424 | fseek(fin, 0, SEEK_END); | 
|  | 425 | file_size = ftell(fin); | 
|  | 426 | fclose(fin); | 
|  | 427 | } | 
|  | 428 |  | 
|  | 429 | void InitClassUnigramTable() { | 
|  | 430 | long long a, c; | 
|  | 431 | printf("loading class unigrams \n"); | 
|  | 432 | FILE *fin = fopen(negative_classes_file, "rb"); | 
|  | 433 | if (fin == NULL) { | 
|  | 434 | printf("ERROR: class file not found!\n"); | 
|  | 435 | exit(1); | 
|  | 436 | } | 
|  | 437 | word_to_group = (int *) malloc(vocab_size * sizeof(int)); | 
|  | 438 | for (a = 0; a < vocab_size; a++) | 
|  | 439 | word_to_group[a] = -1; | 
|  | 440 | char class[MAX_STRING]; | 
|  | 441 | char prev_class[MAX_STRING]; | 
|  | 442 | prev_class[0] = 0; | 
|  | 443 | char word[MAX_STRING]; | 
|  | 444 | class_number = -1; | 
|  | 445 | while (1) { | 
|  | 446 | if (feof(fin)) | 
|  | 447 | break; | 
|  | 448 | ReadWord(class, fin); | 
|  | 449 | ReadWord(word, fin); | 
|  | 450 | int word_index = SearchVocab(word); | 
|  | 451 | if (word_index != -1) { | 
|  | 452 | if (strcmp(class, prev_class) != 0) { | 
|  | 453 | class_number++; | 
|  | 454 | strcpy(prev_class, class); | 
|  | 455 | } | 
|  | 456 | word_to_group[word_index] = class_number; | 
|  | 457 | } | 
|  | 458 | ReadWord(word, fin); | 
|  | 459 | } | 
|  | 460 | class_number++; | 
|  | 461 | fclose(fin); | 
|  | 462 |  | 
|  | 463 | group_to_table = (int *) malloc(table_size * class_number * sizeof(int)); | 
|  | 464 | long long train_words_pow = 0; | 
|  | 465 | real d1, power = 0.75; | 
|  | 466 |  | 
|  | 467 | for (c = 0; c < class_number; c++) { | 
|  | 468 | long long offset = c * table_size; | 
|  | 469 | train_words_pow = 0; | 
|  | 470 | for (a = 0; a < vocab_size; a++) | 
|  | 471 | if (word_to_group[a] == c) | 
|  | 472 | train_words_pow += pow(vocab[a].cn, power); | 
|  | 473 | int i = 0; | 
|  | 474 | while (word_to_group[i] != c && i < vocab_size) | 
|  | 475 | i++; | 
|  | 476 | d1 = pow(vocab[i].cn, power) / (real) train_words_pow; | 
|  | 477 | for (a = 0; a < table_size; a++) { | 
|  | 478 | //printf("index %lld , word %d\n", a, i); | 
|  | 479 | group_to_table[offset + a] = i; | 
|  | 480 | if (a / (real) table_size > d1) { | 
|  | 481 | i++; | 
|  | 482 | while (word_to_group[i] != c && i < vocab_size) | 
|  | 483 | i++; | 
|  | 484 | d1 += pow(vocab[i].cn, power) / (real) train_words_pow; | 
|  | 485 | } | 
|  | 486 | if (i >= vocab_size) | 
|  | 487 | while (word_to_group[i] != c && i >= 0) | 
|  | 488 | i--; | 
|  | 489 | } | 
|  | 490 | } | 
|  | 491 | } | 
|  | 492 |  | 
|  | 493 | void SaveNet() { | 
| Marc Kupietz | 313fcc5 | 2016-03-16 16:43:37 +0100 | [diff] [blame] | 494 | if(type != 3 || negative <= 0) { | 
|  | 495 | fprintf(stderr, "save-net only supported for type 3 with negative sampling\n"); | 
|  | 496 | return; | 
|  | 497 | } | 
|  | 498 |  | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 499 | FILE *fnet = fopen(save_net_file, "wb"); | 
|  | 500 | if (fnet == NULL) { | 
|  | 501 | printf("Net parameter file not found\n"); | 
|  | 502 | exit(1); | 
|  | 503 | } | 
| Marc Kupietz | c697933 | 2016-03-16 15:29:07 +0100 | [diff] [blame] | 504 | fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet); | 
| Marc Kupietz | 313fcc5 | 2016-03-16 16:43:37 +0100 | [diff] [blame] | 505 | fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet); | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 506 | fclose(fnet); | 
|  | 507 | } | 
|  | 508 |  | 
|  | 509 | void InitNet() { | 
|  | 510 | long long a, b; | 
|  | 511 | unsigned long long next_random = 1; | 
|  | 512 | window_layer_size = layer1_size * window * 2; | 
|  | 513 | a = posix_memalign((void **) &syn0, 128, | 
|  | 514 | (long long) vocab_size * layer1_size * sizeof(real)); | 
|  | 515 | if (syn0 == NULL) { | 
|  | 516 | printf("Memory allocation failed\n"); | 
|  | 517 | exit(1); | 
|  | 518 | } | 
|  | 519 |  | 
|  | 520 | if (hs) { | 
|  | 521 | a = posix_memalign((void **) &syn1, 128, | 
|  | 522 | (long long) vocab_size * layer1_size * sizeof(real)); | 
|  | 523 | if (syn1 == NULL) { | 
|  | 524 | printf("Memory allocation failed\n"); | 
|  | 525 | exit(1); | 
|  | 526 | } | 
|  | 527 | a = posix_memalign((void **) &syn1_window, 128, | 
|  | 528 | (long long) vocab_size * window_layer_size * sizeof(real)); | 
|  | 529 | if (syn1_window == NULL) { | 
|  | 530 | printf("Memory allocation failed\n"); | 
|  | 531 | exit(1); | 
|  | 532 | } | 
|  | 533 | a = posix_memalign((void **) &syn_hidden_word, 128, | 
|  | 534 | (long long) vocab_size * window_hidden_size * sizeof(real)); | 
|  | 535 | if (syn_hidden_word == NULL) { | 
|  | 536 | printf("Memory allocation failed\n"); | 
|  | 537 | exit(1); | 
|  | 538 | } | 
|  | 539 |  | 
|  | 540 | for (a = 0; a < vocab_size; a++) | 
|  | 541 | for (b = 0; b < layer1_size; b++) | 
|  | 542 | syn1[a * layer1_size + b] = 0; | 
|  | 543 | for (a = 0; a < vocab_size; a++) | 
|  | 544 | for (b = 0; b < window_layer_size; b++) | 
|  | 545 | syn1_window[a * window_layer_size + b] = 0; | 
|  | 546 | for (a = 0; a < vocab_size; a++) | 
|  | 547 | for (b = 0; b < window_hidden_size; b++) | 
|  | 548 | syn_hidden_word[a * window_hidden_size + b] = 0; | 
|  | 549 | } | 
|  | 550 | if (negative > 0) { | 
| Marc Kupietz | 1006a27 | 2016-03-16 15:50:20 +0100 | [diff] [blame] | 551 | if(type == 0) { | 
|  | 552 | a = posix_memalign((void **) &syn1neg, 128, | 
|  | 553 | (long long) vocab_size * layer1_size * sizeof(real)); | 
|  | 554 | if (syn1neg == NULL) { | 
|  | 555 | printf("Memory allocation failed\n"); | 
|  | 556 | exit(1); | 
|  | 557 | } | 
|  | 558 | for (a = 0; a < vocab_size; a++) | 
|  | 559 | for (b = 0; b < layer1_size; b++) | 
|  | 560 | syn1neg[a * layer1_size + b] = 0; | 
|  | 561 | } else if (type == 3) { | 
|  | 562 | a = posix_memalign((void **) &syn1neg_window, 128, | 
|  | 563 | (long long) vocab_size * window_layer_size * sizeof(real)); | 
|  | 564 | if (syn1neg_window == NULL) { | 
|  | 565 | printf("Memory allocation failed\n"); | 
|  | 566 | exit(1); | 
|  | 567 | } | 
|  | 568 | for (a = 0; a < vocab_size; a++) | 
|  | 569 | for (b = 0; b < window_layer_size; b++) | 
|  | 570 | syn1neg_window[a * window_layer_size + b] = 0; | 
|  | 571 | } else if (type == 4) { | 
|  | 572 | a = posix_memalign((void **) &syn_hidden_word_neg, 128, | 
|  | 573 | (long long) vocab_size * window_hidden_size * sizeof(real)); | 
|  | 574 | if (syn_hidden_word_neg == NULL) { | 
|  | 575 | printf("Memory allocation failed\n"); | 
|  | 576 | exit(1); | 
|  | 577 | } | 
|  | 578 | for (a = 0; a < vocab_size; a++) | 
|  | 579 | for (b = 0; b < window_hidden_size; b++) | 
|  | 580 | syn_hidden_word_neg[a * window_hidden_size + b] = 0; | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 581 | } | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 582 | } | 
|  | 583 | if (nce > 0) { | 
|  | 584 | a = posix_memalign((void **) &syn1nce, 128, | 
|  | 585 | (long long) vocab_size * layer1_size * sizeof(real)); | 
|  | 586 | if (syn1nce == NULL) { | 
|  | 587 | printf("Memory allocation failed\n"); | 
|  | 588 | exit(1); | 
|  | 589 | } | 
|  | 590 | a = posix_memalign((void **) &syn1nce_window, 128, | 
|  | 591 | (long long) vocab_size * window_layer_size * sizeof(real)); | 
|  | 592 | if (syn1nce_window == NULL) { | 
|  | 593 | printf("Memory allocation failed\n"); | 
|  | 594 | exit(1); | 
|  | 595 | } | 
|  | 596 | a = posix_memalign((void **) &syn_hidden_word_nce, 128, | 
|  | 597 | (long long) vocab_size * window_hidden_size * sizeof(real)); | 
|  | 598 | if (syn_hidden_word_nce == NULL) { | 
|  | 599 | printf("Memory allocation failed\n"); | 
|  | 600 | exit(1); | 
|  | 601 | } | 
|  | 602 |  | 
|  | 603 | for (a = 0; a < vocab_size; a++) | 
|  | 604 | for (b = 0; b < layer1_size; b++) | 
|  | 605 | syn1nce[a * layer1_size + b] = 0; | 
|  | 606 | for (a = 0; a < vocab_size; a++) | 
|  | 607 | for (b = 0; b < window_layer_size; b++) | 
|  | 608 | syn1nce_window[a * window_layer_size + b] = 0; | 
|  | 609 | for (a = 0; a < vocab_size; a++) | 
|  | 610 | for (b = 0; b < window_hidden_size; b++) | 
|  | 611 | syn_hidden_word_nce[a * window_hidden_size + b] = 0; | 
|  | 612 | } | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 613 |  | 
| Marc Kupietz | 1006a27 | 2016-03-16 15:50:20 +0100 | [diff] [blame] | 614 | if(type == 4) { | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 615 | a = posix_memalign((void **) &syn_window_hidden, 128, | 
|  | 616 | window_hidden_size * window_layer_size * sizeof(real)); | 
|  | 617 | if (syn_window_hidden == NULL) { | 
|  | 618 | printf("Memory allocation failed\n"); | 
|  | 619 | exit(1); | 
|  | 620 | } | 
|  | 621 | for (a = 0; a < window_hidden_size * window_layer_size; a++) { | 
|  | 622 | next_random = next_random * (unsigned long long) 25214903917 + 11; | 
|  | 623 | syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536) | 
|  | 624 | - 0.5) / (window_hidden_size * window_layer_size); | 
|  | 625 | } | 
|  | 626 | } | 
| Marc Kupietz | 1006a27 | 2016-03-16 15:50:20 +0100 | [diff] [blame] | 627 |  | 
|  | 628 | if (read_net_file[0] == 0) { | 
|  | 629 | for (a = 0; a < vocab_size; a++) | 
|  | 630 | for (b = 0; b < layer1_size; b++) { | 
|  | 631 | next_random = next_random * (unsigned long long) 25214903917 | 
|  | 632 | + 11; | 
|  | 633 | syn0[a * layer1_size + b] = (((next_random & 0xFFFF) | 
|  | 634 | / (real) 65536) - 0.5) / layer1_size; | 
|  | 635 | } | 
| Marc Kupietz | 313fcc5 | 2016-03-16 16:43:37 +0100 | [diff] [blame] | 636 | } else if(type == 3 && negative > 0) { | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 637 | FILE *fnet = fopen(read_net_file, "rb"); | 
|  | 638 | if (fnet == NULL) { | 
|  | 639 | printf("Net parameter file not found\n"); | 
|  | 640 | exit(1); | 
|  | 641 | } | 
| Marc Kupietz | c697933 | 2016-03-16 15:29:07 +0100 | [diff] [blame] | 642 | fread(syn0, sizeof(real), vocab_size * layer1_size, fnet); | 
| Marc Kupietz | 313fcc5 | 2016-03-16 16:43:37 +0100 | [diff] [blame] | 643 | fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet); | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 644 | fclose(fnet); | 
| Marc Kupietz | 313fcc5 | 2016-03-16 16:43:37 +0100 | [diff] [blame] | 645 | } else { | 
|  | 646 | fprintf(stderr, "read-net only supported for type 3 with negative sampling\n"); | 
|  | 647 | exit(-1); | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 648 | } | 
|  | 649 |  | 
|  | 650 | CreateBinaryTree(); | 
|  | 651 | } | 
|  | 652 |  | 
|  | 653 | void *TrainModelThread(void *id) { | 
|  | 654 | long long a, b, d, cw, word, last_word, sentence_length = 0, | 
|  | 655 | sentence_position = 0; | 
|  | 656 | long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1]; | 
|  | 657 | long long l1, l2, c, target, label, local_iter = iter; | 
|  | 658 | unsigned long long next_random = (long long) id; | 
|  | 659 | real f, g; | 
|  | 660 | clock_t now; | 
|  | 661 | int input_len_1 = layer1_size; | 
|  | 662 | int window_offset = -1; | 
|  | 663 | if (type == 2 || type == 4) { | 
|  | 664 | input_len_1 = window_layer_size; | 
|  | 665 | } | 
|  | 666 | real *neu1 = (real *) calloc(input_len_1, sizeof(real)); | 
|  | 667 | real *neu1e = (real *) calloc(input_len_1, sizeof(real)); | 
|  | 668 |  | 
|  | 669 | int input_len_2 = 0; | 
|  | 670 | if (type == 4) { | 
|  | 671 | input_len_2 = window_hidden_size; | 
|  | 672 | } | 
|  | 673 | real *neu2 = (real *) calloc(input_len_2, sizeof(real)); | 
|  | 674 | real *neu2e = (real *) calloc(input_len_2, sizeof(real)); | 
|  | 675 |  | 
|  | 676 | FILE *fi = fopen(train_file, "rb"); | 
|  | 677 | fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET); | 
|  | 678 | while (1) { | 
|  | 679 | if (word_count - last_word_count > 10000) { | 
|  | 680 | word_count_actual += word_count - last_word_count; | 
|  | 681 | last_word_count = word_count; | 
|  | 682 | if ((debug_mode > 1)) { | 
|  | 683 | now = clock(); | 
|  | 684 | printf( | 
|  | 685 | "%cAlpha: %f  Progress: %.2f%%  Words/thread/sec: %.2fk  ", | 
|  | 686 | 13, alpha, | 
|  | 687 | word_count_actual / (real) (iter * train_words + 1) | 
|  | 688 | * 100, | 
|  | 689 | word_count_actual | 
|  | 690 | / ((real) (now - start + 1) | 
|  | 691 | / (real) CLOCKS_PER_SEC * 1000)); | 
|  | 692 | fflush(stdout); | 
|  | 693 | } | 
|  | 694 | alpha = starting_alpha | 
|  | 695 | * (1 - word_count_actual / (real) (iter * train_words + 1)); | 
|  | 696 | if (alpha < starting_alpha * 0.0001) | 
|  | 697 | alpha = starting_alpha * 0.0001; | 
|  | 698 | } | 
|  | 699 | if (sentence_length == 0) { | 
|  | 700 | while (1) { | 
|  | 701 | word = ReadWordIndex(fi); | 
|  | 702 | if (feof(fi)) | 
|  | 703 | break; | 
|  | 704 | if (word == -1) | 
|  | 705 | continue; | 
|  | 706 | word_count++; | 
|  | 707 | if (word == 0) | 
|  | 708 | break; | 
|  | 709 | // The subsampling randomly discards frequent words while keeping the ranking same | 
|  | 710 | if (sample > 0) { | 
|  | 711 | real ran = (sqrt(vocab[word].cn / (sample * train_words)) | 
|  | 712 | + 1) * (sample * train_words) / vocab[word].cn; | 
|  | 713 | next_random = next_random * (unsigned long long) 25214903917 | 
|  | 714 | + 11; | 
|  | 715 | if (ran < (next_random & 0xFFFF) / (real) 65536) | 
|  | 716 | continue; | 
|  | 717 | } | 
|  | 718 | sen[sentence_length] = word; | 
|  | 719 | sentence_length++; | 
|  | 720 | if (sentence_length >= MAX_SENTENCE_LENGTH) | 
|  | 721 | break; | 
|  | 722 | } | 
|  | 723 | sentence_position = 0; | 
|  | 724 | } | 
|  | 725 | if (feof(fi) || (word_count > train_words / num_threads)) { | 
|  | 726 | word_count_actual += word_count - last_word_count; | 
|  | 727 | local_iter--; | 
|  | 728 | if (local_iter == 0) | 
|  | 729 | break; | 
|  | 730 | word_count = 0; | 
|  | 731 | last_word_count = 0; | 
|  | 732 | sentence_length = 0; | 
|  | 733 | fseek(fi, file_size / (long long) num_threads * (long long) id, | 
|  | 734 | SEEK_SET); | 
|  | 735 | continue; | 
|  | 736 | } | 
|  | 737 | word = sen[sentence_position]; | 
|  | 738 | if (word == -1) | 
|  | 739 | continue; | 
|  | 740 | for (c = 0; c < input_len_1; c++) | 
|  | 741 | neu1[c] = 0; | 
|  | 742 | for (c = 0; c < input_len_1; c++) | 
|  | 743 | neu1e[c] = 0; | 
|  | 744 | for (c = 0; c < input_len_2; c++) | 
|  | 745 | neu2[c] = 0; | 
|  | 746 | for (c = 0; c < input_len_2; c++) | 
|  | 747 | neu2e[c] = 0; | 
|  | 748 | next_random = next_random * (unsigned long long) 25214903917 + 11; | 
|  | 749 | b = next_random % window; | 
|  | 750 | if (type == 0) {  //train the cbow architecture | 
|  | 751 | // in -> hidden | 
|  | 752 | cw = 0; | 
|  | 753 | for (a = b; a < window * 2 + 1 - b; a++) | 
|  | 754 | if (a != window) { | 
|  | 755 | c = sentence_position - window + a; | 
|  | 756 | if (c < 0) | 
|  | 757 | continue; | 
|  | 758 | if (c >= sentence_length) | 
|  | 759 | continue; | 
|  | 760 | last_word = sen[c]; | 
|  | 761 | if (last_word == -1) | 
|  | 762 | continue; | 
|  | 763 | for (c = 0; c < layer1_size; c++) | 
|  | 764 | neu1[c] += syn0[c + last_word * layer1_size]; | 
|  | 765 | cw++; | 
|  | 766 | } | 
|  | 767 | if (cw) { | 
|  | 768 | for (c = 0; c < layer1_size; c++) | 
|  | 769 | neu1[c] /= cw; | 
|  | 770 | if (hs) | 
|  | 771 | for (d = 0; d < vocab[word].codelen; d++) { | 
|  | 772 | f = 0; | 
|  | 773 | l2 = vocab[word].point[d] * layer1_size; | 
|  | 774 | // Propagate hidden -> output | 
|  | 775 | for (c = 0; c < layer1_size; c++) | 
|  | 776 | f += neu1[c] * syn1[c + l2]; | 
|  | 777 | if (f <= -MAX_EXP) | 
|  | 778 | continue; | 
|  | 779 | else if (f >= MAX_EXP) | 
|  | 780 | continue; | 
|  | 781 | else | 
|  | 782 | f = expTable[(int) ((f + MAX_EXP) | 
|  | 783 | * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | 
|  | 784 | // 'g' is the gradient multiplied by the learning rate | 
|  | 785 | g = (1 - vocab[word].code[d] - f) * alpha; | 
|  | 786 | // Propagate errors output -> hidden | 
|  | 787 | for (c = 0; c < layer1_size; c++) | 
|  | 788 | neu1e[c] += g * syn1[c + l2]; | 
|  | 789 | // Learn weights hidden -> output | 
|  | 790 | for (c = 0; c < layer1_size; c++) | 
|  | 791 | syn1[c + l2] += g * neu1[c]; | 
|  | 792 | if (cap == 1) | 
|  | 793 | for (c = 0; c < layer1_size; c++) | 
|  | 794 | capParam(syn1, c + l2); | 
|  | 795 | } | 
|  | 796 | // NEGATIVE SAMPLING | 
|  | 797 | if (negative > 0) | 
|  | 798 | for (d = 0; d < negative + 1; d++) { | 
|  | 799 | if (d == 0) { | 
|  | 800 | target = word; | 
|  | 801 | label = 1; | 
|  | 802 | } else { | 
|  | 803 | next_random = next_random | 
|  | 804 | * (unsigned long long) 25214903917 + 11; | 
|  | 805 | if (word_to_group != NULL | 
|  | 806 | && word_to_group[word] != -1) { | 
|  | 807 | target = word; | 
|  | 808 | while (target == word) { | 
|  | 809 | target = group_to_table[word_to_group[word] | 
|  | 810 | * table_size | 
|  | 811 | + (next_random >> 16) % table_size]; | 
|  | 812 | next_random = next_random | 
|  | 813 | * (unsigned long long) 25214903917 | 
|  | 814 | + 11; | 
|  | 815 | } | 
|  | 816 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); | 
|  | 817 | } else { | 
|  | 818 | target = | 
|  | 819 | table[(next_random >> 16) % table_size]; | 
|  | 820 | } | 
|  | 821 | if (target == 0) | 
|  | 822 | target = next_random % (vocab_size - 1) + 1; | 
|  | 823 | if (target == word) | 
|  | 824 | continue; | 
|  | 825 | label = 0; | 
|  | 826 | } | 
|  | 827 | l2 = target * layer1_size; | 
|  | 828 | f = 0; | 
|  | 829 | for (c = 0; c < layer1_size; c++) | 
|  | 830 | f += neu1[c] * syn1neg[c + l2]; | 
|  | 831 | if (f > MAX_EXP) | 
|  | 832 | g = (label - 1) * alpha; | 
|  | 833 | else if (f < -MAX_EXP) | 
|  | 834 | g = (label - 0) * alpha; | 
|  | 835 | else | 
|  | 836 | g = (label | 
|  | 837 | - expTable[(int) ((f + MAX_EXP) | 
|  | 838 | * (EXP_TABLE_SIZE / MAX_EXP / 2))]) | 
|  | 839 | * alpha; | 
|  | 840 | for (c = 0; c < layer1_size; c++) | 
|  | 841 | neu1e[c] += g * syn1neg[c + l2]; | 
|  | 842 | for (c = 0; c < layer1_size; c++) | 
|  | 843 | syn1neg[c + l2] += g * neu1[c]; | 
|  | 844 | if (cap == 1) | 
|  | 845 | for (c = 0; c < layer1_size; c++) | 
|  | 846 | capParam(syn1neg, c + l2); | 
|  | 847 | } | 
|  | 848 | // Noise Contrastive Estimation | 
|  | 849 | if (nce > 0) | 
|  | 850 | for (d = 0; d < nce + 1; d++) { | 
|  | 851 | if (d == 0) { | 
|  | 852 | target = word; | 
|  | 853 | label = 1; | 
|  | 854 | } else { | 
|  | 855 | next_random = next_random | 
|  | 856 | * (unsigned long long) 25214903917 + 11; | 
|  | 857 | if (word_to_group != NULL | 
|  | 858 | && word_to_group[word] != -1) { | 
|  | 859 | target = word; | 
|  | 860 | while (target == word) { | 
|  | 861 | target = group_to_table[word_to_group[word] | 
|  | 862 | * table_size | 
|  | 863 | + (next_random >> 16) % table_size]; | 
|  | 864 | next_random = next_random | 
|  | 865 | * (unsigned long long) 25214903917 | 
|  | 866 | + 11; | 
|  | 867 | } | 
|  | 868 | } else { | 
|  | 869 | target = | 
|  | 870 | table[(next_random >> 16) % table_size]; | 
|  | 871 | } | 
|  | 872 | if (target == 0) | 
|  | 873 | target = next_random % (vocab_size - 1) + 1; | 
|  | 874 | if (target == word) | 
|  | 875 | continue; | 
|  | 876 | label = 0; | 
|  | 877 | } | 
|  | 878 | l2 = target * layer1_size; | 
|  | 879 | f = 0; | 
|  | 880 |  | 
|  | 881 | for (c = 0; c < layer1_size; c++) | 
|  | 882 | f += neu1[c] * syn1nce[c + l2]; | 
|  | 883 | if (f > MAX_EXP) | 
|  | 884 | g = (label - 1) * alpha; | 
|  | 885 | else if (f < -MAX_EXP) | 
|  | 886 | g = (label - 0) * alpha; | 
|  | 887 | else { | 
|  | 888 | f = exp(f); | 
|  | 889 | g = | 
|  | 890 | (label | 
|  | 891 | - f | 
|  | 892 | / (noise_distribution[target] | 
|  | 893 | * nce + f)) * alpha; | 
|  | 894 | } | 
|  | 895 | for (c = 0; c < layer1_size; c++) | 
|  | 896 | neu1e[c] += g * syn1nce[c + l2]; | 
|  | 897 | for (c = 0; c < layer1_size; c++) | 
|  | 898 | syn1nce[c + l2] += g * neu1[c]; | 
|  | 899 | if (cap == 1) | 
|  | 900 | for (c = 0; c < layer1_size; c++) | 
|  | 901 | capParam(syn1nce, c + l2); | 
|  | 902 | } | 
|  | 903 | // hidden -> in | 
|  | 904 | for (a = b; a < window * 2 + 1 - b; a++) | 
|  | 905 | if (a != window) { | 
|  | 906 | c = sentence_position - window + a; | 
|  | 907 | if (c < 0) | 
|  | 908 | continue; | 
|  | 909 | if (c >= sentence_length) | 
|  | 910 | continue; | 
|  | 911 | last_word = sen[c]; | 
|  | 912 | if (last_word == -1) | 
|  | 913 | continue; | 
|  | 914 | for (c = 0; c < layer1_size; c++) | 
|  | 915 | syn0[c + last_word * layer1_size] += neu1e[c]; | 
|  | 916 | } | 
|  | 917 | } | 
|  | 918 | } else if (type == 1) {  //train skip-gram | 
|  | 919 | for (a = b; a < window * 2 + 1 - b; a++) | 
|  | 920 | if (a != window) { | 
|  | 921 | c = sentence_position - window + a; | 
|  | 922 | if (c < 0) | 
|  | 923 | continue; | 
|  | 924 | if (c >= sentence_length) | 
|  | 925 | continue; | 
|  | 926 | last_word = sen[c]; | 
|  | 927 | if (last_word == -1) | 
|  | 928 | continue; | 
|  | 929 | l1 = last_word * layer1_size; | 
|  | 930 | for (c = 0; c < layer1_size; c++) | 
|  | 931 | neu1e[c] = 0; | 
|  | 932 | // HIERARCHICAL SOFTMAX | 
|  | 933 | if (hs) | 
|  | 934 | for (d = 0; d < vocab[word].codelen; d++) { | 
|  | 935 | f = 0; | 
|  | 936 | l2 = vocab[word].point[d] * layer1_size; | 
|  | 937 | // Propagate hidden -> output | 
|  | 938 | for (c = 0; c < layer1_size; c++) | 
|  | 939 | f += syn0[c + l1] * syn1[c + l2]; | 
|  | 940 | if (f <= -MAX_EXP) | 
|  | 941 | continue; | 
|  | 942 | else if (f >= MAX_EXP) | 
|  | 943 | continue; | 
|  | 944 | else | 
|  | 945 | f = expTable[(int) ((f + MAX_EXP) | 
|  | 946 | * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | 
|  | 947 | // 'g' is the gradient multiplied by the learning rate | 
|  | 948 | g = (1 - vocab[word].code[d] - f) * alpha; | 
|  | 949 | // Propagate errors output -> hidden | 
|  | 950 | for (c = 0; c < layer1_size; c++) | 
|  | 951 | neu1e[c] += g * syn1[c + l2]; | 
|  | 952 | // Learn weights hidden -> output | 
|  | 953 | for (c = 0; c < layer1_size; c++) | 
|  | 954 | syn1[c + l2] += g * syn0[c + l1]; | 
|  | 955 | if (cap == 1) | 
|  | 956 | for (c = 0; c < layer1_size; c++) | 
|  | 957 | capParam(syn1, c + l2); | 
|  | 958 | } | 
|  | 959 | // NEGATIVE SAMPLING | 
|  | 960 | if (negative > 0) | 
|  | 961 | for (d = 0; d < negative + 1; d++) { | 
|  | 962 | if (d == 0) { | 
|  | 963 | target = word; | 
|  | 964 | label = 1; | 
|  | 965 | } else { | 
|  | 966 | next_random = next_random | 
|  | 967 | * (unsigned long long) 25214903917 + 11; | 
|  | 968 | if (word_to_group != NULL | 
|  | 969 | && word_to_group[word] != -1) { | 
|  | 970 | target = word; | 
|  | 971 | while (target == word) { | 
|  | 972 | target = | 
|  | 973 | group_to_table[word_to_group[word] | 
|  | 974 | * table_size | 
|  | 975 | + (next_random >> 16) | 
|  | 976 | % table_size]; | 
|  | 977 | next_random = | 
|  | 978 | next_random | 
|  | 979 | * (unsigned long long) 25214903917 | 
|  | 980 | + 11; | 
|  | 981 | } | 
|  | 982 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); | 
|  | 983 | } else { | 
|  | 984 | target = table[(next_random >> 16) | 
|  | 985 | % table_size]; | 
|  | 986 | } | 
|  | 987 | if (target == 0) | 
|  | 988 | target = next_random % (vocab_size - 1) + 1; | 
|  | 989 | if (target == word) | 
|  | 990 | continue; | 
|  | 991 | label = 0; | 
|  | 992 | } | 
|  | 993 | l2 = target * layer1_size; | 
|  | 994 | f = 0; | 
|  | 995 | for (c = 0; c < layer1_size; c++) | 
|  | 996 | f += syn0[c + l1] * syn1neg[c + l2]; | 
|  | 997 | if (f > MAX_EXP) | 
|  | 998 | g = (label - 1) * alpha; | 
|  | 999 | else if (f < -MAX_EXP) | 
|  | 1000 | g = (label - 0) * alpha; | 
|  | 1001 | else | 
|  | 1002 | g = | 
|  | 1003 | (label | 
|  | 1004 | - expTable[(int) ((f + MAX_EXP) | 
|  | 1005 | * (EXP_TABLE_SIZE | 
|  | 1006 | / MAX_EXP / 2))]) | 
|  | 1007 | * alpha; | 
|  | 1008 | for (c = 0; c < layer1_size; c++) | 
|  | 1009 | neu1e[c] += g * syn1neg[c + l2]; | 
|  | 1010 | for (c = 0; c < layer1_size; c++) | 
|  | 1011 | syn1neg[c + l2] += g * syn0[c + l1]; | 
|  | 1012 | if (cap == 1) | 
|  | 1013 | for (c = 0; c < layer1_size; c++) | 
|  | 1014 | capParam(syn1neg, c + l2); | 
|  | 1015 | } | 
|  | 1016 | //Noise Contrastive Estimation | 
|  | 1017 | if (nce > 0) | 
|  | 1018 | for (d = 0; d < nce + 1; d++) { | 
|  | 1019 | if (d == 0) { | 
|  | 1020 | target = word; | 
|  | 1021 | label = 1; | 
|  | 1022 | } else { | 
|  | 1023 | next_random = next_random | 
|  | 1024 | * (unsigned long long) 25214903917 + 11; | 
|  | 1025 | if (word_to_group != NULL | 
|  | 1026 | && word_to_group[word] != -1) { | 
|  | 1027 | target = word; | 
|  | 1028 | while (target == word) { | 
|  | 1029 | target = | 
|  | 1030 | group_to_table[word_to_group[word] | 
|  | 1031 | * table_size | 
|  | 1032 | + (next_random >> 16) | 
|  | 1033 | % table_size]; | 
|  | 1034 | next_random = | 
|  | 1035 | next_random | 
|  | 1036 | * (unsigned long long) 25214903917 | 
|  | 1037 | + 11; | 
|  | 1038 | } | 
|  | 1039 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); | 
|  | 1040 | } else { | 
|  | 1041 | target = table[(next_random >> 16) | 
|  | 1042 | % table_size]; | 
|  | 1043 | } | 
|  | 1044 | if (target == 0) | 
|  | 1045 | target = next_random % (vocab_size - 1) + 1; | 
|  | 1046 | if (target == word) | 
|  | 1047 | continue; | 
|  | 1048 | label = 0; | 
|  | 1049 | } | 
|  | 1050 | l2 = target * layer1_size; | 
|  | 1051 | f = 0; | 
|  | 1052 | for (c = 0; c < layer1_size; c++) | 
|  | 1053 | f += syn0[c + l1] * syn1nce[c + l2]; | 
|  | 1054 | if (f > MAX_EXP) | 
|  | 1055 | g = (label - 1) * alpha; | 
|  | 1056 | else if (f < -MAX_EXP) | 
|  | 1057 | g = (label - 0) * alpha; | 
|  | 1058 | else { | 
|  | 1059 | f = exp(f); | 
|  | 1060 | g = (label | 
|  | 1061 | - f | 
|  | 1062 | / (noise_distribution[target] | 
|  | 1063 | * nce + f)) * alpha; | 
|  | 1064 | } | 
|  | 1065 | for (c = 0; c < layer1_size; c++) | 
|  | 1066 | neu1e[c] += g * syn1nce[c + l2]; | 
|  | 1067 | for (c = 0; c < layer1_size; c++) | 
|  | 1068 | syn1nce[c + l2] += g * syn0[c + l1]; | 
|  | 1069 | if (cap == 1) | 
|  | 1070 | for (c = 0; c < layer1_size; c++) | 
|  | 1071 | capParam(syn1nce, c + l2); | 
|  | 1072 | } | 
|  | 1073 | // Learn weights input -> hidden | 
|  | 1074 | for (c = 0; c < layer1_size; c++) | 
|  | 1075 | syn0[c + l1] += neu1e[c]; | 
|  | 1076 | } | 
|  | 1077 | } else if (type == 2) { //train the cwindow architecture | 
|  | 1078 | // in -> hidden | 
|  | 1079 | cw = 0; | 
|  | 1080 | for (a = 0; a < window * 2 + 1; a++) | 
|  | 1081 | if (a != window) { | 
|  | 1082 | c = sentence_position - window + a; | 
|  | 1083 | if (c < 0) | 
|  | 1084 | continue; | 
|  | 1085 | if (c >= sentence_length) | 
|  | 1086 | continue; | 
|  | 1087 | last_word = sen[c]; | 
|  | 1088 | if (last_word == -1) | 
|  | 1089 | continue; | 
|  | 1090 | window_offset = a * layer1_size; | 
|  | 1091 | if (a > window) | 
|  | 1092 | window_offset -= layer1_size; | 
|  | 1093 | for (c = 0; c < layer1_size; c++) | 
|  | 1094 | neu1[c + window_offset] += syn0[c | 
|  | 1095 | + last_word * layer1_size]; | 
|  | 1096 | cw++; | 
|  | 1097 | } | 
|  | 1098 | if (cw) { | 
|  | 1099 | if (hs) | 
|  | 1100 | for (d = 0; d < vocab[word].codelen; d++) { | 
|  | 1101 | f = 0; | 
|  | 1102 | l2 = vocab[word].point[d] * window_layer_size; | 
|  | 1103 | // Propagate hidden -> output | 
|  | 1104 | for (c = 0; c < window_layer_size; c++) | 
|  | 1105 | f += neu1[c] * syn1_window[c + l2]; | 
|  | 1106 | if (f <= -MAX_EXP) | 
|  | 1107 | continue; | 
|  | 1108 | else if (f >= MAX_EXP) | 
|  | 1109 | continue; | 
|  | 1110 | else | 
|  | 1111 | f = expTable[(int) ((f + MAX_EXP) | 
|  | 1112 | * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | 
|  | 1113 | // 'g' is the gradient multiplied by the learning rate | 
|  | 1114 | g = (1 - vocab[word].code[d] - f) * alpha; | 
|  | 1115 | // Propagate errors output -> hidden | 
|  | 1116 | for (c = 0; c < window_layer_size; c++) | 
|  | 1117 | neu1e[c] += g * syn1_window[c + l2]; | 
|  | 1118 | // Learn weights hidden -> output | 
|  | 1119 | for (c = 0; c < window_layer_size; c++) | 
|  | 1120 | syn1_window[c + l2] += g * neu1[c]; | 
|  | 1121 | if (cap == 1) | 
|  | 1122 | for (c = 0; c < window_layer_size; c++) | 
|  | 1123 | capParam(syn1_window, c + l2); | 
|  | 1124 | } | 
|  | 1125 | // NEGATIVE SAMPLING | 
|  | 1126 | if (negative > 0) | 
|  | 1127 | for (d = 0; d < negative + 1; d++) { | 
|  | 1128 | if (d == 0) { | 
|  | 1129 | target = word; | 
|  | 1130 | label = 1; | 
|  | 1131 | } else { | 
|  | 1132 | next_random = next_random | 
|  | 1133 | * (unsigned long long) 25214903917 + 11; | 
|  | 1134 | if (word_to_group != NULL | 
|  | 1135 | && word_to_group[word] != -1) { | 
|  | 1136 | target = word; | 
|  | 1137 | while (target == word) { | 
|  | 1138 | target = group_to_table[word_to_group[word] | 
|  | 1139 | * table_size | 
|  | 1140 | + (next_random >> 16) % table_size]; | 
|  | 1141 | next_random = next_random | 
|  | 1142 | * (unsigned long long) 25214903917 | 
|  | 1143 | + 11; | 
|  | 1144 | } | 
|  | 1145 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); | 
|  | 1146 | } else { | 
|  | 1147 | target = | 
|  | 1148 | table[(next_random >> 16) % table_size]; | 
|  | 1149 | } | 
|  | 1150 | if (target == 0) | 
|  | 1151 | target = next_random % (vocab_size - 1) + 1; | 
|  | 1152 | if (target == word) | 
|  | 1153 | continue; | 
|  | 1154 | label = 0; | 
|  | 1155 | } | 
|  | 1156 | l2 = target * window_layer_size; | 
|  | 1157 | f = 0; | 
|  | 1158 | for (c = 0; c < window_layer_size; c++) | 
|  | 1159 | f += neu1[c] * syn1neg_window[c + l2]; | 
|  | 1160 | if (f > MAX_EXP) | 
|  | 1161 | g = (label - 1) * alpha; | 
|  | 1162 | else if (f < -MAX_EXP) | 
|  | 1163 | g = (label - 0) * alpha; | 
|  | 1164 | else | 
|  | 1165 | g = (label | 
|  | 1166 | - expTable[(int) ((f + MAX_EXP) | 
|  | 1167 | * (EXP_TABLE_SIZE / MAX_EXP / 2))]) | 
|  | 1168 | * alpha; | 
|  | 1169 | for (c = 0; c < window_layer_size; c++) | 
|  | 1170 | neu1e[c] += g * syn1neg_window[c + l2]; | 
|  | 1171 | for (c = 0; c < window_layer_size; c++) | 
|  | 1172 | syn1neg_window[c + l2] += g * neu1[c]; | 
|  | 1173 | if (cap == 1) | 
|  | 1174 | for (c = 0; c < window_layer_size; c++) | 
|  | 1175 | capParam(syn1neg_window, c + l2); | 
|  | 1176 | } | 
|  | 1177 | // Noise Contrastive Estimation | 
|  | 1178 | if (nce > 0) | 
|  | 1179 | for (d = 0; d < nce + 1; d++) { | 
|  | 1180 | if (d == 0) { | 
|  | 1181 | target = word; | 
|  | 1182 | label = 1; | 
|  | 1183 | } else { | 
|  | 1184 | next_random = next_random | 
|  | 1185 | * (unsigned long long) 25214903917 + 11; | 
|  | 1186 | if (word_to_group != NULL | 
|  | 1187 | && word_to_group[word] != -1) { | 
|  | 1188 | target = word; | 
|  | 1189 | while (target == word) { | 
|  | 1190 | target = group_to_table[word_to_group[word] | 
|  | 1191 | * table_size | 
|  | 1192 | + (next_random >> 16) % table_size]; | 
|  | 1193 | next_random = next_random | 
|  | 1194 | * (unsigned long long) 25214903917 | 
|  | 1195 | + 11; | 
|  | 1196 | } | 
|  | 1197 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); | 
|  | 1198 | } else { | 
|  | 1199 | target = | 
|  | 1200 | table[(next_random >> 16) % table_size]; | 
|  | 1201 | } | 
|  | 1202 | if (target == 0) | 
|  | 1203 | target = next_random % (vocab_size - 1) + 1; | 
|  | 1204 | if (target == word) | 
|  | 1205 | continue; | 
|  | 1206 | label = 0; | 
|  | 1207 | } | 
|  | 1208 | l2 = target * window_layer_size; | 
|  | 1209 | f = 0; | 
|  | 1210 | for (c = 0; c < window_layer_size; c++) | 
|  | 1211 | f += neu1[c] * syn1nce_window[c + l2]; | 
|  | 1212 | if (f > MAX_EXP) | 
|  | 1213 | g = (label - 1) * alpha; | 
|  | 1214 | else if (f < -MAX_EXP) | 
|  | 1215 | g = (label - 0) * alpha; | 
|  | 1216 | else { | 
|  | 1217 | f = exp(f); | 
|  | 1218 | g = | 
|  | 1219 | (label | 
|  | 1220 | - f | 
|  | 1221 | / (noise_distribution[target] | 
|  | 1222 | * nce + f)) * alpha; | 
|  | 1223 | } | 
|  | 1224 | for (c = 0; c < window_layer_size; c++) | 
|  | 1225 | neu1e[c] += g * syn1nce_window[c + l2]; | 
|  | 1226 | for (c = 0; c < window_layer_size; c++) | 
|  | 1227 | syn1nce_window[c + l2] += g * neu1[c]; | 
|  | 1228 | if (cap == 1) | 
|  | 1229 | for (c = 0; c < window_layer_size; c++) | 
|  | 1230 | capParam(syn1nce_window, c + l2); | 
|  | 1231 | } | 
|  | 1232 | // hidden -> in | 
|  | 1233 | for (a = 0; a < window * 2 + 1; a++) | 
|  | 1234 | if (a != window) { | 
|  | 1235 | c = sentence_position - window + a; | 
|  | 1236 | if (c < 0) | 
|  | 1237 | continue; | 
|  | 1238 | if (c >= sentence_length) | 
|  | 1239 | continue; | 
|  | 1240 | last_word = sen[c]; | 
|  | 1241 | if (last_word == -1) | 
|  | 1242 | continue; | 
|  | 1243 | window_offset = a * layer1_size; | 
|  | 1244 | if (a > window) | 
|  | 1245 | window_offset -= layer1_size; | 
|  | 1246 | for (c = 0; c < layer1_size; c++) | 
|  | 1247 | syn0[c + last_word * layer1_size] += neu1e[c | 
|  | 1248 | + window_offset]; | 
|  | 1249 | } | 
|  | 1250 | } | 
|  | 1251 | } else if (type == 3) {  //train structured skip-gram | 
|  | 1252 | for (a = 0; a < window * 2 + 1; a++) | 
|  | 1253 | if (a != window) { | 
|  | 1254 | c = sentence_position - window + a; | 
|  | 1255 | if (c < 0) | 
|  | 1256 | continue; | 
|  | 1257 | if (c >= sentence_length) | 
|  | 1258 | continue; | 
|  | 1259 | last_word = sen[c]; | 
|  | 1260 | if (last_word == -1) | 
|  | 1261 | continue; | 
|  | 1262 | l1 = last_word * layer1_size; | 
|  | 1263 | window_offset = a * layer1_size; | 
|  | 1264 | if (a > window) | 
|  | 1265 | window_offset -= layer1_size; | 
|  | 1266 | for (c = 0; c < layer1_size; c++) | 
|  | 1267 | neu1e[c] = 0; | 
|  | 1268 | // HIERARCHICAL SOFTMAX | 
|  | 1269 | if (hs) | 
|  | 1270 | for (d = 0; d < vocab[word].codelen; d++) { | 
|  | 1271 | f = 0; | 
|  | 1272 | l2 = vocab[word].point[d] * window_layer_size; | 
|  | 1273 | // Propagate hidden -> output | 
|  | 1274 | for (c = 0; c < layer1_size; c++) | 
|  | 1275 | f += syn0[c + l1] | 
|  | 1276 | * syn1_window[c + l2 + window_offset]; | 
|  | 1277 | if (f <= -MAX_EXP) | 
|  | 1278 | continue; | 
|  | 1279 | else if (f >= MAX_EXP) | 
|  | 1280 | continue; | 
|  | 1281 | else | 
|  | 1282 | f = expTable[(int) ((f + MAX_EXP) | 
|  | 1283 | * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | 
|  | 1284 | // 'g' is the gradient multiplied by the learning rate | 
|  | 1285 | g = (1 - vocab[word].code[d] - f) * alpha; | 
|  | 1286 | // Propagate errors output -> hidden | 
|  | 1287 | for (c = 0; c < layer1_size; c++) | 
|  | 1288 | neu1e[c] += g | 
|  | 1289 | * syn1_window[c + l2 + window_offset]; | 
|  | 1290 | // Learn weights hidden -> output | 
|  | 1291 | for (c = 0; c < layer1_size; c++) | 
|  | 1292 | syn1[c + l2 + window_offset] += g | 
|  | 1293 | * syn0[c + l1]; | 
|  | 1294 | if (cap == 1) | 
|  | 1295 | for (c = 0; c < layer1_size; c++) | 
|  | 1296 | capParam(syn1, c + l2 + window_offset); | 
|  | 1297 | } | 
|  | 1298 | // NEGATIVE SAMPLING | 
|  | 1299 | if (negative > 0) | 
|  | 1300 | for (d = 0; d < negative + 1; d++) { | 
|  | 1301 | if (d == 0) { | 
|  | 1302 | target = word; | 
|  | 1303 | label = 1; | 
|  | 1304 | } else { | 
|  | 1305 | next_random = next_random | 
|  | 1306 | * (unsigned long long) 25214903917 + 11; | 
|  | 1307 | if (word_to_group != NULL | 
|  | 1308 | && word_to_group[word] != -1) { | 
|  | 1309 | target = word; | 
|  | 1310 | while (target == word) { | 
|  | 1311 | target = | 
|  | 1312 | group_to_table[word_to_group[word] | 
|  | 1313 | * table_size | 
|  | 1314 | + (next_random >> 16) | 
|  | 1315 | % table_size]; | 
|  | 1316 | next_random = | 
|  | 1317 | next_random | 
|  | 1318 | * (unsigned long long) 25214903917 | 
|  | 1319 | + 11; | 
|  | 1320 | } | 
|  | 1321 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); | 
|  | 1322 | } else { | 
|  | 1323 | target = table[(next_random >> 16) | 
|  | 1324 | % table_size]; | 
|  | 1325 | } | 
|  | 1326 | if (target == 0) | 
|  | 1327 | target = next_random % (vocab_size - 1) + 1; | 
|  | 1328 | if (target == word) | 
|  | 1329 | continue; | 
|  | 1330 | label = 0; | 
|  | 1331 | } | 
|  | 1332 | l2 = target * window_layer_size; | 
|  | 1333 | f = 0; | 
|  | 1334 | for (c = 0; c < layer1_size; c++) | 
|  | 1335 | f += | 
|  | 1336 | syn0[c + l1] | 
|  | 1337 | * syn1neg_window[c + l2 | 
|  | 1338 | + window_offset]; | 
|  | 1339 | if (f > MAX_EXP) | 
|  | 1340 | g = (label - 1) * alpha; | 
|  | 1341 | else if (f < -MAX_EXP) | 
|  | 1342 | g = (label - 0) * alpha; | 
|  | 1343 | else | 
|  | 1344 | g = | 
|  | 1345 | (label | 
|  | 1346 | - expTable[(int) ((f + MAX_EXP) | 
|  | 1347 | * (EXP_TABLE_SIZE | 
|  | 1348 | / MAX_EXP / 2))]) | 
|  | 1349 | * alpha; | 
| Marc Kupietz | 6b1f2ba | 2016-03-17 21:17:42 +0100 | [diff] [blame^] | 1350 | if(debug_mode > 2 && ((long long) id) == 0) { | 
|  | 1351 | printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target); | 
|  | 1352 | printf("label %lld, a %lld, gain %.4f\n", label, a-window, g); | 
|  | 1353 | } | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 1354 | for (c = 0; c < layer1_size; c++) | 
|  | 1355 | neu1e[c] += | 
|  | 1356 | g | 
|  | 1357 | * syn1neg_window[c + l2 | 
|  | 1358 | + window_offset]; | 
|  | 1359 | for (c = 0; c < layer1_size; c++) | 
|  | 1360 | syn1neg_window[c + l2 + window_offset] += g | 
|  | 1361 | * syn0[c + l1]; | 
|  | 1362 | if (cap == 1) | 
|  | 1363 | for (c = 0; c < layer1_size; c++) | 
|  | 1364 | capParam(syn1neg_window, | 
|  | 1365 | c + l2 + window_offset); | 
|  | 1366 | } | 
|  | 1367 | // Noise Constrastive Estimation | 
|  | 1368 | if (nce > 0) | 
|  | 1369 | for (d = 0; d < nce + 1; d++) { | 
|  | 1370 | if (d == 0) { | 
|  | 1371 | target = word; | 
|  | 1372 | label = 1; | 
|  | 1373 | } else { | 
|  | 1374 | next_random = next_random | 
|  | 1375 | * (unsigned long long) 25214903917 + 11; | 
|  | 1376 | if (word_to_group != NULL | 
|  | 1377 | && word_to_group[word] != -1) { | 
|  | 1378 | target = word; | 
|  | 1379 | while (target == word) { | 
|  | 1380 | target = | 
|  | 1381 | group_to_table[word_to_group[word] | 
|  | 1382 | * table_size | 
|  | 1383 | + (next_random >> 16) | 
|  | 1384 | % table_size]; | 
|  | 1385 | next_random = | 
|  | 1386 | next_random | 
|  | 1387 | * (unsigned long long) 25214903917 | 
|  | 1388 | + 11; | 
|  | 1389 | } | 
|  | 1390 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); | 
|  | 1391 | } else { | 
|  | 1392 | target = table[(next_random >> 16) | 
|  | 1393 | % table_size]; | 
|  | 1394 | } | 
|  | 1395 | if (target == 0) | 
|  | 1396 | target = next_random % (vocab_size - 1) + 1; | 
|  | 1397 | if (target == word) | 
|  | 1398 | continue; | 
|  | 1399 | label = 0; | 
|  | 1400 | } | 
|  | 1401 | l2 = target * window_layer_size; | 
|  | 1402 | f = 0; | 
|  | 1403 | for (c = 0; c < layer1_size; c++) | 
|  | 1404 | f += | 
|  | 1405 | syn0[c + l1] | 
|  | 1406 | * syn1nce_window[c + l2 | 
|  | 1407 | + window_offset]; | 
|  | 1408 | if (f > MAX_EXP) | 
|  | 1409 | g = (label - 1) * alpha; | 
|  | 1410 | else if (f < -MAX_EXP) | 
|  | 1411 | g = (label - 0) * alpha; | 
|  | 1412 | else { | 
|  | 1413 | f = exp(f); | 
|  | 1414 | g = (label | 
|  | 1415 | - f | 
|  | 1416 | / (noise_distribution[target] | 
|  | 1417 | * nce + f)) * alpha; | 
|  | 1418 | } | 
|  | 1419 | for (c = 0; c < layer1_size; c++) | 
|  | 1420 | neu1e[c] += | 
|  | 1421 | g | 
|  | 1422 | * syn1nce_window[c + l2 | 
|  | 1423 | + window_offset]; | 
|  | 1424 | for (c = 0; c < layer1_size; c++) | 
|  | 1425 | syn1nce_window[c + l2 + window_offset] += g | 
|  | 1426 | * syn0[c + l1]; | 
|  | 1427 | if (cap == 1) | 
|  | 1428 | for (c = 0; c < layer1_size; c++) | 
|  | 1429 | capParam(syn1nce_window, | 
|  | 1430 | c + l2 + window_offset); | 
|  | 1431 | } | 
|  | 1432 | // Learn weights input -> hidden | 
|  | 1433 | for (c = 0; c < layer1_size; c++) { | 
|  | 1434 | syn0[c + l1] += neu1e[c]; | 
|  | 1435 | if (syn0[c + l1] > 50) | 
|  | 1436 | syn0[c + l1] = 50; | 
|  | 1437 | if (syn0[c + l1] < -50) | 
|  | 1438 | syn0[c + l1] = -50; | 
|  | 1439 | } | 
|  | 1440 | } | 
|  | 1441 | } else if (type == 4) { //training senna | 
|  | 1442 | // in -> hidden | 
|  | 1443 | cw = 0; | 
|  | 1444 | for (a = 0; a < window * 2 + 1; a++) | 
|  | 1445 | if (a != window) { | 
|  | 1446 | c = sentence_position - window + a; | 
|  | 1447 | if (c < 0) | 
|  | 1448 | continue; | 
|  | 1449 | if (c >= sentence_length) | 
|  | 1450 | continue; | 
|  | 1451 | last_word = sen[c]; | 
|  | 1452 | if (last_word == -1) | 
|  | 1453 | continue; | 
|  | 1454 | window_offset = a * layer1_size; | 
|  | 1455 | if (a > window) | 
|  | 1456 | window_offset -= layer1_size; | 
|  | 1457 | for (c = 0; c < layer1_size; c++) | 
|  | 1458 | neu1[c + window_offset] += syn0[c | 
|  | 1459 | + last_word * layer1_size]; | 
|  | 1460 | cw++; | 
|  | 1461 | } | 
|  | 1462 | if (cw) { | 
|  | 1463 | for (a = 0; a < window_hidden_size; a++) { | 
|  | 1464 | c = a * window_layer_size; | 
|  | 1465 | for (b = 0; b < window_layer_size; b++) { | 
|  | 1466 | neu2[a] += syn_window_hidden[c + b] * neu1[b]; | 
|  | 1467 | } | 
|  | 1468 | } | 
|  | 1469 | if (hs) | 
|  | 1470 | for (d = 0; d < vocab[word].codelen; d++) { | 
|  | 1471 | f = 0; | 
|  | 1472 | l2 = vocab[word].point[d] * window_hidden_size; | 
|  | 1473 | // Propagate hidden -> output | 
|  | 1474 | for (c = 0; c < window_hidden_size; c++) | 
|  | 1475 | f += hardTanh(neu2[c]) * syn_hidden_word[c + l2]; | 
|  | 1476 | if (f <= -MAX_EXP) | 
|  | 1477 | continue; | 
|  | 1478 | else if (f >= MAX_EXP) | 
|  | 1479 | continue; | 
|  | 1480 | else | 
|  | 1481 | f = expTable[(int) ((f + MAX_EXP) | 
|  | 1482 | * (EXP_TABLE_SIZE / MAX_EXP / 2))]; | 
|  | 1483 | // 'g' is the gradient multiplied by the learning rate | 
|  | 1484 | g = (1 - vocab[word].code[d] - f) * alpha; | 
|  | 1485 | // Propagate errors output -> hidden | 
|  | 1486 | for (c = 0; c < window_hidden_size; c++) | 
|  | 1487 | neu2e[c] += dHardTanh(neu2[c], g) * g | 
|  | 1488 | * syn_hidden_word[c + l2]; | 
|  | 1489 | // Learn weights hidden -> output | 
|  | 1490 | for (c = 0; c < window_hidden_size; c++) | 
|  | 1491 | syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g | 
|  | 1492 | * neu2[c]; | 
|  | 1493 | } | 
|  | 1494 | // NEGATIVE SAMPLING | 
|  | 1495 | if (negative > 0) | 
|  | 1496 | for (d = 0; d < negative + 1; d++) { | 
|  | 1497 | if (d == 0) { | 
|  | 1498 | target = word; | 
|  | 1499 | label = 1; | 
|  | 1500 | } else { | 
|  | 1501 | next_random = next_random | 
|  | 1502 | * (unsigned long long) 25214903917 + 11; | 
|  | 1503 | if (word_to_group != NULL | 
|  | 1504 | && word_to_group[word] != -1) { | 
|  | 1505 | target = word; | 
|  | 1506 | while (target == word) { | 
|  | 1507 | target = group_to_table[word_to_group[word] | 
|  | 1508 | * table_size | 
|  | 1509 | + (next_random >> 16) % table_size]; | 
|  | 1510 | next_random = next_random | 
|  | 1511 | * (unsigned long long) 25214903917 | 
|  | 1512 | + 11; | 
|  | 1513 | } | 
|  | 1514 | //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word); | 
|  | 1515 | } else { | 
|  | 1516 | target = | 
|  | 1517 | table[(next_random >> 16) % table_size]; | 
|  | 1518 | } | 
|  | 1519 | if (target == 0) | 
|  | 1520 | target = next_random % (vocab_size - 1) + 1; | 
|  | 1521 | if (target == word) | 
|  | 1522 | continue; | 
|  | 1523 | label = 0; | 
|  | 1524 | } | 
|  | 1525 | l2 = target * window_hidden_size; | 
|  | 1526 | f = 0; | 
|  | 1527 | for (c = 0; c < window_hidden_size; c++) | 
|  | 1528 | f += hardTanh(neu2[c]) | 
|  | 1529 | * syn_hidden_word_neg[c + l2]; | 
|  | 1530 | if (f > MAX_EXP) | 
|  | 1531 | g = (label - 1) * alpha / negative; | 
|  | 1532 | else if (f < -MAX_EXP) | 
|  | 1533 | g = (label - 0) * alpha / negative; | 
|  | 1534 | else | 
|  | 1535 | g = (label | 
|  | 1536 | - expTable[(int) ((f + MAX_EXP) | 
|  | 1537 | * (EXP_TABLE_SIZE / MAX_EXP / 2))]) | 
|  | 1538 | * alpha / negative; | 
|  | 1539 | for (c = 0; c < window_hidden_size; c++) | 
|  | 1540 | neu2e[c] += dHardTanh(neu2[c], g) * g | 
|  | 1541 | * syn_hidden_word_neg[c + l2]; | 
|  | 1542 | for (c = 0; c < window_hidden_size; c++) | 
|  | 1543 | syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g) | 
|  | 1544 | * g * neu2[c]; | 
|  | 1545 | } | 
|  | 1546 | for (a = 0; a < window_hidden_size; a++) | 
|  | 1547 | for (b = 0; b < window_layer_size; b++) | 
|  | 1548 | neu1e[b] += neu2e[a] | 
|  | 1549 | * syn_window_hidden[a * window_layer_size + b]; | 
|  | 1550 | for (a = 0; a < window_hidden_size; a++) | 
|  | 1551 | for (b = 0; b < window_layer_size; b++) | 
|  | 1552 | syn_window_hidden[a * window_layer_size + b] += neu2e[a] | 
|  | 1553 | * neu1[b]; | 
|  | 1554 | // hidden -> in | 
|  | 1555 | for (a = 0; a < window * 2 + 1; a++) | 
|  | 1556 | if (a != window) { | 
|  | 1557 | c = sentence_position - window + a; | 
|  | 1558 | if (c < 0) | 
|  | 1559 | continue; | 
|  | 1560 | if (c >= sentence_length) | 
|  | 1561 | continue; | 
|  | 1562 | last_word = sen[c]; | 
|  | 1563 | if (last_word == -1) | 
|  | 1564 | continue; | 
|  | 1565 | window_offset = a * layer1_size; | 
|  | 1566 | if (a > window) | 
|  | 1567 | window_offset -= layer1_size; | 
|  | 1568 | for (c = 0; c < layer1_size; c++) | 
|  | 1569 | syn0[c + last_word * layer1_size] += neu1e[c | 
|  | 1570 | + window_offset]; | 
|  | 1571 | } | 
|  | 1572 | } | 
|  | 1573 | } else { | 
|  | 1574 | printf("unknown type %i", type); | 
|  | 1575 | exit(0); | 
|  | 1576 | } | 
|  | 1577 | sentence_position++; | 
|  | 1578 | if (sentence_position >= sentence_length) { | 
|  | 1579 | sentence_length = 0; | 
|  | 1580 | continue; | 
|  | 1581 | } | 
|  | 1582 | } | 
|  | 1583 | fclose(fi); | 
|  | 1584 | free(neu1); | 
|  | 1585 | free(neu1e); | 
|  | 1586 | pthread_exit(NULL); | 
|  | 1587 | } | 
|  | 1588 |  | 
| Marc Kupietz | 6b1f2ba | 2016-03-17 21:17:42 +0100 | [diff] [blame^] | 1589 | void ShowCollocations() { | 
|  | 1590 | long a, b, c, d, window_offset, target, max_target=0, maxmax_target; | 
|  | 1591 | real f, max_f, maxmax_f; | 
|  | 1592 | real *target_sums; | 
|  | 1593 | a = posix_memalign((void **) &target_sums, 128, vocab_size * sizeof(real)); | 
|  | 1594 |  | 
|  | 1595 | for (d = cc; d < vocab_size; d++) { | 
|  | 1596 | for (b = 0; b < vocab_size; b++) | 
|  | 1597 | target_sums[b]=0; | 
|  | 1598 | maxmax_f = -1; | 
|  | 1599 | maxmax_target = 0; | 
|  | 1600 | for (a = 0; a < window * 2 + 1; a++) { | 
|  | 1601 | if (a != window) { | 
|  | 1602 | max_f = -1; | 
|  | 1603 | window_offset = a * layer1_size; | 
|  | 1604 | if (a > window) | 
|  | 1605 | window_offset -= layer1_size; | 
|  | 1606 | for(target = 0; target < vocab_size; target ++) { | 
|  | 1607 | if(target == d) | 
|  | 1608 | continue; | 
|  | 1609 | f = 0; | 
|  | 1610 | for (c = 0; c < layer1_size; c++) | 
|  | 1611 | f += syn0[d* layer1_size + c]	* syn1neg_window[target * window_layer_size	+ window_offset + c]; | 
|  | 1612 | if (f < -MAX_EXP) | 
|  | 1613 | continue; | 
|  | 1614 | else if (f > MAX_EXP) | 
|  | 1615 | continue; | 
|  | 1616 | else | 
|  | 1617 | f = expTable[(int) ((f + MAX_EXP)	* (EXP_TABLE_SIZE / MAX_EXP / 2))]; | 
|  | 1618 | if(f > max_f) { | 
|  | 1619 | max_f = f; | 
|  | 1620 | max_target = target; | 
|  | 1621 | } | 
|  | 1622 | target_sums[target]+=f; | 
|  | 1623 | } | 
|  | 1624 | printf("%s (%.2f) ", vocab[max_target].word, max_f); | 
|  | 1625 | if(max_f > maxmax_f) { | 
|  | 1626 | maxmax_f = max_f; | 
|  | 1627 | maxmax_target = max_target; | 
|  | 1628 | } | 
|  | 1629 | } else { | 
|  | 1630 | printf("\x1b[1m%s\x1b[0m ", vocab[d].word); | 
|  | 1631 | } | 
|  | 1632 | } | 
|  | 1633 | max_f = -1; | 
|  | 1634 | for (b = 0; b < vocab_size; b++) { | 
|  | 1635 | if(target_sums[b] > max_f) { | 
|  | 1636 | max_f = target_sums[b]; | 
|  | 1637 | max_target = b; | 
|  | 1638 | } | 
|  | 1639 | } | 
|  | 1640 | printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n", | 
|  | 1641 | vocab[max_target].word, max_f/window/2, | 
|  | 1642 | vocab[maxmax_target].word, maxmax_f); | 
|  | 1643 | } | 
|  | 1644 | } | 
|  | 1645 |  | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 1646 | void TrainModel() { | 
|  | 1647 | long a, b, c, d; | 
|  | 1648 | FILE *fo; | 
|  | 1649 | pthread_t *pt = (pthread_t *) malloc(num_threads * sizeof(pthread_t)); | 
|  | 1650 | printf("Starting training using file %s\n", train_file); | 
|  | 1651 | starting_alpha = alpha; | 
|  | 1652 | if (read_vocab_file[0] != 0) | 
|  | 1653 | ReadVocab(); | 
|  | 1654 | else | 
|  | 1655 | LearnVocabFromTrainFile(); | 
|  | 1656 | if (save_vocab_file[0] != 0) | 
|  | 1657 | SaveVocab(); | 
|  | 1658 | if (output_file[0] == 0) | 
|  | 1659 | return; | 
|  | 1660 | InitNet(); | 
| Marc Kupietz | 6b1f2ba | 2016-03-17 21:17:42 +0100 | [diff] [blame^] | 1661 | if(cc > 0) | 
|  | 1662 | ShowCollocations(); | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 1663 | if (negative > 0 || nce > 0) | 
|  | 1664 | InitUnigramTable(); | 
|  | 1665 | if (negative_classes_file[0] != 0) | 
|  | 1666 | InitClassUnigramTable(); | 
|  | 1667 | start = clock(); | 
|  | 1668 | for (a = 0; a < num_threads; a++) | 
|  | 1669 | pthread_create(&pt[a], NULL, TrainModelThread, (void *) a); | 
|  | 1670 | for (a = 0; a < num_threads; a++) | 
|  | 1671 | pthread_join(pt[a], NULL); | 
|  | 1672 | fo = fopen(output_file, "wb"); | 
|  | 1673 | if (classes == 0) { | 
|  | 1674 | // Save the word vectors | 
|  | 1675 | fprintf(fo, "%lld %lld\n", vocab_size, layer1_size); | 
|  | 1676 | for (a = 0; a < vocab_size; a++) { | 
|  | 1677 | fprintf(fo, "%s ", vocab[a].word); | 
|  | 1678 | if (binary) | 
|  | 1679 | for (b = 0; b < layer1_size; b++) | 
|  | 1680 | fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo); | 
|  | 1681 | else | 
|  | 1682 | for (b = 0; b < layer1_size; b++) | 
|  | 1683 | fprintf(fo, "%lf ", syn0[a * layer1_size + b]); | 
|  | 1684 | fprintf(fo, "\n"); | 
|  | 1685 | } | 
|  | 1686 | } else { | 
|  | 1687 | // Run K-means on the word vectors | 
|  | 1688 | int clcn = classes, iter = 10, closeid; | 
|  | 1689 | int *centcn = (int *) malloc(classes * sizeof(int)); | 
|  | 1690 | int *cl = (int *) calloc(vocab_size, sizeof(int)); | 
|  | 1691 | real closev, x; | 
|  | 1692 | real *cent = (real *) calloc(classes * layer1_size, sizeof(real)); | 
|  | 1693 | for (a = 0; a < vocab_size; a++) | 
|  | 1694 | cl[a] = a % clcn; | 
|  | 1695 | for (a = 0; a < iter; a++) { | 
|  | 1696 | for (b = 0; b < clcn * layer1_size; b++) | 
|  | 1697 | cent[b] = 0; | 
|  | 1698 | for (b = 0; b < clcn; b++) | 
|  | 1699 | centcn[b] = 1; | 
|  | 1700 | for (c = 0; c < vocab_size; c++) { | 
|  | 1701 | for (d = 0; d < layer1_size; d++) | 
|  | 1702 | cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d]; | 
|  | 1703 | centcn[cl[c]]++; | 
|  | 1704 | } | 
|  | 1705 | for (b = 0; b < clcn; b++) { | 
|  | 1706 | closev = 0; | 
|  | 1707 | for (c = 0; c < layer1_size; c++) { | 
|  | 1708 | cent[layer1_size * b + c] /= centcn[b]; | 
|  | 1709 | closev += cent[layer1_size * b + c] | 
|  | 1710 | * cent[layer1_size * b + c]; | 
|  | 1711 | } | 
|  | 1712 | closev = sqrt(closev); | 
|  | 1713 | for (c = 0; c < layer1_size; c++) | 
|  | 1714 | cent[layer1_size * b + c] /= closev; | 
|  | 1715 | } | 
|  | 1716 | for (c = 0; c < vocab_size; c++) { | 
|  | 1717 | closev = -10; | 
|  | 1718 | closeid = 0; | 
|  | 1719 | for (d = 0; d < clcn; d++) { | 
|  | 1720 | x = 0; | 
|  | 1721 | for (b = 0; b < layer1_size; b++) | 
|  | 1722 | x += cent[layer1_size * d + b] | 
|  | 1723 | * syn0[c * layer1_size + b]; | 
|  | 1724 | if (x > closev) { | 
|  | 1725 | closev = x; | 
|  | 1726 | closeid = d; | 
|  | 1727 | } | 
|  | 1728 | } | 
|  | 1729 | cl[c] = closeid; | 
|  | 1730 | } | 
|  | 1731 | } | 
|  | 1732 | // Save the K-means classes | 
|  | 1733 | for (a = 0; a < vocab_size; a++) | 
|  | 1734 | fprintf(fo, "%s %d\n", vocab[a].word, cl[a]); | 
|  | 1735 | free(centcn); | 
|  | 1736 | free(cent); | 
|  | 1737 | free(cl); | 
|  | 1738 | } | 
|  | 1739 | fclose(fo); | 
|  | 1740 | if (save_net_file[0] != 0) | 
|  | 1741 | SaveNet(); | 
|  | 1742 | } | 
|  | 1743 |  | 
|  | 1744 | int ArgPos(char *str, int argc, char **argv) { | 
|  | 1745 | int a; | 
|  | 1746 | for (a = 1; a < argc; a++) | 
|  | 1747 | if (!strcmp(str, argv[a])) { | 
|  | 1748 | if (a == argc - 1) { | 
|  | 1749 | printf("Argument missing for %s\n", str); | 
|  | 1750 | exit(1); | 
|  | 1751 | } | 
|  | 1752 | return a; | 
|  | 1753 | } | 
|  | 1754 | return -1; | 
|  | 1755 | } | 
|  | 1756 |  | 
|  | 1757 | int main(int argc, char **argv) { | 
|  | 1758 | int i; | 
|  | 1759 | if (argc == 1) { | 
|  | 1760 | printf("WORD VECTOR estimation toolkit v 0.1c\n\n"); | 
|  | 1761 | printf("Options:\n"); | 
|  | 1762 | printf("Parameters for training:\n"); | 
|  | 1763 | printf("\t-train <file>\n"); | 
|  | 1764 | printf("\t\tUse text data from <file> to train the model\n"); | 
|  | 1765 | printf("\t-output <file>\n"); | 
|  | 1766 | printf( | 
|  | 1767 | "\t\tUse <file> to save the resulting word vectors / word clusters\n"); | 
|  | 1768 | printf("\t-size <int>\n"); | 
|  | 1769 | printf("\t\tSet size of word vectors; default is 100\n"); | 
|  | 1770 | printf("\t-window <int>\n"); | 
|  | 1771 | printf("\t\tSet max skip length between words; default is 5\n"); | 
|  | 1772 | printf("\t-sample <float>\n"); | 
|  | 1773 | printf( | 
|  | 1774 | "\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n"); | 
|  | 1775 | printf( | 
|  | 1776 | "\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n"); | 
|  | 1777 | printf("\t-hs <int>\n"); | 
|  | 1778 | printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n"); | 
|  | 1779 | printf("\t-negative <int>\n"); | 
|  | 1780 | printf( | 
|  | 1781 | "\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n"); | 
|  | 1782 | printf("\t-negative-classes <file>\n"); | 
|  | 1783 | printf("\t\tNegative classes to sample from\n"); | 
|  | 1784 | printf("\t-nce <int>\n"); | 
|  | 1785 | printf( | 
|  | 1786 | "\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n"); | 
|  | 1787 | printf("\t-threads <int>\n"); | 
|  | 1788 | printf("\t\tUse <int> threads (default 12)\n"); | 
|  | 1789 | printf("\t-iter <int>\n"); | 
|  | 1790 | printf("\t\tRun more training iterations (default 5)\n"); | 
|  | 1791 | printf("\t-min-count <int>\n"); | 
|  | 1792 | printf( | 
|  | 1793 | "\t\tThis will discard words that appear less than <int> times; default is 5\n"); | 
|  | 1794 | printf("\t-alpha <float>\n"); | 
|  | 1795 | printf( | 
|  | 1796 | "\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n"); | 
|  | 1797 | printf("\t-classes <int>\n"); | 
|  | 1798 | printf( | 
|  | 1799 | "\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n"); | 
|  | 1800 | printf("\t-debug <int>\n"); | 
|  | 1801 | printf( | 
|  | 1802 | "\t\tSet the debug mode (default = 2 = more info during training)\n"); | 
|  | 1803 | printf("\t-binary <int>\n"); | 
|  | 1804 | printf( | 
|  | 1805 | "\t\tSave the resulting vectors in binary moded; default is 0 (off)\n"); | 
|  | 1806 | printf("\t-save-vocab <file>\n"); | 
|  | 1807 | printf("\t\tThe vocabulary will be saved to <file>\n"); | 
|  | 1808 | printf("\t-read-vocab <file>\n"); | 
|  | 1809 | printf( | 
|  | 1810 | "\t\tThe vocabulary will be read from <file>, not constructed from the training data\n"); | 
|  | 1811 | printf("\t-read-net <file>\n"); | 
|  | 1812 | printf( | 
|  | 1813 | "\t\tThe net parameters will be read from <file>, not initialized randomly\n"); | 
|  | 1814 | printf("\t-save-net <file>\n"); | 
|  | 1815 | printf("\t\tThe net parameters will be saved to <file>\n"); | 
| Marc Kupietz | 6b1f2ba | 2016-03-17 21:17:42 +0100 | [diff] [blame^] | 1816 | printf("\t-show-cc <int>\n"); | 
|  | 1817 | printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n"); | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 1818 | printf("\t-type <int>\n"); | 
|  | 1819 | printf( | 
|  | 1820 | "\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n"); | 
|  | 1821 | printf("\t-cap <int>\n"); | 
|  | 1822 | printf( | 
|  | 1823 | "\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n"); | 
|  | 1824 | printf("\nExamples:\n"); | 
|  | 1825 | printf( | 
|  | 1826 | "./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n"); | 
|  | 1827 | return 0; | 
|  | 1828 | } | 
|  | 1829 | output_file[0] = 0; | 
|  | 1830 | save_vocab_file[0] = 0; | 
|  | 1831 | read_vocab_file[0] = 0; | 
|  | 1832 | save_net_file[0] = 0; | 
|  | 1833 | read_net_file[0] = 0; | 
|  | 1834 | negative_classes_file[0] = 0; | 
|  | 1835 | if ((i = ArgPos((char *) "-size", argc, argv)) > 0) | 
|  | 1836 | layer1_size = atoi(argv[i + 1]); | 
|  | 1837 | if ((i = ArgPos((char *) "-train", argc, argv)) > 0) | 
|  | 1838 | strcpy(train_file, argv[i + 1]); | 
|  | 1839 | if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0) | 
|  | 1840 | strcpy(save_vocab_file, argv[i + 1]); | 
|  | 1841 | if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0) | 
|  | 1842 | strcpy(read_vocab_file, argv[i + 1]); | 
|  | 1843 | if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0) | 
|  | 1844 | strcpy(save_net_file, argv[i + 1]); | 
|  | 1845 | if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0) | 
|  | 1846 | strcpy(read_net_file, argv[i + 1]); | 
|  | 1847 | if ((i = ArgPos((char *) "-debug", argc, argv)) > 0) | 
|  | 1848 | debug_mode = atoi(argv[i + 1]); | 
|  | 1849 | if ((i = ArgPos((char *) "-binary", argc, argv)) > 0) | 
|  | 1850 | binary = atoi(argv[i + 1]); | 
| Marc Kupietz | 6b1f2ba | 2016-03-17 21:17:42 +0100 | [diff] [blame^] | 1851 | if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0) | 
|  | 1852 | cc = atoi(argv[i + 1]); | 
| Marc Kupietz | d6f9c71 | 2016-03-16 11:50:56 +0100 | [diff] [blame] | 1853 | if ((i = ArgPos((char *) "-type", argc, argv)) > 0) | 
|  | 1854 | type = atoi(argv[i + 1]); | 
|  | 1855 | if ((i = ArgPos((char *) "-output", argc, argv)) > 0) | 
|  | 1856 | strcpy(output_file, argv[i + 1]); | 
|  | 1857 | if ((i = ArgPos((char *) "-window", argc, argv)) > 0) | 
|  | 1858 | window = atoi(argv[i + 1]); | 
|  | 1859 | if ((i = ArgPos((char *) "-sample", argc, argv)) > 0) | 
|  | 1860 | sample = atof(argv[i + 1]); | 
|  | 1861 | if ((i = ArgPos((char *) "-hs", argc, argv)) > 0) | 
|  | 1862 | hs = atoi(argv[i + 1]); | 
|  | 1863 | if ((i = ArgPos((char *) "-negative", argc, argv)) > 0) | 
|  | 1864 | negative = atoi(argv[i + 1]); | 
|  | 1865 | if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0) | 
|  | 1866 | strcpy(negative_classes_file, argv[i + 1]); | 
|  | 1867 | if ((i = ArgPos((char *) "-nce", argc, argv)) > 0) | 
|  | 1868 | nce = atoi(argv[i + 1]); | 
|  | 1869 | if ((i = ArgPos((char *) "-threads", argc, argv)) > 0) | 
|  | 1870 | num_threads = atoi(argv[i + 1]); | 
|  | 1871 | if ((i = ArgPos((char *) "-iter", argc, argv)) > 0) | 
|  | 1872 | iter = atoi(argv[i + 1]); | 
|  | 1873 | if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0) | 
|  | 1874 | min_count = atoi(argv[i + 1]); | 
|  | 1875 | if ((i = ArgPos((char *) "-classes", argc, argv)) > 0) | 
|  | 1876 | classes = atoi(argv[i + 1]); | 
|  | 1877 | if ((i = ArgPos((char *) "-cap", argc, argv)) > 0) | 
|  | 1878 | cap = atoi(argv[i + 1]); | 
|  | 1879 | if (type == 0 || type == 2 || type == 4) | 
|  | 1880 | alpha = 0.05; | 
|  | 1881 | if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0) | 
|  | 1882 | alpha = atof(argv[i + 1]); | 
|  | 1883 | vocab = (struct vocab_word *) calloc(vocab_max_size, | 
|  | 1884 | sizeof(struct vocab_word)); | 
|  | 1885 | vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int)); | 
|  | 1886 | expTable = (real *) malloc((EXP_TABLE_SIZE + 1) * sizeof(real)); | 
|  | 1887 | for (i = 0; i < EXP_TABLE_SIZE; i++) { | 
|  | 1888 | expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table | 
|  | 1889 | expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1) | 
|  | 1890 | } | 
|  | 1891 | TrainModel(); | 
|  | 1892 | return 0; | 
|  | 1893 | } | 
|  | 1894 |  |