blob: e68bbee6182434b974ea11e12d340fa83e04a2b8 [file] [log] [blame]
Marc Kupietzd6f9c712016-03-16 11:50:56 +01001// Copyright 2013 Google Inc. All Rights Reserved.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7// http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15#include <stdio.h>
16#include <stdlib.h>
17#include <string.h>
18#include <math.h>
19#include <pthread.h>
20
21#define MAX_STRING 100
22#define EXP_TABLE_SIZE 1000
23#define MAX_EXP 6
24#define MAX_SENTENCE_LENGTH 1000
25#define MAX_CODE_LENGTH 40
26
27const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
28
29typedef float real; // Precision of float numbers
30
31struct vocab_word {
32 long long cn;
33 int *point;
34 char *word, *code, codelen;
35};
36
37char train_file[MAX_STRING], output_file[MAX_STRING];
38char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
39struct vocab_word *vocab;
40int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
41int *vocab_hash;
42long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
43long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
44real alpha = 0.025, starting_alpha, sample = 1e-3;
45real *syn0, *syn1, *syn1neg, *expTable, *tanhTable;
46clock_t start;
47
48real *syn1_window, *syn1neg_window;
49int window_offset, window_layer_size;
50
51int window_hidden_size = 500;
52real *syn_window_hidden, *syn_hidden_word, *syn_hidden_word_neg;
53
54int hs = 0, negative = 5;
55const int table_size = 1e8;
56int *table;
57
58//constrastive negative sampling
59char negative_classes_file[MAX_STRING];
60int *word_to_group;
61int *group_to_table; //group_size*table_size
62int class_number;
63
64//char table
65int rep = 0;
66#define C_MAX_CODE 65536
67int c_state_size = 5;
68int c_cell_size = 5;
69int c_proj_size = 3;
70int c_params_number;
71int c_lstm_params_number;
72real *c_lookup;
73
74//char lstm params
75real *f_init_state;
76real *f_init_cell;
77real *b_init_state;
78real *b_init_cell;
79real *f_b_params;
80
81//short term memory
82real*syn0_initial;
83real*syn0_in_memory;
84
85int batch_size = 100;
86
87void printStates(real*states, int start){
88 int s;
89 printf("igate ");
90 for(s = 0; s < c_state_size; s++){ printf("%f ", states[start++]);} printf("\n");
91 printf("fgate ");
92 for(s = 0; s < c_state_size; s++){ printf("%f ", states[start++]);} printf("\n");
93 printf("c + tanh ");
94 for(s = 0; s < c_state_size; s++){ printf("%f ", states[start++]);} printf("\n");
95 printf("cgate ");
96 for(s = 0; s < c_state_size; s++){ printf("%f ", states[start++]);} printf("\n");
97 printf("ogate ");
98 for(s = 0; s < c_state_size; s++){ printf("%f ", states[start++]);} printf("\n");
99 printf("cgate + tanh ");
100 for(s = 0; s < c_state_size; s++){ printf("%f ", states[start++]);} printf("\n");
101 printf("state ");
102 for(s = 0; s < c_state_size; s++){ printf("%f ", states[start++]);} printf("\n");
103
104}
105
106void lstmForwardBlock(real *chars, int char_start, real*states, int next_start, int p){
107 int i,s,si,sf,sc,sct,sctt,so,s1=next_start;
108 int prev_cell_start = s1 - c_state_size*4;
109 int prev_state_start = s1 - c_state_size;
110 if(states[prev_cell_start]==0){
111// printf("crap! cell is zero\n");
112 }
113 if(states[prev_state_start]==0){
114// printf("crap! state is zero\n");
115 }
116 if(states[s1]!=0){
117// printf("crap! start not zero\n");
118 }
119 //igate
120 si = s1;
121 for(s = 0; s < c_state_size; s++){
122 for(i = 0; i < c_proj_size; i++){
123 states[s1]+=chars[char_start+i]*f_b_params[p++];
124 }
125 for(i = 0; i < c_cell_size; i++){
126 states[s1]+=states[prev_cell_start+i]*f_b_params[p++];
127 }
128 for(i = 0; i < c_state_size; i++){
129 states[s1]+=states[prev_state_start+i]*f_b_params[p++];
130 }
131 states[s1]+=f_b_params[p++];
132 if(states[s1]>MAX_EXP){
133 states[s1]=1;
134 }
135 else if(states[s1]<-MAX_EXP){
136 states[s1]=0;
137 }
138 else{
139 states[s1] = expTable[(int)((states[s1] + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
140 }
141 s1++;
142 }
143
144 //fgate
145 sf=s1;
146 for(s = 0; s < c_state_size; s++){
147 for(i = 0; i < c_proj_size; i++){
148 states[s1]+=chars[char_start+i]*f_b_params[p++];
149 }
150 for(i = 0; i < c_cell_size; i++){
151 states[s1]+=states[prev_cell_start+i]*f_b_params[p++];
152 }
153 for(i = 0; i < c_state_size; i++){
154 states[s1]+=states[prev_state_start+i]*f_b_params[p++];
155 }
156 states[s1]+=f_b_params[p++];
157 if(states[s1]>MAX_EXP){
158 states[s1]=1;
159 }
160 else if(states[s1]<-MAX_EXP){
161 states[s1]=0;
162 }
163 else{
164 states[s1] = expTable[(int)((states[s1] + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
165 }
166 s1++;
167 }
168
169 //c + tanh
170 sct=s1;
171 for(s = 0; s < c_state_size; s++){
172 for(i = 0; i < c_proj_size; i++){
173 states[s1]+=chars[char_start+i]*f_b_params[p++];
174 }
175 for(i = 0; i < c_state_size; i++){
176 states[s1]+=states[prev_state_start+i]*f_b_params[p++];
177 }
178 states[s1]+=f_b_params[p++];
179 if(states[s1]>MAX_EXP){
180 states[s1]=1;
181 }
182 else if(states[s1]<-MAX_EXP){
183 states[s1]=-1;
184 }
185 else{
186 states[s1] = tanhTable[(int)((states[s1] + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
187 }
188 s1++;
189 }
190
191 //cgate
192 sc=s1;
193 for(s = 0; s < c_state_size; s++){
194 states[s1]+=states[sct+s]*states[si+s]+states[sf+s]*states[prev_cell_start+s];
195 s1++;
196 }
197
198 //ogate
199 so=s1;
200 for(s = 0; s < c_state_size; s++){
201 for(i = 0; i < c_proj_size; i++){
202 states[s1]+=chars[char_start+i]*f_b_params[p++];
203 }
204 for(i = 0; i < c_cell_size; i++){
205 states[s1]+=states[sc+s]*f_b_params[p++];
206 }
207 for(i = 0; i < c_state_size; i++){
208 states[s1]+=states[prev_state_start+i]*f_b_params[p++];
209 }
210 states[s1]+=f_b_params[p++];
211 if(states[s1]>MAX_EXP){
212 states[s1]=1;
213 }
214 else if(states[s1]<-MAX_EXP){
215 states[s1]=0;
216 }
217 else{
218 states[s1] = expTable[(int)((states[s1] + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
219 }
220 s1++;
221 }
222
223 //cgate + tan
224 sctt = s1;
225 for(s = 0; s < c_state_size; s++){
226 if(states[sc+s]>MAX_EXP){
227 states[s1]=1;
228 }
229 else if(states[sc+s]<-MAX_EXP){
230 states[s1]=-1;
231 }
232 else{
233 states[s1] = tanhTable[(int)((states[sc+s] + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
234 }
235 s1++;
236 }
237
238 //next state
239 if(states[s1]!=0){
240 printf("crap! end not zero\n");
241 }
242 for(s = 0; s < c_state_size; s++){
243 states[s1] = states[sctt+s] * states[so+s];
244 s1++;
245 }
246
247
248}
249
250void lstmBackwardBlock(real *chars, int char_start, real*states, int next_start, int pStart, real*chars_e, real*states_e, real*lstm_params_e){
251 int p=pStart+c_lstm_params_number-1;
252 int i,s,si,sf,sc,sct,sctt,so,s1=next_start+c_state_size*7-1;
253 int prev_cell_start = next_start - c_state_size*4;
254 int prev_state_start = next_start - c_state_size;
255
256 real e;
257 si = next_start;
258 sf = next_start + c_state_size;
259 sct = next_start + c_state_size*2;
260 sc = next_start + c_state_size*3;
261 so = next_start + c_state_size*4;
262 sctt = next_start + c_state_size*5;
263
264 //next state
265 for(s = c_state_size-1; s >= 0; s--){
266 states_e[sctt+s] += states_e[s1]*states[so+s];
267 states_e[so+s] += states_e[s1]*states[sctt+s];
268 s1--;
269 }
270
271
272 //cgate + tan
273 for(s = c_state_size-1; s >= 0; s--){
274 states_e[sc+s] += states_e[s1]*(1-states[s1]*states[s1]);
275 s1--;
276 }
277
278 //ogate
279 for(s = c_state_size-1; s >= 0; s--){
280 e = states[s1]*(1-states[s1])*states_e[s1];
281 for(i = c_proj_size-1; i >= 0; i--){
282 chars_e[char_start+i] += e*f_b_params[p];
283 lstm_params_e[p--] += e*chars[char_start+i];
284 }
285
286 for(i = c_cell_size-1; i >= 0; i--){
287 states_e[sc+s]+=e*f_b_params[p];
288 lstm_params_e[p--] += e*states[sc+s];
289 }
290 for(i = c_state_size-1; i >= 0; i--){
291 states_e[prev_state_start+i] += e*f_b_params[p];
292 lstm_params_e[p--] += e*states_e[prev_state_start+i];
293 }
294 lstm_params_e[p--]+=e;
295 s1--;
296 }
297
298 //cgate
299 for(s = c_state_size-1; s >= 0; s--){
300 states_e[sct+s]+=states_e[s1]*states[si+s];
301 states_e[si+s]+=states_e[s1]*states[sct+s];
302 states_e[prev_cell_start+s]+=states_e[s1]*states[sf+s];
303 states_e[sf+s]+=states_e[s1]*states[prev_cell_start+s];
304 s1--;
305 }
306
307 //c + tanh
308 for(s = c_state_size-1; s >= 0; s--){
309 e = (1-states[s1]*states[s1])*states_e[s1];
310 for(i = c_proj_size-1; i >= 0; i--){
311 chars_e[char_start+i] += e*f_b_params[p];
312 lstm_params_e[p--] += e*chars[char_start+i];
313 }
314 for(i = c_state_size-1; i >= 0; i--){
315 states_e[prev_state_start+i]+=e*f_b_params[p];
316 lstm_params_e[p--] +=e*states[prev_state_start+i];
317 }
318 lstm_params_e[p--]+=e;
319 s1--;
320 }
321
322
323 //fgate
324 for(s = c_state_size-1; s >= 0; s--){
325 e = states[s1]*(1-states[s1])*states_e[s1];
326 for(i = c_proj_size-1; i >= 0; i--){
327 chars_e[char_start+i] += e*f_b_params[p];
328 lstm_params_e[p--] += e*chars[char_start+i];
329 }
330 for(i = c_cell_size-1; i >= 0; i--){
331 states_e[prev_cell_start+i]+=e*f_b_params[p];
332 lstm_params_e[p--] +=e*states[prev_cell_start+i];
333 }
334 for(i = c_state_size-1; i >= 0; i--){
335 states_e[prev_state_start+i]+=e*f_b_params[p];
336 lstm_params_e[p--] +=e*states[prev_state_start+i];
337 }
338 lstm_params_e[p--]+=e;
339 s1--;
340 }
341
342 //igate
343 for(s = c_state_size-1; s >= 0; s--){
344 e = states[s1]*(1-states[s1])*states_e[s1];
345 for(i = c_proj_size-1; i >= 0; i--){
346 chars_e[char_start+i] += e*f_b_params[p];
347 lstm_params_e[p--] += e*chars[char_start+i];
348 }
349 for(i = c_cell_size-1; i >= 0; i--){
350 states_e[prev_cell_start+i]+=e*f_b_params[p];
351 lstm_params_e[p--] +=e*states[prev_cell_start+i];
352 }
353 for(i = c_state_size-1; i >= 0; i--){
354 states_e[prev_state_start+i]+=e*f_b_params[p];
355 lstm_params_e[p--] +=e*states[prev_state_start+i];
356 }
357 lstm_params_e[p--]+=e;
358 s1--;
359 }
360
361 if(p+1!=pStart){
362 printf("crap! p!= %d p = %d\n",pStart,p+1);
363 }
364 if(s1+1!=next_start){
365 printf("crap! s1!= %d s1 = %d\n",next_start,s1+1);
366 }
367}
368
369void lstmForward(char* word, int len, real* out, real *f_states, real *b_states, real *chars){
370 //printf("%s\n",word);
371 int i,s,c,p;
372 for(s = 0; s < (len+1)*(c_state_size*7); s++){
373 f_states[s]=0;
374 b_states[s]=0;
375 }
376 for(s = 0; s < c_state_size; s++){
377 f_states[c_state_size*3]=f_init_cell[s];
378 f_states[c_state_size*6]=f_init_state[s];
379 b_states[c_state_size*3]=b_init_cell[s];
380 b_states[c_state_size*6]=b_init_state[s];
381 }
382 for(i = 0; i < len; i++){
383 c = word[i];
384 if(c>=C_MAX_CODE){c=C_MAX_CODE-1;}
385 for(s = 0; s < c_proj_size; s++){
386 chars[i*c_proj_size+s] = c_lookup[c*c_proj_size+s];
387 }
388 }
389
390 for(i = 0; i < len; i++){
391 lstmForwardBlock(chars, i*c_proj_size, f_states, (i+1)*c_state_size*7, 0);
392 }
393 for(i = 0; i < len; i++){
394 lstmForwardBlock(chars, (len-i-1)*c_proj_size, b_states, (i+1)*c_state_size*7, c_lstm_params_number);
395 }
396
397 //printStates(f_states,c_state_size*7);
398
399 for(s = 0; s < layer1_size; s++){
400 out[s]=0;
401 }
402 p=c_lstm_params_number*2;
403 for(s = 0; s < layer1_size; s++){
404 for(i = 0; i < c_state_size; i++){
405 out[s]+=f_states[len*c_state_size*7+c_state_size*6 + i]*f_b_params[p++];
406 out[s]+=b_states[len*c_state_size*7+c_state_size*6 + i]*f_b_params[p++];
407 }
408// printf("%f ",out[s]);
409 }
410// printf("\n");
411}
412
413void lstmBackward(char* word, int len, real* out, real *f_states, real *b_states, real* chars, real* out_e, real *f_states_e, real *b_states_e, real* chars_e, real *lstm_params_e){
414 int i,s,c=-1,p;
415 for(s = 0; s < (len+1)*c_state_size*7; s++){
416 f_states_e[s]=0;
417 b_states_e[s]=0;
418 }
419 for(i = 0; i < len; i++){
420 for(s = 0; s < c_proj_size; s++){
421 chars_e[i*c_proj_size+s] = 0;
422 }
423 }
424 for(i = 0; i < c_lstm_params_number*2; i++){
425 lstm_params_e[i]=0;
426 }
427
428 p=c_lstm_params_number*2;
429 for(s = 0; s < layer1_size; s++){
430 for(i = 0; i < c_state_size; i++){
431 f_states_e[len*c_state_size*7+c_state_size*6 + i]+=out_e[s]*f_b_params[p];
432 f_b_params[p] += out_e[s] * f_states[len*c_state_size*7+c_state_size*6 + i];
433 p++;
434 b_states_e[len*c_state_size*7+c_state_size*6 + i]+=out_e[s]*f_b_params[p];
435 f_b_params[p] += out_e[s] * b_states[len*c_state_size*7+c_state_size*6 + i];
436 p++;
437 }
438 }
439
440 for(i = len-1; i >=0; i--){
441 lstmBackwardBlock(chars, i*c_proj_size, b_states, (i+1)*c_state_size*7, c_lstm_params_number, chars_e,b_states_e,lstm_params_e);
442 }
443
444 for(i = len-1; i >=0; i--){
445 lstmBackwardBlock(chars, (len-i-1)*c_proj_size, f_states, (i+1)*c_state_size*7, 0, chars_e,f_states_e,lstm_params_e);
446 }
447
448 for(i = 0; i < len; i++){
449 c = word[i];
450 if(c>=C_MAX_CODE){c=C_MAX_CODE-1;}
451 for(s = 0; s < c_proj_size; s++){
452 c_lookup[c*c_proj_size+s] += chars_e[i*c_proj_size+s];
453 }
454 }
455
456 for(s = 0; s < c_state_size; s++){
457 f_init_cell[s]+=f_states_e[c_state_size*3];
458 f_init_state[s]+=f_states_e[c_state_size*6];
459 b_init_cell[s]+=b_states_e[c_state_size*3];
460 b_init_state[s]+=b_states_e[c_state_size*6];
461 }
462
463 for(s = 0; s < c_lstm_params_number*2; s++){
464 f_b_params[c]+=lstm_params_e[c];
465 }
466
467 //printf("out\n");
468 //printStates(f_states,(len)*c_state_size*7);
469 //printf("err\n");
470 //printStates(f_states_e,(len)*c_state_size*7);
471
472}
473
474void lstmFitting(char* word, int len, real* out, real *f_states, real *b_states, real* chars, real* out_expected, real* out_e, real *f_states_e, real *b_states_e, real* chars_e, real *lstm_params_e){
475 int i;
476 real g = 0;
477 lstmForward(word, len, out, f_states, b_states, chars);
478 for(i = 0; i < layer1_size; i++){
479 if(out_expected[i]>out[i]){
480 g += out_expected[i]-out[i];
481 }
482 else{
483 g += -out_expected[i]+out[i];
484 }
485 out_e[i] = (out_expected[i]-out[i])*alpha;
486 }
487 printf("error before fitting = %f\n", g);
488 lstmBackward(word, len, out, f_states, b_states, chars, out_e, f_states_e, b_states_e, chars_e, lstm_params_e);
489 lstmForward(word, len, out, f_states, b_states, chars);
490 g=0;
491 for(i = 0; i < layer1_size; i++){
492 if(out_expected[i]>out[i]){
493 g += out_expected[i]-out[i];
494 }
495 else{
496 g += -out_expected[i]+out[i];
497 }
498 out_e[i] = (out_expected[i]-out[i])*alpha;
499 }
500 printf("error after fitting = %f\n", g);
501
502}
503
504real hardTanh(real x){
505 if(x>=1){
506 return 1;
507 }
508 else if(x<=-1){
509 return -1;
510 }
511 else{
512 return x;
513 }
514}
515
516real dHardTanh(real x, real g){
517 if(x > 1 && g > 0){
518 return 0;
519 }
520 if(x < -1 && g < 0){
521 return 0;
522 }
523 return 1;
524}
525
526void InitUnigramTable() {
527 int a, i;
528 long long train_words_pow = 0;
529 real d1, power = 0.75;
530 table = (int *)malloc(table_size * sizeof(int));
531 for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
532 i = 0;
533 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
534 for (a = 0; a < table_size; a++) {
535 table[a] = i;
536 if (a / (real)table_size > d1) {
537 i++;
538 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
539 }
540 if (i >= vocab_size) i = vocab_size - 1;
541 }
542}
543
544// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
545void ReadWord(char *word, FILE *fin) {
546 int a = 0, ch;
547 while (!feof(fin)) {
548 ch = fgetc(fin);
549 if (ch == 13) continue;
550 if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
551 if (a > 0) {
552 if (ch == '\n') ungetc(ch, fin);
553 break;
554 }
555 if (ch == '\n') {
556 strcpy(word, (char *)"</s>");
557 return;
558 } else continue;
559 }
560 word[a] = ch;
561 a++;
562 if (a >= MAX_STRING - 1) a--; // Truncate too long words
563 }
564 word[a] = 0;
565}
566
567// Returns hash value of a word
568int GetWordHash(char *word) {
569 unsigned long long a, hash = 0;
570 for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
571 hash = hash % vocab_hash_size;
572 return hash;
573}
574
575// Returns position of a word in the vocabulary; if the word is not found, returns -1
576int SearchVocab(char *word) {
577 unsigned int hash = GetWordHash(word);
578 while (1) {
579 if (vocab_hash[hash] == -1) return -1;
580 if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
581 hash = (hash + 1) % vocab_hash_size;
582 }
583 return -1;
584}
585
586// Reads a word and returns its index in the vocabulary
587int ReadWordIndex(FILE *fin) {
588 char word[MAX_STRING];
589 ReadWord(word, fin);
590 if (feof(fin)) return -1;
591 return SearchVocab(word);
592}
593
594// Reads a word and returns its index in the vocabulary
595int ReadAndStoreWordIndex(FILE *fin, char* word) {
596 ReadWord(word, fin);
597 if (feof(fin)) return -1;
598 return SearchVocab(word);
599}
600
601// Adds a word to the vocabulary
602int AddWordToVocab(char *word) {
603 unsigned int hash, length = strlen(word) + 1;
604 if (length > MAX_STRING) length = MAX_STRING;
605 vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
606 strcpy(vocab[vocab_size].word, word);
607 vocab[vocab_size].cn = 0;
608 vocab_size++;
609 // Reallocate memory if needed
610 if (vocab_size + 2 >= vocab_max_size) {
611 vocab_max_size += 1000;
612 vocab = (struct vocab_word *)realloc(vocab, vocab_max_size * sizeof(struct vocab_word));
613 }
614 hash = GetWordHash(word);
615 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
616 vocab_hash[hash] = vocab_size - 1;
617 return vocab_size - 1;
618}
619
620// Used later for sorting by word counts
621int VocabCompare(const void *a, const void *b) {
622 return ((struct vocab_word *)b)->cn - ((struct vocab_word *)a)->cn;
623}
624
625// Sorts the vocabulary by frequency using word counts
626void SortVocab() {
627 int a, size;
628 unsigned int hash;
629 // Sort the vocabulary and keep </s> at the first position
630 qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
631 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
632 size = vocab_size;
633 train_words = 0;
634 for (a = 0; a < size; a++) {
635 // Words occuring less than min_count times will be discarded from the vocab
636 if ((vocab[a].cn < min_count) && (a != 0)) {
637 vocab_size--;
638 free(vocab[a].word);
639 } else {
640 // Hash will be re-computed, as after the sorting it is not actual
641 hash=GetWordHash(vocab[a].word);
642 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
643 vocab_hash[hash] = a;
644 train_words += vocab[a].cn;
645 }
646 }
647 vocab = (struct vocab_word *)realloc(vocab, (vocab_size + 1) * sizeof(struct vocab_word));
648 // Allocate memory for the binary tree construction
649 for (a = 0; a < vocab_size; a++) {
650 vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
651 vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
652 }
653}
654
655// Reduces the vocabulary by removing infrequent tokens
656void ReduceVocab() {
657 int a, b = 0;
658 unsigned int hash;
659 for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
660 vocab[b].cn = vocab[a].cn;
661 vocab[b].word = vocab[a].word;
662 b++;
663 } else free(vocab[a].word);
664 vocab_size = b;
665 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
666 for (a = 0; a < vocab_size; a++) {
667 // Hash will be re-computed, as it is not actual
668 hash = GetWordHash(vocab[a].word);
669 while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
670 vocab_hash[hash] = a;
671 }
672 fflush(stdout);
673 min_reduce++;
674}
675
676// Create binary Huffman tree using the word counts
677// Frequent words will have short uniqe binary codes
678void CreateBinaryTree() {
679 long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
680 char code[MAX_CODE_LENGTH];
681 long long *count = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
682 long long *binary = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
683 long long *parent_node = (long long *)calloc(vocab_size * 2 + 1, sizeof(long long));
684 for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
685 for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
686 pos1 = vocab_size - 1;
687 pos2 = vocab_size;
688 // Following algorithm constructs the Huffman tree by adding one node at a time
689 for (a = 0; a < vocab_size - 1; a++) {
690 // First, find two smallest nodes 'min1, min2'
691 if (pos1 >= 0) {
692 if (count[pos1] < count[pos2]) {
693 min1i = pos1;
694 pos1--;
695 } else {
696 min1i = pos2;
697 pos2++;
698 }
699 } else {
700 min1i = pos2;
701 pos2++;
702 }
703 if (pos1 >= 0) {
704 if (count[pos1] < count[pos2]) {
705 min2i = pos1;
706 pos1--;
707 } else {
708 min2i = pos2;
709 pos2++;
710 }
711 } else {
712 min2i = pos2;
713 pos2++;
714 }
715 count[vocab_size + a] = count[min1i] + count[min2i];
716 parent_node[min1i] = vocab_size + a;
717 parent_node[min2i] = vocab_size + a;
718 binary[min2i] = 1;
719 }
720 // Now assign binary code to each vocabulary word
721 for (a = 0; a < vocab_size; a++) {
722 b = a;
723 i = 0;
724 while (1) {
725 code[i] = binary[b];
726 point[i] = b;
727 i++;
728 b = parent_node[b];
729 if (b == vocab_size * 2 - 2) break;
730 }
731 vocab[a].codelen = i;
732 vocab[a].point[0] = vocab_size - 2;
733 for (b = 0; b < i; b++) {
734 vocab[a].code[i - b - 1] = code[b];
735 vocab[a].point[i - b] = point[b] - vocab_size;
736 }
737 }
738 free(count);
739 free(binary);
740 free(parent_node);
741}
742
743void LearnVocabFromTrainFile() {
744 char word[MAX_STRING];
745 FILE *fin;
746 long long a, i;
747 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
748 fin = fopen(train_file, "rb");
749 if (fin == NULL) {
750 printf("ERROR: training data file not found!\n");
751 exit(1);
752 }
753 vocab_size = 0;
754 AddWordToVocab((char *)"</s>");
755 while (1) {
756 ReadWord(word, fin);
757 if (feof(fin)) break;
758 train_words++;
759 if ((debug_mode > 1) && (train_words % 100000 == 0)) {
760 printf("%lldK%c", train_words / 1000, 13);
761 fflush(stdout);
762 }
763 i = SearchVocab(word);
764 if (i == -1) {
765 a = AddWordToVocab(word);
766 vocab[a].cn = 1;
767 } else vocab[i].cn++;
768 if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
769 }
770 SortVocab();
771 if (debug_mode > 0) {
772 printf("Vocab size: %lld\n", vocab_size);
773 printf("Words in train file: %lld\n", train_words);
774 }
775 file_size = ftell(fin);
776 fclose(fin);
777}
778
779void SaveVocab() {
780 long long i;
781 FILE *fo = fopen(save_vocab_file, "wb");
782 for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
783 fclose(fo);
784}
785
786void ReadVocab() {
787 long long a, i = 0;
788 char c;
789 char word[MAX_STRING];
790 FILE *fin = fopen(read_vocab_file, "rb");
791 if (fin == NULL) {
792 printf("Vocabulary file not found\n");
793 exit(1);
794 }
795 for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
796 vocab_size = 0;
797 while (1) {
798 ReadWord(word, fin);
799 if (feof(fin)) break;
800 a = AddWordToVocab(word);
801 fscanf(fin, "%lld%c", &vocab[a].cn, &c);
802 i++;
803 }
804 SortVocab();
805 if (debug_mode > 0) {
806 printf("Vocab size: %lld\n", vocab_size);
807 printf("Words in train file: %lld\n", train_words);
808 }
809 fin = fopen(train_file, "rb");
810 if (fin == NULL) {
811 printf("ERROR: training data file not found!\n");
812 exit(1);
813 }
814 fseek(fin, 0, SEEK_END);
815 file_size = ftell(fin);
816 fclose(fin);
817}
818
819void InitClassUnigramTable() {
820 long long a,c;
821 printf("loading class unigrams \n");
822 FILE *fin = fopen(negative_classes_file, "rb");
823 if (fin == NULL) {
824 printf("ERROR: class file not found!\n");
825 exit(1);
826 }
827 word_to_group = (int *)malloc(vocab_size * sizeof(int));
828 for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
829 char class[MAX_STRING];
830 char prev_class[MAX_STRING];
831 prev_class[0] = 0;
832 char word[MAX_STRING];
833 class_number = -1;
834 while (1) {
835 if (feof(fin)) break;
836 ReadWord(class, fin);
837 ReadWord(word, fin);
838 int word_index = SearchVocab(word);
839 if (word_index != -1){
840 if(strcmp(class, prev_class) != 0){
841 class_number++;
842 strcpy(prev_class, class);
843 }
844 word_to_group[word_index] = class_number;
845 }
846 ReadWord(word, fin);
847 }
848 class_number++;
849 fclose(fin);
850
851 group_to_table = (int *)malloc(table_size * class_number * sizeof(int));
852 long long train_words_pow = 0;
853 real d1, power = 0.75;
854
855 for(c = 0; c < class_number; c++){
856 long long offset = c * table_size;
857 train_words_pow = 0;
858 for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
859 int i = 0;
860 while(word_to_group[i]!=c && i < vocab_size) i++;
861 d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
862 for (a = 0; a < table_size; a++) {
863 //printf("index %lld , word %d\n", a, i);
864 group_to_table[offset + a] = i;
865 if (a / (real)table_size > d1) {
866 i++;
867 while(word_to_group[i]!=c && i < vocab_size) i++;
868 d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
869 }
870 if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
871 }
872 }
873}
874
875void InitNet() {
876 long long a, b;
877 unsigned long long next_random = 1;
878 window_layer_size = layer1_size*window*2;
879 a = posix_memalign((void **)&syn0, 128, (long long)vocab_size * layer1_size * sizeof(real));
880 if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
881
882 if (hs) {
883 a = posix_memalign((void **)&syn1, 128, (long long)vocab_size * layer1_size * sizeof(real));
884 if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
885 a = posix_memalign((void **)&syn1_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
886 if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
887 a = posix_memalign((void **)&syn_hidden_word, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
888 if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
889
890 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
891 syn1[a * layer1_size + b] = 0;
892 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
893 syn1_window[a * window_layer_size + b] = 0;
894 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
895 syn_hidden_word[a * window_hidden_size + b] = 0;
896 }
897 if (negative>0) {
898 a = posix_memalign((void **)&syn1neg, 128, (long long)vocab_size * layer1_size * sizeof(real));
899 if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
900 a = posix_memalign((void **)&syn1neg_window, 128, (long long)vocab_size * window_layer_size * sizeof(real));
901 if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
902 a = posix_memalign((void **)&syn_hidden_word_neg, 128, (long long)vocab_size * window_hidden_size * sizeof(real));
903 if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
904
905 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
906 syn1neg[a * layer1_size + b] = 0;
907 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
908 syn1neg_window[a * window_layer_size + b] = 0;
909 for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
910 syn_hidden_word_neg[a * window_hidden_size + b] = 0;
911 }
912 for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
913 next_random = next_random * (unsigned long long)25214903917 + 11;
914 syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
915 }
916
917 a = posix_memalign((void **)&syn_window_hidden, 128, window_hidden_size * window_layer_size * sizeof(real));
918 if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
919 for (a = 0; a < window_hidden_size * window_layer_size; a++){
920 next_random = next_random * (unsigned long long)25214903917 + 11;
921 syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
922 }
923
924 if(rep == 1 || rep == 2){
925 a = posix_memalign((void **)&c_lookup, 128, (long long)C_MAX_CODE * c_proj_size * sizeof(real));
926 if (c_lookup == NULL) {printf("Memory allocation failed\n"); exit(1);}
927 for (a = 0; a < C_MAX_CODE * c_proj_size; a++){
928 next_random = next_random * (unsigned long long)25214903917 + 11;
929 c_lookup[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (c_proj_size);
930 }
931
932 a = posix_memalign((void **)&f_init_state, 128, c_state_size * sizeof(real));
933 if (f_init_state == NULL) {printf("Memory allocation failed\n"); exit(1);}
934 a = posix_memalign((void **)&f_init_cell, 128, c_state_size * sizeof(real));
935 if (f_init_cell == NULL) {printf("Memory allocation failed\n"); exit(1);}
936 a = posix_memalign((void **)&b_init_state, 128, c_state_size * sizeof(real));
937 if (b_init_state == NULL) {printf("Memory allocation failed\n"); exit(1);}
938 a = posix_memalign((void **)&b_init_cell, 128, c_state_size * sizeof(real));
939 if (b_init_cell == NULL) {printf("Memory allocation failed\n"); exit(1);}
940
941 for (a = 0; a < c_state_size; a++){
942 next_random = next_random * (unsigned long long)25214903917 + 11;
943 f_init_state[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (c_state_size);
944 next_random = next_random * (unsigned long long)25214903917 + 11;
945 f_init_cell[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (c_state_size);
946 next_random = next_random * (unsigned long long)25214903917 + 11;
947 b_init_state[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (c_state_size);
948 next_random = next_random * (unsigned long long)25214903917 + 11;
949 b_init_cell[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (c_state_size);
950 }
951
952 c_lstm_params_number = /*input*/ (c_state_size+c_cell_size+c_proj_size+1)*c_state_size +
953 /*forget*/ (c_state_size+c_cell_size+c_proj_size+1)*c_state_size +
954 /*cell*/ (c_state_size+c_proj_size+1)*c_state_size +
955 /*output*/ (c_state_size+c_cell_size+c_proj_size+1)*c_state_size;
956
957 c_params_number = ( c_lstm_params_number * 2 + (c_state_size*2)*layer1_size) ;
958 a = posix_memalign((void **)&f_b_params, 128, c_params_number* sizeof(real));
959 if (f_b_params == NULL) {printf("Memory allocation failed\n"); exit(1);}
960
961 for (a = 0; a < c_params_number; a++){
962 next_random = next_random * (unsigned long long)25214903917 + 11;
963 f_b_params[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) ;
964 }
965 }
966
967 if(rep == 2){
968 a = posix_memalign((void **)&syn0_initial, 128, (long long)vocab_size * layer1_size * sizeof(real));
969 if (syn0_initial == NULL) {printf("Memory allocation failed\n"); exit(1);}
970 a = posix_memalign((void **)&syn0_in_memory, 128, (long long)vocab_size * sizeof(real));
971 if (syn0_in_memory == NULL) {printf("Memory allocation failed\n"); exit(1);}
972 for(a = 0; a < vocab_size; a++){
973 syn0_in_memory[a] = -1;
974 }
975 }
976 CreateBinaryTree();
977}
978
979void *TrainModelThread(void *id) {
980 long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
981 long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
982 long long l1, l2, c, target, label, local_iter = iter;
983 char c_sen[(MAX_SENTENCE_LENGTH + 1) * MAX_STRING];
984 unsigned long long next_random = (long long)id;
985 real f, g, acc_g=0;
986 clock_t now;
987 int input_len_1 = layer1_size;
988 if(type == 2 || type == 4){
989 input_len_1=window_layer_size;
990 }
991 real *neu1 = (real *)calloc(input_len_1, sizeof(real));
992 real *neu1e = (real *)calloc(input_len_1, sizeof(real));
993
994 int input_len_2 = 0;
995 if(type == 4){
996 input_len_2 = window_hidden_size;
997 }
998 real *neu2 = (real *)calloc(input_len_2, sizeof(real));
999 real *neu2e = (real *)calloc(input_len_2, sizeof(real));
1000
1001 FILE *fi = fopen(train_file, "rb");
1002 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
1003
1004 real *f_states = (real *)calloc((c_state_size * 7) * (MAX_STRING + 1), sizeof(real));
1005 real *f_states_e = (real *)calloc((c_state_size * 7) * (MAX_STRING + 1), sizeof(real));
1006 real *b_states = (real *)calloc((c_state_size * 7) * (MAX_STRING + 1), sizeof(real));
1007 real *b_states_e = (real *)calloc((c_state_size * 7) * (MAX_STRING + 1), sizeof(real));
1008 real *chars = (real *)calloc(c_proj_size * MAX_STRING, sizeof(real));
1009 real *chars_e = (real *)calloc(c_proj_size * MAX_STRING, sizeof(real));
1010 real *lstm_params_e = (real *)calloc(c_lstm_params_number*2, sizeof(real));
1011
1012 //short term memory vars
1013 real global_divergence = -1;
1014 int in_mem = 0;
1015 int skip=0, non_skip=0;
1016
1017 while (1) {
1018 if (word_count - last_word_count > 10000) {
1019 word_count_actual += word_count - last_word_count;
1020 last_word_count = word_count;
1021 if ((debug_mode > 1)) {
1022 now=clock();
1023 printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk : error %.4f", 13, alpha,
1024 word_count_actual / (real)(iter * train_words + 1) * 100,
1025 word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000), acc_g);
1026 if(rep == 2){
1027 printf(" skiprate %f",skip/(real)(skip+non_skip));
1028 }
1029 acc_g=0;
1030 skip=0;
1031 non_skip=0;
1032 fflush(stdout);
1033 }
1034 alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
1035 if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
1036 }
1037 if (sentence_length == 0) {
1038 while (1) {
1039 word = ReadAndStoreWordIndex(fi, &c_sen[sentence_length*MAX_STRING]);
1040 if (feof(fi)) break;
1041 if (word == -1) continue;
1042 word_count++;
1043 if (word == 0) break;
1044 // The subsampling randomly discards frequent words while keeping the ranking same
1045 if (sample > 0) {
1046 real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
1047 next_random = next_random * (unsigned long long)25214903917 + 11;
1048 if (ran < (next_random & 0xFFFF) / (real)65536) continue;
1049 }
1050 sen[sentence_length] = word;
1051 sentence_length++;
1052 if (sentence_length >= MAX_SENTENCE_LENGTH) break;
1053 }
1054 sentence_position = 0;
1055 }
1056 if (feof(fi) || (word_count > train_words / num_threads)) {
1057 word_count_actual += word_count - last_word_count;
1058 local_iter--;
1059 if (local_iter == 0) break;
1060 word_count = 0;
1061 last_word_count = 0;
1062 sentence_length = 0;
1063 fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
1064 continue;
1065 }
1066 word = sen[sentence_position];
1067 if (word == -1) continue;
1068 for (c = 0; c < input_len_1; c++) neu1[c] = 0;
1069 for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
1070 for (c = 0; c < input_len_2; c++) neu2[c] = 0;
1071 for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
1072 next_random = next_random * (unsigned long long)25214903917 + 11;
1073 b = next_random % window;
1074 if (type == 0) { //train the cbow architecture
1075 // in -> hidden
1076 cw = 0;
1077 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1078 c = sentence_position - window + a;
1079 if (c < 0) continue;
1080 if (c >= sentence_length) continue;
1081 last_word = sen[c];
1082 if (last_word == -1) continue;
1083 for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
1084 cw++;
1085 }
1086 if (cw) {
1087 for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
1088 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1089 f = 0;
1090 l2 = vocab[word].point[d] * layer1_size;
1091 // Propagate hidden -> output
1092 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
1093 if (f <= -MAX_EXP) continue;
1094 else if (f >= MAX_EXP) continue;
1095 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1096 // 'g' is the gradient multiplied by the learning rate
1097 g = (1 - vocab[word].code[d] - f) * alpha;
1098 // Propagate errors output -> hidden
1099 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
1100 // Learn weights hidden -> output
1101 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
1102 }
1103 // NEGATIVE SAMPLING
1104 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1105 if (d == 0) {
1106 target = word;
1107 label = 1;
1108 } else {
1109 next_random = next_random * (unsigned long long)25214903917 + 11;
1110 if(word_to_group != NULL && word_to_group[word] != -1){
1111 target = word;
1112 while(target == word) {
1113 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1114 next_random = next_random * (unsigned long long)25214903917 + 11;
1115 }
1116 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1117 }
1118 else{
1119 target = table[(next_random >> 16) % table_size];
1120 }
1121 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1122 if (target == word) continue;
1123 label = 0;
1124 }
1125 l2 = target * layer1_size;
1126 f = 0;
1127 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
1128 if (f > MAX_EXP) g = (label - 1) * alpha;
1129 else if (f < -MAX_EXP) g = (label - 0) * alpha;
1130 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
1131 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
1132 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
1133 }
1134 // hidden -> in
1135 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1136 c = sentence_position - window + a;
1137 if (c < 0) continue;
1138 if (c >= sentence_length) continue;
1139 last_word = sen[c];
1140 if (last_word == -1) continue;
1141 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
1142 }
1143 }
1144 } else if(type==1) { //train skip-gram
1145 for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
1146 c = sentence_position - window + a;
1147 if (c < 0) continue;
1148 if (c >= sentence_length) continue;
1149 last_word = sen[c];
1150 if (last_word == -1) continue;
1151 l1 = last_word * layer1_size;
1152 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
1153 // HIERARCHICAL SOFTMAX
1154 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1155 f = 0;
1156 l2 = vocab[word].point[d] * layer1_size;
1157 // Propagate hidden -> output
1158 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
1159 if (f <= -MAX_EXP) continue;
1160 else if (f >= MAX_EXP) continue;
1161 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1162 // 'g' is the gradient multiplied by the learning rate
1163 g = (1 - vocab[word].code[d] - f) * alpha;
1164 // Propagate errors output -> hidden
1165 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
1166 // Learn weights hidden -> output
1167 for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
1168 }
1169 // NEGATIVE SAMPLING
1170 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1171 if (d == 0) {
1172 target = word;
1173 label = 1;
1174 } else {
1175 next_random = next_random * (unsigned long long)25214903917 + 11;
1176 if(word_to_group != NULL && word_to_group[word] != -1){
1177 target = word;
1178 while(target == word) {
1179 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1180 next_random = next_random * (unsigned long long)25214903917 + 11;
1181 }
1182 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1183 }
1184 else{
1185 target = table[(next_random >> 16) % table_size];
1186 }
1187 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1188 if (target == word) continue;
1189 label = 0;
1190 }
1191 l2 = target * layer1_size;
1192 f = 0;
1193 for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
1194 if (f > MAX_EXP) g = (label - 1) * alpha;
1195 else if (f < -MAX_EXP) g = (label - 0) * alpha;
1196 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
1197 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
1198 for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
1199 }
1200 // Learn weights input -> hidden
1201 for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
1202 }
1203 }
1204 else if(type == 2){ //train the cwindow architecture
1205 // in -> hidden
1206 cw = 0;
1207 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1208 c = sentence_position - window + a;
1209 if (c < 0) continue;
1210 if (c >= sentence_length) continue;
1211 last_word = sen[c];
1212 if (last_word == -1) continue;
1213 window_offset = a*layer1_size;
1214 if (a > window) window_offset-=layer1_size;
1215 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
1216 cw++;
1217 }
1218 if (cw) {
1219 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1220 f = 0;
1221 l2 = vocab[word].point[d] * window_layer_size;
1222 // Propagate hidden -> output
1223 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
1224 if (f <= -MAX_EXP) continue;
1225 else if (f >= MAX_EXP) continue;
1226 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1227 // 'g' is the gradient multiplied by the learning rate
1228 g = (1 - vocab[word].code[d] - f) * alpha;
1229 // Propagate errors output -> hidden
1230 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
1231 // Learn weights hidden -> output
1232 for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
1233 }
1234 // NEGATIVE SAMPLING
1235 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1236 if (d == 0) {
1237 target = word;
1238 label = 1;
1239 } else {
1240 next_random = next_random * (unsigned long long)25214903917 + 11;
1241 if(word_to_group != NULL && word_to_group[word] != -1){
1242 target = word;
1243 while(target == word) {
1244 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1245 next_random = next_random * (unsigned long long)25214903917 + 11;
1246 }
1247 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1248 }
1249 else{
1250 target = table[(next_random >> 16) % table_size];
1251 }
1252 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1253 if (target == word) continue;
1254 label = 0;
1255 }
1256 l2 = target * window_layer_size;
1257 f = 0;
1258 for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
1259 if (f > MAX_EXP) g = (label - 1) * alpha;
1260 else if (f < -MAX_EXP) g = (label - 0) * alpha;
1261 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
1262 acc_g+=g;
1263 for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
1264 for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
1265 }
1266 // hidden -> in
1267 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1268 c = sentence_position - window + a;
1269 if (c < 0) continue;
1270 if (c >= sentence_length) continue;
1271 last_word = sen[c];
1272 if (last_word == -1) continue;
1273 window_offset = a * layer1_size;
1274 if(a > window) window_offset -= layer1_size;
1275 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
1276 }
1277 }
1278 }
1279 else if (type == 3){ //train structured skip-gram
1280 char* c_word = &c_sen[sentence_position*MAX_STRING];
1281 if(rep == 1){
1282 lstmForward(c_word, strlen(c_word),neu1, f_states, b_states, chars);
1283 }
1284 else if(rep == 2){
1285 l1 = word * layer1_size;
1286 if(syn0_in_memory[word]==-1){
1287 syn0_in_memory[word]=0;
1288 lstmForward(c_word, strlen(c_word),&syn0_initial[l1], f_states, b_states, chars);
1289 for (c = 0; c < layer1_size; c++) {syn0[c + l1] = syn0_initial[c + l1];neu1[c] += syn0[c + l1];}
1290 in_mem = 1;
1291 }
1292 else{
1293 for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + l1];
1294 in_mem = 0;
1295 }
1296 }
1297 else{
1298 l1 = word * layer1_size;
1299 for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + l1];
1300 }
1301
1302 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1303 c = sentence_position - window + a;
1304 if (c < 0) continue;
1305 if (c >= sentence_length) continue;
1306 last_word = sen[c];
1307 if (last_word == -1) continue;
1308
1309
1310 window_offset = a * layer1_size;
1311 if(a > window) window_offset -= layer1_size;
1312 for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
1313 // HIERARCHICAL SOFTMAX
1314 if (hs) for (d = 0; d < vocab[last_word].codelen; d++) {
1315 f = 0;
1316 l2 = vocab[last_word].point[d] * window_layer_size;
1317 // Propagate hidden -> output
1318 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1_window[c + l2 + window_offset];
1319 if (f <= -MAX_EXP) continue;
1320 else if (f >= MAX_EXP) continue;
1321 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1322 // 'g' is the gradient multiplied by the learning rate
1323 g = (1 - vocab[last_word].code[d] - f) * alpha;
1324 // Propagate errors output -> hidden
1325 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
1326 // Learn weights hidden -> output
1327 for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * neu1[c];
1328 }
1329 // NEGATIVE SAMPLING
1330 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1331 if (d == 0) {
1332 target = last_word;
1333 label = 1;
1334 } else {
1335 next_random = next_random * (unsigned long long)25214903917 + 11;
1336 if(word_to_group != NULL && word_to_group[last_word] != -1){
1337 target = last_word;
1338 while(target == last_word) {
1339 target = group_to_table[word_to_group[last_word]*table_size + (next_random >> 16) % table_size];
1340 next_random = next_random * (unsigned long long)25214903917 + 11;
1341 }
1342 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1343 }
1344 else{
1345 target = table[(next_random >> 16) % table_size];
1346 }
1347 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1348 if (target == last_word) continue;
1349 label = 0;
1350 }
1351 l2 = target * window_layer_size;
1352 f = 0;
1353 for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg_window[c + l2 + window_offset];
1354 if (f > MAX_EXP) g = (label - 1) * alpha;
1355 else if (f < -MAX_EXP) g = (label - 0) * alpha;
1356 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
1357 acc_g+=g;
1358
1359 for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
1360 for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * neu1[c];
1361
1362 }
1363
1364 }
1365 // Learn weights input -> hidden
1366
1367 if(rep == 1){
1368 lstmBackward(c_word, strlen(c_word),neu1, f_states, b_states, chars, neu1e,f_states_e, b_states_e, chars_e, lstm_params_e);
1369 }
1370 else if(rep == 2){
1371 g = 0;
1372 l1 = word * layer1_size;
1373 for (c = 0; c < layer1_size; c++) {
1374 syn0[c + l1] += neu1e[c];
1375 f = syn0[c + l1] - syn0_initial[c + l1];
1376 if(f > 0){
1377 g+=f;
1378 }
1379 else{
1380 g-=f;
1381 }
1382 }
1383 syn0_in_memory[word] = g;
1384 if(global_divergence == -1){global_divergence = g;}
1385 long skip_prob = vocab[word].cn-(log(vocab[word].cn)+1);
1386 next_random = next_random * (unsigned long long)25214903917 + 11;
1387
1388 if(skip_prob < next_random%vocab[word].cn){
1389 non_skip++;
1390 if(in_mem == 0){
1391 lstmFitting(c_word, strlen(c_word),neu1, f_states, b_states, chars,&syn0[c +l1], neu1e,f_states_e, b_states_e, chars_e, lstm_params_e);
1392 }
1393 else{
1394 lstmBackward(c_word, strlen(c_word),neu1, f_states, b_states, chars, neu1e,f_states_e, b_states_e, chars_e, lstm_params_e);
1395 }
1396 syn0_in_memory[word]=-1;
1397 }
1398 else{
1399 skip++;
1400 }
1401 global_divergence = global_divergence*0.9 + g*0.1;
1402 }
1403 else{
1404 l1 = word * layer1_size;
1405 for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
1406 }
1407 }
1408 else if(type == 4){ //training senna
1409 // in -> hidden
1410 cw = 0;
1411 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1412 c = sentence_position - window + a;
1413 if (c < 0) continue;
1414 if (c >= sentence_length) continue;
1415 last_word = sen[c];
1416 if (last_word == -1) continue;
1417 window_offset = a*layer1_size;
1418 if (a > window) window_offset-=layer1_size;
1419 for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
1420 cw++;
1421 }
1422 if (cw) {
1423 for (a = 0; a < window_hidden_size; a++){
1424 c = a*window_layer_size;
1425 for(b = 0; b < window_layer_size; b++){
1426 neu2[a] += syn_window_hidden[c + b] * neu1[b];
1427 }
1428 }
1429 if (hs) for (d = 0; d < vocab[word].codelen; d++) {
1430 f = 0;
1431 l2 = vocab[word].point[d] * window_hidden_size;
1432 // Propagate hidden -> output
1433 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
1434 if (f <= -MAX_EXP) continue;
1435 else if (f >= MAX_EXP) continue;
1436 else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
1437 // 'g' is the gradient multiplied by the learning rate
1438 g = (1 - vocab[word].code[d] - f) * alpha;
1439 // Propagate errors output -> hidden
1440 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
1441 // Learn weights hidden -> output
1442 for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1443 }
1444 // NEGATIVE SAMPLING
1445 if (negative > 0) for (d = 0; d < negative + 1; d++) {
1446 if (d == 0) {
1447 target = word;
1448 label = 1;
1449 } else {
1450 next_random = next_random * (unsigned long long)25214903917 + 11;
1451 if(word_to_group != NULL && word_to_group[word] != -1){
1452 target = word;
1453 while(target == word) {
1454 target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
1455 next_random = next_random * (unsigned long long)25214903917 + 11;
1456 }
1457 //printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
1458 }
1459 else{
1460 target = table[(next_random >> 16) % table_size];
1461 }
1462 if (target == 0) target = next_random % (vocab_size - 1) + 1;
1463 if (target == word) continue;
1464 label = 0;
1465 }
1466 l2 = target * window_hidden_size;
1467 f = 0;
1468 for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
1469 if (f > MAX_EXP) g = (label - 1) * alpha / negative;
1470 else if (f < -MAX_EXP) g = (label - 0) * alpha / negative;
1471 else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha / negative;
1472 for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
1473 for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
1474 }
1475 for (a = 0; a < window_hidden_size; a++)
1476 for(b = 0; b < window_layer_size; b++)
1477 neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
1478 for (a = 0; a < window_hidden_size; a++)
1479 for(b = 0; b < window_layer_size; b++)
1480 syn_window_hidden[a*window_layer_size + b] += neu2e[a] * neu1[b];
1481 // hidden -> in
1482 for (a = 0; a < window * 2 + 1; a++) if (a != window) {
1483 c = sentence_position - window + a;
1484 if (c < 0) continue;
1485 if (c >= sentence_length) continue;
1486 last_word = sen[c];
1487 if (last_word == -1) continue;
1488 window_offset = a * layer1_size;
1489 if(a > window) window_offset -= layer1_size;
1490 for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
1491 }
1492 }
1493 }
1494 else{
1495 printf("unknown type %i", type);
1496 exit(0);
1497 }
1498 sentence_position++;
1499 if (sentence_position >= sentence_length) {
1500 sentence_length = 0;
1501 continue;
1502 }
1503 }
1504 fclose(fi);
1505 free(neu1);
1506 free(neu1e);
1507 pthread_exit(NULL);
1508}
1509
1510void TrainModel() {
1511 long a, b, c, d;
1512 FILE *fo;
1513 pthread_t *pt = (pthread_t *)malloc(num_threads * sizeof(pthread_t));
1514 printf("Starting training using file %s\n", train_file);
1515 starting_alpha = alpha;
1516 if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
1517 if (save_vocab_file[0] != 0) SaveVocab();
1518 if (output_file[0] == 0) return;
1519 InitNet();
1520 if (negative > 0) InitUnigramTable();
1521 if (negative_classes_file[0] != 0) InitClassUnigramTable();
1522 start = clock();
1523 for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
1524 for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
1525 fo = fopen(output_file, "wb");
1526 if (classes == 0) {
1527 // Save the word vectors
1528 real *f_states = (real *)calloc((c_state_size * 7) * (MAX_STRING + 1), sizeof(real));
1529 real *b_states = (real *)calloc((c_state_size * 7) * (MAX_STRING + 1), sizeof(real));
1530 real *chars = (real *)calloc(c_proj_size * MAX_STRING, sizeof(real));
1531 real *neu1 = (real *)calloc(layer1_size * MAX_STRING, sizeof(real));
1532
1533 fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
1534 for (a = 0; a < vocab_size; a++) {
1535 fprintf(fo, "%s ", vocab[a].word);
1536 if(rep == 1 || rep == 2){
1537 for (b = 0; b < layer1_size; b++) {neu1[b]=0;}
1538 lstmForward(vocab[a].word, strlen(vocab[a].word),neu1, f_states,b_states,chars);
1539 if (binary) for (b = 0; b < layer1_size; b++) fwrite(&neu1[b], sizeof(real), 1, fo);
1540 else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", neu1[b]);
1541 }
1542 else{
1543 if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
1544 else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
1545 }
1546 fprintf(fo, "\n");
1547 }
1548 } else {
1549 // Run K-means on the word vectors
1550 int clcn = classes, iter = 10, closeid;
1551 int *centcn = (int *)malloc(classes * sizeof(int));
1552 int *cl = (int *)calloc(vocab_size, sizeof(int));
1553 real closev, x;
1554 real *cent = (real *)calloc(classes * layer1_size, sizeof(real));
1555 for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
1556 for (a = 0; a < iter; a++) {
1557 for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
1558 for (b = 0; b < clcn; b++) centcn[b] = 1;
1559 for (c = 0; c < vocab_size; c++) {
1560 for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
1561 centcn[cl[c]]++;
1562 }
1563 for (b = 0; b < clcn; b++) {
1564 closev = 0;
1565 for (c = 0; c < layer1_size; c++) {
1566 cent[layer1_size * b + c] /= centcn[b];
1567 closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
1568 }
1569 closev = sqrt(closev);
1570 for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
1571 }
1572 for (c = 0; c < vocab_size; c++) {
1573 closev = -10;
1574 closeid = 0;
1575 for (d = 0; d < clcn; d++) {
1576 x = 0;
1577 for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
1578 if (x > closev) {
1579 closev = x;
1580 closeid = d;
1581 }
1582 }
1583 cl[c] = closeid;
1584 }
1585 }
1586 // Save the K-means classes
1587 for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
1588 free(centcn);
1589 free(cent);
1590 free(cl);
1591 }
1592 fclose(fo);
1593}
1594
1595int ArgPos(char *str, int argc, char **argv) {
1596 int a;
1597 for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
1598 if (a == argc - 1) {
1599 printf("Argument missing for %s\n", str);
1600 exit(1);
1601 }
1602 return a;
1603 }
1604 return -1;
1605}
1606
1607int main(int argc, char **argv) {
1608 int i;
1609 if (argc == 1) {
1610 printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
1611 printf("Options:\n");
1612 printf("Parameters for training:\n");
1613 printf("\t-train <file>\n");
1614 printf("\t\tUse text data from <file> to train the model\n");
1615 printf("\t-output <file>\n");
1616 printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
1617 printf("\t-size <int>\n");
1618 printf("\t\tSet size of word vectors; default is 100\n");
1619 printf("\t-window <int>\n");
1620 printf("\t\tSet max skip length between words; default is 5\n");
1621 printf("\t-sample <float>\n");
1622 printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
1623 printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
1624 printf("\t-hs <int>\n");
1625 printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
1626 printf("\t-negative <int>\n");
1627 printf("\t-negative-classes <file>\n");
1628 printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
1629 printf("\t-threads <int>\n");
1630 printf("\t\tUse <int> threads (default 12)\n");
1631 printf("\t-iter <int>\n");
1632 printf("\t\tRun more training iterations (default 5)\n");
1633 printf("\t-min-count <int>\n");
1634 printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
1635 printf("\t-alpha <float>\n");
1636 printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
1637 printf("\t-classes <int>\n");
1638 printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
1639 printf("\t-debug <int>\n");
1640 printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
1641 printf("\t-binary <int>\n");
1642 printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
1643 printf("\t-save-vocab <file>\n");
1644 printf("\t\tThe vocabulary will be saved to <file>\n");
1645 printf("\t-read-vocab <file>\n");
1646 printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
1647 printf("\t-type <int>\n");
1648 printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
1649 printf("\t-rep <int>\n");
1650 printf("\t\tType of word rep (0 for word, 1 for character, 2 for character with short term memory\n");
1651 printf("\t-char-state-dim <int>\n");
1652 printf("\t\tcharacter state size\n");
1653 printf("\t-char-proj-dim <int>\n");
1654 printf("\t\tcharacter projection size\n");
1655 printf("\nExamples:\n");
1656 printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
1657 return 0;
1658 }
1659 output_file[0] = 0;
1660 save_vocab_file[0] = 0;
1661 read_vocab_file[0] = 0;
1662 negative_classes_file[0] = 0;
1663 if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
1664 if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
1665 if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
1666 if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
1667 if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
1668 if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
1669 if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
1670 if (type==0 || type==2 || type==4) alpha = 0.05;
1671 if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
1672 if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
1673 if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
1674 if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
1675 if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
1676 if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
1677 if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
1678 if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
1679 if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
1680 if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
1681 if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
1682 if ((i = ArgPos((char *)"-rep", argc, argv)) > 0) rep = atoi(argv[i + 1]);
1683 if ((i = ArgPos((char *)"-char-state-dim", argc, argv)) > 0) {c_state_size = atoi(argv[i + 1]); c_cell_size = c_state_size;}
1684 if ((i = ArgPos((char *)"-char-proj-dim", argc, argv)) > 0) {c_proj_size = atoi(argv[i + 1]);}
1685 vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
1686 vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
1687 expTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1688 tanhTable = (real *)malloc((EXP_TABLE_SIZE + 1) * sizeof(real));
1689 for (i = 0; i < EXP_TABLE_SIZE; i++) {
1690 expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
1691 expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
1692 tanhTable[i] = tanh((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP);
1693 }
1694 TrainModel();
1695 return 0;
1696}