Blame - word2vec.c - ids-kl/dereko2vec

blob: 67d98468a9ad160f318619533030fba759394529 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
				25	#define MAX_CODE_LENGTH 40
				26
				27	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				28
				29	typedef float real; // Precision of float numbers
				30
				31	struct vocab_word {
				32	long long cn;
				33	int *point;
				34	char word, code, codelen;
				35	};
				36
				37	char train_file[MAX_STRING], output_file[MAX_STRING];
				38	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				39	struct vocab_word *vocab;
				40	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
				41	int *vocab_hash;
				42	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				43	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
				44	real alpha = 0.025, starting_alpha, sample = 1e-3;
				45	real syn0, syn1, syn1neg, syn1nce, *expTable;
				46	clock_t start;
				47
				48	real syn1_window, syn1neg_window, *syn1nce_window;
				49	int w_offset, window_layer_size;
				50
				51	int window_hidden_size = 500;
				52	real syn_window_hidden, syn_hidden_word, syn_hidden_word_neg, syn_hidden_word_nce;
				53
				54	int hs = 0, negative = 5;
				55	const int table_size = 1e8;
				56	int *table;
				57
				58	//constrastive negative sampling
				59	char negative_classes_file[MAX_STRING];
				60	int *word_to_group;
				61	int group_to_table; //group_sizetable_size
				62	int class_number;
				63
				64	//nce
				65	real* noise_distribution;
				66	int nce = 0;
				67
				68	//param caps
				69	real CAP_VALUE = 50;
				70	int cap = 0;
				71
				72	void capParam(real* array, int index){
				73	if(array[index] > CAP_VALUE)
				74	array[index] = CAP_VALUE;
				75	else if(array[index] < -CAP_VALUE)
				76	array[index] = -CAP_VALUE;
				77	}
				78
				79	real hardTanh(real x){
				80	if(x>=1){
				81	return 1;
				82	}
				83	else if(x<=-1){
				84	return -1;
				85	}
				86	else{
				87	return x;
				88	}
				89	}
				90
				91	real dHardTanh(real x, real g){
				92	if(x > 1 && g > 0){
				93	return 0;
				94	}
				95	if(x < -1 && g < 0){
				96	return 0;
				97	}
				98	return 1;
				99	}
				100
				101	void InitUnigramTable() {
				102	int a, i;
				103	long long train_words_pow = 0;
				104	real d1, power = 0.75;
				105	table = (int )malloc(table_size sizeof(int));
				106	for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
				107	i = 0;
				108	d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
				109	for (a = 0; a < table_size; a++) {
				110	table[a] = i;
				111	if (a / (real)table_size > d1) {
				112	i++;
				113	d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
				114	}
				115	if (i >= vocab_size) i = vocab_size - 1;
				116	}
				117
				118	noise_distribution = (real *)calloc(vocab_size, sizeof(real));
				119	for (a = 0; a < vocab_size; a++) noise_distribution[a] = pow(vocab[a].cn, power)/(real)train_words_pow;
				120	}
				121
				122	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				123	void ReadWord(char word, FILE fin) {
				124	int a = 0, ch;
				125	while (!feof(fin)) {
				126	ch = fgetc(fin);
				127	if (ch == 13) continue;
				128	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				129	if (a > 0) {
				130	if (ch == '\n') ungetc(ch, fin);
				131	break;
				132	}
				133	if (ch == '\n') {
				134	strcpy(word, (char *)"</s>");
				135	return;
				136	} else continue;
				137	}
				138	word[a] = ch;
				139	a++;
				140	if (a >= MAX_STRING - 1) a--; // Truncate too long words
				141	}
				142	word[a] = 0;
				143	}
				144
				145	// Returns hash value of a word
				146	int GetWordHash(char *word) {
				147	unsigned long long a, hash = 0;
				148	for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
				149	hash = hash % vocab_hash_size;
				150	return hash;
				151	}
				152
				153	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				154	int SearchVocab(char *word) {
				155	unsigned int hash = GetWordHash(word);
				156	while (1) {
				157	if (vocab_hash[hash] == -1) return -1;
				158	if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
				159	hash = (hash + 1) % vocab_hash_size;
				160	}
				161	return -1;
				162	}
				163
				164	// Reads a word and returns its index in the vocabulary
				165	int ReadWordIndex(FILE *fin) {
				166	char word[MAX_STRING];
				167	ReadWord(word, fin);
				168	if (feof(fin)) return -1;
				169	return SearchVocab(word);
				170	}
				171
				172	// Adds a word to the vocabulary
				173	int AddWordToVocab(char *word) {
				174	unsigned int hash, length = strlen(word) + 1;
				175	if (length > MAX_STRING) length = MAX_STRING;
				176	vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
				177	strcpy(vocab[vocab_size].word, word);
				178	vocab[vocab_size].cn = 0;
				179	vocab_size++;
				180	// Reallocate memory if needed
				181	if (vocab_size + 2 >= vocab_max_size) {
				182	vocab_max_size += 1000;
				183	vocab = (struct vocab_word )realloc(vocab, vocab_max_size sizeof(struct vocab_word));
				184	}
				185	hash = GetWordHash(word);
				186	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				187	vocab_hash[hash] = vocab_size - 1;
				188	return vocab_size - 1;
				189	}
				190
				191	// Used later for sorting by word counts
				192	int VocabCompare(const void a, const void b) {
				193	return ((struct vocab_word )b)->cn - ((struct vocab_word )a)->cn;
				194	}
				195
				196	// Sorts the vocabulary by frequency using word counts
				197	void SortVocab() {
				198	int a, size;
				199	unsigned int hash;
				200	// Sort the vocabulary and keep </s> at the first position
				201	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				202	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				203	size = vocab_size;
				204	train_words = 0;
				205	for (a = 0; a < size; a++) {
				206	// Words occuring less than min_count times will be discarded from the vocab
				207	if ((vocab[a].cn < min_count) && (a != 0)) {
				208	vocab_size--;
				209	free(vocab[a].word);
				210	} else {
				211	// Hash will be re-computed, as after the sorting it is not actual
				212	hash=GetWordHash(vocab[a].word);
				213	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				214	vocab_hash[hash] = a;
				215	train_words += vocab[a].cn;
				216	}
				217	}
				218	vocab = (struct vocab_word )realloc(vocab, (vocab_size + 1) sizeof(struct vocab_word));
				219	// Allocate memory for the binary tree construction
				220	for (a = 0; a < vocab_size; a++) {
				221	vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
				222	vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
				223	}
				224	}
				225
				226	// Reduces the vocabulary by removing infrequent tokens
				227	void ReduceVocab() {
				228	int a, b = 0;
				229	unsigned int hash;
				230	for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
				231	vocab[b].cn = vocab[a].cn;
				232	vocab[b].word = vocab[a].word;
				233	b++;
				234	} else free(vocab[a].word);
				235	vocab_size = b;
				236	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				237	for (a = 0; a < vocab_size; a++) {
				238	// Hash will be re-computed, as it is not actual
				239	hash = GetWordHash(vocab[a].word);
				240	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				241	vocab_hash[hash] = a;
				242	}
				243	fflush(stdout);
				244	min_reduce++;
				245	}
				246
				247	// Create binary Huffman tree using the word counts
				248	// Frequent words will have short uniqe binary codes
				249	void CreateBinaryTree() {
				250	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				251	char code[MAX_CODE_LENGTH];
				252	long long count = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				253	long long binary = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				254	long long parent_node = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				255	for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
				256	for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
				257	pos1 = vocab_size - 1;
				258	pos2 = vocab_size;
				259	// Following algorithm constructs the Huffman tree by adding one node at a time
				260	for (a = 0; a < vocab_size - 1; a++) {
				261	// First, find two smallest nodes 'min1, min2'
				262	if (pos1 >= 0) {
				263	if (count[pos1] < count[pos2]) {
				264	min1i = pos1;
				265	pos1--;
				266	} else {
				267	min1i = pos2;
				268	pos2++;
				269	}
				270	} else {
				271	min1i = pos2;
				272	pos2++;
				273	}
				274	if (pos1 >= 0) {
				275	if (count[pos1] < count[pos2]) {
				276	min2i = pos1;
				277	pos1--;
				278	} else {
				279	min2i = pos2;
				280	pos2++;
				281	}
				282	} else {
				283	min2i = pos2;
				284	pos2++;
				285	}
				286	count[vocab_size + a] = count[min1i] + count[min2i];
				287	parent_node[min1i] = vocab_size + a;
				288	parent_node[min2i] = vocab_size + a;
				289	binary[min2i] = 1;
				290	}
				291	// Now assign binary code to each vocabulary word
				292	for (a = 0; a < vocab_size; a++) {
				293	b = a;
				294	i = 0;
				295	while (1) {
				296	code[i] = binary[b];
				297	point[i] = b;
				298	i++;
				299	b = parent_node[b];
				300	if (b == vocab_size * 2 - 2) break;
				301	}
				302	vocab[a].codelen = i;
				303	vocab[a].point[0] = vocab_size - 2;
				304	for (b = 0; b < i; b++) {
				305	vocab[a].code[i - b - 1] = code[b];
				306	vocab[a].point[i - b] = point[b] - vocab_size;
				307	}
				308	}
				309	free(count);
				310	free(binary);
				311	free(parent_node);
				312	}
				313
				314	void LearnVocabFromTrainFile() {
				315	char word[MAX_STRING];
				316	FILE *fin;
				317	long long a, i;
				318	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				319	fin = fopen(train_file, "rb");
				320	if (fin == NULL) {
				321	printf("ERROR: training data file not found!\n");
				322	exit(1);
				323	}
				324	vocab_size = 0;
				325	AddWordToVocab((char *)"</s>");
				326	while (1) {
				327	ReadWord(word, fin);
				328	if (feof(fin)) break;
				329	train_words++;
				330	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				331	printf("%lldK%c", train_words / 1000, 13);
				332	fflush(stdout);
				333	}
				334	i = SearchVocab(word);
				335	if (i == -1) {
				336	a = AddWordToVocab(word);
				337	vocab[a].cn = 1;
				338	} else vocab[i].cn++;
				339	if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
				340	}
				341	SortVocab();
				342	if (debug_mode > 0) {
				343	printf("Vocab size: %lld\n", vocab_size);
				344	printf("Words in train file: %lld\n", train_words);
				345	}
				346	file_size = ftell(fin);
				347	fclose(fin);
				348	}
				349
				350	void SaveVocab() {
				351	long long i;
				352	FILE *fo = fopen(save_vocab_file, "wb");
				353	for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				354	fclose(fo);
				355	}
				356
				357	void ReadVocab() {
				358	long long a, i = 0;
				359	char c;
				360	char word[MAX_STRING];
				361	FILE *fin = fopen(read_vocab_file, "rb");
				362	if (fin == NULL) {
				363	printf("Vocabulary file not found\n");
				364	exit(1);
				365	}
				366	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				367	vocab_size = 0;
				368	while (1) {
				369	ReadWord(word, fin);
				370	if (feof(fin)) break;
				371	a = AddWordToVocab(word);
				372	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				373	i++;
				374	}
				375	SortVocab();
				376	if (debug_mode > 0) {
				377	printf("Vocab size: %lld\n", vocab_size);
				378	printf("Words in train file: %lld\n", train_words);
				379	}
				380	fin = fopen(train_file, "rb");
				381	if (fin == NULL) {
				382	printf("ERROR: training data file not found!\n");
				383	exit(1);
				384	}
				385	fseek(fin, 0, SEEK_END);
				386	file_size = ftell(fin);
				387	fclose(fin);
				388	}
				389
				390	void InitClassUnigramTable() {
				391	long long a,c;
				392	printf("loading class unigrams \n");
				393	FILE *fin = fopen(negative_classes_file, "rb");
				394	if (fin == NULL) {
				395	printf("ERROR: class file not found!\n");
				396	exit(1);
				397	}
				398	word_to_group = (int )malloc(vocab_size sizeof(int));
				399	for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
				400	char class[MAX_STRING];
				401	char prev_class[MAX_STRING];
				402	prev_class[0] = 0;
				403	char word[MAX_STRING];
				404	class_number = -1;
				405	while (1) {
				406	if (feof(fin)) break;
				407	ReadWord(class, fin);
				408	ReadWord(word, fin);
				409	int word_index = SearchVocab(word);
				410	if (word_index != -1){
				411	if(strcmp(class, prev_class) != 0){
				412	class_number++;
				413	strcpy(prev_class, class);
				414	}
				415	word_to_group[word_index] = class_number;
				416	}
				417	ReadWord(word, fin);
				418	}
				419	class_number++;
				420	fclose(fin);
				421
				422	group_to_table = (int )malloc(table_size class_number * sizeof(int));
				423	long long train_words_pow = 0;
				424	real d1, power = 0.75;
				425
				426	for(c = 0; c < class_number; c++){
				427	long long offset = c * table_size;
				428	train_words_pow = 0;
				429	for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
				430	int i = 0;
				431	while(word_to_group[i]!=c && i < vocab_size) i++;
				432	d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
				433	for (a = 0; a < table_size; a++) {
				434	//printf("index %lld , word %d\n", a, i);
				435	group_to_table[offset + a] = i;
				436	if (a / (real)table_size > d1) {
				437	i++;
				438	while(word_to_group[i]!=c && i < vocab_size) i++;
				439	d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
				440	}
				441	if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
				442	}
				443	}
				444	}
				445
				446	void InitNet() {
				447	long long a, b;
				448	unsigned long long next_random = 1;
				449	window_layer_size = layer1_sizewindow2;
				450	a = posix_memalign((void *)&syn0, 128, (long long)vocab_size layer1_size * sizeof(real));
				451	if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
				452
				453	if (hs) {
				454	a = posix_memalign((void *)&syn1, 128, (long long)vocab_size layer1_size * sizeof(real));
				455	if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
				456	a = posix_memalign((void *)&syn1_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				457	if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				458	a = posix_memalign((void *)&syn_hidden_word, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				459	if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
				460
				461	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				462	syn1[a * layer1_size + b] = 0;
				463	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				464	syn1_window[a * window_layer_size + b] = 0;
				465	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				466	syn_hidden_word[a * window_hidden_size + b] = 0;
				467	}
				468	if (negative>0) {
				469	a = posix_memalign((void *)&syn1neg, 128, (long long)vocab_size layer1_size * sizeof(real));
				470	if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
				471	a = posix_memalign((void *)&syn1neg_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				472	if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				473	a = posix_memalign((void *)&syn_hidden_word_neg, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				474	if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
				475
				476	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				477	syn1neg[a * layer1_size + b] = 0;
				478	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				479	syn1neg_window[a * window_layer_size + b] = 0;
				480	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				481	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
				482	}
				483	if (nce>0) {
				484	a = posix_memalign((void *)&syn1nce, 128, (long long)vocab_size layer1_size * sizeof(real));
				485	if (syn1nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
				486	a = posix_memalign((void *)&syn1nce_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				487	if (syn1nce_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				488	a = posix_memalign((void *)&syn_hidden_word_nce, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				489	if (syn_hidden_word_nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
				490
				491	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				492	syn1nce[a * layer1_size + b] = 0;
				493	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				494	syn1nce_window[a * window_layer_size + b] = 0;
				495	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				496	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				497	}
				498	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
				499	next_random = next_random * (unsigned long long)25214903917 + 11;
				500	syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
				501	}
				502
				503	a = posix_memalign((void *)&syn_window_hidden, 128, window_hidden_size window_layer_size * sizeof(real));
				504	if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
				505	for (a = 0; a < window_hidden_size * window_layer_size; a++){
				506	next_random = next_random * (unsigned long long)25214903917 + 11;
				507	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
				508	}
				509
				510	CreateBinaryTree();
				511	}
				512
				513	void TrainModelThread(void id) {
				514	long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
				515	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				516	long long l1, l2, c, target, label, local_iter = iter;
				517	unsigned long long next_random = (long long)id;
				518	real f, g;
				519	clock_t now;
				520	int input_len_1 = layer1_size;
				521	int window_offset = -1;
				522	if(type == 2 \|\| type == 4){
				523	input_len_1=window_layer_size;
				524	}
				525	real neu1 = (real )calloc(input_len_1, sizeof(real));
				526	real neu1e = (real )calloc(input_len_1, sizeof(real));
				527
				528	int input_len_2 = 0;
				529	if(type == 4){
				530	input_len_2 = window_hidden_size;
				531	}
				532	real neu2 = (real )calloc(input_len_2, sizeof(real));
				533	real neu2e = (real )calloc(input_len_2, sizeof(real));
				534
				535	FILE *fi = fopen(train_file, "rb");
				536	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
				537	while (1) {
				538	if (word_count - last_word_count > 10000) {
				539	word_count_actual += word_count - last_word_count;
				540	last_word_count = word_count;
				541	if ((debug_mode > 1)) {
				542	now=clock();
				543	printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
				544	word_count_actual / (real)(iter * train_words + 1) * 100,
				545	word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
				546	fflush(stdout);
				547	}
				548	alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
				549	if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
				550	}
				551	if (sentence_length == 0) {
				552	while (1) {
				553	word = ReadWordIndex(fi);
				554	if (feof(fi)) break;
				555	if (word == -1) continue;
				556	word_count++;
				557	if (word == 0) break;
				558	// The subsampling randomly discards frequent words while keeping the ranking same
				559	if (sample > 0) {
				560	real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
				561	next_random = next_random * (unsigned long long)25214903917 + 11;
				562	if (ran < (next_random & 0xFFFF) / (real)65536) continue;
				563	}
				564	sen[sentence_length] = word;
				565	sentence_length++;
				566	if (sentence_length >= MAX_SENTENCE_LENGTH) break;
				567	}
				568	sentence_position = 0;
				569	}
				570	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				571	word_count_actual += word_count - last_word_count;
				572	local_iter--;
				573	if (local_iter == 0) break;
				574	word_count = 0;
				575	last_word_count = 0;
				576	sentence_length = 0;
				577	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
				578	continue;
				579	}
				580	word = sen[sentence_position];
				581	if (word == -1) continue;
				582	for (c = 0; c < input_len_1; c++) neu1[c] = 0;
				583	for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
				584	for (c = 0; c < input_len_2; c++) neu2[c] = 0;
				585	for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
				586	next_random = next_random * (unsigned long long)25214903917 + 11;
				587	b = next_random % window;
				588	if (type == 0) { //train the cbow architecture
				589	// in -> hidden
				590	cw = 0;
				591	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				592	c = sentence_position - window + a;
				593	if (c < 0) continue;
				594	if (c >= sentence_length) continue;
				595	last_word = sen[c];
				596	if (last_word == -1) continue;
				597	for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
				598	cw++;
				599	}
				600	if (cw) {
				601	for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
				602	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				603	f = 0;
				604	l2 = vocab[word].point[d] * layer1_size;
				605	// Propagate hidden -> output
				606	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
				607	if (f <= -MAX_EXP) continue;
				608	else if (f >= MAX_EXP) continue;
				609	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				610	// 'g' is the gradient multiplied by the learning rate
				611	g = (1 - vocab[word].code[d] - f) * alpha;
				612	// Propagate errors output -> hidden
				613	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
				614	// Learn weights hidden -> output
				615	for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
				616	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
				617	}
				618	// NEGATIVE SAMPLING
				619	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				620	if (d == 0) {
				621	target = word;
				622	label = 1;
				623	} else {
				624	next_random = next_random * (unsigned long long)25214903917 + 11;
				625	if(word_to_group != NULL && word_to_group[word] != -1){
				626	target = word;
				627	while(target == word) {
				628	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				629	next_random = next_random * (unsigned long long)25214903917 + 11;
				630	}
				631	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				632	}
				633	else{
				634	target = table[(next_random >> 16) % table_size];
				635	}
				636	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				637	if (target == word) continue;
				638	label = 0;
				639	}
				640	l2 = target * layer1_size;
				641	f = 0;
				642	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
				643	if (f > MAX_EXP) g = (label - 1) * alpha;
				644	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				645	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				646	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
				647	for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
				648	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
				649	}
				650	// Noise Contrastive Estimation
				651	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				652	if (d == 0) {
				653	target = word;
				654	label = 1;
				655	} else {
				656	next_random = next_random * (unsigned long long)25214903917 + 11;
				657	if(word_to_group != NULL && word_to_group[word] != -1){
				658	target = word;
				659	while(target == word) {
				660	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				661	next_random = next_random * (unsigned long long)25214903917 + 11;
				662	}
				663	}
				664	else{
				665	target = table[(next_random >> 16) % table_size];
				666	}
				667	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				668	if (target == word) continue;
				669	label = 0;
				670	}
				671	l2 = target * layer1_size;
				672	f = 0;
				673
				674	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
				675	if (f > MAX_EXP) g = (label - 1) * alpha;
				676	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				677	else {
				678	f = exp(f);
				679	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				680	}
				681	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
				682	for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
				683	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce,c + l2);
				684	}
				685	// hidden -> in
				686	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				687	c = sentence_position - window + a;
				688	if (c < 0) continue;
				689	if (c >= sentence_length) continue;
				690	last_word = sen[c];
				691	if (last_word == -1) continue;
				692	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
				693	}
				694	}
				695	} else if(type==1) { //train skip-gram
				696	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				697	c = sentence_position - window + a;
				698	if (c < 0) continue;
				699	if (c >= sentence_length) continue;
				700	last_word = sen[c];
				701	if (last_word == -1) continue;
				702	l1 = last_word * layer1_size;
				703	for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
				704	// HIERARCHICAL SOFTMAX
				705	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				706	f = 0;
				707	l2 = vocab[word].point[d] * layer1_size;
				708	// Propagate hidden -> output
				709	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
				710	if (f <= -MAX_EXP) continue;
				711	else if (f >= MAX_EXP) continue;
				712	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				713	// 'g' is the gradient multiplied by the learning rate
				714	g = (1 - vocab[word].code[d] - f) * alpha;
				715	// Propagate errors output -> hidden
				716	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
				717	// Learn weights hidden -> output
				718	for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
				719	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
				720	}
				721	// NEGATIVE SAMPLING
				722	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				723	if (d == 0) {
				724	target = word;
				725	label = 1;
				726	} else {
				727	next_random = next_random * (unsigned long long)25214903917 + 11;
				728	if(word_to_group != NULL && word_to_group[word] != -1){
				729	target = word;
				730	while(target == word) {
				731	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				732	next_random = next_random * (unsigned long long)25214903917 + 11;
				733	}
				734	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				735	}
				736	else{
				737	target = table[(next_random >> 16) % table_size];
				738	}
				739	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				740	if (target == word) continue;
				741	label = 0;
				742	}
				743	l2 = target * layer1_size;
				744	f = 0;
				745	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
				746	if (f > MAX_EXP) g = (label - 1) * alpha;
				747	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				748	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				749	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
				750	for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
				751	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
				752	}
				753	//Noise Contrastive Estimation
				754	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				755	if (d == 0) {
				756	target = word;
				757	label = 1;
				758	} else {
				759	next_random = next_random * (unsigned long long)25214903917 + 11;
				760	if(word_to_group != NULL && word_to_group[word] != -1){
				761	target = word;
				762	while(target == word) {
				763	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				764	next_random = next_random * (unsigned long long)25214903917 + 11;
				765	}
				766	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				767	}
				768	else{
				769	target = table[(next_random >> 16) % table_size];
				770	}
				771	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				772	if (target == word) continue;
				773	label = 0;
				774	}
				775	l2 = target * layer1_size;
				776	f = 0;
				777	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce[c + l2];
				778	if (f > MAX_EXP) g = (label - 1) * alpha;
				779	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				780	else {
				781	f = exp(f);
				782	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				783	}
				784	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
				785	for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * syn0[c + l1];
				786	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce, c + l2);
				787	}
				788	// Learn weights input -> hidden
				789	for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
				790	}
				791	}
				792	else if(type == 2){ //train the cwindow architecture
				793	// in -> hidden
				794	cw = 0;
				795	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				796	c = sentence_position - window + a;
				797	if (c < 0) continue;
				798	if (c >= sentence_length) continue;
				799	last_word = sen[c];
				800	if (last_word == -1) continue;
				801	window_offset = a*layer1_size;
				802	if (a > window) window_offset-=layer1_size;
				803	for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
				804	cw++;
				805	}
				806	if (cw) {
				807	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				808	f = 0;
				809	l2 = vocab[word].point[d] * window_layer_size;
				810	// Propagate hidden -> output
				811	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
				812	if (f <= -MAX_EXP) continue;
				813	else if (f >= MAX_EXP) continue;
				814	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				815	// 'g' is the gradient multiplied by the learning rate
				816	g = (1 - vocab[word].code[d] - f) * alpha;
				817	// Propagate errors output -> hidden
				818	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
				819	// Learn weights hidden -> output
				820	for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
				821	if (cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1_window, c + l2);
				822	}
				823	// NEGATIVE SAMPLING
				824	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				825	if (d == 0) {
				826	target = word;
				827	label = 1;
				828	} else {
				829	next_random = next_random * (unsigned long long)25214903917 + 11;
				830	if(word_to_group != NULL && word_to_group[word] != -1){
				831	target = word;
				832	while(target == word) {
				833	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				834	next_random = next_random * (unsigned long long)25214903917 + 11;
				835	}
				836	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				837	}
				838	else{
				839	target = table[(next_random >> 16) % table_size];
				840	}
				841	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				842	if (target == word) continue;
				843	label = 0;
				844	}
				845	l2 = target * window_layer_size;
				846	f = 0;
				847	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
				848	if (f > MAX_EXP) g = (label - 1) * alpha;
				849	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				850	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				851	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
				852	for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
				853	if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1neg_window, c + l2);
				854	}
				855	// Noise Contrastive Estimation
				856	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				857	if (d == 0) {
				858	target = word;
				859	label = 1;
				860	} else {
				861	next_random = next_random * (unsigned long long)25214903917 + 11;
				862	if(word_to_group != NULL && word_to_group[word] != -1){
				863	target = word;
				864	while(target == word) {
				865	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				866	next_random = next_random * (unsigned long long)25214903917 + 11;
				867	}
				868	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				869	}
				870	else{
				871	target = table[(next_random >> 16) % table_size];
				872	}
				873	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				874	if (target == word) continue;
				875	label = 0;
				876	}
				877	l2 = target * window_layer_size;
				878	f = 0;
				879	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1nce_window[c + l2];
				880	if (f > MAX_EXP) g = (label - 1) * alpha;
				881	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				882	else {
				883	f = exp(f);
				884	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				885	}
				886	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1nce_window[c + l2];
				887	for (c = 0; c < window_layer_size; c++) syn1nce_window[c + l2] += g * neu1[c];
				888	if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1nce_window, c + l2);
				889	}
				890	// hidden -> in
				891	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				892	c = sentence_position - window + a;
				893	if (c < 0) continue;
				894	if (c >= sentence_length) continue;
				895	last_word = sen[c];
				896	if (last_word == -1) continue;
				897	window_offset = a * layer1_size;
				898	if(a > window) window_offset -= layer1_size;
				899	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
				900	}
				901	}
				902	}
				903	else if (type == 3){ //train structured skip-gram
				904	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				905	c = sentence_position - window + a;
				906	if (c < 0) continue;
				907	if (c >= sentence_length) continue;
				908	last_word = sen[c];
				909	if (last_word == -1) continue;
				910	l1 = last_word * layer1_size;
				911	window_offset = a * layer1_size;
				912	if(a > window) window_offset -= layer1_size;
				913	for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
				914	// HIERARCHICAL SOFTMAX
				915	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				916	f = 0;
				917	l2 = vocab[word].point[d] * window_layer_size;
				918	// Propagate hidden -> output
				919	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1_window[c + l2 + window_offset];
				920	if (f <= -MAX_EXP) continue;
				921	else if (f >= MAX_EXP) continue;
				922	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				923	// 'g' is the gradient multiplied by the learning rate
				924	g = (1 - vocab[word].code[d] - f) * alpha;
				925	// Propagate errors output -> hidden
				926	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
				927	// Learn weights hidden -> output
				928	for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * syn0[c + l1];
				929	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2 + window_offset);
				930	}
				931	// NEGATIVE SAMPLING
				932	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				933	if (d == 0) {
				934	target = word;
				935	label = 1;
				936	} else {
				937	next_random = next_random * (unsigned long long)25214903917 + 11;
				938	if(word_to_group != NULL && word_to_group[word] != -1){
				939	target = word;
				940	while(target == word) {
				941	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				942	next_random = next_random * (unsigned long long)25214903917 + 11;
				943	}
				944	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				945	}
				946	else{
				947	target = table[(next_random >> 16) % table_size];
				948	}
				949	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				950	if (target == word) continue;
				951	label = 0;
				952	}
				953	l2 = target * window_layer_size;
				954	f = 0;
				955	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg_window[c + l2 + window_offset];
				956	if (f > MAX_EXP) g = (label - 1) * alpha;
				957	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				958	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				959	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
				960	for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * syn0[c + l1];
				961	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg_window, c + l2 + window_offset);
				962	}
				963	// Noise Constrastive Estimation
				964	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				965	if (d == 0) {
				966	target = word;
				967	label = 1;
				968	} else {
				969	next_random = next_random * (unsigned long long)25214903917 + 11;
				970	if(word_to_group != NULL && word_to_group[word] != -1){
				971	target = word;
				972	while(target == word) {
				973	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				974	next_random = next_random * (unsigned long long)25214903917 + 11;
				975	}
				976	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				977	}
				978	else{
				979	target = table[(next_random >> 16) % table_size];
				980	}
				981	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				982	if (target == word) continue;
				983	label = 0;
				984	}
				985	l2 = target * window_layer_size;
				986	f = 0;
				987	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce_window[c + l2 + window_offset];
				988	if (f > MAX_EXP) g = (label - 1) * alpha;
				989	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				990	else {
				991	f = exp(f);
				992	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				993	}
				994	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce_window[c + l2 + window_offset];
				995	for (c = 0; c < layer1_size; c++) syn1nce_window[c + l2 + window_offset] += g * syn0[c + l1];
				996	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce_window, c + l2 + window_offset);
				997	}
				998	// Learn weights input -> hidden
				999	for (c = 0; c < layer1_size; c++) {syn0[c + l1] += neu1e[c]; if(syn0[c + l1] > 50) syn0[c + l1] = 50; if(syn0[c + l1] < -50) syn0[c + l1] = -50;}
				1000	}
				1001	}
				1002	else if(type == 4){ //training senna
				1003	// in -> hidden
				1004	cw = 0;
				1005	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1006	c = sentence_position - window + a;
				1007	if (c < 0) continue;
				1008	if (c >= sentence_length) continue;
				1009	last_word = sen[c];
				1010	if (last_word == -1) continue;
				1011	window_offset = a*layer1_size;
				1012	if (a > window) window_offset-=layer1_size;
				1013	for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
				1014	cw++;
				1015	}
				1016	if (cw) {
				1017	for (a = 0; a < window_hidden_size; a++){
				1018	c = a*window_layer_size;
				1019	for(b = 0; b < window_layer_size; b++){
				1020	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1021	}
				1022	}
				1023	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				1024	f = 0;
				1025	l2 = vocab[word].point[d] * window_hidden_size;
				1026	// Propagate hidden -> output
				1027	for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1028	if (f <= -MAX_EXP) continue;
				1029	else if (f >= MAX_EXP) continue;
				1030	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1031	// 'g' is the gradient multiplied by the learning rate
				1032	g = (1 - vocab[word].code[d] - f) * alpha;
				1033	// Propagate errors output -> hidden
				1034	for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
				1035	// Learn weights hidden -> output
				1036	for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
				1037	}
				1038	// NEGATIVE SAMPLING
				1039	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				1040	if (d == 0) {
				1041	target = word;
				1042	label = 1;
				1043	} else {
				1044	next_random = next_random * (unsigned long long)25214903917 + 11;
				1045	if(word_to_group != NULL && word_to_group[word] != -1){
				1046	target = word;
				1047	while(target == word) {
				1048	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1049	next_random = next_random * (unsigned long long)25214903917 + 11;
				1050	}
				1051	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1052	}
				1053	else{
				1054	target = table[(next_random >> 16) % table_size];
				1055	}
				1056	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1057	if (target == word) continue;
				1058	label = 0;
				1059	}
				1060	l2 = target * window_hidden_size;
				1061	f = 0;
				1062	for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
				1063	if (f > MAX_EXP) g = (label - 1) * alpha / negative;
				1064	else if (f < -MAX_EXP) g = (label - 0) * alpha / negative;
				1065	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha / negative;
				1066	for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
				1067	for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
				1068	}
				1069	for (a = 0; a < window_hidden_size; a++)
				1070	for(b = 0; b < window_layer_size; b++)
				1071	neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
				1072	for (a = 0; a < window_hidden_size; a++)
				1073	for(b = 0; b < window_layer_size; b++)
				1074	syn_window_hidden[awindow_layer_size + b] += neu2e[a] neu1[b];
				1075	// hidden -> in
				1076	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1077	c = sentence_position - window + a;
				1078	if (c < 0) continue;
				1079	if (c >= sentence_length) continue;
				1080	last_word = sen[c];
				1081	if (last_word == -1) continue;
				1082	window_offset = a * layer1_size;
				1083	if(a > window) window_offset -= layer1_size;
				1084	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
				1085	}
				1086	}
				1087	}
				1088	else{
				1089	printf("unknown type %i", type);
				1090	exit(0);
				1091	}
				1092	sentence_position++;
				1093	if (sentence_position >= sentence_length) {
				1094	sentence_length = 0;
				1095	continue;
				1096	}
				1097	}
				1098	fclose(fi);
				1099	free(neu1);
				1100	free(neu1e);
				1101	pthread_exit(NULL);
				1102	}
				1103
				1104	void TrainModel() {
				1105	long a, b, c, d;
				1106	FILE *fo;
				1107	pthread_t pt = (pthread_t )malloc(num_threads * sizeof(pthread_t));
				1108	printf("Starting training using file %s\n", train_file);
				1109	starting_alpha = alpha;
				1110	if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
				1111	if (save_vocab_file[0] != 0) SaveVocab();
				1112	if (output_file[0] == 0) return;
				1113	InitNet();
				1114	if (negative > 0 \|\| nce > 0) InitUnigramTable();
				1115	if (negative_classes_file[0] != 0) InitClassUnigramTable();
				1116	start = clock();
				1117	for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
				1118	for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
				1119	fo = fopen(output_file, "wb");
				1120	if (classes == 0) {
				1121	// Save the word vectors
				1122	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1123	for (a = 0; a < vocab_size; a++) {
				1124	fprintf(fo, "%s ", vocab[a].word);
				1125	if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1126	else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1127	fprintf(fo, "\n");
				1128	}
				1129	} else {
				1130	// Run K-means on the word vectors
				1131	int clcn = classes, iter = 10, closeid;
				1132	int centcn = (int )malloc(classes * sizeof(int));
				1133	int cl = (int )calloc(vocab_size, sizeof(int));
				1134	real closev, x;
				1135	real cent = (real )calloc(classes * layer1_size, sizeof(real));
				1136	for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
				1137	for (a = 0; a < iter; a++) {
				1138	for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
				1139	for (b = 0; b < clcn; b++) centcn[b] = 1;
				1140	for (c = 0; c < vocab_size; c++) {
				1141	for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1142	centcn[cl[c]]++;
				1143	}
				1144	for (b = 0; b < clcn; b++) {
				1145	closev = 0;
				1146	for (c = 0; c < layer1_size; c++) {
				1147	cent[layer1_size * b + c] /= centcn[b];
				1148	closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
				1149	}
				1150	closev = sqrt(closev);
				1151	for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
				1152	}
				1153	for (c = 0; c < vocab_size; c++) {
				1154	closev = -10;
				1155	closeid = 0;
				1156	for (d = 0; d < clcn; d++) {
				1157	x = 0;
				1158	for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
				1159	if (x > closev) {
				1160	closev = x;
				1161	closeid = d;
				1162	}
				1163	}
				1164	cl[c] = closeid;
				1165	}
				1166	}
				1167	// Save the K-means classes
				1168	for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1169	free(centcn);
				1170	free(cent);
				1171	free(cl);
				1172	}
				1173	fclose(fo);
				1174	}
				1175
				1176	int ArgPos(char str, int argc, char *argv) {
				1177	int a;
				1178	for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
				1179	if (a == argc - 1) {
				1180	printf("Argument missing for %s\n", str);
				1181	exit(1);
				1182	}
				1183	return a;
				1184	}
				1185	return -1;
				1186	}
				1187
				1188	int main(int argc, char **argv) {
				1189	int i;
				1190	if (argc == 1) {
				1191	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1192	printf("Options:\n");
				1193	printf("Parameters for training:\n");
				1194	printf("\t-train <file>\n");
				1195	printf("\t\tUse text data from <file> to train the model\n");
				1196	printf("\t-output <file>\n");
				1197	printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1198	printf("\t-size <int>\n");
				1199	printf("\t\tSet size of word vectors; default is 100\n");
				1200	printf("\t-window <int>\n");
				1201	printf("\t\tSet max skip length between words; default is 5\n");
				1202	printf("\t-sample <float>\n");
				1203	printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1204	printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1205	printf("\t-hs <int>\n");
				1206	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1207	printf("\t-negative <int>\n");
				1208	printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1209	printf("\t-negative-classes <file>\n");
				1210	printf("\t\tNegative classes to sample from\n");
				1211	printf("\t-nce <int>\n");
				1212	printf("\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1213	printf("\t-threads <int>\n");
				1214	printf("\t\tUse <int> threads (default 12)\n");
				1215	printf("\t-iter <int>\n");
				1216	printf("\t\tRun more training iterations (default 5)\n");
				1217	printf("\t-min-count <int>\n");
				1218	printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1219	printf("\t-alpha <float>\n");
				1220	printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1221	printf("\t-classes <int>\n");
				1222	printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1223	printf("\t-debug <int>\n");
				1224	printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
				1225	printf("\t-binary <int>\n");
				1226	printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1227	printf("\t-save-vocab <file>\n");
				1228	printf("\t\tThe vocabulary will be saved to <file>\n");
				1229	printf("\t-read-vocab <file>\n");
				1230	printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1231	printf("\t-type <int>\n");
				1232	printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1233	printf("\t-cap <int>\n");
				1234	printf("\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1235	printf("\nExamples:\n");
				1236	printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1237	return 0;
				1238	}
				1239	output_file[0] = 0;
				1240	save_vocab_file[0] = 0;
				1241	read_vocab_file[0] = 0;
				1242	negative_classes_file[0] = 0;
				1243	if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
				1244	if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
				1245	if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
				1246	if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
				1247	if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
				1248	if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
				1249	if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
				1250	if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
				1251	if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
				1252	if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
				1253	if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
				1254	if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
				1255	if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
				1256	if ((i = ArgPos((char *)"-nce", argc, argv)) > 0) nce = atoi(argv[i + 1]);
				1257	if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
				1258	if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
				1259	if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
				1260	if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
				1261	if ((i = ArgPos((char *)"-cap", argc, argv)) > 0) cap = atoi(argv[i + 1]);
				1262	if (type==0 \|\| type==2 \|\| type==4) alpha = 0.05;
				1263	if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
				1264	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
				1265	vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
				1266	expTable = (real )malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1267	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1268	expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1269	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1270	}
				1271	TrainModel();
				1272	return 0;
				1273	}
				1274