Blame - weightedWord2vec.c - ids-kl/dereko2vec

blob: f1d8f60df15a292f565dac364ee2b79691a3cad2 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
				25	#define MAX_CODE_LENGTH 40
				26
				27	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				28
				29	typedef float real; // Precision of float numbers
				30
				31	struct vocab_word {
				32	long long cn;
				33	int *point;
				34	char word, code, codelen;
				35	};
				36
				37	char train_file[MAX_STRING], output_file[MAX_STRING];
				38	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				39	struct vocab_word *vocab;
				40	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
				41	int *vocab_hash;
				42	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				43	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
				44	real alpha = 0.025, starting_alpha, sample = 1e-3;
				45	real syn0, syn1, syn1neg, syn1nce, *expTable;
				46	clock_t start;
				47
				48	real syn1_window, syn1neg_window, *syn1nce_window;
				49	int w_offset, window_layer_size;
				50
				51	int window_hidden_size = 500;
				52	real syn_window_hidden, syn_hidden_word, syn_hidden_word_neg, syn_hidden_word_nce;
				53
				54	int hs = 0, negative = 5;
				55	const int table_size = 1e8;
				56	int *table;
				57
				58	//constrastive negative sampling
				59	char negative_classes_file[MAX_STRING];
				60	int *word_to_group;
				61	int group_to_table; //group_sizetable_size
				62	int class_number;
				63
				64	//nce
				65	real* noise_distribution;
				66	int nce = 0;
				67
				68	//param caps
				69	real CAP_VALUE = 50;
				70	int cap = 0;
				71
				72	void capParam(real* array, int index){
				73	if(array[index] > CAP_VALUE)
				74	array[index] = CAP_VALUE;
				75	else if(array[index] < -CAP_VALUE)
				76	array[index] = -CAP_VALUE;
				77	}
				78
				79	real hardTanh(real x){
				80	if(x>=1){
				81	return 1;
				82	}
				83	else if(x<=-1){
				84	return -1;
				85	}
				86	else{
				87	return x;
				88	}
				89	}
				90
				91	real dHardTanh(real x, real g){
				92	if(x > 1 && g > 0){
				93	return 0;
				94	}
				95	if(x < -1 && g < 0){
				96	return 0;
				97	}
				98	return 1;
				99	}
				100
				101	int isEndOfSentence(char* word){
				102	return strcmp("</s>", word) == 0;
				103	}
				104
				105	void InitUnigramTable() {
				106	int a, i;
				107	long long train_words_pow = 0;
				108	real d1, power = 0.75;
				109	table = (int )malloc(table_size sizeof(int));
				110	for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
				111	i = 0;
				112	d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
				113	for (a = 0; a < table_size; a++) {
				114	table[a] = i;
				115	if (a / (real)table_size > d1) {
				116	i++;
				117	d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
				118	}
				119	if (i >= vocab_size) i = vocab_size - 1;
				120	}
				121
				122	noise_distribution = (real *)calloc(vocab_size, sizeof(real));
				123	for (a = 0; a < vocab_size; a++) noise_distribution[a] = pow(vocab[a].cn, power)/(real)train_words_pow;
				124	}
				125
				126	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				127	void ReadWord(char word, FILE fin) {
				128	int a = 0, ch;
				129	while (!feof(fin)) {
				130	ch = fgetc(fin);
				131	if (ch == 13) continue;
				132	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				133	if (a > 0) {
				134	if (ch == '\n') ungetc(ch, fin);
				135	break;
				136	}
				137	if (ch == '\n') {
				138	strcpy(word, (char *)"</s>");
				139	return;
				140	} else continue;
				141	}
				142	word[a] = ch;
				143	a++;
				144	if (a >= MAX_STRING - 1) a--; // Truncate too long words
				145	}
				146	word[a] = 0;
				147	}
				148
				149	// Returns hash value of a word
				150	int GetWordHash(char *word) {
				151	unsigned long long a, hash = 0;
				152	for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
				153	hash = hash % vocab_hash_size;
				154	return hash;
				155	}
				156
				157	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				158	int SearchVocab(char *word) {
				159	unsigned int hash = GetWordHash(word);
				160	while (1) {
				161	if (vocab_hash[hash] == -1) return -1;
				162	if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
				163	hash = (hash + 1) % vocab_hash_size;
				164	}
				165	return -1;
				166	}
				167
				168	// Reads a word and returns its index in the vocabulary
				169	int ReadWordIndex(FILE *fin) {
				170	char word[MAX_STRING];
				171	ReadWord(word, fin);
				172	if (feof(fin)) return -1;
				173	return SearchVocab(word);
				174	}
				175
				176	// Adds a word to the vocabulary
				177	int AddWordToVocab(char *word) {
				178	unsigned int hash, length = strlen(word) + 1;
				179	if (length > MAX_STRING) length = MAX_STRING;
				180	vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
				181	strcpy(vocab[vocab_size].word, word);
				182	vocab[vocab_size].cn = 0;
				183	vocab_size++;
				184	// Reallocate memory if needed
				185	if (vocab_size + 2 >= vocab_max_size) {
				186	vocab_max_size += 1000;
				187	vocab = (struct vocab_word )realloc(vocab, vocab_max_size sizeof(struct vocab_word));
				188	}
				189	hash = GetWordHash(word);
				190	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				191	vocab_hash[hash] = vocab_size - 1;
				192	return vocab_size - 1;
				193	}
				194
				195	// Used later for sorting by word counts
				196	int VocabCompare(const void a, const void b) {
				197	return ((struct vocab_word )b)->cn - ((struct vocab_word )a)->cn;
				198	}
				199
				200	// Sorts the vocabulary by frequency using word counts
				201	void SortVocab() {
				202	int a, size;
				203	unsigned int hash;
				204	// Sort the vocabulary and keep </s> at the first position
				205	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				206	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				207	size = vocab_size;
				208	train_words = 0;
				209	for (a = 0; a < size; a++) {
				210	// Words occuring less than min_count times will be discarded from the vocab
				211	if ((vocab[a].cn < min_count) && (a != 0)) {
				212	vocab_size--;
				213	free(vocab[a].word);
				214	} else {
				215	// Hash will be re-computed, as after the sorting it is not actual
				216	hash=GetWordHash(vocab[a].word);
				217	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				218	vocab_hash[hash] = a;
				219	train_words += vocab[a].cn;
				220	}
				221	}
				222	vocab = (struct vocab_word )realloc(vocab, (vocab_size + 1) sizeof(struct vocab_word));
				223	// Allocate memory for the binary tree construction
				224	for (a = 0; a < vocab_size; a++) {
				225	vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
				226	vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
				227	}
				228	}
				229
				230	// Reduces the vocabulary by removing infrequent tokens
				231	void ReduceVocab() {
				232	int a, b = 0;
				233	unsigned int hash;
				234	for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
				235	vocab[b].cn = vocab[a].cn;
				236	vocab[b].word = vocab[a].word;
				237	b++;
				238	} else free(vocab[a].word);
				239	vocab_size = b;
				240	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				241	for (a = 0; a < vocab_size; a++) {
				242	// Hash will be re-computed, as it is not actual
				243	hash = GetWordHash(vocab[a].word);
				244	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				245	vocab_hash[hash] = a;
				246	}
				247	fflush(stdout);
				248	min_reduce++;
				249	}
				250
				251	// Create binary Huffman tree using the word counts
				252	// Frequent words will have short uniqe binary codes
				253	void CreateBinaryTree() {
				254	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				255	char code[MAX_CODE_LENGTH];
				256	long long count = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				257	long long binary = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				258	long long parent_node = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				259	for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
				260	for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
				261	pos1 = vocab_size - 1;
				262	pos2 = vocab_size;
				263	// Following algorithm constructs the Huffman tree by adding one node at a time
				264	for (a = 0; a < vocab_size - 1; a++) {
				265	// First, find two smallest nodes 'min1, min2'
				266	if (pos1 >= 0) {
				267	if (count[pos1] < count[pos2]) {
				268	min1i = pos1;
				269	pos1--;
				270	} else {
				271	min1i = pos2;
				272	pos2++;
				273	}
				274	} else {
				275	min1i = pos2;
				276	pos2++;
				277	}
				278	if (pos1 >= 0) {
				279	if (count[pos1] < count[pos2]) {
				280	min2i = pos1;
				281	pos1--;
				282	} else {
				283	min2i = pos2;
				284	pos2++;
				285	}
				286	} else {
				287	min2i = pos2;
				288	pos2++;
				289	}
				290	count[vocab_size + a] = count[min1i] + count[min2i];
				291	parent_node[min1i] = vocab_size + a;
				292	parent_node[min2i] = vocab_size + a;
				293	binary[min2i] = 1;
				294	}
				295	// Now assign binary code to each vocabulary word
				296	for (a = 0; a < vocab_size; a++) {
				297	b = a;
				298	i = 0;
				299	while (1) {
				300	code[i] = binary[b];
				301	point[i] = b;
				302	i++;
				303	b = parent_node[b];
				304	if (b == vocab_size * 2 - 2) break;
				305	}
				306	vocab[a].codelen = i;
				307	vocab[a].point[0] = vocab_size - 2;
				308	for (b = 0; b < i; b++) {
				309	vocab[a].code[i - b - 1] = code[b];
				310	vocab[a].point[i - b] = point[b] - vocab_size;
				311	}
				312	}
				313	free(count);
				314	free(binary);
				315	free(parent_node);
				316	}
				317
				318	void LearnVocabFromTrainFile() {
				319	char word[MAX_STRING];
				320	FILE *fin;
				321	long long a, i;
				322	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				323	fin = fopen(train_file, "rb");
				324	if (fin == NULL) {
				325	printf("ERROR: training data file not found!\n");
				326	exit(1);
				327	}
				328	vocab_size = 0;
				329	AddWordToVocab((char *)"</s>");
				330	int startOfLine = 1;
				331	while (1) {
				332	ReadWord(word, fin);
				333	if (feof(fin)) break;
				334	if (startOfLine) {
				335	ReadWord(word, fin);
				336	startOfLine = 0;
				337	}
				338	if(isEndOfSentence(word)){
				339	startOfLine = 1;
				340	}
				341	train_words++;
				342	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				343	printf("%lldK%c", train_words / 1000, 13);
				344	fflush(stdout);
				345	}
				346	i = SearchVocab(word);
				347	if (i == -1) {
				348	a = AddWordToVocab(word);
				349	vocab[a].cn = 1;
				350	} else vocab[i].cn++;
				351	if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
				352	}
				353	SortVocab();
				354	if (debug_mode > 0) {
				355	printf("Vocab size: %lld\n", vocab_size);
				356	printf("Words in train file: %lld\n", train_words);
				357	}
				358	file_size = ftell(fin);
				359	fclose(fin);
				360	}
				361
				362	void SaveVocab() {
				363	long long i;
				364	FILE *fo = fopen(save_vocab_file, "wb");
				365	for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				366	fclose(fo);
				367	}
				368
				369	void ReadVocab() {
				370	long long a, i = 0;
				371	char c;
				372	char word[MAX_STRING];
				373	FILE *fin = fopen(read_vocab_file, "rb");
				374	if (fin == NULL) {
				375	printf("Vocabulary file not found\n");
				376	exit(1);
				377	}
				378	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				379	vocab_size = 0;
				380	while (1) {
				381	ReadWord(word, fin);
				382	if (feof(fin)) break;
				383	a = AddWordToVocab(word);
				384	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				385	i++;
				386	}
				387	SortVocab();
				388	if (debug_mode > 0) {
				389	printf("Vocab size: %lld\n", vocab_size);
				390	printf("Words in train file: %lld\n", train_words);
				391	}
				392	fin = fopen(train_file, "rb");
				393	if (fin == NULL) {
				394	printf("ERROR: training data file not found!\n");
				395	exit(1);
				396	}
				397	fseek(fin, 0, SEEK_END);
				398	file_size = ftell(fin);
				399	fclose(fin);
				400	}
				401
				402	void InitClassUnigramTable() {
				403	long long a,c;
				404	printf("loading class unigrams \n");
				405	FILE *fin = fopen(negative_classes_file, "rb");
				406	if (fin == NULL) {
				407	printf("ERROR: class file not found!\n");
				408	exit(1);
				409	}
				410	word_to_group = (int )malloc(vocab_size sizeof(int));
				411	for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
				412	char class[MAX_STRING];
				413	char prev_class[MAX_STRING];
				414	prev_class[0] = 0;
				415	char word[MAX_STRING];
				416	class_number = -1;
				417	while (1) {
				418	if (feof(fin)) break;
				419	ReadWord(class, fin);
				420	ReadWord(word, fin);
				421	int word_index = SearchVocab(word);
				422	if (word_index != -1){
				423	if(strcmp(class, prev_class) != 0){
				424	class_number++;
				425	strcpy(prev_class, class);
				426	}
				427	word_to_group[word_index] = class_number;
				428	}
				429	ReadWord(word, fin);
				430	}
				431	class_number++;
				432	fclose(fin);
				433
				434	group_to_table = (int )malloc(table_size class_number * sizeof(int));
				435	long long train_words_pow = 0;
				436	real d1, power = 0.75;
				437
				438	for(c = 0; c < class_number; c++){
				439	long long offset = c * table_size;
				440	train_words_pow = 0;
				441	for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
				442	int i = 0;
				443	while(word_to_group[i]!=c && i < vocab_size) i++;
				444	d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
				445	for (a = 0; a < table_size; a++) {
				446	//printf("index %lld , word %d\n", a, i);
				447	group_to_table[offset + a] = i;
				448	if (a / (real)table_size > d1) {
				449	i++;
				450	while(word_to_group[i]!=c && i < vocab_size) i++;
				451	d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
				452	}
				453	if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
				454	}
				455	}
				456	}
				457
				458	void InitNet() {
				459	long long a, b;
				460	unsigned long long next_random = 1;
				461	window_layer_size = layer1_sizewindow2;
				462	a = posix_memalign((void *)&syn0, 128, (long long)vocab_size layer1_size * sizeof(real));
				463	if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
				464
				465	if (hs) {
				466	a = posix_memalign((void *)&syn1, 128, (long long)vocab_size layer1_size * sizeof(real));
				467	if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
				468	a = posix_memalign((void *)&syn1_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				469	if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				470	a = posix_memalign((void *)&syn_hidden_word, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				471	if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
				472
				473	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				474	syn1[a * layer1_size + b] = 0;
				475	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				476	syn1_window[a * window_layer_size + b] = 0;
				477	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				478	syn_hidden_word[a * window_hidden_size + b] = 0;
				479	}
				480	if (negative>0) {
				481	a = posix_memalign((void *)&syn1neg, 128, (long long)vocab_size layer1_size * sizeof(real));
				482	if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
				483	a = posix_memalign((void *)&syn1neg_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				484	if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				485	a = posix_memalign((void *)&syn_hidden_word_neg, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				486	if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
				487
				488	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				489	syn1neg[a * layer1_size + b] = 0;
				490	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				491	syn1neg_window[a * window_layer_size + b] = 0;
				492	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				493	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
				494	}
				495	if (nce>0) {
				496	a = posix_memalign((void *)&syn1nce, 128, (long long)vocab_size layer1_size * sizeof(real));
				497	if (syn1nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
				498	a = posix_memalign((void *)&syn1nce_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				499	if (syn1nce_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				500	a = posix_memalign((void *)&syn_hidden_word_nce, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				501	if (syn_hidden_word_nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
				502
				503	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				504	syn1nce[a * layer1_size + b] = 0;
				505	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				506	syn1nce_window[a * window_layer_size + b] = 0;
				507	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				508	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				509	}
				510	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
				511	next_random = next_random * (unsigned long long)25214903917 + 11;
				512	syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
				513	}
				514
				515	a = posix_memalign((void *)&syn_window_hidden, 128, window_hidden_size window_layer_size * sizeof(real));
				516	if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
				517	for (a = 0; a < window_hidden_size * window_layer_size; a++){
				518	next_random = next_random * (unsigned long long)25214903917 + 11;
				519	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
				520	}
				521
				522	CreateBinaryTree();
				523	}
				524
				525	long long findStartOfLine(char* file, long long start){
				526	char word[MAX_STRING];
				527	if(start == 0) return 0;
				528	while(start != 0){
				529	FILE*fi = fopen(file, "rb");
				530	fseek(fi, start, SEEK_SET);
				531	ReadWord(word, fi);
				532	if(isEndOfSentence(word)){
				533	fclose(fi);
				534	return start+1;
				535	}
				536	fclose(fi);
				537	start--;
				538	}
				539	return 0;
				540	}
				541
				542	void TrainModelThread(void id) {
				543	char word_str[MAX_STRING];
				544	long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
				545	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				546	long long l1, l2, c, target, label, local_iter = iter;
				547	unsigned long long next_random = (long long)id;
				548	real f, g;
				549	clock_t now;
				550	int input_len_1 = layer1_size;
				551	int window_offset = -1;
				552	float currentWeight = 0;
				553	if(type == 2 \|\| type == 4){
				554	input_len_1=window_layer_size;
				555	}
				556	real neu1 = (real )calloc(input_len_1, sizeof(real));
				557	real neu1e = (real )calloc(input_len_1, sizeof(real));
				558
				559	int input_len_2 = 0;
				560	if(type == 4){
				561	input_len_2 = window_hidden_size;
				562	}
				563	real neu2 = (real )calloc(input_len_2, sizeof(real));
				564	real neu2e = (real )calloc(input_len_2, sizeof(real));
				565
				566	long long start_pos = findStartOfLine(train_file, file_size / (long long)num_threads * (long long)id);
				567	FILE *fi = fopen(train_file, "rb");
				568	fseek(fi, start_pos, SEEK_SET);
				569	int startOfSentence = 1;
				570	int startEndOfLineIndex = SearchVocab("</s>");
				571	while (1) {
				572	if (word_count - last_word_count > 10000) {
				573	word_count_actual += word_count - last_word_count;
				574	last_word_count = word_count;
				575	if ((debug_mode > 1)) {
				576	now=clock();
				577	printf("%cAlpha: %f Weight: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha, currentWeight,
				578	word_count_actual / (real)(iter * train_words + 1) * 100,
				579	word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
				580	fflush(stdout);
				581	}
				582	alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
				583	if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
				584	}
				585	if (sentence_length == 0) {
				586	while (1) {
				587	if(startOfSentence){
				588	ReadWord(word_str, fi);
				589	currentWeight = atof(word_str);
				590	startOfSentence = 0;
				591	continue;
				592	}
				593	word = ReadWordIndex(fi);
				594	if (word == startEndOfLineIndex){
				595	startOfSentence = 1;
				596	}
				597	if (feof(fi)) break;
				598	if (word == -1) continue;
				599	word_count++;
				600	if (word == 0) break;
				601	// The subsampling randomly discards frequent words while keeping the ranking same
				602	if (sample > 0) {
				603	real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
				604	next_random = next_random * (unsigned long long)25214903917 + 11;
				605	if (ran < (next_random & 0xFFFF) / (real)65536) continue;
				606	}
				607	sen[sentence_length] = word;
				608	sentence_length++;
				609	if (sentence_length >= MAX_SENTENCE_LENGTH) break;
				610	}
				611	sentence_position = 0;
				612	}
				613	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				614	word_count_actual += word_count - last_word_count;
				615	local_iter--;
				616	if (local_iter == 0) break;
				617	word_count = 0;
				618	last_word_count = 0;
				619	sentence_length = 0;
				620	fseek(fi, start_pos, SEEK_SET);
				621	continue;
				622	}
				623	word = sen[sentence_position];
				624	if (word == -1) continue;
				625	for (c = 0; c < input_len_1; c++) neu1[c] = 0;
				626	for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
				627	for (c = 0; c < input_len_2; c++) neu2[c] = 0;
				628	for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
				629	next_random = next_random * (unsigned long long)25214903917 + 11;
				630	b = next_random % window;
				631	if (type == 0) { //train the cbow architecture
				632	// in -> hidden
				633	cw = 0;
				634	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				635	c = sentence_position - window + a;
				636	if (c < 0) continue;
				637	if (c >= sentence_length) continue;
				638	last_word = sen[c];
				639	if (last_word == -1) continue;
				640	for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
				641	cw++;
				642	}
				643	if (cw) {
				644	for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
				645	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				646	f = 0;
				647	l2 = vocab[word].point[d] * layer1_size;
				648	// Propagate hidden -> output
				649	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
				650	if (f <= -MAX_EXP) continue;
				651	else if (f >= MAX_EXP) continue;
				652	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				653	// 'g' is the gradient multiplied by the learning rate
				654	g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
				655	// Propagate errors output -> hidden
				656	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
				657	// Learn weights hidden -> output
				658	for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
				659	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
				660	}
				661	// NEGATIVE SAMPLING
				662	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				663	if (d == 0) {
				664	target = word;
				665	label = 1;
				666	} else {
				667	next_random = next_random * (unsigned long long)25214903917 + 11;
				668	if(word_to_group != NULL && word_to_group[word] != -1){
				669	target = word;
				670	while(target == word) {
				671	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				672	next_random = next_random * (unsigned long long)25214903917 + 11;
				673	}
				674	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				675	}
				676	else{
				677	target = table[(next_random >> 16) % table_size];
				678	}
				679	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				680	if (target == word) continue;
				681	label = 0;
				682	}
				683	l2 = target * layer1_size;
				684	f = 0;
				685	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
				686	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
				687	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
				688	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight;
				689	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
				690	for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
				691	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
				692	}
				693	// Noise Contrastive Estimation
				694	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				695	if (d == 0) {
				696	target = word;
				697	label = 1;
				698	} else {
				699	next_random = next_random * (unsigned long long)25214903917 + 11;
				700	if(word_to_group != NULL && word_to_group[word] != -1){
				701	target = word;
				702	while(target == word) {
				703	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				704	next_random = next_random * (unsigned long long)25214903917 + 11;
				705	}
				706	}
				707	else{
				708	target = table[(next_random >> 16) % table_size];
				709	}
				710	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				711	if (target == word) continue;
				712	label = 0;
				713	}
				714	l2 = target * layer1_size;
				715	f = 0;
				716
				717	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
				718	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
				719	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
				720	else {
				721	f = exp(f);
				722	g = (label - f/(noise_distribution[target]nce + f)) alpha * currentWeight;
				723	}
				724	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
				725	for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
				726	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce,c + l2);
				727	}
				728	// hidden -> in
				729	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				730	c = sentence_position - window + a;
				731	if (c < 0) continue;
				732	if (c >= sentence_length) continue;
				733	last_word = sen[c];
				734	if (last_word == -1) continue;
				735	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
				736	}
				737	}
				738	} else if(type==1) { //train skip-gram
				739	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				740	c = sentence_position - window + a;
				741	if (c < 0) continue;
				742	if (c >= sentence_length) continue;
				743	last_word = sen[c];
				744	if (last_word == -1) continue;
				745	l1 = last_word * layer1_size;
				746	for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
				747	// HIERARCHICAL SOFTMAX
				748	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				749	f = 0;
				750	l2 = vocab[word].point[d] * layer1_size;
				751	// Propagate hidden -> output
				752	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
				753	if (f <= -MAX_EXP) continue;
				754	else if (f >= MAX_EXP) continue;
				755	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				756	// 'g' is the gradient multiplied by the learning rate
				757	g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
				758	// Propagate errors output -> hidden
				759	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
				760	// Learn weights hidden -> output
				761	for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
				762	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
				763	}
				764	// NEGATIVE SAMPLING
				765	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				766	if (d == 0) {
				767	target = word;
				768	label = 1;
				769	} else {
				770	next_random = next_random * (unsigned long long)25214903917 + 11;
				771	if(word_to_group != NULL && word_to_group[word] != -1){
				772	target = word;
				773	while(target == word) {
				774	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				775	next_random = next_random * (unsigned long long)25214903917 + 11;
				776	}
				777	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				778	}
				779	else{
				780	target = table[(next_random >> 16) % table_size];
				781	}
				782	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				783	if (target == word) continue;
				784	label = 0;
				785	}
				786	l2 = target * layer1_size;
				787	f = 0;
				788	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
				789	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
				790	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
				791	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight;
				792	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
				793	for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
				794	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
				795	}
				796	//Noise Contrastive Estimation
				797	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				798	if (d == 0) {
				799	target = word;
				800	label = 1;
				801	} else {
				802	next_random = next_random * (unsigned long long)25214903917 + 11;
				803	if(word_to_group != NULL && word_to_group[word] != -1){
				804	target = word;
				805	while(target == word) {
				806	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				807	next_random = next_random * (unsigned long long)25214903917 + 11;
				808	}
				809	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				810	}
				811	else{
				812	target = table[(next_random >> 16) % table_size];
				813	}
				814	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				815	if (target == word) continue;
				816	label = 0;
				817	}
				818	l2 = target * layer1_size;
				819	f = 0;
				820	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce[c + l2];
				821	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
				822	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
				823	else {
				824	f = exp(f);
				825	g = (label - f/(noise_distribution[target]nce + f)) alpha * currentWeight;
				826	}
				827	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
				828	for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * syn0[c + l1];
				829	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce, c + l2);
				830	}
				831	// Learn weights input -> hidden
				832	for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
				833	}
				834	}
				835	else if(type == 2){ //train the cwindow architecture
				836	// in -> hidden
				837	cw = 0;
				838	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				839	c = sentence_position - window + a;
				840	if (c < 0) continue;
				841	if (c >= sentence_length) continue;
				842	last_word = sen[c];
				843	if (last_word == -1) continue;
				844	window_offset = a*layer1_size;
				845	if (a > window) window_offset-=layer1_size;
				846	for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
				847	cw++;
				848	}
				849	if (cw) {
				850	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				851	f = 0;
				852	l2 = vocab[word].point[d] * window_layer_size;
				853	// Propagate hidden -> output
				854	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
				855	if (f <= -MAX_EXP) continue;
				856	else if (f >= MAX_EXP) continue;
				857	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				858	// 'g' is the gradient multiplied by the learning rate
				859	g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
				860	// Propagate errors output -> hidden
				861	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
				862	// Learn weights hidden -> output
				863	for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
				864	if (cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1_window, c + l2);
				865	}
				866	// NEGATIVE SAMPLING
				867	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				868	if (d == 0) {
				869	target = word;
				870	label = 1;
				871	} else {
				872	next_random = next_random * (unsigned long long)25214903917 + 11;
				873	if(word_to_group != NULL && word_to_group[word] != -1){
				874	target = word;
				875	while(target == word) {
				876	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				877	next_random = next_random * (unsigned long long)25214903917 + 11;
				878	}
				879	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				880	}
				881	else{
				882	target = table[(next_random >> 16) % table_size];
				883	}
				884	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				885	if (target == word) continue;
				886	label = 0;
				887	}
				888	l2 = target * window_layer_size;
				889	f = 0;
				890	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
				891	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
				892	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
				893	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight;
				894	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
				895	for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
				896	if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1neg_window, c + l2);
				897	}
				898	// Noise Contrastive Estimation
				899	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				900	if (d == 0) {
				901	target = word;
				902	label = 1;
				903	} else {
				904	next_random = next_random * (unsigned long long)25214903917 + 11;
				905	if(word_to_group != NULL && word_to_group[word] != -1){
				906	target = word;
				907	while(target == word) {
				908	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				909	next_random = next_random * (unsigned long long)25214903917 + 11;
				910	}
				911	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				912	}
				913	else{
				914	target = table[(next_random >> 16) % table_size];
				915	}
				916	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				917	if (target == word) continue;
				918	label = 0;
				919	}
				920	l2 = target * window_layer_size;
				921	f = 0;
				922	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1nce_window[c + l2];
				923	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
				924	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
				925	else {
				926	f = exp(f);
				927	g = (label - f/(noise_distribution[target]nce + f)) alpha * currentWeight;
				928	}
				929	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1nce_window[c + l2];
				930	for (c = 0; c < window_layer_size; c++) syn1nce_window[c + l2] += g * neu1[c];
				931	if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1nce_window, c + l2);
				932	}
				933	// hidden -> in
				934	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				935	c = sentence_position - window + a;
				936	if (c < 0) continue;
				937	if (c >= sentence_length) continue;
				938	last_word = sen[c];
				939	if (last_word == -1) continue;
				940	window_offset = a * layer1_size;
				941	if(a > window) window_offset -= layer1_size;
				942	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
				943	}
				944	}
				945	}
				946	else if (type == 3){ //train structured skip-gram
				947	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				948	c = sentence_position - window + a;
				949	if (c < 0) continue;
				950	if (c >= sentence_length) continue;
				951	last_word = sen[c];
				952	if (last_word == -1) continue;
				953	l1 = last_word * layer1_size;
				954	window_offset = a * layer1_size;
				955	if(a > window) window_offset -= layer1_size;
				956	for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
				957	// HIERARCHICAL SOFTMAX
				958	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				959	f = 0;
				960	l2 = vocab[word].point[d] * window_layer_size;
				961	// Propagate hidden -> output
				962	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1_window[c + l2 + window_offset];
				963	if (f <= -MAX_EXP) continue;
				964	else if (f >= MAX_EXP) continue;
				965	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				966	// 'g' is the gradient multiplied by the learning rate
				967	g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
				968	// Propagate errors output -> hidden
				969	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
				970	// Learn weights hidden -> output
				971	for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * syn0[c + l1];
				972	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2 + window_offset);
				973	}
				974	// NEGATIVE SAMPLING
				975	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				976	if (d == 0) {
				977	target = word;
				978	label = 1;
				979	} else {
				980	next_random = next_random * (unsigned long long)25214903917 + 11;
				981	if(word_to_group != NULL && word_to_group[word] != -1){
				982	target = word;
				983	while(target == word) {
				984	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				985	next_random = next_random * (unsigned long long)25214903917 + 11;
				986	}
				987	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				988	}
				989	else{
				990	target = table[(next_random >> 16) % table_size];
				991	}
				992	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				993	if (target == word) continue;
				994	label = 0;
				995	}
				996	l2 = target * window_layer_size;
				997	f = 0;
				998	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg_window[c + l2 + window_offset];
				999	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
				1000	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
				1001	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight;
				1002	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
				1003	for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * syn0[c + l1];
				1004	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg_window, c + l2 + window_offset);
				1005	}
				1006	// Noise Constrastive Estimation
				1007	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				1008	if (d == 0) {
				1009	target = word;
				1010	label = 1;
				1011	} else {
				1012	next_random = next_random * (unsigned long long)25214903917 + 11;
				1013	if(word_to_group != NULL && word_to_group[word] != -1){
				1014	target = word;
				1015	while(target == word) {
				1016	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1017	next_random = next_random * (unsigned long long)25214903917 + 11;
				1018	}
				1019	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1020	}
				1021	else{
				1022	target = table[(next_random >> 16) % table_size];
				1023	}
				1024	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1025	if (target == word) continue;
				1026	label = 0;
				1027	}
				1028	l2 = target * window_layer_size;
				1029	f = 0;
				1030	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce_window[c + l2 + window_offset];
				1031	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight;
				1032	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight;
				1033	else {
				1034	f = exp(f);
				1035	g = (label - f/(noise_distribution[target]nce + f)) alpha * currentWeight;
				1036	}
				1037	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce_window[c + l2 + window_offset];
				1038	for (c = 0; c < layer1_size; c++) syn1nce_window[c + l2 + window_offset] += g * syn0[c + l1];
				1039	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce_window, c + l2 + window_offset);
				1040	}
				1041	// Learn weights input -> hidden
				1042	for (c = 0; c < layer1_size; c++) {syn0[c + l1] += neu1e[c]; if(syn0[c + l1] > 50) syn0[c + l1] = 50; if(syn0[c + l1] < -50) syn0[c + l1] = -50;}
				1043	}
				1044	}
				1045	else if(type == 4){ //training senna
				1046	// in -> hidden
				1047	cw = 0;
				1048	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1049	c = sentence_position - window + a;
				1050	if (c < 0) continue;
				1051	if (c >= sentence_length) continue;
				1052	last_word = sen[c];
				1053	if (last_word == -1) continue;
				1054	window_offset = a*layer1_size;
				1055	if (a > window) window_offset-=layer1_size;
				1056	for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
				1057	cw++;
				1058	}
				1059	if (cw) {
				1060	for (a = 0; a < window_hidden_size; a++){
				1061	c = a*window_layer_size;
				1062	for(b = 0; b < window_layer_size; b++){
				1063	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1064	}
				1065	}
				1066	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				1067	f = 0;
				1068	l2 = vocab[word].point[d] * window_hidden_size;
				1069	// Propagate hidden -> output
				1070	for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1071	if (f <= -MAX_EXP) continue;
				1072	else if (f >= MAX_EXP) continue;
				1073	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1074	// 'g' is the gradient multiplied by the learning rate
				1075	g = (1 - vocab[word].code[d] - f) * alpha * currentWeight;
				1076	// Propagate errors output -> hidden
				1077	for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
				1078	// Learn weights hidden -> output
				1079	for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
				1080	}
				1081	// NEGATIVE SAMPLING
				1082	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				1083	if (d == 0) {
				1084	target = word;
				1085	label = 1;
				1086	} else {
				1087	next_random = next_random * (unsigned long long)25214903917 + 11;
				1088	if(word_to_group != NULL && word_to_group[word] != -1){
				1089	target = word;
				1090	while(target == word) {
				1091	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1092	next_random = next_random * (unsigned long long)25214903917 + 11;
				1093	}
				1094	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1095	}
				1096	else{
				1097	target = table[(next_random >> 16) % table_size];
				1098	}
				1099	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1100	if (target == word) continue;
				1101	label = 0;
				1102	}
				1103	l2 = target * window_hidden_size;
				1104	f = 0;
				1105	for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
				1106	if (f > MAX_EXP) g = (label - 1) * alpha * currentWeight / negative;
				1107	else if (f < -MAX_EXP) g = (label - 0) * alpha * currentWeight / negative;
				1108	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha * currentWeight / negative;
				1109	for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
				1110	for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
				1111	}
				1112	for (a = 0; a < window_hidden_size; a++)
				1113	for(b = 0; b < window_layer_size; b++)
				1114	neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
				1115	for (a = 0; a < window_hidden_size; a++)
				1116	for(b = 0; b < window_layer_size; b++)
				1117	syn_window_hidden[awindow_layer_size + b] += neu2e[a] neu1[b];
				1118	// hidden -> in
				1119	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1120	c = sentence_position - window + a;
				1121	if (c < 0) continue;
				1122	if (c >= sentence_length) continue;
				1123	last_word = sen[c];
				1124	if (last_word == -1) continue;
				1125	window_offset = a * layer1_size;
				1126	if(a > window) window_offset -= layer1_size;
				1127	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
				1128	}
				1129	}
				1130	}
				1131	else{
				1132	printf("unknown type %i", type);
				1133	exit(0);
				1134	}
				1135	sentence_position++;
				1136	if (sentence_position >= sentence_length) {
				1137	sentence_length = 0;
				1138	continue;
				1139	}
				1140	}
				1141	fclose(fi);
				1142	free(neu1);
				1143	free(neu1e);
				1144	pthread_exit(NULL);
				1145	}
				1146
				1147	void TrainModel() {
				1148	long a, b, c, d;
				1149	FILE *fo;
				1150	pthread_t pt = (pthread_t )malloc(num_threads * sizeof(pthread_t));
				1151	printf("Starting training using file %s\n", train_file);
				1152	starting_alpha = alpha;
				1153	if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
				1154	if (save_vocab_file[0] != 0) SaveVocab();
				1155	if (output_file[0] == 0) return;
				1156	InitNet();
				1157	if (negative > 0 \|\| nce > 0) InitUnigramTable();
				1158	if (negative_classes_file[0] != 0) InitClassUnigramTable();
				1159	start = clock();
				1160	for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
				1161	for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
				1162	fo = fopen(output_file, "wb");
				1163	if (classes == 0) {
				1164	// Save the word vectors
				1165	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1166	for (a = 0; a < vocab_size; a++) {
				1167	fprintf(fo, "%s ", vocab[a].word);
				1168	if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1169	else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1170	fprintf(fo, "\n");
				1171	}
				1172	} else {
				1173	// Run K-means on the word vectors
				1174	int clcn = classes, iter = 10, closeid;
				1175	int centcn = (int )malloc(classes * sizeof(int));
				1176	int cl = (int )calloc(vocab_size, sizeof(int));
				1177	real closev, x;
				1178	real cent = (real )calloc(classes * layer1_size, sizeof(real));
				1179	for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
				1180	for (a = 0; a < iter; a++) {
				1181	for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
				1182	for (b = 0; b < clcn; b++) centcn[b] = 1;
				1183	for (c = 0; c < vocab_size; c++) {
				1184	for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1185	centcn[cl[c]]++;
				1186	}
				1187	for (b = 0; b < clcn; b++) {
				1188	closev = 0;
				1189	for (c = 0; c < layer1_size; c++) {
				1190	cent[layer1_size * b + c] /= centcn[b];
				1191	closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
				1192	}
				1193	closev = sqrt(closev);
				1194	for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
				1195	}
				1196	for (c = 0; c < vocab_size; c++) {
				1197	closev = -10;
				1198	closeid = 0;
				1199	for (d = 0; d < clcn; d++) {
				1200	x = 0;
				1201	for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
				1202	if (x > closev) {
				1203	closev = x;
				1204	closeid = d;
				1205	}
				1206	}
				1207	cl[c] = closeid;
				1208	}
				1209	}
				1210	// Save the K-means classes
				1211	for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1212	free(centcn);
				1213	free(cent);
				1214	free(cl);
				1215	}
				1216	fclose(fo);
				1217	}
				1218
				1219	int ArgPos(char str, int argc, char *argv) {
				1220	int a;
				1221	for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
				1222	if (a == argc - 1) {
				1223	printf("Argument missing for %s\n", str);
				1224	exit(1);
				1225	}
				1226	return a;
				1227	}
				1228	return -1;
				1229	}
				1230
				1231	int main(int argc, char **argv) {
				1232	int i;
				1233	if (argc == 1) {
				1234	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1235	printf("Options:\n");
				1236	printf("Parameters for training:\n");
				1237	printf("\t-train <file>\n");
				1238	printf("\t\tUse text data from <file> to train the model\n");
				1239	printf("\t-output <file>\n");
				1240	printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1241	printf("\t-size <int>\n");
				1242	printf("\t\tSet size of word vectors; default is 100\n");
				1243	printf("\t-window <int>\n");
				1244	printf("\t\tSet max skip length between words; default is 5\n");
				1245	printf("\t-sample <float>\n");
				1246	printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1247	printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1248	printf("\t-hs <int>\n");
				1249	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1250	printf("\t-negative <int>\n");
				1251	printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1252	printf("\t-negative-classes <file>\n");
				1253	printf("\t\tNegative classes to sample from\n");
				1254	printf("\t-nce <int>\n");
				1255	printf("\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1256	printf("\t-threads <int>\n");
				1257	printf("\t\tUse <int> threads (default 12)\n");
				1258	printf("\t-iter <int>\n");
				1259	printf("\t\tRun more training iterations (default 5)\n");
				1260	printf("\t-min-count <int>\n");
				1261	printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1262	printf("\t-alpha <float>\n");
				1263	printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1264	printf("\t-classes <int>\n");
				1265	printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1266	printf("\t-debug <int>\n");
				1267	printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
				1268	printf("\t-binary <int>\n");
				1269	printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1270	printf("\t-save-vocab <file>\n");
				1271	printf("\t\tThe vocabulary will be saved to <file>\n");
				1272	printf("\t-read-vocab <file>\n");
				1273	printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1274	printf("\t-type <int>\n");
				1275	printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1276	printf("\t-cap <int>\n");
				1277	printf("\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1278	printf("\nExamples:\n");
				1279	printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1280	return 0;
				1281	}
				1282	output_file[0] = 0;
				1283	save_vocab_file[0] = 0;
				1284	read_vocab_file[0] = 0;
				1285	negative_classes_file[0] = 0;
				1286	if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
				1287	if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
				1288	if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
				1289	if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
				1290	if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
				1291	if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
				1292	if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
				1293	if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
				1294	if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
				1295	if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
				1296	if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
				1297	if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
				1298	if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
				1299	if ((i = ArgPos((char *)"-nce", argc, argv)) > 0) nce = atoi(argv[i + 1]);
				1300	if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
				1301	if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
				1302	if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
				1303	if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
				1304	if ((i = ArgPos((char *)"-cap", argc, argv)) > 0) cap = atoi(argv[i + 1]);
				1305	if (type==0 \|\| type==2 \|\| type==4) alpha = 0.05;
				1306	if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
				1307	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
				1308	vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
				1309	expTable = (real )malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1310	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1311	expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1312	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1313	}
				1314	TrainModel();
				1315	return 0;
				1316	}
				1317