Blame - word2vecExt1.c - ids-kl/dereko2vec

blob: 0fb49ff98ef57f7f71936c4d5be0c6de1f3fa2c7 [file] [log] [blame]

Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <locale.h>
				16	#include <stdio.h>
				17	#include <stdlib.h>
				18	#include <string.h>
				19	#include <unistd.h>
				20	#include <math.h>
				21	#include <pthread.h>
				22	#include <collocatordb.h>
				23
				24	#define MAX_STRING 100
				25	#define EXP_TABLE_SIZE 1000
				26	#define MAX_EXP 6
				27	#define MAX_SENTENCE_LENGTH 1000
				28	#define MAX_CC 100
				29	#define MAX_CODE_LENGTH 40
				30
				31	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				32
				33	typedef float real; // Precision of float numbers
				34
				35	struct vocab_word {
				36	long long cn;
				37	int *point;
				38	char word, code, codelen;
				39	};
				40
				41	char train_file[MAX_STRING], output_file[MAX_STRING];
				42	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				43	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				44	char magic_stop_file[MAX_STRING];
				45
				46	struct vocab_word *vocab;
				47	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
				48	num_threads = 12, min_reduce = 1;
				49	int *vocab_hash;
				50	long long *threadPos;
				51	int *threadIters;
				52	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				53	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				54	classes = 0;
				55	real alpha = 0.025, starting_alpha, sample = 1e-3;
				56	real syn0, syn1, syn1neg, syn1nce, *expTable;
				57	real avgWordLength=0;
				58	clock_t start, start_clock;
				59
				60	real syn1_window, syn1neg_window, *syn1nce_window;
				61	int w_offset, window_layer_size;
				62
				63	int window_hidden_size = 500;
				64	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				65	*syn_hidden_word_nce;
				66
				67	int hs = 0, negative = 5;
				68	const int table_size = 1e8;
				69	int *table;
				70
				71	long cc = 0;
				72	long tc = 1;
				73
				74	//constrastive negative sampling
				75	char negative_classes_file[MAX_STRING];
				76	int *word_to_group;
				77	int group_to_table; //group_sizetable_size
				78	int class_number;
				79
				80	//nce
				81	real* noise_distribution;
				82	int nce = 0;
				83
				84	//param caps
				85	real CAP_VALUE = 50;
				86	int cap = 0;
				87
				88	COLLOCATORDB *cdb = NULL;
				89
				90	void capParam(real* array, int index) {
				91	if (array[index] > CAP_VALUE)
				92	array[index] = CAP_VALUE;
				93	else if (array[index] < -CAP_VALUE)
				94	array[index] = -CAP_VALUE;
				95	}
				96
				97	real hardTanh(real x) {
				98	if (x >= 1) {
				99	return 1;
				100	} else if (x <= -1) {
				101	return -1;
				102	} else {
				103	return x;
				104	}
				105	}
				106
				107	real dHardTanh(real x, real g) {
				108	if (x > 1 && g > 0) {
				109	return 0;
				110	}
				111	if (x < -1 && g < 0) {
				112	return 0;
				113	}
				114	return 1;
				115	}
				116
				117	void InitUnigramTable() {
				118	int a, i;
				119	long long train_words_pow = 0;
				120	real d1, power = 0.75;
				121	table = (int ) malloc(table_size sizeof(int));
				122	for (a = 0; a < vocab_size; a++)
				123	train_words_pow += pow(vocab[a].cn, power);
				124	i = 0;
				125	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				126	for (a = 0; a < table_size; a++) {
				127	table[a] = i;
				128	if (a / (real) table_size > d1) {
				129	i++;
				130	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				131	}
				132	if (i >= vocab_size)
				133	i = vocab_size - 1;
				134	}
				135
				136	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				137	for (a = 0; a < vocab_size; a++)
				138	noise_distribution[a] = pow(vocab[a].cn, power)
				139	/ (real) train_words_pow;
				140	}
				141
				142	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				143	void ReadWord(char word, FILE fin) {
				144	int a = 0, ch;
				145	while (!feof(fin)) {
				146	ch = fgetc(fin);
				147	if (ch == 13)
				148	continue;
				149	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				150	if (a > 0) {
				151	if (ch == '\n')
				152	ungetc(ch, fin);
				153	break;
				154	}
				155	if (ch == '\n') {
				156	strcpy(word, (char *) "</s>");
				157	return;
				158	} else
				159	continue;
				160	}
				161	word[a] = ch;
				162	a++;
				163	if (a >= MAX_STRING - 1)
				164	a--; // Truncate too long words
				165	}
				166	word[a] = 0;
				167	}
				168
				169	// Returns hash value of a word
				170	int GetWordHash(char *word) {
				171	unsigned long long a, hash = 0;
				172	for (a = 0; a < strlen(word); a++)
				173	hash = hash * 257 + word[a];
				174	hash = hash % vocab_hash_size;
				175	return hash;
				176	}
				177
				178	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				179	int SearchVocab(char *word) {
				180	unsigned int hash = GetWordHash(word);
				181	while (1) {
				182	if (vocab_hash[hash] == -1)
				183	return -1;
				184	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				185	return vocab_hash[hash];
				186	hash = (hash + 1) % vocab_hash_size;
				187	}
				188	return -1;
				189	}
				190
				191	// Reads a word and returns its index in the vocabulary
				192	int ReadWordIndex(FILE *fin) {
				193	char word[MAX_STRING];
				194	ReadWord(word, fin);
				195	if (feof(fin))
				196	return -1;
				197	return SearchVocab(word);
				198	}
				199
				200	// Adds a word to the vocabulary
				201	int AddWordToVocab(char *word) {
				202	unsigned int hash, length = strlen(word) + 1;
				203	if (length > MAX_STRING)
				204	length = MAX_STRING;
				205	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				206	strcpy(vocab[vocab_size].word, word);
				207	vocab[vocab_size].cn = 0;
				208	vocab_size++;
				209	// Reallocate memory if needed
				210	if (vocab_size + 2 >= vocab_max_size) {
				211	vocab_max_size += 1000;
				212	vocab = (struct vocab_word *) realloc(vocab,
				213	vocab_max_size * sizeof(struct vocab_word));
				214	}
				215	hash = GetWordHash(word);
				216	while (vocab_hash[hash] != -1)
				217	hash = (hash + 1) % vocab_hash_size;
				218	vocab_hash[hash] = vocab_size - 1;
				219	return vocab_size - 1;
				220	}
				221
				222	// Used later for sorting by word counts
				223	int VocabCompare(const void a, const void b) {
				224	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				225	}
				226
				227	// Sorts the vocabulary by frequency using word counts
				228	void SortVocab() {
				229	int a, size;
				230	unsigned int hash;
				231	// Sort the vocabulary and keep </s> at the first position
				232	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				233	for (a = 0; a < vocab_hash_size; a++)
				234	vocab_hash[a] = -1;
				235	size = vocab_size;
				236	train_words = 0;
				237	for (a = 0; a < size; a++) {
				238	avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
				239	// Words occuring less than min_count times will be discarded from the vocab
				240	if ((vocab[a].cn < min_count) && (a != 0)) {
				241	vocab_size--;
				242	free(vocab[a].word);
				243	} else {
				244	// Hash will be re-computed, as after the sorting it is not actual
				245	hash = GetWordHash(vocab[a].word);
				246	while (vocab_hash[hash] != -1)
				247	hash = (hash + 1) % vocab_hash_size;
				248	vocab_hash[hash] = a;
				249	train_words += vocab[a].cn;
				250	}
				251	}
				252	avgWordLength /= train_words;
				253	vocab = (struct vocab_word *) realloc(vocab,
				254	(vocab_size + 1) * sizeof(struct vocab_word));
				255	// Allocate memory for the binary tree construction
				256	for (a = 0; a < vocab_size; a++) {
				257	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				258	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				259	}
				260	}
				261
				262	// Reduces the vocabulary by removing infrequent tokens
				263	void ReduceVocab() {
				264	int a, b = 0;
				265	unsigned int hash;
				266	for (a = 0; a < vocab_size; a++)
				267	if (vocab[a].cn > min_reduce) {
				268	vocab[b].cn = vocab[a].cn;
				269	vocab[b].word = vocab[a].word;
				270	b++;
				271	} else
				272	free(vocab[a].word);
				273	vocab_size = b;
				274	for (a = 0; a < vocab_hash_size; a++)
				275	vocab_hash[a] = -1;
				276	for (a = 0; a < vocab_size; a++) {
				277	// Hash will be re-computed, as it is not actual
				278	hash = GetWordHash(vocab[a].word);
				279	while (vocab_hash[hash] != -1)
				280	hash = (hash + 1) % vocab_hash_size;
				281	vocab_hash[hash] = a;
				282	}
				283	fflush(stdout);
				284	min_reduce++;
				285	}
				286
				287	// Create binary Huffman tree using the word counts
				288	// Frequent words will have short uniqe binary codes
				289	void CreateBinaryTree() {
				290	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				291	char code[MAX_CODE_LENGTH];
				292	long long count = (long long ) calloc(vocab_size * 2 + 1,
				293	sizeof(long long));
				294	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				295	sizeof(long long));
				296	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				297	sizeof(long long));
				298	// todo: this needs to operate on a sorted copy of vocab[a].cn if we use local counts
				299	for (a = 0; a < vocab_size; a++)
				300	count[a] = vocab[a].cn;
				301	for (a = vocab_size; a < vocab_size * 2; a++)
				302	count[a] = 1e15;
				303	pos1 = vocab_size - 1;
				304	pos2 = vocab_size;
				305	// Following algorithm constructs the Huffman tree by adding one node at a time
				306	for (a = 0; a < vocab_size - 1; a++) {
				307	// First, find two smallest nodes 'min1, min2'
				308	if (pos1 >= 0) {
				309	if (count[pos1] < count[pos2]) {
				310	min1i = pos1;
				311	pos1--;
				312	} else {
				313	min1i = pos2;
				314	pos2++;
				315	}
				316	} else {
				317	min1i = pos2;
				318	pos2++;
				319	}
				320	if (pos1 >= 0) {
				321	if (count[pos1] < count[pos2]) {
				322	min2i = pos1;
				323	pos1--;
				324	} else {
				325	min2i = pos2;
				326	pos2++;
				327	}
				328	} else {
				329	min2i = pos2;
				330	pos2++;
				331	}
				332	count[vocab_size + a] = count[min1i] + count[min2i];
				333	parent_node[min1i] = vocab_size + a;
				334	parent_node[min2i] = vocab_size + a;
				335	binary[min2i] = 1;
				336	}
				337	// Now assign binary code to each vocabulary word
				338	for (a = 0; a < vocab_size; a++) {
				339	b = a;
				340	i = 0;
				341	while (1) {
				342	code[i] = binary[b];
				343	point[i] = b;
				344	i++;
				345	b = parent_node[b];
				346	if (b == vocab_size * 2 - 2)
				347	break;
				348	}
				349	vocab[a].codelen = i;
				350	vocab[a].point[0] = vocab_size - 2;
				351	for (b = 0; b < i; b++) {
				352	vocab[a].code[i - b - 1] = code[b];
				353	vocab[a].point[i - b] = point[b] - vocab_size;
				354	}
				355	}
				356	free(count);
				357	free(binary);
				358	free(parent_node);
				359	}
				360
				361	void LearnVocabFromTrainFile() {
				362	char word[MAX_STRING];
				363	FILE *fin;
				364	long long a, i;
				365	for (a = 0; a < vocab_hash_size; a++)
				366	vocab_hash[a] = -1;
				367	fin = fopen(train_file, "rb");
				368	if (fin == NULL) {
				369	printf("ERROR: training data file not found!\n");
				370	exit(1);
				371	}
				372	vocab_size = 0;
				373	AddWordToVocab((char *) "</s>");
				374	while (1) {
				375	ReadWord(word, fin);
				376	if (feof(fin))
				377	break;
				378	train_words++;
				379	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				380	printf("%lldK%c", train_words / 1000, 13);
				381	fflush(stdout);
				382	}
				383	i = SearchVocab(word);
				384	if (i == -1) {
				385	a = AddWordToVocab(word);
				386	vocab[a].cn = 1;
				387	} else
				388	vocab[i].cn++;
				389	if (vocab_size > vocab_hash_size * 0.7)
				390	ReduceVocab();
				391	}
				392	SortVocab();
				393	if (debug_mode > 0) {
				394	printf("Vocab size: %lld\n", vocab_size);
				395	printf("Words in train file: %lld\n", train_words);
				396	}
				397	file_size = ftell(fin);
				398	fclose(fin);
				399	}
				400
				401	void SaveVocab() {
				402	long long i;
				403	FILE *fo = fopen(save_vocab_file, "wb");
				404	for (i = 0; i < vocab_size; i++)
				405	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				406	fclose(fo);
				407	}
				408
				409	void ReadVocab() {
				410	long long a, i = 0;
				411	char c;
				412	char word[MAX_STRING];
				413	FILE *fin = fopen(read_vocab_file, "rb");
				414	if (fin == NULL) {
				415	printf("Vocabulary file not found\n");
				416	exit(1);
				417	}
				418	for (a = 0; a < vocab_hash_size; a++)
				419	vocab_hash[a] = -1;
				420	vocab_size = 0;
				421	while (1) {
				422	ReadWord(word, fin);
				423	if (feof(fin))
				424	break;
				425	a = AddWordToVocab(word);
				426	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				427	i++;
				428	}
				429	fclose(fin);
Peter Fankhauser	117aab0	2020-04-24 13:20:06 +0200	[diff] [blame]	430	// this is just for determining train_words by avgWordLength
				431	fin = fopen(train_file, "rb");
				432	if (fin == NULL) {
				433	printf("ERROR: training data file not found!\n");
				434	exit(1);
				435	}
				436	fseek(fin, 0, SEEK_END);
				437	file_size = ftell(fin);
				438	fclose(fin);
Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	439	SortVocab();
Peter Fankhauser	117aab0	2020-04-24 13:20:06 +0200	[diff] [blame]	440	train_words = file_size / avgWordLength;
				441	if(debug_mode > 0)
				442	printf("Estimated words in train file: %'lld\n", train_words);
Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	443	if (tc > 0) {
				444	// recalculate counts for the current corpus
				445	// adapted from LearnVocabFromTrainFile()
				446	// note that we don't sort or rehash the vocabulary again, we only adapt vocab[.].cn.
				447	fin = fopen(train_file, "rb");
				448	if (fin == NULL) {
				449	printf("ERROR: training data file not found!\n");
				450	exit(1);
				451	}
				452	// reset vocabulary counts
				453	for (a = 0; a < vocab_size; a++)
				454	vocab[a].cn = 0;
Peter Fankhauser	117aab0	2020-04-24 13:20:06 +0200	[diff] [blame]	455	long long train_words1 = 0;
Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	456	while (1) {
				457	ReadWord(word, fin);
				458	if (feof(fin))
				459	break;
Peter Fankhauser	117aab0	2020-04-24 13:20:06 +0200	[diff] [blame]	460	if ((debug_mode > 1) && (train_words1 % 100000 == 0)) {
				461	printf("%lldK%c", train_words1 / 1000, 13);
Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	462	fflush(stdout);
				463	}
				464	i = SearchVocab(word);
				465	// the word must be in the vocabulary but we don't issue a warning,
				466	// because it may have been cut off due to minfreq.
				467	if (i >= 0) {
				468	vocab[i].cn++;
Peter Fankhauser	117aab0	2020-04-24 13:20:06 +0200	[diff] [blame]	469	train_words1++;
Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	470	}
				471	}
				472	// we cannot have 0 counts.
				473	for (a = 0; a < vocab_size; a++) {
				474	if(vocab[a].cn == 0) {
				475	vocab[a].cn = 1;
Peter Fankhauser	117aab0	2020-04-24 13:20:06 +0200	[diff] [blame]	476	train_words1++;
Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	477	}
				478	}
				479	if (debug_mode > 0) {
				480	printf("Vocab size: %lld\n", vocab_size);
Peter Fankhauser	117aab0	2020-04-24 13:20:06 +0200	[diff] [blame]	481	printf("Words in current train file: %'lld\n", train_words1);
Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	482	}
				483	fseek(fin, 0, SEEK_END);
				484	file_size = ftell(fin);
				485	fclose(fin);
				486	}
Peter Fankhauser	542b687	2020-04-19 15:21:44 +0200	[diff] [blame]	487	}
				488
				489	void InitClassUnigramTable() {
				490	// TODO: this probably needs to be adapted for dealing with subcorpus adjusted vocabulary counts
				491	long long a, c;
				492	printf("loading class unigrams \n");
				493	FILE *fin = fopen(negative_classes_file, "rb");
				494	if (fin == NULL) {
				495	printf("ERROR: class file not found!\n");
				496	exit(1);
				497	}
				498	word_to_group = (int ) malloc(vocab_size sizeof(int));
				499	for (a = 0; a < vocab_size; a++)
				500	word_to_group[a] = -1;
				501	char class[MAX_STRING];
				502	char prev_class[MAX_STRING];
				503	prev_class[0] = 0;
				504	char word[MAX_STRING];
				505	class_number = -1;
				506	while (1) {
				507	if (feof(fin))
				508	break;
				509	ReadWord(class, fin);
				510	ReadWord(word, fin);
				511	int word_index = SearchVocab(word);
				512	if (word_index != -1) {
				513	if (strcmp(class, prev_class) != 0) {
				514	class_number++;
				515	strcpy(prev_class, class);
				516	}
				517	word_to_group[word_index] = class_number;
				518	}
				519	ReadWord(word, fin);
				520	}
				521	class_number++;
				522	fclose(fin);
				523
				524	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				525	long long train_words_pow = 0;
				526	real d1, power = 0.75;
				527
				528	for (c = 0; c < class_number; c++) {
				529	long long offset = c * table_size;
				530	train_words_pow = 0;
				531	for (a = 0; a < vocab_size; a++)
				532	if (word_to_group[a] == c)
				533	train_words_pow += pow(vocab[a].cn, power);
				534	int i = 0;
				535	while (word_to_group[i] != c && i < vocab_size)
				536	i++;
				537	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				538	for (a = 0; a < table_size; a++) {
				539	//printf("index %lld , word %d\n", a, i);
				540	group_to_table[offset + a] = i;
				541	if (a / (real) table_size > d1) {
				542	i++;
				543	while (word_to_group[i] != c && i < vocab_size)
				544	i++;
				545	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				546	}
				547	if (i >= vocab_size)
				548	while (word_to_group[i] != c && i >= 0)
				549	i--;
				550	}
				551	}
				552	}
				553
				554	void SaveArgs(int argc, char **argv) {
				555	unsigned int i;
				556	char args_file[MAX_STRING];
				557	strcpy(args_file, output_file);
				558	strcat(args_file, ".args");
				559	FILE *fargs = fopen(args_file, "w");
				560	if (fargs == NULL) {
				561	printf("Cannot save args to %s.\n", args_file);
				562	return;
				563	}
				564
				565	for(i=1; i<argc; i++)
				566	fprintf(fargs, "%s ", argv[i]);
				567
				568	fprintf(fargs, "\n");
				569	fclose(fargs);
				570
				571	return;
				572	}
				573
				574	void SaveNet() {
				575	if (type == 4 \|\| negative <= 0) {
				576	fprintf(stderr,
				577	"save-net only supported for type 0,1,2,3 with negative sampling\n");
				578	return;
				579	}
				580
				581	FILE *fnet = fopen(save_net_file, "wb");
				582	if (fnet == NULL) {
				583	printf("Net parameter file not found\n");
				584	exit(1);
				585	}
				586	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				587	if (type == 0 \|\| type == 1) {
				588	fwrite(syn1neg, sizeof(real), vocab_size * layer1_size, fnet);
				589	}
				590	if (type == 2 \|\| type == 3) {
				591	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				592	}
				593	fclose(fnet);
				594	}
				595
				596	void InitNet() {
				597	long long a, b;
				598	unsigned long long next_random = 1;
				599	long long read;
				600
				601	window_layer_size = layer1_size * window * 2;
				602	a = posix_memalign((void **) &syn0, 128,
				603	(long long) vocab_size * layer1_size * sizeof(real));
				604	if (syn0 == NULL) {
				605	printf("Memory allocation failed\n");
				606	exit(1);
				607	}
				608
				609	if (hs) {
				610	a = posix_memalign((void **) &syn1, 128,
				611	(long long) vocab_size * layer1_size * sizeof(real));
				612	if (syn1 == NULL) {
				613	printf("Memory allocation failed\n");
				614	exit(1);
				615	}
				616	a = posix_memalign((void **) &syn1_window, 128,
				617	(long long) vocab_size * window_layer_size * sizeof(real));
				618	if (syn1_window == NULL) {
				619	printf("Memory allocation failed\n");
				620	exit(1);
				621	}
				622	a = posix_memalign((void **) &syn_hidden_word, 128,
				623	(long long) vocab_size * window_hidden_size * sizeof(real));
				624	if (syn_hidden_word == NULL) {
				625	printf("Memory allocation failed\n");
				626	exit(1);
				627	}
				628
				629	for (a = 0; a < vocab_size; a++)
				630	for (b = 0; b < layer1_size; b++)
				631	syn1[a * layer1_size + b] = 0;
				632	for (a = 0; a < vocab_size; a++)
				633	for (b = 0; b < window_layer_size; b++)
				634	syn1_window[a * window_layer_size + b] = 0;
				635	for (a = 0; a < vocab_size; a++)
				636	for (b = 0; b < window_hidden_size; b++)
				637	syn_hidden_word[a * window_hidden_size + b] = 0;
				638	}
				639	if (negative > 0) {
				640	if (type == 0 \|\| type == 1) {
				641	a = posix_memalign((void **) &syn1neg, 128,
				642	(long long) vocab_size * layer1_size * sizeof(real));
				643	if (syn1neg == NULL) {
				644	printf("Memory allocation failed\n");
				645	exit(1);
				646	}
				647	for (a = 0; a < vocab_size; a++)
				648	for (b = 0; b < layer1_size; b++)
				649	syn1neg[a * layer1_size + b] = 0;
				650	} else if (type == 2 \|\| type == 3) {
				651	a = posix_memalign((void **) &syn1neg_window, 128,
				652	(long long) vocab_size * window_layer_size * sizeof(real));
				653	if (syn1neg_window == NULL) {
				654	printf("Memory allocation failed\n");
				655	exit(1);
				656	}
				657	for (a = 0; a < vocab_size; a++)
				658	for (b = 0; b < window_layer_size; b++)
				659	syn1neg_window[a * window_layer_size + b] = 0;
				660	} else if (type == 4) {
				661	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				662	(long long) vocab_size * window_hidden_size * sizeof(real));
				663	if (syn_hidden_word_neg == NULL) {
				664	printf("Memory allocation failed\n");
				665	exit(1);
				666	}
				667	for (a = 0; a < vocab_size; a++)
				668	for (b = 0; b < window_hidden_size; b++)
				669	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
				670	}
				671	}
				672	if (nce > 0) {
				673	a = posix_memalign((void **) &syn1nce, 128,
				674	(long long) vocab_size * layer1_size * sizeof(real));
				675	if (syn1nce == NULL) {
				676	printf("Memory allocation failed\n");
				677	exit(1);
				678	}
				679	a = posix_memalign((void **) &syn1nce_window, 128,
				680	(long long) vocab_size * window_layer_size * sizeof(real));
				681	if (syn1nce_window == NULL) {
				682	printf("Memory allocation failed\n");
				683	exit(1);
				684	}
				685	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				686	(long long) vocab_size * window_hidden_size * sizeof(real));
				687	if (syn_hidden_word_nce == NULL) {
				688	printf("Memory allocation failed\n");
				689	exit(1);
				690	}
				691
				692	for (a = 0; a < vocab_size; a++)
				693	for (b = 0; b < layer1_size; b++)
				694	syn1nce[a * layer1_size + b] = 0;
				695	for (a = 0; a < vocab_size; a++)
				696	for (b = 0; b < window_layer_size; b++)
				697	syn1nce_window[a * window_layer_size + b] = 0;
				698	for (a = 0; a < vocab_size; a++)
				699	for (b = 0; b < window_hidden_size; b++)
				700	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				701	}
				702
				703	if (type == 4) {
				704	a = posix_memalign((void **) &syn_window_hidden, 128,
				705	window_hidden_size * window_layer_size * sizeof(real));
				706	if (syn_window_hidden == NULL) {
				707	printf("Memory allocation failed\n");
				708	exit(1);
				709	}
				710	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				711	next_random = next_random * (unsigned long long) 25214903917 + 11;
				712	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				713	- 0.5) / (window_hidden_size * window_layer_size);
				714	}
				715	}
				716
				717	if (read_net_file[0] == 0) {
				718	for (a = 0; a < vocab_size; a++)
				719	for (b = 0; b < layer1_size; b++) {
				720	next_random = next_random * (unsigned long long) 25214903917
				721	+ 11;
				722	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				723	/ (real) 65536) - 0.5) / layer1_size;
				724	}
				725	} else if ((type == 0 \|\| type == 1) && negative > 0) {
				726	FILE *fnet = fopen(read_net_file, "rb");
				727	if (fnet == NULL) {
				728	printf("Net parameter file not found\n");
				729	exit(1);
				730	}
				731	printf("vocab-size: %lld, layer1_size: %lld\n",
				732	vocab_size, layer1_size);
				733	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				734	if (read != vocab_size * layer1_size) {
				735	fprintf(stderr, "read-net failed %lld\n", read);
				736	exit(-1);
				737	}
				738	read = fread(syn1neg, sizeof(real),
				739	vocab_size * layer1_size, fnet);
				740	if (read != (long long) vocab_size * layer1_size) {
				741	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n",
				742	read,
				743	(long long) sizeof(real) * vocab_size * layer1_size);
				744	exit(-1);
				745	}
				746	fgetc(fnet);
				747	if (!feof(fnet)) {
				748	fprintf(stderr,
				749	"Remaining bytes in net-file after read-net. File position: %ld\n",
				750	ftell(fnet));
				751	exit(-1);
				752	}
				753	fclose(fnet);
				754	} else if ((type == 2 \|\| type == 3) && negative > 0) {
				755	FILE *fnet = fopen(read_net_file, "rb");
				756	if (fnet == NULL) {
				757	printf("Net parameter file not found\n");
				758	exit(1);
				759	}
				760	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n",
				761	vocab_size, layer1_size, window_layer_size);
				762	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				763	if (read != vocab_size * layer1_size) {
				764	fprintf(stderr, "read-net failed %lld\n", read);
				765	exit(-1);
				766	}
				767	read = fread(syn1neg_window, sizeof(real),
				768	vocab_size * window_layer_size, fnet);
				769	if (read != (long long) vocab_size * window_layer_size) {
				770	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n",
				771	read,
				772	(long long) sizeof(real) * vocab_size * window_layer_size);
				773	exit(-1);
				774	}
				775	fgetc(fnet);
				776	if (!feof(fnet)) {
				777	fprintf(stderr,
				778	"Remaining bytes in net-file after read-net. File position: %ld\n",
				779	ftell(fnet));
				780	exit(-1);
				781	}
				782	fclose(fnet);
				783	} else {
				784	fprintf(stderr,
				785	"read-net only supported for type 3 with negative sampling\n");
				786	exit(-1);
				787	}
				788
				789	CreateBinaryTree();
				790	}
				791
				792	char currentDateTime(char buf, real offset) {
				793	time_t t;
				794	time(&t);
				795	t += (long) offset;
				796	struct tm tstruct;
				797	tstruct = *localtime(&t);
				798	strftime(buf, 80, "%c", &tstruct);
				799	return buf;
				800	}
				801
				802	void MonitorThread(void id) {
				803	char *timebuf = malloc(80);;
				804	int i, n=num_threads;
				805	long long sum;
				806	sleep(1);
				807	while(n > 0) {
				808	sleep(1);
				809	sum = n = 0;
				810	for(i=0; i < num_threads; i++) {
				811	if(threadPos[i] >= 0) {
				812	sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
				813	n++;
				814	} else {
				815	sum += iter * file_size / num_threads;
				816	}
				817	}
				818	if(n == 0)
				819	break;
				820	real finished_portion = (real) sum / (float) (file_size * iter);
				821	long long now = time(NULL);
				822	long long elapsed = (now - start);
				823	long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
				824
				825	printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
				826	alpha,
				827	finished_portion * 100,
				828	(float) sum / elapsed / 1000,
				829	elapsed,
				830	ttg,
				831	currentDateTime(timebuf, ttg)
				832	);
				833	fflush(stdout);
				834	}
				835	pthread_exit(NULL);
				836	}
				837
				838	void TrainModelThread(void id) {
				839	long long a, b, d, cw, word, last_word, sentence_length = 0,
				840	sentence_position = 0;
				841	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				842	long long l1, l2, c, target, label, local_iter = iter;
				843	unsigned long long next_random = (long long) id;
				844	real f, g;
				845	int input_len_1 = layer1_size;
				846	int window_offset = -1;
				847	if (type == 2 \|\| type == 4) {
				848	input_len_1 = window_layer_size;
				849	}
				850	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				851	real neu1e = (real ) calloc(input_len_1, sizeof(real));
				852	threadIters[(long) id] = iter;
				853
				854	int input_len_2 = 0;
				855	if (type == 4) {
				856	input_len_2 = window_hidden_size;
				857	}
				858	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				859	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				860
				861	FILE *fi = fopen(train_file, "rb");
				862	long long start_pos = file_size / (long long) num_threads * (long long) id;
				863	long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
				864	long long current_pos = start_pos;
				865	long long last_pos = start_pos;;
				866	fseek(fi, start_pos, SEEK_SET);
				867	while (1) {
				868	if (word_count - last_word_count > 10000) {
				869	// if ((current_pos - last_pos > 100000)) {
				870	// PF: changed back, because it seems that alpha is not correctly adjusted otherwise.
				871	word_count_actual += word_count - last_word_count;
				872	last_pos = current_pos;
				873	last_word_count = word_count;
				874	alpha = starting_alpha
				875	* (1 - word_count_actual / (real) (iter * train_words + 1));
				876	if (alpha < starting_alpha * 0.0001)
				877	alpha = starting_alpha * 0.0001;
				878	}
				879	if (sentence_length == 0) {
				880	while (1) {
				881	word = ReadWordIndex(fi);
				882	if (feof(fi))
				883	break;
				884	if (word == -1)
				885	continue;
				886	word_count++;
				887	if (word == 0)
				888	break;
				889	// The subsampling randomly discards frequent words while keeping the ranking same
				890	if (sample > 0) {
				891	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				892	+ 1) * (sample * train_words) / vocab[word].cn;
				893	next_random = next_random * (unsigned long long) 25214903917
				894	+ 11;
				895	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				896	if (type == 3) // in structured skipgrams
				897	word = -2; // keep the window position correct
				898	else
				899	continue;
				900	}
				901	}
				902	sen[sentence_length] = word;
				903	sentence_length++;
				904	if (sentence_length >= MAX_SENTENCE_LENGTH)
				905	break;
				906	}
				907	sentence_position = 0;
				908	}
				909	current_pos = threadPos[(long) id] = ftell(fi);
				910	if (feof(fi) \|\| current_pos >= end_pos ) {
				911	word_count_actual += word_count - last_word_count;
				912	threadIters[(long) id]--;
				913	local_iter--;
				914	if (local_iter == 0)
				915	break;
				916	if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
				917	printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
				918	break;
				919	}
				920	word_count = 0;
				921	last_word_count = 0;
				922	sentence_length = 0;
				923	fseek(fi, file_size / (long long) num_threads * (long long) id,
				924	SEEK_SET);
				925	continue;
				926	}
				927	word = sen[sentence_position];
				928	while (word == -2 && sentence_position<sentence_length)
				929	word = sen[++sentence_position];
				930	if (sentence_position>=sentence_length) {
				931	sentence_length=0;
				932	continue;
				933	}
				934	if (word < 0)
				935	continue;
				936	for (c = 0; c < input_len_1; c++)
				937	neu1[c] = 0;
				938	for (c = 0; c < input_len_1; c++)
				939	neu1e[c] = 0;
				940	for (c = 0; c < input_len_2; c++)
				941	neu2[c] = 0;
				942	for (c = 0; c < input_len_2; c++)
				943	neu2e[c] = 0;
				944	next_random = next_random * (unsigned long long) 25214903917 + 11;
				945	b = next_random % window;
				946	if (type == 0) { //train the cbow architecture
				947	// in -> hidden
				948	cw = 0;
				949	for (a = b; a < window * 2 + 1 - b; a++)
				950	if (a != window) {
				951	c = sentence_position - window + a;
				952	if (c < 0)
				953	continue;
				954	if (c >= sentence_length)
				955	continue;
				956	last_word = sen[c];
				957	if (last_word == -1)
				958	continue;
				959	for (c = 0; c < layer1_size; c++)
				960	neu1[c] += syn0[c + last_word * layer1_size];
				961	cw++;
				962	}
				963	if (cw) {
				964	for (c = 0; c < layer1_size; c++)
				965	neu1[c] /= cw;
				966	if (hs)
				967	for (d = 0; d < vocab[word].codelen; d++) {
				968	f = 0;
				969	l2 = vocab[word].point[d] * layer1_size;
				970	// Propagate hidden -> output
				971	for (c = 0; c < layer1_size; c++)
				972	f += neu1[c] * syn1[c + l2];
				973	if (f <= -MAX_EXP)
				974	continue;
				975	else if (f >= MAX_EXP)
				976	continue;
				977	else
				978	f = expTable[(int) ((f + MAX_EXP)
				979	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				980	// 'g' is the gradient multiplied by the learning rate
				981	g = (1 - vocab[word].code[d] - f) * alpha;
				982	// Propagate errors output -> hidden
				983	for (c = 0; c < layer1_size; c++)
				984	neu1e[c] += g * syn1[c + l2];
				985	// Learn weights hidden -> output
				986	for (c = 0; c < layer1_size; c++)
				987	syn1[c + l2] += g * neu1[c];
				988	if (cap == 1)
				989	for (c = 0; c < layer1_size; c++)
				990	capParam(syn1, c + l2);
				991	}
				992	// NEGATIVE SAMPLING
				993	if (negative > 0)
				994	for (d = 0; d < negative + 1; d++) {
				995	if (d == 0) {
				996	target = word;
				997	label = 1;
				998	} else {
				999	next_random = next_random
				1000	* (unsigned long long) 25214903917 + 11;
				1001	if (word_to_group != NULL
				1002	&& word_to_group[word] != -1) {
				1003	target = word;
				1004	while (target == word) {
				1005	target = group_to_table[word_to_group[word]
				1006	* table_size
				1007	+ (next_random >> 16) % table_size];
				1008	next_random = next_random
				1009	* (unsigned long long) 25214903917
				1010	+ 11;
				1011	}
				1012	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1013	} else {
				1014	target =
				1015	table[(next_random >> 16) % table_size];
				1016	}
				1017	if (target == 0)
				1018	target = next_random % (vocab_size - 1) + 1;
				1019	if (target == word)
				1020	continue;
				1021	label = 0;
				1022	}
				1023	l2 = target * layer1_size;
				1024	f = 0;
				1025	for (c = 0; c < layer1_size; c++)
				1026	f += neu1[c] * syn1neg[c + l2];
				1027	if (f > MAX_EXP)
				1028	g = (label - 1) * alpha;
				1029	else if (f < -MAX_EXP)
				1030	g = (label - 0) * alpha;
				1031	else
				1032	g = (label
				1033	- expTable[(int) ((f + MAX_EXP)
				1034	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1035	* alpha;
				1036	for (c = 0; c < layer1_size; c++)
				1037	neu1e[c] += g * syn1neg[c + l2];
				1038	for (c = 0; c < layer1_size; c++)
				1039	syn1neg[c + l2] += g * neu1[c];
				1040	if (cap == 1)
				1041	for (c = 0; c < layer1_size; c++)
				1042	capParam(syn1neg, c + l2);
				1043	}
				1044	// Noise Contrastive Estimation
				1045	if (nce > 0)
				1046	for (d = 0; d < nce + 1; d++) {
				1047	if (d == 0) {
				1048	target = word;
				1049	label = 1;
				1050	} else {
				1051	next_random = next_random
				1052	* (unsigned long long) 25214903917 + 11;
				1053	if (word_to_group != NULL
				1054	&& word_to_group[word] != -1) {
				1055	target = word;
				1056	while (target == word) {
				1057	target = group_to_table[word_to_group[word]
				1058	* table_size
				1059	+ (next_random >> 16) % table_size];
				1060	next_random = next_random
				1061	* (unsigned long long) 25214903917
				1062	+ 11;
				1063	}
				1064	} else {
				1065	target =
				1066	table[(next_random >> 16) % table_size];
				1067	}
				1068	if (target == 0)
				1069	target = next_random % (vocab_size - 1) + 1;
				1070	if (target == word)
				1071	continue;
				1072	label = 0;
				1073	}
				1074	l2 = target * layer1_size;
				1075	f = 0;
				1076
				1077	for (c = 0; c < layer1_size; c++)
				1078	f += neu1[c] * syn1nce[c + l2];
				1079	if (f > MAX_EXP)
				1080	g = (label - 1) * alpha;
				1081	else if (f < -MAX_EXP)
				1082	g = (label - 0) * alpha;
				1083	else {
				1084	f = exp(f);
				1085	g =
				1086	(label
				1087	- f
				1088	/ (noise_distribution[target]
				1089	* nce + f)) * alpha;
				1090	}
				1091	for (c = 0; c < layer1_size; c++)
				1092	neu1e[c] += g * syn1nce[c + l2];
				1093	for (c = 0; c < layer1_size; c++)
				1094	syn1nce[c + l2] += g * neu1[c];
				1095	if (cap == 1)
				1096	for (c = 0; c < layer1_size; c++)
				1097	capParam(syn1nce, c + l2);
				1098	}
				1099	// hidden -> in
				1100	for (a = b; a < window * 2 + 1 - b; a++)
				1101	if (a != window) {
				1102	c = sentence_position - window + a;
				1103	if (c < 0)
				1104	continue;
				1105	if (c >= sentence_length)
				1106	continue;
				1107	last_word = sen[c];
				1108	if (last_word == -1)
				1109	continue;
				1110	for (c = 0; c < layer1_size; c++)
				1111	syn0[c + last_word * layer1_size] += neu1e[c];
				1112	}
				1113	}
				1114	} else if (type == 1) { //train skip-gram
				1115	for (a = b; a < window * 2 + 1 - b; a++)
				1116	if (a != window) {
				1117	c = sentence_position - window + a;
				1118	if (c < 0)
				1119	continue;
				1120	if (c >= sentence_length)
				1121	continue;
				1122	last_word = sen[c];
				1123	if (last_word == -1)
				1124	continue;
				1125	l1 = last_word * layer1_size;
				1126	for (c = 0; c < layer1_size; c++)
				1127	neu1e[c] = 0;
				1128	// HIERARCHICAL SOFTMAX
				1129	if (hs)
				1130	for (d = 0; d < vocab[word].codelen; d++) {
				1131	f = 0;
				1132	l2 = vocab[word].point[d] * layer1_size;
				1133	// Propagate hidden -> output
				1134	for (c = 0; c < layer1_size; c++)
				1135	f += syn0[c + l1] * syn1[c + l2];
				1136	if (f <= -MAX_EXP)
				1137	continue;
				1138	else if (f >= MAX_EXP)
				1139	continue;
				1140	else
				1141	f = expTable[(int) ((f + MAX_EXP)
				1142	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1143	// 'g' is the gradient multiplied by the learning rate
				1144	g = (1 - vocab[word].code[d] - f) * alpha;
				1145	// Propagate errors output -> hidden
				1146	for (c = 0; c < layer1_size; c++)
				1147	neu1e[c] += g * syn1[c + l2];
				1148	// Learn weights hidden -> output
				1149	for (c = 0; c < layer1_size; c++)
				1150	syn1[c + l2] += g * syn0[c + l1];
				1151	if (cap == 1)
				1152	for (c = 0; c < layer1_size; c++)
				1153	capParam(syn1, c + l2);
				1154	}
				1155	// NEGATIVE SAMPLING
				1156	if (negative > 0)
				1157	for (d = 0; d < negative + 1; d++) {
				1158	if (d == 0) {
				1159	target = word;
				1160	label = 1;
				1161	} else {
				1162	next_random = next_random
				1163	* (unsigned long long) 25214903917 + 11;
				1164	if (word_to_group != NULL
				1165	&& word_to_group[word] != -1) {
				1166	target = word;
				1167	while (target == word) {
				1168	target =
				1169	group_to_table[word_to_group[word]
				1170	* table_size
				1171	+ (next_random >> 16)
				1172	% table_size];
				1173	next_random =
				1174	next_random
				1175	* (unsigned long long) 25214903917
				1176	+ 11;
				1177	}
				1178	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1179	} else {
				1180	target = table[(next_random >> 16)
				1181	% table_size];
				1182	}
				1183	if (target == 0)
				1184	target = next_random % (vocab_size - 1) + 1;
				1185	if (target == word)
				1186	continue;
				1187	label = 0;
				1188	}
				1189	l2 = target * layer1_size;
				1190	f = 0;
				1191	for (c = 0; c < layer1_size; c++)
				1192	f += syn0[c + l1] * syn1neg[c + l2];
				1193	if (f > MAX_EXP)
				1194	g = (label - 1) * alpha;
				1195	else if (f < -MAX_EXP)
				1196	g = (label - 0) * alpha;
				1197	else
				1198	g =
				1199	(label
				1200	- expTable[(int) ((f + MAX_EXP)
				1201	* (EXP_TABLE_SIZE
				1202	/ MAX_EXP / 2))])
				1203	* alpha;
				1204	for (c = 0; c < layer1_size; c++)
				1205	neu1e[c] += g * syn1neg[c + l2];
				1206	for (c = 0; c < layer1_size; c++)
				1207	syn1neg[c + l2] += g * syn0[c + l1];
				1208	if (cap == 1)
				1209	for (c = 0; c < layer1_size; c++)
				1210	capParam(syn1neg, c + l2);
				1211	}
				1212	//Noise Contrastive Estimation
				1213	if (nce > 0)
				1214	for (d = 0; d < nce + 1; d++) {
				1215	if (d == 0) {
				1216	target = word;
				1217	label = 1;
				1218	} else {
				1219	next_random = next_random
				1220	* (unsigned long long) 25214903917 + 11;
				1221	if (word_to_group != NULL
				1222	&& word_to_group[word] != -1) {
				1223	target = word;
				1224	while (target == word) {
				1225	target =
				1226	group_to_table[word_to_group[word]
				1227	* table_size
				1228	+ (next_random >> 16)
				1229	% table_size];
				1230	next_random =
				1231	next_random
				1232	* (unsigned long long) 25214903917
				1233	+ 11;
				1234	}
				1235	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1236	} else {
				1237	target = table[(next_random >> 16)
				1238	% table_size];
				1239	}
				1240	if (target == 0)
				1241	target = next_random % (vocab_size - 1) + 1;
				1242	if (target == word)
				1243	continue;
				1244	label = 0;
				1245	}
				1246	l2 = target * layer1_size;
				1247	f = 0;
				1248	for (c = 0; c < layer1_size; c++)
				1249	f += syn0[c + l1] * syn1nce[c + l2];
				1250	if (f > MAX_EXP)
				1251	g = (label - 1) * alpha;
				1252	else if (f < -MAX_EXP)
				1253	g = (label - 0) * alpha;
				1254	else {
				1255	f = exp(f);
				1256	g = (label
				1257	- f
				1258	/ (noise_distribution[target]
				1259	* nce + f)) * alpha;
				1260	}
				1261	for (c = 0; c < layer1_size; c++)
				1262	neu1e[c] += g * syn1nce[c + l2];
				1263	for (c = 0; c < layer1_size; c++)
				1264	syn1nce[c + l2] += g * syn0[c + l1];
				1265	if (cap == 1)
				1266	for (c = 0; c < layer1_size; c++)
				1267	capParam(syn1nce, c + l2);
				1268	}
				1269	// Learn weights input -> hidden
				1270	for (c = 0; c < layer1_size; c++)
				1271	syn0[c + l1] += neu1e[c];
				1272	}
				1273	} else if (type == 2) { //train the cwindow architecture
				1274	// in -> hidden
				1275	cw = 0;
				1276	for (a = 0; a < window * 2 + 1; a++)
				1277	if (a != window) {
				1278	c = sentence_position - window + a;
				1279	if (c < 0)
				1280	continue;
				1281	if (c >= sentence_length)
				1282	continue;
				1283	last_word = sen[c];
				1284	if (last_word == -1)
				1285	continue;
				1286	window_offset = a * layer1_size;
				1287	if (a > window)
				1288	window_offset -= layer1_size;
				1289	for (c = 0; c < layer1_size; c++)
				1290	neu1[c + window_offset] += syn0[c
				1291	+ last_word * layer1_size];
				1292	cw++;
				1293	}
				1294	if (cw) {
				1295	if (hs)
				1296	for (d = 0; d < vocab[word].codelen; d++) {
				1297	f = 0;
				1298	l2 = vocab[word].point[d] * window_layer_size;
				1299	// Propagate hidden -> output
				1300	for (c = 0; c < window_layer_size; c++)
				1301	f += neu1[c] * syn1_window[c + l2];
				1302	if (f <= -MAX_EXP)
				1303	continue;
				1304	else if (f >= MAX_EXP)
				1305	continue;
				1306	else
				1307	f = expTable[(int) ((f + MAX_EXP)
				1308	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1309	// 'g' is the gradient multiplied by the learning rate
				1310	g = (1 - vocab[word].code[d] - f) * alpha;
				1311	// Propagate errors output -> hidden
				1312	for (c = 0; c < window_layer_size; c++)
				1313	neu1e[c] += g * syn1_window[c + l2];
				1314	// Learn weights hidden -> output
				1315	for (c = 0; c < window_layer_size; c++)
				1316	syn1_window[c + l2] += g * neu1[c];
				1317	if (cap == 1)
				1318	for (c = 0; c < window_layer_size; c++)
				1319	capParam(syn1_window, c + l2);
				1320	}
				1321	// NEGATIVE SAMPLING
				1322	if (negative > 0)
				1323	for (d = 0; d < negative + 1; d++) {
				1324	if (d == 0) {
				1325	target = word;
				1326	label = 1;
				1327	} else {
				1328	next_random = next_random
				1329	* (unsigned long long) 25214903917 + 11;
				1330	if (word_to_group != NULL
				1331	&& word_to_group[word] != -1) {
				1332	target = word;
				1333	while (target == word) {
				1334	target = group_to_table[word_to_group[word]
				1335	* table_size
				1336	+ (next_random >> 16) % table_size];
				1337	next_random = next_random
				1338	* (unsigned long long) 25214903917
				1339	+ 11;
				1340	}
				1341	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1342	} else {
				1343	target =
				1344	table[(next_random >> 16) % table_size];
				1345	}
				1346	if (target == 0)
				1347	target = next_random % (vocab_size - 1) + 1;
				1348	if (target == word)
				1349	continue;
				1350	label = 0;
				1351	}
				1352	l2 = target * window_layer_size;
				1353	f = 0;
				1354	for (c = 0; c < window_layer_size; c++)
				1355	f += neu1[c] * syn1neg_window[c + l2];
				1356	if (f > MAX_EXP)
				1357	g = (label - 1) * alpha;
				1358	else if (f < -MAX_EXP)
				1359	g = (label - 0) * alpha;
				1360	else
				1361	g = (label
				1362	- expTable[(int) ((f + MAX_EXP)
				1363	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1364	* alpha;
				1365	for (c = 0; c < window_layer_size; c++)
				1366	neu1e[c] += g * syn1neg_window[c + l2];
				1367	for (c = 0; c < window_layer_size; c++)
				1368	syn1neg_window[c + l2] += g * neu1[c];
				1369	if (cap == 1)
				1370	for (c = 0; c < window_layer_size; c++)
				1371	capParam(syn1neg_window, c + l2);
				1372	}
				1373	// Noise Contrastive Estimation
				1374	if (nce > 0)
				1375	for (d = 0; d < nce + 1; d++) {
				1376	if (d == 0) {
				1377	target = word;
				1378	label = 1;
				1379	} else {
				1380	next_random = next_random
				1381	* (unsigned long long) 25214903917 + 11;
				1382	if (word_to_group != NULL
				1383	&& word_to_group[word] != -1) {
				1384	target = word;
				1385	while (target == word) {
				1386	target = group_to_table[word_to_group[word]
				1387	* table_size
				1388	+ (next_random >> 16) % table_size];
				1389	next_random = next_random
				1390	* (unsigned long long) 25214903917
				1391	+ 11;
				1392	}
				1393	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1394	} else {
				1395	target =
				1396	table[(next_random >> 16) % table_size];
				1397	}
				1398	if (target == 0)
				1399	target = next_random % (vocab_size - 1) + 1;
				1400	if (target == word)
				1401	continue;
				1402	label = 0;
				1403	}
				1404	l2 = target * window_layer_size;
				1405	f = 0;
				1406	for (c = 0; c < window_layer_size; c++)
				1407	f += neu1[c] * syn1nce_window[c + l2];
				1408	if (f > MAX_EXP)
				1409	g = (label - 1) * alpha;
				1410	else if (f < -MAX_EXP)
				1411	g = (label - 0) * alpha;
				1412	else {
				1413	f = exp(f);
				1414	g =
				1415	(label
				1416	- f
				1417	/ (noise_distribution[target]
				1418	* nce + f)) * alpha;
				1419	}
				1420	for (c = 0; c < window_layer_size; c++)
				1421	neu1e[c] += g * syn1nce_window[c + l2];
				1422	for (c = 0; c < window_layer_size; c++)
				1423	syn1nce_window[c + l2] += g * neu1[c];
				1424	if (cap == 1)
				1425	for (c = 0; c < window_layer_size; c++)
				1426	capParam(syn1nce_window, c + l2);
				1427	}
				1428	// hidden -> in
				1429	for (a = 0; a < window * 2 + 1; a++)
				1430	if (a != window) {
				1431	c = sentence_position - window + a;
				1432	if (c < 0)
				1433	continue;
				1434	if (c >= sentence_length)
				1435	continue;
				1436	last_word = sen[c];
				1437	if (last_word == -1)
				1438	continue;
				1439	window_offset = a * layer1_size;
				1440	if (a > window)
				1441	window_offset -= layer1_size;
				1442	for (c = 0; c < layer1_size; c++)
				1443	syn0[c + last_word * layer1_size] += neu1e[c
				1444	+ window_offset];
				1445	}
				1446	}
				1447	} else if (type == 3) { //train structured skip-gram
				1448	for (a = 0; a < window * 2 + 1; a++)
				1449	if (a != window) {
				1450	c = sentence_position - window + a;
				1451	if (c < 0)
				1452	continue;
				1453	if (c >= sentence_length)
				1454	continue;
				1455	last_word = sen[c];
				1456	if (last_word < 0)
				1457	continue;
				1458	l1 = last_word * layer1_size;
				1459	window_offset = a * layer1_size;
				1460	if (a > window)
				1461	window_offset -= layer1_size;
				1462	for (c = 0; c < layer1_size; c++)
				1463	neu1e[c] = 0;
				1464	// HIERARCHICAL SOFTMAX
				1465	if (hs)
				1466	for (d = 0; d < vocab[word].codelen; d++) {
				1467	f = 0;
				1468	l2 = vocab[word].point[d] * window_layer_size;
				1469	// Propagate hidden -> output
				1470	for (c = 0; c < layer1_size; c++)
				1471	f += syn0[c + l1]
				1472	* syn1_window[c + l2 + window_offset];
				1473	if (f <= -MAX_EXP)
				1474	continue;
				1475	else if (f >= MAX_EXP)
				1476	continue;
				1477	else
				1478	f = expTable[(int) ((f + MAX_EXP)
				1479	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1480	// 'g' is the gradient multiplied by the learning rate
				1481	g = (1 - vocab[word].code[d] - f) * alpha;
				1482	// Propagate errors output -> hidden
				1483	for (c = 0; c < layer1_size; c++)
				1484	neu1e[c] += g
				1485	* syn1_window[c + l2 + window_offset];
				1486	// Learn weights hidden -> output
				1487	for (c = 0; c < layer1_size; c++)
				1488	syn1[c + l2 + window_offset] += g
				1489	* syn0[c + l1];
				1490	if (cap == 1)
				1491	for (c = 0; c < layer1_size; c++)
				1492	capParam(syn1, c + l2 + window_offset);
				1493	}
				1494	// NEGATIVE SAMPLING
				1495	if (negative > 0)
				1496	for (d = 0; d < negative + 1; d++) {
				1497	if (d == 0) {
				1498	target = word;
				1499	label = 1;
				1500	} else {
				1501	next_random = next_random
				1502	* (unsigned long long) 25214903917 + 11;
				1503	if (word_to_group != NULL
				1504	&& word_to_group[word] != -1) {
				1505	target = word;
				1506	while (target == word) {
				1507	target =
				1508	group_to_table[word_to_group[word]
				1509	* table_size
				1510	+ (next_random >> 16)
				1511	% table_size];
				1512	next_random =
				1513	next_random
				1514	* (unsigned long long) 25214903917
				1515	+ 11;
				1516	}
				1517	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1518	} else {
				1519	target = table[(next_random >> 16)
				1520	% table_size];
				1521	}
				1522	if (target == 0)
				1523	target = next_random % (vocab_size - 1) + 1;
				1524	if (target == word)
				1525	continue;
				1526	label = 0;
				1527	}
				1528	l2 = target * window_layer_size;
				1529	f = 0;
				1530	for (c = 0; c < layer1_size; c++)
				1531	f +=
				1532	syn0[c + l1]
				1533	* syn1neg_window[c + l2
				1534	+ window_offset];
				1535	if (f > MAX_EXP)
				1536	g = (label - 1) * alpha;
				1537	else if (f < -MAX_EXP)
				1538	g = (label - 0) * alpha;
				1539	else
				1540	g =
				1541	(label
				1542	- expTable[(int) ((f + MAX_EXP)
				1543	* (EXP_TABLE_SIZE
				1544	/ MAX_EXP / 2))])
				1545	* alpha;
				1546	if(debug_mode > 2 && ((long long) id) == 0) {
				1547	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1548	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1549	}
				1550	for (c = 0; c < layer1_size; c++)
				1551	neu1e[c] +=
				1552	g
				1553	* syn1neg_window[c + l2
				1554	+ window_offset];
				1555	for (c = 0; c < layer1_size; c++)
				1556	syn1neg_window[c + l2 + window_offset] += g
				1557	* syn0[c + l1];
				1558	if (cap == 1)
				1559	for (c = 0; c < layer1_size; c++)
				1560	capParam(syn1neg_window,
				1561	c + l2 + window_offset);
				1562	}
				1563	// Noise Constrastive Estimation
				1564	if (nce > 0)
				1565	for (d = 0; d < nce + 1; d++) {
				1566	if (d == 0) {
				1567	target = word;
				1568	label = 1;
				1569	} else {
				1570	next_random = next_random
				1571	* (unsigned long long) 25214903917 + 11;
				1572	if (word_to_group != NULL
				1573	&& word_to_group[word] != -1) {
				1574	target = word;
				1575	while (target == word) {
				1576	target =
				1577	group_to_table[word_to_group[word]
				1578	* table_size
				1579	+ (next_random >> 16)
				1580	% table_size];
				1581	next_random =
				1582	next_random
				1583	* (unsigned long long) 25214903917
				1584	+ 11;
				1585	}
				1586	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1587	} else {
				1588	target = table[(next_random >> 16)
				1589	% table_size];
				1590	}
				1591	if (target == 0)
				1592	target = next_random % (vocab_size - 1) + 1;
				1593	if (target == word)
				1594	continue;
				1595	label = 0;
				1596	}
				1597	l2 = target * window_layer_size;
				1598	f = 0;
				1599	for (c = 0; c < layer1_size; c++)
				1600	f +=
				1601	syn0[c + l1]
				1602	* syn1nce_window[c + l2
				1603	+ window_offset];
				1604	if (f > MAX_EXP)
				1605	g = (label - 1) * alpha;
				1606	else if (f < -MAX_EXP)
				1607	g = (label - 0) * alpha;
				1608	else {
				1609	f = exp(f);
				1610	g = (label
				1611	- f
				1612	/ (noise_distribution[target]
				1613	* nce + f)) * alpha;
				1614	}
				1615	for (c = 0; c < layer1_size; c++)
				1616	neu1e[c] +=
				1617	g
				1618	* syn1nce_window[c + l2
				1619	+ window_offset];
				1620	for (c = 0; c < layer1_size; c++)
				1621	syn1nce_window[c + l2 + window_offset] += g
				1622	* syn0[c + l1];
				1623	if (cap == 1)
				1624	for (c = 0; c < layer1_size; c++)
				1625	capParam(syn1nce_window,
				1626	c + l2 + window_offset);
				1627	}
				1628	// Learn weights input -> hidden
				1629	for (c = 0; c < layer1_size; c++) {
				1630	syn0[c + l1] += neu1e[c];
				1631	if (syn0[c + l1] > 50)
				1632	syn0[c + l1] = 50;
				1633	if (syn0[c + l1] < -50)
				1634	syn0[c + l1] = -50;
				1635	}
				1636	}
				1637	} else if (type == 4) { //training senna
				1638	// in -> hidden
				1639	cw = 0;
				1640	for (a = 0; a < window * 2 + 1; a++)
				1641	if (a != window) {
				1642	c = sentence_position - window + a;
				1643	if (c < 0)
				1644	continue;
				1645	if (c >= sentence_length)
				1646	continue;
				1647	last_word = sen[c];
				1648	if (last_word == -1)
				1649	continue;
				1650	window_offset = a * layer1_size;
				1651	if (a > window)
				1652	window_offset -= layer1_size;
				1653	for (c = 0; c < layer1_size; c++)
				1654	neu1[c + window_offset] += syn0[c
				1655	+ last_word * layer1_size];
				1656	cw++;
				1657	}
				1658	if (cw) {
				1659	for (a = 0; a < window_hidden_size; a++) {
				1660	c = a * window_layer_size;
				1661	for (b = 0; b < window_layer_size; b++) {
				1662	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1663	}
				1664	}
				1665	if (hs)
				1666	for (d = 0; d < vocab[word].codelen; d++) {
				1667	f = 0;
				1668	l2 = vocab[word].point[d] * window_hidden_size;
				1669	// Propagate hidden -> output
				1670	for (c = 0; c < window_hidden_size; c++)
				1671	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1672	if (f <= -MAX_EXP)
				1673	continue;
				1674	else if (f >= MAX_EXP)
				1675	continue;
				1676	else
				1677	f = expTable[(int) ((f + MAX_EXP)
				1678	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1679	// 'g' is the gradient multiplied by the learning rate
				1680	g = (1 - vocab[word].code[d] - f) * alpha;
				1681	// Propagate errors output -> hidden
				1682	for (c = 0; c < window_hidden_size; c++)
				1683	neu2e[c] += dHardTanh(neu2[c], g) * g
				1684	* syn_hidden_word[c + l2];
				1685	// Learn weights hidden -> output
				1686	for (c = 0; c < window_hidden_size; c++)
				1687	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1688	* neu2[c];
				1689	}
				1690	// NEGATIVE SAMPLING
				1691	if (negative > 0)
				1692	for (d = 0; d < negative + 1; d++) {
				1693	if (d == 0) {
				1694	target = word;
				1695	label = 1;
				1696	} else {
				1697	next_random = next_random
				1698	* (unsigned long long) 25214903917 + 11;
				1699	if (word_to_group != NULL
				1700	&& word_to_group[word] != -1) {
				1701	target = word;
				1702	while (target == word) {
				1703	target = group_to_table[word_to_group[word]
				1704	* table_size
				1705	+ (next_random >> 16) % table_size];
				1706	next_random = next_random
				1707	* (unsigned long long) 25214903917
				1708	+ 11;
				1709	}
				1710	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1711	} else {
				1712	target =
				1713	table[(next_random >> 16) % table_size];
				1714	}
				1715	if (target == 0)
				1716	target = next_random % (vocab_size - 1) + 1;
				1717	if (target == word)
				1718	continue;
				1719	label = 0;
				1720	}
				1721	l2 = target * window_hidden_size;
				1722	f = 0;
				1723	for (c = 0; c < window_hidden_size; c++)
				1724	f += hardTanh(neu2[c])
				1725	* syn_hidden_word_neg[c + l2];
				1726	if (f > MAX_EXP)
				1727	g = (label - 1) * alpha / negative;
				1728	else if (f < -MAX_EXP)
				1729	g = (label - 0) * alpha / negative;
				1730	else
				1731	g = (label
				1732	- expTable[(int) ((f + MAX_EXP)
				1733	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1734	* alpha / negative;
				1735	for (c = 0; c < window_hidden_size; c++)
				1736	neu2e[c] += dHardTanh(neu2[c], g) * g
				1737	* syn_hidden_word_neg[c + l2];
				1738	for (c = 0; c < window_hidden_size; c++)
				1739	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1740	* g * neu2[c];
				1741	}
				1742	for (a = 0; a < window_hidden_size; a++)
				1743	for (b = 0; b < window_layer_size; b++)
				1744	neu1e[b] += neu2e[a]
				1745	* syn_window_hidden[a * window_layer_size + b];
				1746	for (a = 0; a < window_hidden_size; a++)
				1747	for (b = 0; b < window_layer_size; b++)
				1748	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1749	* neu1[b];
				1750	// hidden -> in
				1751	for (a = 0; a < window * 2 + 1; a++)
				1752	if (a != window) {
				1753	c = sentence_position - window + a;
				1754	if (c < 0)
				1755	continue;
				1756	if (c >= sentence_length)
				1757	continue;
				1758	last_word = sen[c];
				1759	if (last_word == -1)
				1760	continue;
				1761	window_offset = a * layer1_size;
				1762	if (a > window)
				1763	window_offset -= layer1_size;
				1764	for (c = 0; c < layer1_size; c++)
				1765	syn0[c + last_word * layer1_size] += neu1e[c
				1766	+ window_offset];
				1767	}
				1768	}
				1769	} else if(type == 5) {
				1770	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				1771	c = sentence_position - window + a;
				1772	if (c < 0) continue;
				1773	if (c >= sentence_length) continue;
				1774	last_word = sen[c];
				1775	if (last_word == -1) continue;
				1776	inc_collocator(cdb, word, last_word, a - window);
				1777	// printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
				1778	// cw++;
				1779	}
				1780	} else {
				1781	printf("unknown type %i", type);
				1782	exit(0);
				1783	}
				1784	sentence_position++;
				1785	if (sentence_position >= sentence_length) {
				1786	sentence_length = 0;
				1787	continue;
				1788	}
				1789	}
				1790	fclose(fi);
				1791	free(neu1);
				1792	free(neu1e);
				1793	threadPos[(long) id] = -1;
				1794	pthread_exit(NULL);
				1795	}
				1796
				1797	void ShowCollocations() {
				1798	long a, b, c, d, e, window_offset, target, max_target = 0, maxmax_target;
				1799	real f, max_f, maxmax_f;
				1800	real *target_sums, bestf[MAX_CC], worstbest;
				1801	long besti[MAX_CC];
				1802	int N = 10, bestp[MAX_CC];
				1803	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1804
				1805	for (d = cc; d < vocab_size; d++) {
				1806	for (b = 0; b < vocab_size; b++)
				1807	target_sums[b] = 0;
				1808	for (b = 0; b < N; b++)
				1809	bestf[b] = -1;
				1810	worstbest = -1;
				1811
				1812	maxmax_f = -1;
				1813	maxmax_target = 0;
				1814	for (a = window * 2 + 1; a >=0; a--) {
				1815	if (a != window) {
				1816	max_f = -1;
				1817	window_offset = a * layer1_size;
				1818	if (a > window)
				1819	window_offset -= layer1_size;
				1820	for(target = 0; target < vocab_size; target ++) {
				1821	if(target == d)
				1822	continue;
				1823	f = 0;
				1824	for (c = 0; c < layer1_size; c++)
				1825	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1826	if (f < -MAX_EXP)
				1827	continue;
				1828	else if (f > MAX_EXP)
				1829	continue;
				1830	else
				1831	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1832	if(f > max_f) {
				1833	max_f = f;
				1834	max_target = target;
				1835	}
				1836	target_sums[target] += (1-target_sums[target]) * f;
				1837	if(f > worstbest) {
				1838	for (b = 0; b < N; b++) {
				1839	if (f > bestf[b]) {
				1840	for (e = N - 1; e > b; e--) {
				1841	bestf[e] = bestf[e - 1];
				1842	besti[e] = besti[e - 1];
				1843	bestp[e] = bestp[e - 1];
				1844	}
				1845	bestf[b] = f;
				1846	besti[b] = target;
				1847	bestp[b] = window-a;
				1848	break;
				1849	}
				1850	}
				1851	worstbest = bestf[N - 1];
				1852	}
				1853	}
				1854	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1855	if (max_f > maxmax_f) {
				1856	maxmax_f = max_f;
				1857	maxmax_target = max_target;
				1858	}
				1859	} else {
				1860	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1861	}
				1862	}
				1863	max_f = -1;
				1864	for (b = 0; b < vocab_size; b++) {
				1865	if (target_sums[b] > max_f) {
				1866	max_f = target_sums[b];
				1867	max_target = b;
				1868	}
				1869	}
				1870	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
				1871	vocab[max_target].word, max_f, vocab[maxmax_target].word,
				1872	maxmax_f);
				1873	for (b = 0; b < N && bestf[b] > -1; b++)
				1874	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
				1875	printf("\n");
				1876	}
				1877	}
				1878
				1879	void TrainModel() {
				1880	long a, b, c, d;
				1881	FILE *fo;
				1882	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
				1883	threadPos = malloc(num_threads * sizeof(long long));
				1884	threadIters = malloc(num_threads * sizeof(int));
				1885	char *timebuf = malloc(80);
				1886	printf("Starting training using file %s\n", train_file);
				1887	starting_alpha = alpha;
				1888	if (read_vocab_file[0] != 0)
				1889	ReadVocab();
				1890	else
				1891	LearnVocabFromTrainFile();
				1892	if (save_vocab_file[0] != 0)
				1893	SaveVocab();
				1894	if (output_file[0] == 0)
				1895	return;
				1896	InitNet();
				1897	if (cc > 0)
				1898	ShowCollocations();
				1899	if (negative > 0 \|\| nce > 0)
				1900	InitUnigramTable();
				1901	if (negative_classes_file[0] != 0)
				1902	InitClassUnigramTable();
				1903	start = time(NULL);
				1904	start_clock = clock();
				1905	for (a = 0; a < num_threads; a++)
				1906	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
				1907	if(debug_mode > 1)
				1908	pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
				1909	for (a = 0; a < num_threads; a++)
				1910	pthread_join(pt[a], NULL);
				1911	if(debug_mode > 1) {
				1912	pthread_join(pt[num_threads], NULL);
				1913	clock_t now = time(NULL);
				1914	clock_t now_clock = clock();
				1915	printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
				1916	if(type == 5) // don't save vectorsmfor classic collocators
				1917	return;
				1918	printf("Saving vectors to %s ...", output_file);
				1919	fflush(stdout);
				1920	}
				1921	fo = fopen(output_file, "wb");
				1922	if (classes == 0) {
				1923	// Save the word vectors
				1924	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1925	for (a = 0; a < vocab_size; a++) {
				1926	fprintf(fo, "%s ", vocab[a].word);
				1927	if (binary)
				1928	for (b = 0; b < layer1_size; b++)
				1929	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1930	else
				1931	for (b = 0; b < layer1_size; b++)
				1932	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1933	fprintf(fo, "\n");
				1934	}
				1935	if(debug_mode > 1)
				1936	fprintf(stderr, "\n");
				1937	} else {
				1938	// Run K-means on the word vectors
				1939	int clcn = classes, iter = 10, closeid;
				1940	int centcn = (int ) malloc(classes * sizeof(int));
				1941	int cl = (int ) calloc(vocab_size, sizeof(int));
				1942	real closev, x;
				1943	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1944	for (a = 0; a < vocab_size; a++)
				1945	cl[a] = a % clcn;
				1946	for (a = 0; a < iter; a++) {
				1947	for (b = 0; b < clcn * layer1_size; b++)
				1948	cent[b] = 0;
				1949	for (b = 0; b < clcn; b++)
				1950	centcn[b] = 1;
				1951	for (c = 0; c < vocab_size; c++) {
				1952	for (d = 0; d < layer1_size; d++)
				1953	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1954	centcn[cl[c]]++;
				1955	}
				1956	for (b = 0; b < clcn; b++) {
				1957	closev = 0;
				1958	for (c = 0; c < layer1_size; c++) {
				1959	cent[layer1_size * b + c] /= centcn[b];
				1960	closev += cent[layer1_size * b + c]
				1961	* cent[layer1_size * b + c];
				1962	}
				1963	closev = sqrt(closev);
				1964	for (c = 0; c < layer1_size; c++)
				1965	cent[layer1_size * b + c] /= closev;
				1966	}
				1967	for (c = 0; c < vocab_size; c++) {
				1968	closev = -10;
				1969	closeid = 0;
				1970	for (d = 0; d < clcn; d++) {
				1971	x = 0;
				1972	for (b = 0; b < layer1_size; b++)
				1973	x += cent[layer1_size * d + b]
				1974	* syn0[c * layer1_size + b];
				1975	if (x > closev) {
				1976	closev = x;
				1977	closeid = d;
				1978	}
				1979	}
				1980	cl[c] = closeid;
				1981	}
				1982	}
				1983	// Save the K-means classes
				1984	for (a = 0; a < vocab_size; a++)
				1985	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1986	free(centcn);
				1987	free(cent);
				1988	free(cl);
				1989	}
				1990	fclose(fo);
				1991	if (save_net_file[0] != 0)
				1992	SaveNet();
				1993	}
				1994
				1995	int ArgPos(char str, int argc, char *argv) {
				1996	int a;
				1997	for (a = 1; a < argc; a++)
				1998	if (!strcmp(str, argv[a])) {
				1999	if (a == argc - 1) {
				2000	printf("Argument missing for %s\n", str);
				2001	exit(1);
				2002	}
				2003	return a;
				2004	}
				2005	return -1;
				2006	}
				2007
				2008	void print_help() {
				2009	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				2010	printf("Options:\n");
				2011	printf("Parameters for training:\n");
				2012	printf("\t-train <file>\n");
				2013	printf("\t\tUse text data from <file> to train the model\n");
				2014	printf("\t-output <file>\n");
				2015	printf(
				2016	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				2017	printf("\t-size <int>\n");
				2018	printf("\t\tSet size of word vectors; default is 100\n");
				2019	printf("\t-window <int>\n");
				2020	printf("\t\tSet max skip length between words; default is 5\n");
				2021	printf("\t-sample <float>\n");
				2022	printf(
				2023	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				2024	printf(
				2025	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				2026	printf("\t-hs <int>\n");
				2027	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				2028	printf("\t-negative <int>\n");
				2029	printf(
				2030	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				2031	printf("\t-negative-classes <file>\n");
				2032	printf("\t\tNegative classes to sample from\n");
				2033	printf("\t-nce <int>\n");
				2034	printf(
				2035	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				2036	printf("\t-threads <int>\n");
				2037	printf("\t\tUse <int> threads (default 12)\n");
				2038	printf("\t-iter <int>\n");
				2039	printf("\t\tRun more training iterations (default 5)\n");
				2040	printf("\t-min-count <int>\n");
				2041	printf(
				2042	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				2043	printf("\t-alpha <float>\n");
				2044	printf(
				2045	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				2046	printf("\t-classes <int>\n");
				2047	printf(
				2048	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				2049	printf("\t-debug <int>\n");
				2050	printf(
				2051	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				2052	printf("\t-binary <int>\n");
				2053	printf(
				2054	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				2055	printf("\t-save-vocab <file>\n");
				2056	printf("\t\tThe vocabulary will be saved to <file>\n");
				2057	printf("\t-read-vocab <file>\n");
				2058	printf(
				2059	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				2060	printf("\t-train-counts <int>\n");
				2061	printf(
				2062	"\t\tUse word counts of actual corpus rather than vocabulary counts; default is 1 (on)\n");
				2063	printf("\t-read-net <file>\n");
				2064	printf(
				2065	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				2066	printf("\t-save-net <file>\n");
				2067	printf("\t\tThe net parameters will be saved to <file>\n");
				2068	printf("\t-magic-stop-file <file>\n");
				2069	printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
				2070	printf("\t-show-cc <int>\n");
				2071	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
				2072	printf("\t-type <int>\n");
				2073	printf(
				2074	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
				2075	printf("\t-cap <int>\n");
				2076	printf(
				2077	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				2078	printf("\nExamples:\n");
				2079	printf(
				2080	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				2081	}
				2082
				2083	int main(int argc, char **argv) {
				2084	int i;
				2085	setlocale(LC_ALL, "");
				2086	if (argc == 1) {
				2087	print_help();
				2088	return 0;
				2089	}
				2090	output_file[0] = 0;
				2091	save_vocab_file[0] = 0;
				2092	read_vocab_file[0] = 0;
				2093	save_net_file[0] = 0;
				2094	read_net_file[0] = 0;
				2095	negative_classes_file[0] = 0;
				2096	if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
				2097	print_help();
				2098	return(0);
				2099	}
				2100	if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
				2101	print_help();
				2102	return(0);
				2103	}
				2104	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				2105	layer1_size = atoi(argv[i + 1]);
				2106	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				2107	strcpy(train_file, argv[i + 1]);
				2108	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				2109	strcpy(save_vocab_file, argv[i + 1]);
				2110	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				2111	strcpy(read_vocab_file, argv[i + 1]);
				2112	if ((i = ArgPos((char *) "-train-counts", argc, argv)) > 0)
				2113	tc = atoi(argv[i + 1]);
				2114	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				2115	strcpy(save_net_file, argv[i + 1]);
				2116	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				2117	strcpy(read_net_file, argv[i + 1]);
				2118	if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
				2119	strcpy(magic_stop_file, argv[i + 1]);
				2120	if (access(magic_stop_file, F_OK ) != -1) {
				2121	printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
				2122	exit(1);
				2123	}
				2124	}
				2125	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				2126	debug_mode = atoi(argv[i + 1]);
				2127	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				2128	binary = atoi(argv[i + 1]);
				2129	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				2130	cc = atoi(argv[i + 1]);
				2131	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				2132	type = atoi(argv[i + 1]);
				2133	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				2134	strcpy(output_file, argv[i + 1]);
				2135	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				2136	window = atoi(argv[i + 1]);
				2137	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				2138	sample = atof(argv[i + 1]);
				2139	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				2140	hs = atoi(argv[i + 1]);
				2141	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				2142	negative = atoi(argv[i + 1]);
				2143	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				2144	strcpy(negative_classes_file, argv[i + 1]);
				2145	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				2146	nce = atoi(argv[i + 1]);
				2147	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				2148	num_threads = atoi(argv[i + 1]);
				2149	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				2150	iter = atoi(argv[i + 1]);
				2151	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				2152	min_count = atoi(argv[i + 1]);
				2153	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				2154	classes = atoi(argv[i + 1]);
				2155	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				2156	cap = atoi(argv[i + 1]);
				2157	if (type == 0 \|\| type == 2 \|\| type == 4)
				2158	alpha = 0.05;
				2159	if (type==5) {
				2160	sample = 0;
				2161	cdb = open_collocatordb_for_write(output_file);
				2162	}
				2163	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				2164	alpha = atof(argv[i + 1]);
				2165	vocab = (struct vocab_word *) calloc(vocab_max_size,
				2166	sizeof(struct vocab_word));
				2167	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				2168	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				2169	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				2170	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				2171	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				2172	}
				2173	SaveArgs(argc, argv);
				2174	TrainModel();
				2175	return 0;
				2176	}
				2177