Blame - src/cngram2vec.c - ids-kl/dereko2vec

blob: 266cf7667e35b395b5247c50ccafcb10c3d20c2c [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
				25	#define MAX_CODE_LENGTH 40
				26
				27	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				28
				29	typedef float real; // Precision of float numbers
				30
				31	struct vocab_word {
				32	long long cn;
				33	int *point;
				34	char word, code, codelen;
				35	};
				36
				37	char train_file[MAX_STRING], output_file[MAX_STRING];
				38	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				39	struct vocab_word *vocab;
				40	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
				41	int *vocab_hash;
				42	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				43	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
				44	real alpha = 0.025, starting_alpha, sample = 1e-3;
				45	real syn0, syn1, syn1neg, syn1nce, *expTable;
				46	clock_t start;
				47
				48	real syn1_window, syn1neg_window, *syn1nce_window;
				49	int w_offset, window_layer_size;
				50
				51	int window_hidden_size = 500;
				52	real syn_window_hidden, syn_hidden_word, syn_hidden_word_neg, syn_hidden_word_nce;
				53
				54	int hs = 0, negative = 5;
				55	const int table_size = 1e8;
				56	int *table;
				57
				58	//constrastive negative sampling
				59	char negative_classes_file[MAX_STRING];
				60	int *word_to_group;
				61	int group_to_table; //group_sizetable_size
				62	int class_number;
				63
				64	//nce
				65	real* noise_distribution;
				66	int nce = 10;
				67
				68	//param caps
				69	real CAP_VALUE = 50;
				70	int cap = 0;
				71
				72	// char models
				73	char boundToken = 'Z';
				74	char *unkNgramToken = "ZZZ";
				75	int cngram_size = 6;
				76	real *syn0_cngram;
				77	long long cngram_vocab_size = 0;
				78	struct vocab_word *cngram_vocab;
				79	int *cngram_vocab_hash;
				80	long long cngram_vocab_max_size = 1000;
				81	char extra_vocab_file[MAX_STRING];
				82	long long maxNgramSize = 1000000;
				83
				84	// Returns hash value of a word
				85	int GetWordHash(char *word) {
				86	unsigned long long a, hash = 0;
				87	for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
				88	hash = hash % vocab_hash_size;
				89	return hash;
				90	}
				91
				92	// Search
				93	int SearchCNgramVocab(char *ngram) {
				94	unsigned int hash = GetWordHash(ngram);
				95	while (1) {
				96	if (cngram_vocab_hash[hash] == -1) return -1;
				97	if (!strcmp(ngram, cngram_vocab[cngram_vocab_hash[hash]].word)) return cngram_vocab_hash[hash];
				98	hash = (hash + 1) % vocab_hash_size;
				99	}
				100	return -1;
				101	}
				102
				103	// char functions
				104	void ForwardCNgramWordNgram(real output, char ngram){
				105	long long a;
				106	int index = SearchCNgramVocab(ngram);
				107	if (index == -1) {index = SearchCNgramVocab(unkNgramToken);}
				108	long long startIndex = layer1_size * index;
				109	for (a = 0; a < layer1_size; a++){
				110	output[a] += syn0_cngram[startIndex + a];
				111	}
				112	}
				113
				114	void ForwardCNgramWordRepresentation(real output, char word){
				115	int length = strlen(word);
				116	int start;
				117	int cur_len;
				118	char *ngram;
				119	char tmp[cngram_size+1];
				120	tmp[cngram_size] = '\0';
				121	int ngrams = 0;
				122	for(start = 0; start < length-cngram_size+1; start++){
				123	ngram = word + start;
				124	strncpy(tmp, ngram, cngram_size);
				125	ForwardCNgramWordNgram(output, tmp);
				126	ngrams++;
				127	}
				128	for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cur_len] = boundToken;
				129	strncpy(tmp+1, word, cur_len);
				130	ForwardCNgramWordNgram(output, tmp);
				131	for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cngram_size-cur_len-1] = boundToken;
				132	cur_len = cngram_size - 1;
				133	if(length < cur_len){
				134	cur_len = length;
				135	}
				136	ngram = word + length - cur_len;
				137	strncpy(tmp, ngram, cur_len);
				138	tmp[cur_len] = 'Z';
				139	tmp[cur_len + 1] = '\0';
				140	ForwardCNgramWordNgram(output, tmp);
				141	for(start = 0; start < layer1_size; start++){
				142	output[start] /= ngrams+2;
				143	}
				144	}
				145
				146	void BackwardCNgramWordNgram(real output, char ngram, real *output_err){
				147	long long a;
				148	int index = SearchCNgramVocab(ngram);
				149	if (index == -1) index = SearchCNgramVocab(unkNgramToken);
				150	long long startIndex = layer1_size * index;
				151	for (a = 0; a < layer1_size; a++){
				152	syn0_cngram[startIndex + a] += output_err[a];
				153	}
				154	}
				155
				156	void BackwardCNgramWordRepresentation(real output, char word, real *output_err){
				157	int length = strlen(word);
				158	int start;
				159	int cur_len;
				160	char *ngram;
				161	char tmp[cngram_size+1];
				162	tmp[cngram_size] = '\0';
				163	for(start = 0; start < length-cngram_size+1; start++){
				164	ngram = word + start;
				165	strncpy(tmp, ngram, cngram_size);
				166	BackwardCNgramWordNgram(output, tmp, output_err);
				167	}
				168	for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cur_len] = boundToken;
				169	strncpy(tmp+1, word, cur_len);
				170	BackwardCNgramWordNgram(output, tmp, output_err);
				171	for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cngram_size-cur_len-1] = boundToken;
				172	cur_len = cngram_size - 1;
				173	if(length < cur_len){
				174	cur_len = length;
				175	}
				176	ngram = word + length - cur_len;
				177	strncpy(tmp, ngram, cur_len);
				178	tmp[cur_len] = 'Z';
				179	tmp[cur_len + 1] = '\0';
				180	BackwardCNgramWordNgram(output, tmp, output_err);
				181	}
				182
				183	void AddWordNgramToVocab(char *ngram, int count){
				184	int index = SearchCNgramVocab(ngram);
				185	if(index != -1){
				186	cngram_vocab[index].cn+=count;
				187	return;
				188	}
				189	unsigned int hash, length = strlen(ngram) + 1;
				190	if (length > MAX_STRING) length = MAX_STRING;
				191	cngram_vocab[cngram_vocab_size].word = (char *)calloc(length, sizeof(char));
				192	strcpy(cngram_vocab[cngram_vocab_size].word, ngram);
				193	cngram_vocab[cngram_vocab_size].cn = count;
				194	cngram_vocab_size++;
				195	// Reallocate memory if needed
				196	if (cngram_vocab_size + 2 >= cngram_vocab_max_size) {
				197	cngram_vocab_max_size += 1000;
				198	cngram_vocab = (struct vocab_word )realloc(cngram_vocab, cngram_vocab_max_size sizeof(struct vocab_word));
				199	}
				200	hash = GetWordHash(ngram);
				201	while (cngram_vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				202	cngram_vocab_hash[hash] = cngram_vocab_size - 1;
				203	}
				204
				205	void AddAllWordNgramToVocab(char *word, int count){
				206	int length = strlen(word);
				207	int start;
				208	int cur_len;
				209	char *ngram;
				210	char tmp[cngram_size+1];
				211	tmp[cngram_size] = '\0';
				212	for(start = 0; start < length-cngram_size+1; start++){
				213	ngram = word + start;
				214	strncpy(tmp, ngram, cngram_size);
				215	AddWordNgramToVocab(tmp, count);
				216	}
				217	for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cur_len] = boundToken;
				218	strncpy(tmp+1, word, cur_len);
				219	AddWordNgramToVocab(tmp, count);
				220	for(cur_len = 0; cur_len < cngram_size-1; cur_len++) tmp[cngram_size-cur_len-1] = boundToken;
				221	cur_len = cngram_size - 1;
				222	if(length < cur_len){
				223	cur_len = length;
				224	}
				225	ngram = word + length - cur_len;
				226	strncpy(tmp, ngram, cur_len);
				227	tmp[cur_len] = 'Z';
				228	tmp[cur_len + 1] = '\0';
				229	AddWordNgramToVocab(tmp, count);
				230	}
				231
				232	void capParam(real* array, int index){
				233	if(array[index] > CAP_VALUE)
				234	array[index] = CAP_VALUE;
				235	else if(array[index] < -CAP_VALUE)
				236	array[index] = -CAP_VALUE;
				237	}
				238
				239	real hardTanh(real x){
				240	if(x>=1){
				241	return 1;
				242	}
				243	else if(x<=-1){
				244	return -1;
				245	}
				246	else{
				247	return x;
				248	}
				249	}
				250
				251	real dHardTanh(real x, real g){
				252	if(x > 1 && g > 0){
				253	return 0;
				254	}
				255	if(x < -1 && g < 0){
				256	return 0;
				257	}
				258	return 1;
				259	}
				260
				261	void InitUnigramTable() {
				262	int a, i;
				263	long long train_words_pow = 0;
				264	real d1, power = 0.75;
				265	table = (int )malloc(table_size sizeof(int));
				266	for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
				267	i = 0;
				268	d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
				269	for (a = 0; a < table_size; a++) {
				270	table[a] = i;
				271	if (a / (real)table_size > d1) {
				272	i++;
				273	d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
				274	}
				275	if (i >= vocab_size) i = vocab_size - 1;
				276	}
				277
				278	noise_distribution = (real *)calloc(vocab_size, sizeof(real));
				279	for (a = 0; a < vocab_size; a++) noise_distribution[a] = pow(vocab[a].cn, power)/(real)train_words_pow;
				280	}
				281
				282	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				283	void ReadWord(char word, FILE fin) {
				284	int a = 0, ch;
				285	while (!feof(fin)) {
				286	ch = fgetc(fin);
				287	if (ch == 13) continue;
				288	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				289	if (a > 0) {
				290	if (ch == '\n') ungetc(ch, fin);
				291	break;
				292	}
				293	if (ch == '\n') {
				294	strcpy(word, (char *)"</s>");
				295	return;
				296	} else continue;
				297	}
				298	word[a] = ch;
				299	a++;
				300	if (a >= MAX_STRING - 1) a--; // Truncate too long words
				301	}
				302	word[a] = 0;
				303	}
				304
				305	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				306	int SearchVocab(char *word) {
				307	unsigned int hash = GetWordHash(word);
				308	while (1) {
				309	if (vocab_hash[hash] == -1) return -1;
				310	if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
				311	hash = (hash + 1) % vocab_hash_size;
				312	}
				313	return -1;
				314	}
				315
				316	// Reads a word and returns its index in the vocabulary
				317	int ReadWordIndex(FILE *fin) {
				318	char word[MAX_STRING];
				319	ReadWord(word, fin);
				320	if (feof(fin)) return -1;
				321	return SearchVocab(word);
				322	}
				323
				324	// Adds a word to the vocabulary
				325	int AddWordToVocab(char *word) {
				326	unsigned int hash, length = strlen(word) + 1;
				327	if (length > MAX_STRING) length = MAX_STRING;
				328	vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
				329	strcpy(vocab[vocab_size].word, word);
				330	vocab[vocab_size].cn = 0;
				331	vocab_size++;
				332	// Reallocate memory if needed
				333	if (vocab_size + 2 >= vocab_max_size) {
				334	vocab_max_size += 1000;
				335	vocab = (struct vocab_word )realloc(vocab, vocab_max_size sizeof(struct vocab_word));
				336	}
				337	hash = GetWordHash(word);
				338	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				339	vocab_hash[hash] = vocab_size - 1;
				340	return vocab_size - 1;
				341	}
				342
				343	// Used later for sorting by word counts
				344	int VocabCompare(const void a, const void b) {
				345	return ((struct vocab_word )b)->cn - ((struct vocab_word )a)->cn;
				346	}
				347
				348	// Sorts the vocabulary by frequency using word counts
				349	void SortVocab() {
				350	int a, size;
				351	unsigned int hash;
				352	// Sort the vocabulary and keep </s> at the first position
				353	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				354	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				355	size = vocab_size;
				356	train_words = 0;
				357	for (a = 0; a < size; a++) {
				358	// Words occuring less than min_count times will be discarded from the vocab
				359	if ((vocab[a].cn < min_count) && (a != 0)) {
				360	vocab_size--;
				361	free(vocab[a].word);
				362	} else {
				363	// Hash will be re-computed, as after the sorting it is not actual
				364	hash=GetWordHash(vocab[a].word);
				365	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				366	vocab_hash[hash] = a;
				367	train_words += vocab[a].cn;
				368	}
				369	}
				370	vocab = (struct vocab_word )realloc(vocab, (vocab_size + 1) sizeof(struct vocab_word));
				371	// Allocate memory for the binary tree construction
				372	for (a = 0; a < vocab_size; a++) {
				373	vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
				374	vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
				375	}
				376	}
				377
				378	// Reduces the vocabulary by removing infrequent tokens
				379	void ReduceVocab() {
				380	int a, b = 0;
				381	unsigned int hash;
				382	for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
				383	vocab[b].cn = vocab[a].cn;
				384	vocab[b].word = vocab[a].word;
				385	b++;
				386	} else free(vocab[a].word);
				387	vocab_size = b;
				388	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				389	for (a = 0; a < vocab_size; a++) {
				390	// Hash will be re-computed, as it is not actual
				391	hash = GetWordHash(vocab[a].word);
				392	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				393	vocab_hash[hash] = a;
				394	}
				395	fflush(stdout);
				396	min_reduce++;
				397	}
				398
				399	// Create binary Huffman tree using the word counts
				400	// Frequent words will have short uniqe binary codes
				401	void CreateBinaryTree() {
				402	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				403	char code[MAX_CODE_LENGTH];
				404	long long count = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				405	long long binary = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				406	long long parent_node = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				407	for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
				408	for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
				409	pos1 = vocab_size - 1;
				410	pos2 = vocab_size;
				411	// Following algorithm constructs the Huffman tree by adding one node at a time
				412	for (a = 0; a < vocab_size - 1; a++) {
				413	// First, find two smallest nodes 'min1, min2'
				414	if (pos1 >= 0) {
				415	if (count[pos1] < count[pos2]) {
				416	min1i = pos1;
				417	pos1--;
				418	} else {
				419	min1i = pos2;
				420	pos2++;
				421	}
				422	} else {
				423	min1i = pos2;
				424	pos2++;
				425	}
				426	if (pos1 >= 0) {
				427	if (count[pos1] < count[pos2]) {
				428	min2i = pos1;
				429	pos1--;
				430	} else {
				431	min2i = pos2;
				432	pos2++;
				433	}
				434	} else {
				435	min2i = pos2;
				436	pos2++;
				437	}
				438	count[vocab_size + a] = count[min1i] + count[min2i];
				439	parent_node[min1i] = vocab_size + a;
				440	parent_node[min2i] = vocab_size + a;
				441	binary[min2i] = 1;
				442	}
				443	// Now assign binary code to each vocabulary word
				444	for (a = 0; a < vocab_size; a++) {
				445	b = a;
				446	i = 0;
				447	while (1) {
				448	code[i] = binary[b];
				449	point[i] = b;
				450	i++;
				451	b = parent_node[b];
				452	if (b == vocab_size * 2 - 2) break;
				453	}
				454	vocab[a].codelen = i;
				455	vocab[a].point[0] = vocab_size - 2;
				456	for (b = 0; b < i; b++) {
				457	vocab[a].code[i - b - 1] = code[b];
				458	vocab[a].point[i - b] = point[b] - vocab_size;
				459	}
				460	}
				461	free(count);
				462	free(binary);
				463	free(parent_node);
				464	}
				465
				466	void LearnVocabFromTrainFile() {
				467	char word[MAX_STRING];
				468	FILE *fin;
				469	long long a, i;
				470	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				471	for (a = 0; a < vocab_hash_size; a++) cngram_vocab_hash[a] = -1;
				472	fin = fopen(train_file, "rb");
				473	if (fin == NULL) {
				474	printf("ERROR: training data file not found!\n");
				475	exit(1);
				476	}
				477	vocab_size = 0;
				478	AddWordToVocab((char *)"</s>");
				479	AddWordNgramToVocab(unkNgramToken,1000000);
				480	while (1) {
				481	ReadWord(word, fin);
				482	if (feof(fin)) break;
				483	train_words++;
				484	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				485	printf("%lldK%c", train_words / 1000, 13);
				486	fflush(stdout);
				487	}
				488	i = SearchVocab(word);
				489	if (i == -1) {
				490	a = AddWordToVocab(word);
				491	vocab[a].cn = 1;
				492	} else vocab[i].cn++;
				493	if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
				494	}
				495	SortVocab();
				496	for (a = 0; a < vocab_size; a++){
				497	AddAllWordNgramToVocab(vocab[a].word, vocab[a].cn);
				498	}
				499	if (debug_mode > 0) {
				500	printf("Vocab size: %lld\n", vocab_size);
				501	printf("Ngrams size: %lld\n", cngram_vocab_size);
				502	printf("Words in train file: %lld\n", train_words);
				503	}
				504	file_size = ftell(fin);
				505	fclose(fin);
				506	}
				507
				508	void SaveVocab() {
				509	long long i;
				510	FILE *fo = fopen(save_vocab_file, "wb");
				511	for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				512	fclose(fo);
				513	}
				514
				515	void ReadVocab() {
				516	long long a, i = 0;
				517	char c;
				518	char word[MAX_STRING];
				519	FILE *fin = fopen(read_vocab_file, "rb");
				520	if (fin == NULL) {
				521	printf("Vocabulary file not found\n");
				522	exit(1);
				523	}
				524	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				525	vocab_size = 0;
				526	while (1) {
				527	ReadWord(word, fin);
				528	if (feof(fin)) break;
				529	a = AddWordToVocab(word);
				530	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				531	i++;
				532	}
				533	SortVocab();
				534	if (debug_mode > 0) {
				535	printf("Vocab size: %lld\n", vocab_size);
				536	printf("Words in train file: %lld\n", train_words);
				537	}
				538	fin = fopen(train_file, "rb");
				539	if (fin == NULL) {
				540	printf("ERROR: training data file not found!\n");
				541	exit(1);
				542	}
				543	fseek(fin, 0, SEEK_END);
				544	file_size = ftell(fin);
				545	fclose(fin);
				546	}
				547
				548	void InitClassUnigramTable() {
				549	long long a,c;
				550	printf("loading class unigrams \n");
				551	FILE *fin = fopen(negative_classes_file, "rb");
				552	if (fin == NULL) {
				553	printf("ERROR: class file not found!\n");
				554	exit(1);
				555	}
				556	word_to_group = (int )malloc(vocab_size sizeof(int));
				557	for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
				558	char class[MAX_STRING];
				559	char prev_class[MAX_STRING];
				560	prev_class[0] = 0;
				561	char word[MAX_STRING];
				562	class_number = -1;
				563	while (1) {
				564	if (feof(fin)) break;
				565	ReadWord(class, fin);
				566	ReadWord(word, fin);
				567	int word_index = SearchVocab(word);
				568	if (word_index != -1){
				569	if(strcmp(class, prev_class) != 0){
				570	class_number++;
				571	strcpy(prev_class, class);
				572	}
				573	word_to_group[word_index] = class_number;
				574	}
				575	ReadWord(word, fin);
				576	}
				577	class_number++;
				578	fclose(fin);
				579
				580	group_to_table = (int )malloc(table_size class_number * sizeof(int));
				581	long long train_words_pow = 0;
				582	real d1, power = 0.75;
				583
				584	for(c = 0; c < class_number; c++){
				585	long long offset = c * table_size;
				586	train_words_pow = 0;
				587	for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
				588	int i = 0;
				589	while(word_to_group[i]!=c && i < vocab_size) i++;
				590	d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
				591	for (a = 0; a < table_size; a++) {
				592	//printf("index %lld , word %d\n", a, i);
				593	group_to_table[offset + a] = i;
				594	if (a / (real)table_size > d1) {
				595	i++;
				596	while(word_to_group[i]!=c && i < vocab_size) i++;
				597	d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
				598	}
				599	if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
				600	}
				601	}
				602	}
				603
				604	void InitNet() {
				605	long long a, b;
				606	unsigned long long next_random = 1;
				607	window_layer_size = layer1_sizewindow2;
				608	a = posix_memalign((void *)&syn0, 128, (long long)vocab_size layer1_size * sizeof(real));
				609	if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
				610	a = posix_memalign((void *)&syn0_cngram, 128, (long long)vocab_size layer1_size * sizeof(real));
				611	if (syn0_cngram == NULL) {printf("Memory allocation failed\n"); exit(1);}
				612
				613	if (hs) {
				614	a = posix_memalign((void *)&syn1, 128, (long long)vocab_size layer1_size * sizeof(real));
				615	if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
				616	a = posix_memalign((void *)&syn1_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				617	if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				618	a = posix_memalign((void *)&syn_hidden_word, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				619	if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
				620
				621	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				622	syn1[a * layer1_size + b] = 0;
				623	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				624	syn1_window[a * window_layer_size + b] = 0;
				625	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				626	syn_hidden_word[a * window_hidden_size + b] = 0;
				627	}
				628	if (negative>0) {
				629	a = posix_memalign((void *)&syn1neg, 128, (long long)vocab_size layer1_size * sizeof(real));
				630	if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
				631	a = posix_memalign((void *)&syn1neg_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				632	if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				633	a = posix_memalign((void *)&syn_hidden_word_neg, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				634	if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
				635
				636	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				637	syn1neg[a * layer1_size + b] = 0;
				638	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				639	syn1neg_window[a * window_layer_size + b] = 0;
				640	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				641	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
				642	}
				643	if (nce>0) {
				644	a = posix_memalign((void *)&syn1nce, 128, (long long)vocab_size layer1_size * sizeof(real));
				645	if (syn1nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
				646	a = posix_memalign((void *)&syn1nce_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				647	if (syn1nce_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				648	a = posix_memalign((void *)&syn_hidden_word_nce, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				649	if (syn_hidden_word_nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
				650
				651	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				652	syn1nce[a * layer1_size + b] = 0;
				653	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				654	syn1nce_window[a * window_layer_size + b] = 0;
				655	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				656	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				657	}
				658	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
				659	next_random = next_random * (unsigned long long)25214903917 + 11;
				660	syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
				661	}
				662
				663	for (a = 0; a < cngram_vocab_size; a++) for (b = 0; b < layer1_size; b++){
				664	next_random = next_random * (unsigned long long)25214903917 + 11;
				665	syn0_cngram[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
				666	}
				667
				668	a = posix_memalign((void *)&syn_window_hidden, 128, window_hidden_size window_layer_size * sizeof(real));
				669	if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
				670	for (a = 0; a < window_hidden_size * window_layer_size; a++){
				671	next_random = next_random * (unsigned long long)25214903917 + 11;
				672	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
				673	}
				674
				675	CreateBinaryTree();
				676	}
				677
				678	void TrainModelThread(void id) {
				679	long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
				680	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				681	long long l1, l2, c, target, label, local_iter = iter;
				682	unsigned long long next_random = (long long)id;
				683	real f, g;
				684	clock_t now;
				685	int input_len_1 = layer1_size;
				686	int window_offset = -1;
				687	if(type == 2 \|\| type == 4){
				688	input_len_1=window_layer_size;
				689	}
				690	real neu1 = (real )calloc(input_len_1, sizeof(real));
				691	real neu1e = (real )calloc(input_len_1, sizeof(real));
				692
				693	int input_len_2 = 0;
				694	if(type == 4){
				695	input_len_2 = window_hidden_size;
				696	}
				697	real neu2 = (real )calloc(input_len_2, sizeof(real));
				698	real neu2e = (real )calloc(input_len_2, sizeof(real));
				699
				700	FILE *fi = fopen(train_file, "rb");
				701	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
				702	while (1) {
				703	if (word_count - last_word_count > 10000) {
				704	word_count_actual += word_count - last_word_count;
				705	last_word_count = word_count;
				706	if ((debug_mode > 1)) {
				707	now=clock();
				708	printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
				709	word_count_actual / (real)(iter * train_words + 1) * 100,
				710	word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
				711	fflush(stdout);
				712	}
				713	alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
				714	if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
				715	}
				716	if (sentence_length == 0) {
				717	while (1) {
				718	word = ReadWordIndex(fi);
				719	if (feof(fi)) break;
				720	if (word == -1) continue;
				721	word_count++;
				722	if (word == 0) break;
				723	// The subsampling randomly discards frequent words while keeping the ranking same
				724	if (sample > 0) {
				725	real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
				726	next_random = next_random * (unsigned long long)25214903917 + 11;
				727	if (ran < (next_random & 0xFFFF) / (real)65536) continue;
				728	}
				729	sen[sentence_length] = word;
				730	sentence_length++;
				731	if (sentence_length >= MAX_SENTENCE_LENGTH) break;
				732	}
				733	sentence_position = 0;
				734	}
				735	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				736	word_count_actual += word_count - last_word_count;
				737	local_iter--;
				738	if (local_iter == 0) break;
				739	word_count = 0;
				740	last_word_count = 0;
				741	sentence_length = 0;
				742	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
				743	continue;
				744	}
				745	word = sen[sentence_position];
				746	if (word == -1) continue;
				747	for (c = 0; c < input_len_1; c++) neu1[c] = 0;
				748	for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
				749	for (c = 0; c < input_len_2; c++) neu2[c] = 0;
				750	for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
				751	next_random = next_random * (unsigned long long)25214903917 + 11;
				752	b = next_random % window;
				753	if (type == 0) { //train the cbow architecture
				754	// in -> hidden
				755	cw = 0;
				756	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				757	c = sentence_position - window + a;
				758	if (c < 0) continue;
				759	if (c >= sentence_length) continue;
				760	last_word = sen[c];
				761	if (last_word == -1) continue;
				762	ForwardCNgramWordRepresentation(neu1, vocab[last_word].word);
				763	cw++;
				764	}
				765	if (cw) {
				766	for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
				767	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				768	f = 0;
				769	l2 = vocab[word].point[d] * layer1_size;
				770	// Propagate hidden -> output
				771	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
				772	if (f <= -MAX_EXP) continue;
				773	else if (f >= MAX_EXP) continue;
				774	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				775	// 'g' is the gradient multiplied by the learning rate
				776	g = (1 - vocab[word].code[d] - f) * alpha;
				777	// Propagate errors output -> hidden
				778	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
				779	// Learn weights hidden -> output
				780	for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
				781	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
				782	}
				783	// NEGATIVE SAMPLING
				784	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				785	if (d == 0) {
				786	target = word;
				787	label = 1;
				788	} else {
				789	next_random = next_random * (unsigned long long)25214903917 + 11;
				790	if(word_to_group != NULL && word_to_group[word] != -1){
				791	target = word;
				792	while(target == word) {
				793	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				794	next_random = next_random * (unsigned long long)25214903917 + 11;
				795	}
				796	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				797	}
				798	else{
				799	target = table[(next_random >> 16) % table_size];
				800	}
				801	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				802	if (target == word) continue;
				803	label = 0;
				804	}
				805	l2 = target * layer1_size;
				806	f = 0;
				807	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
				808	if (f > MAX_EXP) g = (label - 1) * alpha;
				809	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				810	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				811	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
				812	for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
				813	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
				814	}
				815	// Noise Contrastive Estimation
				816	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				817	if (d == 0) {
				818	target = word;
				819	label = 1;
				820	} else {
				821	next_random = next_random * (unsigned long long)25214903917 + 11;
				822	if(word_to_group != NULL && word_to_group[word] != -1){
				823	target = word;
				824	while(target == word) {
				825	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				826	next_random = next_random * (unsigned long long)25214903917 + 11;
				827	}
				828	}
				829	else{
				830	target = table[(next_random >> 16) % table_size];
				831	}
				832	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				833	if (target == word) continue;
				834	label = 0;
				835	}
				836	l2 = target * layer1_size;
				837	f = 0;
				838
				839	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
				840	if (f > MAX_EXP) g = (label - 1) * alpha;
				841	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				842	else {
				843	f = exp(f);
				844	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				845	}
				846	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
				847	for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
				848	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce,c + l2);
				849	}
				850	// hidden -> in
				851	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				852	c = sentence_position - window + a;
				853	if (c < 0) continue;
				854	if (c >= sentence_length) continue;
				855	last_word = sen[c];
				856	if (last_word == -1) continue;
				857	BackwardCNgramWordRepresentation(neu1, vocab[last_word].word, neu1e);
				858	}
				859	}
				860	} else if(type==1) { //train skip-gram
				861	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				862	c = sentence_position - window + a;
				863	if (c < 0) continue;
				864	if (c >= sentence_length) continue;
				865	last_word = sen[c];
				866	if (last_word == -1) continue;
				867	l1 = last_word * layer1_size;
				868	ForwardCNgramWordRepresentation(neu1, vocab[last_word].word);
				869	for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
				870	// HIERARCHICAL SOFTMAX
				871	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				872	f = 0;
				873	l2 = vocab[word].point[d] * layer1_size;
				874	// Propagate hidden -> output
				875	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
				876	if (f <= -MAX_EXP) continue;
				877	else if (f >= MAX_EXP) continue;
				878	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				879	// 'g' is the gradient multiplied by the learning rate
				880	g = (1 - vocab[word].code[d] - f) * alpha;
				881	// Propagate errors output -> hidden
				882	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
				883	// Learn weights hidden -> output
				884	for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
				885	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
				886	}
				887	// NEGATIVE SAMPLING
				888	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				889	if (d == 0) {
				890	target = word;
				891	label = 1;
				892	} else {
				893	next_random = next_random * (unsigned long long)25214903917 + 11;
				894	if(word_to_group != NULL && word_to_group[word] != -1){
				895	target = word;
				896	while(target == word) {
				897	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				898	next_random = next_random * (unsigned long long)25214903917 + 11;
				899	}
				900	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				901	}
				902	else{
				903	target = table[(next_random >> 16) % table_size];
				904	}
				905	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				906	if (target == word) continue;
				907	label = 0;
				908	}
				909	l2 = target * layer1_size;
				910	f = 0;
				911	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
				912	if (f > MAX_EXP) g = (label - 1) * alpha;
				913	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				914	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				915	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
				916	for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
				917	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
				918	}
				919	//Noise Contrastive Estimation
				920	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				921	if (d == 0) {
				922	target = word;
				923	label = 1;
				924	} else {
				925	next_random = next_random * (unsigned long long)25214903917 + 11;
				926	if(word_to_group != NULL && word_to_group[word] != -1){
				927	target = word;
				928	while(target == word) {
				929	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				930	next_random = next_random * (unsigned long long)25214903917 + 11;
				931	}
				932	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				933	}
				934	else{
				935	target = table[(next_random >> 16) % table_size];
				936	}
				937	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				938	if (target == word) continue;
				939	label = 0;
				940	}
				941	l2 = target * layer1_size;
				942	f = 0;
				943	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
				944	if (f > MAX_EXP) g = (label - 1) * alpha;
				945	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				946	else {
				947	f = exp(f);
				948	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				949	}
				950	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
				951	for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
				952	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce, c + l2);
				953	}
				954	// Learn weights input -> hidden
				955	BackwardCNgramWordRepresentation(neu1, vocab[last_word].word, neu1e);
				956	}
				957	}
				958	else if(type == 2){ //train the cwindow architecture
				959	// in -> hidden
				960	cw = 0;
				961	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				962	c = sentence_position - window + a;
				963	if (c < 0) continue;
				964	if (c >= sentence_length) continue;
				965	last_word = sen[c];
				966	if (last_word == -1) continue;
				967	window_offset = a*layer1_size;
				968	if (a > window) window_offset-=layer1_size;
				969	ForwardCNgramWordRepresentation(&neu1[window_offset], vocab[last_word].word);
				970	cw++;
				971	}
				972	if (cw) {
				973	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				974	f = 0;
				975	l2 = vocab[word].point[d] * window_layer_size;
				976	// Propagate hidden -> output
				977	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
				978	if (f <= -MAX_EXP) continue;
				979	else if (f >= MAX_EXP) continue;
				980	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				981	// 'g' is the gradient multiplied by the learning rate
				982	g = (1 - vocab[word].code[d] - f) * alpha;
				983	// Propagate errors output -> hidden
				984	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
				985	// Learn weights hidden -> output
				986	for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
				987	if (cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1_window, c + l2);
				988	}
				989	// NEGATIVE SAMPLING
				990	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				991	if (d == 0) {
				992	target = word;
				993	label = 1;
				994	} else {
				995	next_random = next_random * (unsigned long long)25214903917 + 11;
				996	if(word_to_group != NULL && word_to_group[word] != -1){
				997	target = word;
				998	while(target == word) {
				999	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1000	next_random = next_random * (unsigned long long)25214903917 + 11;
				1001	}
				1002	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1003	}
				1004	else{
				1005	target = table[(next_random >> 16) % table_size];
				1006	}
				1007	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1008	if (target == word) continue;
				1009	label = 0;
				1010	}
				1011	l2 = target * window_layer_size;
				1012	f = 0;
				1013	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
				1014	if (f > MAX_EXP) g = (label - 1) * alpha;
				1015	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				1016	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				1017	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
				1018	for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
				1019	if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1neg_window, c + l2);
				1020	}
				1021	// Noise Contrastive Estimation
				1022	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				1023	if (d == 0) {
				1024	target = word;
				1025	label = 1;
				1026	} else {
				1027	next_random = next_random * (unsigned long long)25214903917 + 11;
				1028	if(word_to_group != NULL && word_to_group[word] != -1){
				1029	target = word;
				1030	while(target == word) {
				1031	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1032	next_random = next_random * (unsigned long long)25214903917 + 11;
				1033	}
				1034	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1035	}
				1036	else{
				1037	target = table[(next_random >> 16) % table_size];
				1038	}
				1039	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1040	if (target == word) continue;
				1041	label = 0;
				1042	}
				1043	l2 = target * window_layer_size;
				1044	f = 0;
				1045	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1nce_window[c + l2];
				1046	if (f > MAX_EXP) g = (label - 1) * alpha;
				1047	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				1048	else {
				1049	f = exp(f);
				1050	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				1051	}
				1052	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1nce_window[c + l2];
				1053	for (c = 0; c < window_layer_size; c++) syn1nce_window[c + l2] += g * neu1[c];
				1054	if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1nce_window, c + l2);
				1055	}
				1056	// hidden -> in
				1057	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1058	c = sentence_position - window + a;
				1059	if (c < 0) continue;
				1060	if (c >= sentence_length) continue;
				1061	last_word = sen[c];
				1062	if (last_word == -1) continue;
				1063	window_offset = a * layer1_size;
				1064	if(a > window) window_offset -= layer1_size;
				1065	BackwardCNgramWordRepresentation(&neu1[window_offset], vocab[last_word].word, &neu1e[window_offset]);
				1066	}
				1067	}
				1068	}
				1069	else if (type == 3){ //train structured skip-gram
				1070	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1071	c = sentence_position - window + a;
				1072	if (c < 0) continue;
				1073	if (c >= sentence_length) continue;
				1074	last_word = sen[c];
				1075	if (last_word == -1) continue;
				1076	l1 = last_word * layer1_size;
				1077	window_offset = a * layer1_size;
				1078	if(a > window) window_offset -= layer1_size;
				1079	ForwardCNgramWordRepresentation(neu1, vocab[last_word].word);
				1080	for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
				1081	// HIERARCHICAL SOFTMAX
				1082	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				1083	f = 0;
				1084	l2 = vocab[word].point[d] * window_layer_size;
				1085	// Propagate hidden -> output
				1086	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1_window[c + l2 + window_offset];
				1087	if (f <= -MAX_EXP) continue;
				1088	else if (f >= MAX_EXP) continue;
				1089	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1090	// 'g' is the gradient multiplied by the learning rate
				1091	g = (1 - vocab[word].code[d] - f) * alpha;
				1092	// Propagate errors output -> hidden
				1093	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
				1094	// Learn weights hidden -> output
				1095	for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * neu1[c];
				1096	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2 + window_offset);
				1097	}
				1098	// NEGATIVE SAMPLING
				1099	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				1100	if (d == 0) {
				1101	target = word;
				1102	label = 1;
				1103	} else {
				1104	next_random = next_random * (unsigned long long)25214903917 + 11;
				1105	if(word_to_group != NULL && word_to_group[word] != -1){
				1106	target = word;
				1107	while(target == word) {
				1108	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1109	next_random = next_random * (unsigned long long)25214903917 + 11;
				1110	}
				1111	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1112	}
				1113	else{
				1114	target = table[(next_random >> 16) % table_size];
				1115	}
				1116	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1117	if (target == word) continue;
				1118	label = 0;
				1119	}
				1120	l2 = target * window_layer_size;
				1121	f = 0;
				1122	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg_window[c + l2 + window_offset];
				1123	if (f > MAX_EXP) g = (label - 1) * alpha;
				1124	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				1125	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				1126	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
				1127	for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * neu1[c];
				1128	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg_window, c + l2 + window_offset);
				1129	}
				1130	// Noise Constrastive Estimation
				1131	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				1132	if (d == 0) {
				1133	target = word;
				1134	label = 1;
				1135	} else {
				1136	next_random = next_random * (unsigned long long)25214903917 + 11;
				1137	if(word_to_group != NULL && word_to_group[word] != -1){
				1138	target = word;
				1139	while(target == word) {
				1140	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1141	next_random = next_random * (unsigned long long)25214903917 + 11;
				1142	}
				1143	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1144	}
				1145	else{
				1146	target = table[(next_random >> 16) % table_size];
				1147	}
				1148	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1149	if (target == word) continue;
				1150	label = 0;
				1151	}
				1152	l2 = target * window_layer_size;
				1153	f = 0;
				1154	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce_window[c + l2 + window_offset];
				1155	if (f > MAX_EXP) g = (label - 1) * alpha;
				1156	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				1157	else {
				1158	f = exp(f);
				1159	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				1160	}
				1161	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce_window[c + l2 + window_offset];
				1162	for (c = 0; c < layer1_size; c++) syn1nce_window[c + l2 + window_offset] += g * neu1[c];
				1163	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce_window, c + l2 + window_offset);
				1164	}
				1165	// Learn weights input -> hidden
				1166	BackwardCNgramWordRepresentation(neu1, vocab[last_word].word, neu1e);
				1167	}
				1168	}
				1169	else if(type == 4){ //training senna
				1170	// in -> hidden
				1171	cw = 0;
				1172	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1173	c = sentence_position - window + a;
				1174	if (c < 0) continue;
				1175	if (c >= sentence_length) continue;
				1176	last_word = sen[c];
				1177	if (last_word == -1) continue;
				1178	window_offset = a*layer1_size;
				1179	if (a > window) window_offset-=layer1_size;
				1180	for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
				1181	cw++;
				1182	}
				1183	if (cw) {
				1184	for (a = 0; a < window_hidden_size; a++){
				1185	c = a*window_layer_size;
				1186	for(b = 0; b < window_layer_size; b++){
				1187	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1188	}
				1189	}
				1190	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				1191	f = 0;
				1192	l2 = vocab[word].point[d] * window_hidden_size;
				1193	// Propagate hidden -> output
				1194	for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1195	if (f <= -MAX_EXP) continue;
				1196	else if (f >= MAX_EXP) continue;
				1197	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1198	// 'g' is the gradient multiplied by the learning rate
				1199	g = (1 - vocab[word].code[d] - f) * alpha;
				1200	// Propagate errors output -> hidden
				1201	for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
				1202	// Learn weights hidden -> output
				1203	for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
				1204	}
				1205	// NEGATIVE SAMPLING
				1206	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				1207	if (d == 0) {
				1208	target = word;
				1209	label = 1;
				1210	} else {
				1211	next_random = next_random * (unsigned long long)25214903917 + 11;
				1212	if(word_to_group != NULL && word_to_group[word] != -1){
				1213	target = word;
				1214	while(target == word) {
				1215	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1216	next_random = next_random * (unsigned long long)25214903917 + 11;
				1217	}
				1218	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1219	}
				1220	else{
				1221	target = table[(next_random >> 16) % table_size];
				1222	}
				1223	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1224	if (target == word) continue;
				1225	label = 0;
				1226	}
				1227	l2 = target * window_hidden_size;
				1228	f = 0;
				1229	for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
				1230	if (f > MAX_EXP) g = (label - 1) * alpha / negative;
				1231	else if (f < -MAX_EXP) g = (label - 0) * alpha / negative;
				1232	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha / negative;
				1233	for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
				1234	for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
				1235	}
				1236	for (a = 0; a < window_hidden_size; a++)
				1237	for(b = 0; b < window_layer_size; b++)
				1238	neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
				1239	for (a = 0; a < window_hidden_size; a++)
				1240	for(b = 0; b < window_layer_size; b++)
				1241	syn_window_hidden[awindow_layer_size + b] += neu2e[a] neu1[b];
				1242	// hidden -> in
				1243	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1244	c = sentence_position - window + a;
				1245	if (c < 0) continue;
				1246	if (c >= sentence_length) continue;
				1247	last_word = sen[c];
				1248	if (last_word == -1) continue;
				1249	window_offset = a * layer1_size;
				1250	if(a > window) window_offset -= layer1_size;
				1251	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
				1252	}
				1253	}
				1254	}
				1255	else{
				1256	printf("unknown type %i", type);
				1257	exit(0);
				1258	}
				1259	sentence_position++;
				1260	if (sentence_position >= sentence_length) {
				1261	sentence_length = 0;
				1262	continue;
				1263	}
				1264	}
				1265	fclose(fi);
				1266	free(neu1);
				1267	free(neu1e);
				1268	pthread_exit(NULL);
				1269	}
				1270
				1271	void TrainModel() {
				1272	long a, b;
				1273	long extra_words;
				1274	FILE *fo;
				1275	FILE *fi;
				1276	pthread_t pt = (pthread_t )malloc(num_threads * sizeof(pthread_t));
				1277	printf("Starting training using file %s\n", train_file);
				1278	starting_alpha = alpha;
				1279	if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
				1280	if (save_vocab_file[0] != 0) SaveVocab();
				1281	if (output_file[0] == 0) return;
				1282	InitNet();
				1283	if (negative > 0 \|\| nce > 0) InitUnigramTable();
				1284	if (negative_classes_file[0] != 0) InitClassUnigramTable();
				1285	start = clock();
				1286	for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
				1287	for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
				1288	fo = fopen(output_file, "wb");
				1289	if (classes == 0) {
				1290	// Save the word vectors
				1291	real neu1[layer1_size];
				1292
				1293	// count extra words
				1294	extra_words = 0;
				1295	fi = fopen(extra_vocab_file, "rb");
				1296	if (fi != NULL) {
				1297	char word[MAX_STRING];
				1298	while(1){
				1299	ReadWord(word, fi);
				1300	if(feof(fi)) break;
				1301	extra_words++;
				1302	ReadWord(word, fi);
				1303	}
				1304	}
				1305	fclose(fi);
				1306	fprintf(fo, "%lld %lld\n", vocab_size + extra_words, layer1_size);
				1307	for (a = 0; a < vocab_size; a++) {
				1308	fprintf(fo, "%s ", vocab[a].word);
				1309	for (b = 0; b < layer1_size; b++) neu1[b] = 0;
				1310	ForwardCNgramWordRepresentation(neu1, vocab[a].word);
				1311
				1312	if (binary) for (b = 0; b < layer1_size; b++) fwrite(&neu1[b], sizeof(real), 1, fo);
				1313	else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", neu1[b]);
				1314	fprintf(fo, "\n");
				1315	}
				1316	fi = fopen(extra_vocab_file, "rb");
				1317	if (fi != NULL) {
				1318	char word[MAX_STRING];
				1319	while(1){
				1320	ReadWord(word, fi);
				1321	if(feof(fi)) break;
				1322	for (b = 0; b < layer1_size; b++) neu1[b] = 0;
				1323	fprintf(fo, "%s ", word);
				1324	ForwardCNgramWordRepresentation(neu1, word);
				1325	if (binary) for (b = 0; b < layer1_size; b++) fwrite(&neu1[b], sizeof(real), 1, fo);
				1326	else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", neu1[b]);
				1327	fprintf(fo, "\n");
				1328	ReadWord(word, fi);
				1329	}
				1330	}
				1331	fclose(fi);
				1332	}
				1333	fclose(fo);
				1334	}
				1335
				1336	int ArgPos(char str, int argc, char *argv) {
				1337	int a;
				1338	for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
				1339	if (a == argc - 1) {
				1340	printf("Argument missing for %s\n", str);
				1341	exit(1);
				1342	}
				1343	return a;
				1344	}
				1345	return -1;
				1346	}
				1347
				1348	int main(int argc, char **argv) {
				1349	int i;
				1350	if (argc == 1) {
				1351	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1352	printf("Options:\n");
				1353	printf("Parameters for training:\n");
				1354	printf("\t-train <file>\n");
				1355	printf("\t\tUse text data from <file> to train the model\n");
				1356	printf("\t-output <file>\n");
				1357	printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1358	printf("\t-size <int>\n");
				1359	printf("\t\tSet size of word vectors; default is 100\n");
				1360	printf("\t-window <int>\n");
				1361	printf("\t\tSet max skip length between words; default is 5\n");
				1362	printf("\t-sample <float>\n");
				1363	printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1364	printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1365	printf("\t-hs <int>\n");
				1366	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1367	printf("\t-negative <int>\n");
				1368	printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1369	printf("\t-negative-classes <file>\n");
				1370	printf("\t\tNegative classes to sample from\n");
				1371	printf("\t-nce <int>\n");
				1372	printf("\t\tNumber of negative examples for nce; default is 5, common values are 3 - 10 (0 = not used)\n");
				1373	printf("\t-threads <int>\n");
				1374	printf("\t\tUse <int> threads (default 12)\n");
				1375	printf("\t-iter <int>\n");
				1376	printf("\t\tRun more training iterations (default 5)\n");
				1377	printf("\t-cngram-size <int>\n");
				1378	printf("\t\tUse <int> size of the character ngrams (default 4)\n");
				1379	printf("\t-extra_vocab_file <file>\n");
				1380	printf("\t\tUse <file> file with extra words (one per line)\n");
				1381	printf("\t-min-count <int>\n");
				1382	printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1383	printf("\t-alpha <float>\n");
				1384	printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1385	printf("\t-classes <int>\n");
				1386	printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1387	printf("\t-debug <int>\n");
				1388	printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
				1389	printf("\t-binary <int>\n");
				1390	printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1391	printf("\t-save-vocab <file>\n");
				1392	printf("\t\tThe vocabulary will be saved to <file>\n");
				1393	printf("\t-read-vocab <file>\n");
				1394	printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1395	printf("\t-type <int>\n");
				1396	printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1397	printf("\t-cap <int>\n");
				1398	printf("\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1399	printf("\nExamples:\n");
				1400	printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3 -cngram-size 4 -extra_vocab_file extra.txt \n\n");
				1401	return 0;
				1402	}
				1403	output_file[0] = 0;
				1404	save_vocab_file[0] = 0;
				1405	read_vocab_file[0] = 0;
				1406	negative_classes_file[0] = 0;
				1407	if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
				1408	if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
				1409	if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
				1410	if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
				1411	if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
				1412	if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
				1413	if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
				1414	if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
				1415	if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
				1416	if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
				1417	if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
				1418	if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
				1419	if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
				1420	if ((i = ArgPos((char *)"-nce", argc, argv)) > 0) nce = atoi(argv[i + 1]);
				1421	if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
				1422	if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
				1423	if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
				1424	if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
				1425	if ((i = ArgPos((char *)"-cap", argc, argv)) > 0) cap = atoi(argv[i + 1]);
				1426	if ((i = ArgPos((char *)"-cngram-size", argc, argv)) > 0) cngram_size = atoi(argv[i + 1]);
				1427	if ((i = ArgPos((char *)"-extra_vocab_file", argc, argv)) > 0) strcpy(extra_vocab_file, argv[i + 1]);
				1428	if (type==0 \|\| type==2 \|\| type==4) alpha = 0.05;
				1429	if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
				1430	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
				1431	vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
				1432	cngram_vocab = (struct vocab_word *)calloc(cngram_vocab_max_size, sizeof(struct vocab_word));
				1433	cngram_vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
				1434	expTable = (real )malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1435	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1436	expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1437	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1438	}
				1439	TrainModel();
				1440	return 0;
				1441	}
				1442