Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 2d4d34d4dd52c09ed9bb7268a071e2271f3b4803 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
				25	#define MAX_CODE_LENGTH 40
				26
				27	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				28
				29	typedef float real; // Precision of float numbers
				30
				31	struct vocab_word {
				32	long long cn;
				33	int *point;
				34	char word, code, codelen;
				35	};
				36
				37	char train_file[MAX_STRING], output_file[MAX_STRING];
				38	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				39	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				40	struct vocab_word *vocab;
				41	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
				42	num_threads = 12, min_reduce = 1;
				43	int *vocab_hash;
				44	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				45	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				46	classes = 0;
				47	real alpha = 0.025, starting_alpha, sample = 1e-3;
				48	real syn0, syn1, syn1neg, syn1nce, *expTable;
				49	clock_t start;
				50
				51	real syn1_window, syn1neg_window, *syn1nce_window;
				52	int w_offset, window_layer_size;
				53
				54	int window_hidden_size = 500;
				55	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				56	*syn_hidden_word_nce;
				57
				58	int hs = 0, negative = 5;
				59	const int table_size = 1e8;
				60	int *table;
				61
				62	//constrastive negative sampling
				63	char negative_classes_file[MAX_STRING];
				64	int *word_to_group;
				65	int group_to_table; //group_sizetable_size
				66	int class_number;
				67
				68	//nce
				69	real* noise_distribution;
				70	int nce = 0;
				71
				72	//param caps
				73	real CAP_VALUE = 50;
				74	int cap = 0;
				75
				76	void capParam(real* array, int index) {
				77	if (array[index] > CAP_VALUE)
				78	array[index] = CAP_VALUE;
				79	else if (array[index] < -CAP_VALUE)
				80	array[index] = -CAP_VALUE;
				81	}
				82
				83	real hardTanh(real x) {
				84	if (x >= 1) {
				85	return 1;
				86	} else if (x <= -1) {
				87	return -1;
				88	} else {
				89	return x;
				90	}
				91	}
				92
				93	real dHardTanh(real x, real g) {
				94	if (x > 1 && g > 0) {
				95	return 0;
				96	}
				97	if (x < -1 && g < 0) {
				98	return 0;
				99	}
				100	return 1;
				101	}
				102
				103	void InitUnigramTable() {
				104	int a, i;
				105	long long train_words_pow = 0;
				106	real d1, power = 0.75;
				107	table = (int ) malloc(table_size sizeof(int));
				108	for (a = 0; a < vocab_size; a++)
				109	train_words_pow += pow(vocab[a].cn, power);
				110	i = 0;
				111	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				112	for (a = 0; a < table_size; a++) {
				113	table[a] = i;
				114	if (a / (real) table_size > d1) {
				115	i++;
				116	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				117	}
				118	if (i >= vocab_size)
				119	i = vocab_size - 1;
				120	}
				121
				122	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				123	for (a = 0; a < vocab_size; a++)
				124	noise_distribution[a] = pow(vocab[a].cn, power)
				125	/ (real) train_words_pow;
				126	}
				127
				128	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				129	void ReadWord(char word, FILE fin) {
				130	int a = 0, ch;
				131	while (!feof(fin)) {
				132	ch = fgetc(fin);
				133	if (ch == 13)
				134	continue;
				135	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				136	if (a > 0) {
				137	if (ch == '\n')
				138	ungetc(ch, fin);
				139	break;
				140	}
				141	if (ch == '\n') {
				142	strcpy(word, (char *) "</s>");
				143	return;
				144	} else
				145	continue;
				146	}
				147	word[a] = ch;
				148	a++;
				149	if (a >= MAX_STRING - 1)
				150	a--; // Truncate too long words
				151	}
				152	word[a] = 0;
				153	}
				154
				155	// Returns hash value of a word
				156	int GetWordHash(char *word) {
				157	unsigned long long a, hash = 0;
				158	for (a = 0; a < strlen(word); a++)
				159	hash = hash * 257 + word[a];
				160	hash = hash % vocab_hash_size;
				161	return hash;
				162	}
				163
				164	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				165	int SearchVocab(char *word) {
				166	unsigned int hash = GetWordHash(word);
				167	while (1) {
				168	if (vocab_hash[hash] == -1)
				169	return -1;
				170	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				171	return vocab_hash[hash];
				172	hash = (hash + 1) % vocab_hash_size;
				173	}
				174	return -1;
				175	}
				176
				177	// Reads a word and returns its index in the vocabulary
				178	int ReadWordIndex(FILE *fin) {
				179	char word[MAX_STRING];
				180	ReadWord(word, fin);
				181	if (feof(fin))
				182	return -1;
				183	return SearchVocab(word);
				184	}
				185
				186	// Adds a word to the vocabulary
				187	int AddWordToVocab(char *word) {
				188	unsigned int hash, length = strlen(word) + 1;
				189	if (length > MAX_STRING)
				190	length = MAX_STRING;
				191	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				192	strcpy(vocab[vocab_size].word, word);
				193	vocab[vocab_size].cn = 0;
				194	vocab_size++;
				195	// Reallocate memory if needed
				196	if (vocab_size + 2 >= vocab_max_size) {
				197	vocab_max_size += 1000;
				198	vocab = (struct vocab_word *) realloc(vocab,
				199	vocab_max_size * sizeof(struct vocab_word));
				200	}
				201	hash = GetWordHash(word);
				202	while (vocab_hash[hash] != -1)
				203	hash = (hash + 1) % vocab_hash_size;
				204	vocab_hash[hash] = vocab_size - 1;
				205	return vocab_size - 1;
				206	}
				207
				208	// Used later for sorting by word counts
				209	int VocabCompare(const void a, const void b) {
				210	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				211	}
				212
				213	// Sorts the vocabulary by frequency using word counts
				214	void SortVocab() {
				215	int a, size;
				216	unsigned int hash;
				217	// Sort the vocabulary and keep </s> at the first position
				218	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				219	for (a = 0; a < vocab_hash_size; a++)
				220	vocab_hash[a] = -1;
				221	size = vocab_size;
				222	train_words = 0;
				223	for (a = 0; a < size; a++) {
				224	// Words occuring less than min_count times will be discarded from the vocab
				225	if ((vocab[a].cn < min_count) && (a != 0)) {
				226	vocab_size--;
				227	free(vocab[a].word);
				228	} else {
				229	// Hash will be re-computed, as after the sorting it is not actual
				230	hash = GetWordHash(vocab[a].word);
				231	while (vocab_hash[hash] != -1)
				232	hash = (hash + 1) % vocab_hash_size;
				233	vocab_hash[hash] = a;
				234	train_words += vocab[a].cn;
				235	}
				236	}
				237	vocab = (struct vocab_word *) realloc(vocab,
				238	(vocab_size + 1) * sizeof(struct vocab_word));
				239	// Allocate memory for the binary tree construction
				240	for (a = 0; a < vocab_size; a++) {
				241	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				242	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				243	}
				244	}
				245
				246	// Reduces the vocabulary by removing infrequent tokens
				247	void ReduceVocab() {
				248	int a, b = 0;
				249	unsigned int hash;
				250	for (a = 0; a < vocab_size; a++)
				251	if (vocab[a].cn > min_reduce) {
				252	vocab[b].cn = vocab[a].cn;
				253	vocab[b].word = vocab[a].word;
				254	b++;
				255	} else
				256	free(vocab[a].word);
				257	vocab_size = b;
				258	for (a = 0; a < vocab_hash_size; a++)
				259	vocab_hash[a] = -1;
				260	for (a = 0; a < vocab_size; a++) {
				261	// Hash will be re-computed, as it is not actual
				262	hash = GetWordHash(vocab[a].word);
				263	while (vocab_hash[hash] != -1)
				264	hash = (hash + 1) % vocab_hash_size;
				265	vocab_hash[hash] = a;
				266	}
				267	fflush(stdout);
				268	min_reduce++;
				269	}
				270
				271	// Create binary Huffman tree using the word counts
				272	// Frequent words will have short uniqe binary codes
				273	void CreateBinaryTree() {
				274	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				275	char code[MAX_CODE_LENGTH];
				276	long long count = (long long ) calloc(vocab_size * 2 + 1,
				277	sizeof(long long));
				278	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				279	sizeof(long long));
				280	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				281	sizeof(long long));
				282	for (a = 0; a < vocab_size; a++)
				283	count[a] = vocab[a].cn;
				284	for (a = vocab_size; a < vocab_size * 2; a++)
				285	count[a] = 1e15;
				286	pos1 = vocab_size - 1;
				287	pos2 = vocab_size;
				288	// Following algorithm constructs the Huffman tree by adding one node at a time
				289	for (a = 0; a < vocab_size - 1; a++) {
				290	// First, find two smallest nodes 'min1, min2'
				291	if (pos1 >= 0) {
				292	if (count[pos1] < count[pos2]) {
				293	min1i = pos1;
				294	pos1--;
				295	} else {
				296	min1i = pos2;
				297	pos2++;
				298	}
				299	} else {
				300	min1i = pos2;
				301	pos2++;
				302	}
				303	if (pos1 >= 0) {
				304	if (count[pos1] < count[pos2]) {
				305	min2i = pos1;
				306	pos1--;
				307	} else {
				308	min2i = pos2;
				309	pos2++;
				310	}
				311	} else {
				312	min2i = pos2;
				313	pos2++;
				314	}
				315	count[vocab_size + a] = count[min1i] + count[min2i];
				316	parent_node[min1i] = vocab_size + a;
				317	parent_node[min2i] = vocab_size + a;
				318	binary[min2i] = 1;
				319	}
				320	// Now assign binary code to each vocabulary word
				321	for (a = 0; a < vocab_size; a++) {
				322	b = a;
				323	i = 0;
				324	while (1) {
				325	code[i] = binary[b];
				326	point[i] = b;
				327	i++;
				328	b = parent_node[b];
				329	if (b == vocab_size * 2 - 2)
				330	break;
				331	}
				332	vocab[a].codelen = i;
				333	vocab[a].point[0] = vocab_size - 2;
				334	for (b = 0; b < i; b++) {
				335	vocab[a].code[i - b - 1] = code[b];
				336	vocab[a].point[i - b] = point[b] - vocab_size;
				337	}
				338	}
				339	free(count);
				340	free(binary);
				341	free(parent_node);
				342	}
				343
				344	void LearnVocabFromTrainFile() {
				345	char word[MAX_STRING];
				346	FILE *fin;
				347	long long a, i;
				348	for (a = 0; a < vocab_hash_size; a++)
				349	vocab_hash[a] = -1;
				350	fin = fopen(train_file, "rb");
				351	if (fin == NULL) {
				352	printf("ERROR: training data file not found!\n");
				353	exit(1);
				354	}
				355	vocab_size = 0;
				356	AddWordToVocab((char *) "</s>");
				357	while (1) {
				358	ReadWord(word, fin);
				359	if (feof(fin))
				360	break;
				361	train_words++;
				362	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				363	printf("%lldK%c", train_words / 1000, 13);
				364	fflush(stdout);
				365	}
				366	i = SearchVocab(word);
				367	if (i == -1) {
				368	a = AddWordToVocab(word);
				369	vocab[a].cn = 1;
				370	} else
				371	vocab[i].cn++;
				372	if (vocab_size > vocab_hash_size * 0.7)
				373	ReduceVocab();
				374	}
				375	SortVocab();
				376	if (debug_mode > 0) {
				377	printf("Vocab size: %lld\n", vocab_size);
				378	printf("Words in train file: %lld\n", train_words);
				379	}
				380	file_size = ftell(fin);
				381	fclose(fin);
				382	}
				383
				384	void SaveVocab() {
				385	long long i;
				386	FILE *fo = fopen(save_vocab_file, "wb");
				387	for (i = 0; i < vocab_size; i++)
				388	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				389	fclose(fo);
				390	}
				391
				392	void ReadVocab() {
				393	long long a, i = 0;
				394	char c;
				395	char word[MAX_STRING];
				396	FILE *fin = fopen(read_vocab_file, "rb");
				397	if (fin == NULL) {
				398	printf("Vocabulary file not found\n");
				399	exit(1);
				400	}
				401	for (a = 0; a < vocab_hash_size; a++)
				402	vocab_hash[a] = -1;
				403	vocab_size = 0;
				404	while (1) {
				405	ReadWord(word, fin);
				406	if (feof(fin))
				407	break;
				408	a = AddWordToVocab(word);
				409	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				410	i++;
				411	}
				412	SortVocab();
				413	if (debug_mode > 0) {
				414	printf("Vocab size: %lld\n", vocab_size);
				415	printf("Words in train file: %lld\n", train_words);
				416	}
				417	fin = fopen(train_file, "rb");
				418	if (fin == NULL) {
				419	printf("ERROR: training data file not found!\n");
				420	exit(1);
				421	}
				422	fseek(fin, 0, SEEK_END);
				423	file_size = ftell(fin);
				424	fclose(fin);
				425	}
				426
				427	void InitClassUnigramTable() {
				428	long long a, c;
				429	printf("loading class unigrams \n");
				430	FILE *fin = fopen(negative_classes_file, "rb");
				431	if (fin == NULL) {
				432	printf("ERROR: class file not found!\n");
				433	exit(1);
				434	}
				435	word_to_group = (int ) malloc(vocab_size sizeof(int));
				436	for (a = 0; a < vocab_size; a++)
				437	word_to_group[a] = -1;
				438	char class[MAX_STRING];
				439	char prev_class[MAX_STRING];
				440	prev_class[0] = 0;
				441	char word[MAX_STRING];
				442	class_number = -1;
				443	while (1) {
				444	if (feof(fin))
				445	break;
				446	ReadWord(class, fin);
				447	ReadWord(word, fin);
				448	int word_index = SearchVocab(word);
				449	if (word_index != -1) {
				450	if (strcmp(class, prev_class) != 0) {
				451	class_number++;
				452	strcpy(prev_class, class);
				453	}
				454	word_to_group[word_index] = class_number;
				455	}
				456	ReadWord(word, fin);
				457	}
				458	class_number++;
				459	fclose(fin);
				460
				461	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				462	long long train_words_pow = 0;
				463	real d1, power = 0.75;
				464
				465	for (c = 0; c < class_number; c++) {
				466	long long offset = c * table_size;
				467	train_words_pow = 0;
				468	for (a = 0; a < vocab_size; a++)
				469	if (word_to_group[a] == c)
				470	train_words_pow += pow(vocab[a].cn, power);
				471	int i = 0;
				472	while (word_to_group[i] != c && i < vocab_size)
				473	i++;
				474	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				475	for (a = 0; a < table_size; a++) {
				476	//printf("index %lld , word %d\n", a, i);
				477	group_to_table[offset + a] = i;
				478	if (a / (real) table_size > d1) {
				479	i++;
				480	while (word_to_group[i] != c && i < vocab_size)
				481	i++;
				482	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				483	}
				484	if (i >= vocab_size)
				485	while (word_to_group[i] != c && i >= 0)
				486	i--;
				487	}
				488	}
				489	}
				490
				491	void SaveNet() {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	492	FILE *fnet = fopen(save_net_file, "wb");
				493	if (fnet == NULL) {
				494	printf("Net parameter file not found\n");
				495	exit(1);
				496	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	497	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				498	fwrite(syn_window_hidden, sizeof(real), window_hidden_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	499	fclose(fnet);
				500	}
				501
				502	void InitNet() {
				503	long long a, b;
				504	unsigned long long next_random = 1;
				505	window_layer_size = layer1_size * window * 2;
				506	a = posix_memalign((void **) &syn0, 128,
				507	(long long) vocab_size * layer1_size * sizeof(real));
				508	if (syn0 == NULL) {
				509	printf("Memory allocation failed\n");
				510	exit(1);
				511	}
				512
				513	if (hs) {
				514	a = posix_memalign((void **) &syn1, 128,
				515	(long long) vocab_size * layer1_size * sizeof(real));
				516	if (syn1 == NULL) {
				517	printf("Memory allocation failed\n");
				518	exit(1);
				519	}
				520	a = posix_memalign((void **) &syn1_window, 128,
				521	(long long) vocab_size * window_layer_size * sizeof(real));
				522	if (syn1_window == NULL) {
				523	printf("Memory allocation failed\n");
				524	exit(1);
				525	}
				526	a = posix_memalign((void **) &syn_hidden_word, 128,
				527	(long long) vocab_size * window_hidden_size * sizeof(real));
				528	if (syn_hidden_word == NULL) {
				529	printf("Memory allocation failed\n");
				530	exit(1);
				531	}
				532
				533	for (a = 0; a < vocab_size; a++)
				534	for (b = 0; b < layer1_size; b++)
				535	syn1[a * layer1_size + b] = 0;
				536	for (a = 0; a < vocab_size; a++)
				537	for (b = 0; b < window_layer_size; b++)
				538	syn1_window[a * window_layer_size + b] = 0;
				539	for (a = 0; a < vocab_size; a++)
				540	for (b = 0; b < window_hidden_size; b++)
				541	syn_hidden_word[a * window_hidden_size + b] = 0;
				542	}
				543	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame^]	544	if(type == 0) {
				545	a = posix_memalign((void **) &syn1neg, 128,
				546	(long long) vocab_size * layer1_size * sizeof(real));
				547	if (syn1neg == NULL) {
				548	printf("Memory allocation failed\n");
				549	exit(1);
				550	}
				551	for (a = 0; a < vocab_size; a++)
				552	for (b = 0; b < layer1_size; b++)
				553	syn1neg[a * layer1_size + b] = 0;
				554	} else if (type == 3) {
				555	a = posix_memalign((void **) &syn1neg_window, 128,
				556	(long long) vocab_size * window_layer_size * sizeof(real));
				557	if (syn1neg_window == NULL) {
				558	printf("Memory allocation failed\n");
				559	exit(1);
				560	}
				561	for (a = 0; a < vocab_size; a++)
				562	for (b = 0; b < window_layer_size; b++)
				563	syn1neg_window[a * window_layer_size + b] = 0;
				564	} else if (type == 4) {
				565	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				566	(long long) vocab_size * window_hidden_size * sizeof(real));
				567	if (syn_hidden_word_neg == NULL) {
				568	printf("Memory allocation failed\n");
				569	exit(1);
				570	}
				571	for (a = 0; a < vocab_size; a++)
				572	for (b = 0; b < window_hidden_size; b++)
				573	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	574	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	575	}
				576	if (nce > 0) {
				577	a = posix_memalign((void **) &syn1nce, 128,
				578	(long long) vocab_size * layer1_size * sizeof(real));
				579	if (syn1nce == NULL) {
				580	printf("Memory allocation failed\n");
				581	exit(1);
				582	}
				583	a = posix_memalign((void **) &syn1nce_window, 128,
				584	(long long) vocab_size * window_layer_size * sizeof(real));
				585	if (syn1nce_window == NULL) {
				586	printf("Memory allocation failed\n");
				587	exit(1);
				588	}
				589	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				590	(long long) vocab_size * window_hidden_size * sizeof(real));
				591	if (syn_hidden_word_nce == NULL) {
				592	printf("Memory allocation failed\n");
				593	exit(1);
				594	}
				595
				596	for (a = 0; a < vocab_size; a++)
				597	for (b = 0; b < layer1_size; b++)
				598	syn1nce[a * layer1_size + b] = 0;
				599	for (a = 0; a < vocab_size; a++)
				600	for (b = 0; b < window_layer_size; b++)
				601	syn1nce_window[a * window_layer_size + b] = 0;
				602	for (a = 0; a < vocab_size; a++)
				603	for (b = 0; b < window_hidden_size; b++)
				604	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				605	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	606
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame^]	607	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	608	a = posix_memalign((void **) &syn_window_hidden, 128,
				609	window_hidden_size * window_layer_size * sizeof(real));
				610	if (syn_window_hidden == NULL) {
				611	printf("Memory allocation failed\n");
				612	exit(1);
				613	}
				614	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				615	next_random = next_random * (unsigned long long) 25214903917 + 11;
				616	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				617	- 0.5) / (window_hidden_size * window_layer_size);
				618	}
				619	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame^]	620
				621	if (read_net_file[0] == 0) {
				622	for (a = 0; a < vocab_size; a++)
				623	for (b = 0; b < layer1_size; b++) {
				624	next_random = next_random * (unsigned long long) 25214903917
				625	+ 11;
				626	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				627	/ (real) 65536) - 0.5) / layer1_size;
				628	}
				629	} else {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	630	FILE *fnet = fopen(read_net_file, "rb");
				631	if (fnet == NULL) {
				632	printf("Net parameter file not found\n");
				633	exit(1);
				634	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	635	fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	636	a = posix_memalign((void **) &syn_window_hidden, 128,
				637	window_hidden_size * window_layer_size * sizeof(real));
				638	if (syn_window_hidden == NULL) {
				639	printf("Memory allocation failed\n");
				640	exit(1);
				641	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	642	fread(syn_window_hidden, sizeof(real), window_hidden_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	643	fclose(fnet);
				644	}
				645
				646	CreateBinaryTree();
				647	}
				648
				649	void TrainModelThread(void id) {
				650	long long a, b, d, cw, word, last_word, sentence_length = 0,
				651	sentence_position = 0;
				652	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				653	long long l1, l2, c, target, label, local_iter = iter;
				654	unsigned long long next_random = (long long) id;
				655	real f, g;
				656	clock_t now;
				657	int input_len_1 = layer1_size;
				658	int window_offset = -1;
				659	if (type == 2 \|\| type == 4) {
				660	input_len_1 = window_layer_size;
				661	}
				662	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				663	real neu1e = (real ) calloc(input_len_1, sizeof(real));
				664
				665	int input_len_2 = 0;
				666	if (type == 4) {
				667	input_len_2 = window_hidden_size;
				668	}
				669	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				670	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				671
				672	FILE *fi = fopen(train_file, "rb");
				673	fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
				674	while (1) {
				675	if (word_count - last_word_count > 10000) {
				676	word_count_actual += word_count - last_word_count;
				677	last_word_count = word_count;
				678	if ((debug_mode > 1)) {
				679	now = clock();
				680	printf(
				681	"%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
				682	13, alpha,
				683	word_count_actual / (real) (iter * train_words + 1)
				684	* 100,
				685	word_count_actual
				686	/ ((real) (now - start + 1)
				687	/ (real) CLOCKS_PER_SEC * 1000));
				688	fflush(stdout);
				689	}
				690	alpha = starting_alpha
				691	* (1 - word_count_actual / (real) (iter * train_words + 1));
				692	if (alpha < starting_alpha * 0.0001)
				693	alpha = starting_alpha * 0.0001;
				694	}
				695	if (sentence_length == 0) {
				696	while (1) {
				697	word = ReadWordIndex(fi);
				698	if (feof(fi))
				699	break;
				700	if (word == -1)
				701	continue;
				702	word_count++;
				703	if (word == 0)
				704	break;
				705	// The subsampling randomly discards frequent words while keeping the ranking same
				706	if (sample > 0) {
				707	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				708	+ 1) * (sample * train_words) / vocab[word].cn;
				709	next_random = next_random * (unsigned long long) 25214903917
				710	+ 11;
				711	if (ran < (next_random & 0xFFFF) / (real) 65536)
				712	continue;
				713	}
				714	sen[sentence_length] = word;
				715	sentence_length++;
				716	if (sentence_length >= MAX_SENTENCE_LENGTH)
				717	break;
				718	}
				719	sentence_position = 0;
				720	}
				721	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				722	word_count_actual += word_count - last_word_count;
				723	local_iter--;
				724	if (local_iter == 0)
				725	break;
				726	word_count = 0;
				727	last_word_count = 0;
				728	sentence_length = 0;
				729	fseek(fi, file_size / (long long) num_threads * (long long) id,
				730	SEEK_SET);
				731	continue;
				732	}
				733	word = sen[sentence_position];
				734	if (word == -1)
				735	continue;
				736	for (c = 0; c < input_len_1; c++)
				737	neu1[c] = 0;
				738	for (c = 0; c < input_len_1; c++)
				739	neu1e[c] = 0;
				740	for (c = 0; c < input_len_2; c++)
				741	neu2[c] = 0;
				742	for (c = 0; c < input_len_2; c++)
				743	neu2e[c] = 0;
				744	next_random = next_random * (unsigned long long) 25214903917 + 11;
				745	b = next_random % window;
				746	if (type == 0) { //train the cbow architecture
				747	// in -> hidden
				748	cw = 0;
				749	for (a = b; a < window * 2 + 1 - b; a++)
				750	if (a != window) {
				751	c = sentence_position - window + a;
				752	if (c < 0)
				753	continue;
				754	if (c >= sentence_length)
				755	continue;
				756	last_word = sen[c];
				757	if (last_word == -1)
				758	continue;
				759	for (c = 0; c < layer1_size; c++)
				760	neu1[c] += syn0[c + last_word * layer1_size];
				761	cw++;
				762	}
				763	if (cw) {
				764	for (c = 0; c < layer1_size; c++)
				765	neu1[c] /= cw;
				766	if (hs)
				767	for (d = 0; d < vocab[word].codelen; d++) {
				768	f = 0;
				769	l2 = vocab[word].point[d] * layer1_size;
				770	// Propagate hidden -> output
				771	for (c = 0; c < layer1_size; c++)
				772	f += neu1[c] * syn1[c + l2];
				773	if (f <= -MAX_EXP)
				774	continue;
				775	else if (f >= MAX_EXP)
				776	continue;
				777	else
				778	f = expTable[(int) ((f + MAX_EXP)
				779	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				780	// 'g' is the gradient multiplied by the learning rate
				781	g = (1 - vocab[word].code[d] - f) * alpha;
				782	// Propagate errors output -> hidden
				783	for (c = 0; c < layer1_size; c++)
				784	neu1e[c] += g * syn1[c + l2];
				785	// Learn weights hidden -> output
				786	for (c = 0; c < layer1_size; c++)
				787	syn1[c + l2] += g * neu1[c];
				788	if (cap == 1)
				789	for (c = 0; c < layer1_size; c++)
				790	capParam(syn1, c + l2);
				791	}
				792	// NEGATIVE SAMPLING
				793	if (negative > 0)
				794	for (d = 0; d < negative + 1; d++) {
				795	if (d == 0) {
				796	target = word;
				797	label = 1;
				798	} else {
				799	next_random = next_random
				800	* (unsigned long long) 25214903917 + 11;
				801	if (word_to_group != NULL
				802	&& word_to_group[word] != -1) {
				803	target = word;
				804	while (target == word) {
				805	target = group_to_table[word_to_group[word]
				806	* table_size
				807	+ (next_random >> 16) % table_size];
				808	next_random = next_random
				809	* (unsigned long long) 25214903917
				810	+ 11;
				811	}
				812	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				813	} else {
				814	target =
				815	table[(next_random >> 16) % table_size];
				816	}
				817	if (target == 0)
				818	target = next_random % (vocab_size - 1) + 1;
				819	if (target == word)
				820	continue;
				821	label = 0;
				822	}
				823	l2 = target * layer1_size;
				824	f = 0;
				825	for (c = 0; c < layer1_size; c++)
				826	f += neu1[c] * syn1neg[c + l2];
				827	if (f > MAX_EXP)
				828	g = (label - 1) * alpha;
				829	else if (f < -MAX_EXP)
				830	g = (label - 0) * alpha;
				831	else
				832	g = (label
				833	- expTable[(int) ((f + MAX_EXP)
				834	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				835	* alpha;
				836	for (c = 0; c < layer1_size; c++)
				837	neu1e[c] += g * syn1neg[c + l2];
				838	for (c = 0; c < layer1_size; c++)
				839	syn1neg[c + l2] += g * neu1[c];
				840	if (cap == 1)
				841	for (c = 0; c < layer1_size; c++)
				842	capParam(syn1neg, c + l2);
				843	}
				844	// Noise Contrastive Estimation
				845	if (nce > 0)
				846	for (d = 0; d < nce + 1; d++) {
				847	if (d == 0) {
				848	target = word;
				849	label = 1;
				850	} else {
				851	next_random = next_random
				852	* (unsigned long long) 25214903917 + 11;
				853	if (word_to_group != NULL
				854	&& word_to_group[word] != -1) {
				855	target = word;
				856	while (target == word) {
				857	target = group_to_table[word_to_group[word]
				858	* table_size
				859	+ (next_random >> 16) % table_size];
				860	next_random = next_random
				861	* (unsigned long long) 25214903917
				862	+ 11;
				863	}
				864	} else {
				865	target =
				866	table[(next_random >> 16) % table_size];
				867	}
				868	if (target == 0)
				869	target = next_random % (vocab_size - 1) + 1;
				870	if (target == word)
				871	continue;
				872	label = 0;
				873	}
				874	l2 = target * layer1_size;
				875	f = 0;
				876
				877	for (c = 0; c < layer1_size; c++)
				878	f += neu1[c] * syn1nce[c + l2];
				879	if (f > MAX_EXP)
				880	g = (label - 1) * alpha;
				881	else if (f < -MAX_EXP)
				882	g = (label - 0) * alpha;
				883	else {
				884	f = exp(f);
				885	g =
				886	(label
				887	- f
				888	/ (noise_distribution[target]
				889	* nce + f)) * alpha;
				890	}
				891	for (c = 0; c < layer1_size; c++)
				892	neu1e[c] += g * syn1nce[c + l2];
				893	for (c = 0; c < layer1_size; c++)
				894	syn1nce[c + l2] += g * neu1[c];
				895	if (cap == 1)
				896	for (c = 0; c < layer1_size; c++)
				897	capParam(syn1nce, c + l2);
				898	}
				899	// hidden -> in
				900	for (a = b; a < window * 2 + 1 - b; a++)
				901	if (a != window) {
				902	c = sentence_position - window + a;
				903	if (c < 0)
				904	continue;
				905	if (c >= sentence_length)
				906	continue;
				907	last_word = sen[c];
				908	if (last_word == -1)
				909	continue;
				910	for (c = 0; c < layer1_size; c++)
				911	syn0[c + last_word * layer1_size] += neu1e[c];
				912	}
				913	}
				914	} else if (type == 1) { //train skip-gram
				915	for (a = b; a < window * 2 + 1 - b; a++)
				916	if (a != window) {
				917	c = sentence_position - window + a;
				918	if (c < 0)
				919	continue;
				920	if (c >= sentence_length)
				921	continue;
				922	last_word = sen[c];
				923	if (last_word == -1)
				924	continue;
				925	l1 = last_word * layer1_size;
				926	for (c = 0; c < layer1_size; c++)
				927	neu1e[c] = 0;
				928	// HIERARCHICAL SOFTMAX
				929	if (hs)
				930	for (d = 0; d < vocab[word].codelen; d++) {
				931	f = 0;
				932	l2 = vocab[word].point[d] * layer1_size;
				933	// Propagate hidden -> output
				934	for (c = 0; c < layer1_size; c++)
				935	f += syn0[c + l1] * syn1[c + l2];
				936	if (f <= -MAX_EXP)
				937	continue;
				938	else if (f >= MAX_EXP)
				939	continue;
				940	else
				941	f = expTable[(int) ((f + MAX_EXP)
				942	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				943	// 'g' is the gradient multiplied by the learning rate
				944	g = (1 - vocab[word].code[d] - f) * alpha;
				945	// Propagate errors output -> hidden
				946	for (c = 0; c < layer1_size; c++)
				947	neu1e[c] += g * syn1[c + l2];
				948	// Learn weights hidden -> output
				949	for (c = 0; c < layer1_size; c++)
				950	syn1[c + l2] += g * syn0[c + l1];
				951	if (cap == 1)
				952	for (c = 0; c < layer1_size; c++)
				953	capParam(syn1, c + l2);
				954	}
				955	// NEGATIVE SAMPLING
				956	if (negative > 0)
				957	for (d = 0; d < negative + 1; d++) {
				958	if (d == 0) {
				959	target = word;
				960	label = 1;
				961	} else {
				962	next_random = next_random
				963	* (unsigned long long) 25214903917 + 11;
				964	if (word_to_group != NULL
				965	&& word_to_group[word] != -1) {
				966	target = word;
				967	while (target == word) {
				968	target =
				969	group_to_table[word_to_group[word]
				970	* table_size
				971	+ (next_random >> 16)
				972	% table_size];
				973	next_random =
				974	next_random
				975	* (unsigned long long) 25214903917
				976	+ 11;
				977	}
				978	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				979	} else {
				980	target = table[(next_random >> 16)
				981	% table_size];
				982	}
				983	if (target == 0)
				984	target = next_random % (vocab_size - 1) + 1;
				985	if (target == word)
				986	continue;
				987	label = 0;
				988	}
				989	l2 = target * layer1_size;
				990	f = 0;
				991	for (c = 0; c < layer1_size; c++)
				992	f += syn0[c + l1] * syn1neg[c + l2];
				993	if (f > MAX_EXP)
				994	g = (label - 1) * alpha;
				995	else if (f < -MAX_EXP)
				996	g = (label - 0) * alpha;
				997	else
				998	g =
				999	(label
				1000	- expTable[(int) ((f + MAX_EXP)
				1001	* (EXP_TABLE_SIZE
				1002	/ MAX_EXP / 2))])
				1003	* alpha;
				1004	for (c = 0; c < layer1_size; c++)
				1005	neu1e[c] += g * syn1neg[c + l2];
				1006	for (c = 0; c < layer1_size; c++)
				1007	syn1neg[c + l2] += g * syn0[c + l1];
				1008	if (cap == 1)
				1009	for (c = 0; c < layer1_size; c++)
				1010	capParam(syn1neg, c + l2);
				1011	}
				1012	//Noise Contrastive Estimation
				1013	if (nce > 0)
				1014	for (d = 0; d < nce + 1; d++) {
				1015	if (d == 0) {
				1016	target = word;
				1017	label = 1;
				1018	} else {
				1019	next_random = next_random
				1020	* (unsigned long long) 25214903917 + 11;
				1021	if (word_to_group != NULL
				1022	&& word_to_group[word] != -1) {
				1023	target = word;
				1024	while (target == word) {
				1025	target =
				1026	group_to_table[word_to_group[word]
				1027	* table_size
				1028	+ (next_random >> 16)
				1029	% table_size];
				1030	next_random =
				1031	next_random
				1032	* (unsigned long long) 25214903917
				1033	+ 11;
				1034	}
				1035	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1036	} else {
				1037	target = table[(next_random >> 16)
				1038	% table_size];
				1039	}
				1040	if (target == 0)
				1041	target = next_random % (vocab_size - 1) + 1;
				1042	if (target == word)
				1043	continue;
				1044	label = 0;
				1045	}
				1046	l2 = target * layer1_size;
				1047	f = 0;
				1048	for (c = 0; c < layer1_size; c++)
				1049	f += syn0[c + l1] * syn1nce[c + l2];
				1050	if (f > MAX_EXP)
				1051	g = (label - 1) * alpha;
				1052	else if (f < -MAX_EXP)
				1053	g = (label - 0) * alpha;
				1054	else {
				1055	f = exp(f);
				1056	g = (label
				1057	- f
				1058	/ (noise_distribution[target]
				1059	* nce + f)) * alpha;
				1060	}
				1061	for (c = 0; c < layer1_size; c++)
				1062	neu1e[c] += g * syn1nce[c + l2];
				1063	for (c = 0; c < layer1_size; c++)
				1064	syn1nce[c + l2] += g * syn0[c + l1];
				1065	if (cap == 1)
				1066	for (c = 0; c < layer1_size; c++)
				1067	capParam(syn1nce, c + l2);
				1068	}
				1069	// Learn weights input -> hidden
				1070	for (c = 0; c < layer1_size; c++)
				1071	syn0[c + l1] += neu1e[c];
				1072	}
				1073	} else if (type == 2) { //train the cwindow architecture
				1074	// in -> hidden
				1075	cw = 0;
				1076	for (a = 0; a < window * 2 + 1; a++)
				1077	if (a != window) {
				1078	c = sentence_position - window + a;
				1079	if (c < 0)
				1080	continue;
				1081	if (c >= sentence_length)
				1082	continue;
				1083	last_word = sen[c];
				1084	if (last_word == -1)
				1085	continue;
				1086	window_offset = a * layer1_size;
				1087	if (a > window)
				1088	window_offset -= layer1_size;
				1089	for (c = 0; c < layer1_size; c++)
				1090	neu1[c + window_offset] += syn0[c
				1091	+ last_word * layer1_size];
				1092	cw++;
				1093	}
				1094	if (cw) {
				1095	if (hs)
				1096	for (d = 0; d < vocab[word].codelen; d++) {
				1097	f = 0;
				1098	l2 = vocab[word].point[d] * window_layer_size;
				1099	// Propagate hidden -> output
				1100	for (c = 0; c < window_layer_size; c++)
				1101	f += neu1[c] * syn1_window[c + l2];
				1102	if (f <= -MAX_EXP)
				1103	continue;
				1104	else if (f >= MAX_EXP)
				1105	continue;
				1106	else
				1107	f = expTable[(int) ((f + MAX_EXP)
				1108	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1109	// 'g' is the gradient multiplied by the learning rate
				1110	g = (1 - vocab[word].code[d] - f) * alpha;
				1111	// Propagate errors output -> hidden
				1112	for (c = 0; c < window_layer_size; c++)
				1113	neu1e[c] += g * syn1_window[c + l2];
				1114	// Learn weights hidden -> output
				1115	for (c = 0; c < window_layer_size; c++)
				1116	syn1_window[c + l2] += g * neu1[c];
				1117	if (cap == 1)
				1118	for (c = 0; c < window_layer_size; c++)
				1119	capParam(syn1_window, c + l2);
				1120	}
				1121	// NEGATIVE SAMPLING
				1122	if (negative > 0)
				1123	for (d = 0; d < negative + 1; d++) {
				1124	if (d == 0) {
				1125	target = word;
				1126	label = 1;
				1127	} else {
				1128	next_random = next_random
				1129	* (unsigned long long) 25214903917 + 11;
				1130	if (word_to_group != NULL
				1131	&& word_to_group[word] != -1) {
				1132	target = word;
				1133	while (target == word) {
				1134	target = group_to_table[word_to_group[word]
				1135	* table_size
				1136	+ (next_random >> 16) % table_size];
				1137	next_random = next_random
				1138	* (unsigned long long) 25214903917
				1139	+ 11;
				1140	}
				1141	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1142	} else {
				1143	target =
				1144	table[(next_random >> 16) % table_size];
				1145	}
				1146	if (target == 0)
				1147	target = next_random % (vocab_size - 1) + 1;
				1148	if (target == word)
				1149	continue;
				1150	label = 0;
				1151	}
				1152	l2 = target * window_layer_size;
				1153	f = 0;
				1154	for (c = 0; c < window_layer_size; c++)
				1155	f += neu1[c] * syn1neg_window[c + l2];
				1156	if (f > MAX_EXP)
				1157	g = (label - 1) * alpha;
				1158	else if (f < -MAX_EXP)
				1159	g = (label - 0) * alpha;
				1160	else
				1161	g = (label
				1162	- expTable[(int) ((f + MAX_EXP)
				1163	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1164	* alpha;
				1165	for (c = 0; c < window_layer_size; c++)
				1166	neu1e[c] += g * syn1neg_window[c + l2];
				1167	for (c = 0; c < window_layer_size; c++)
				1168	syn1neg_window[c + l2] += g * neu1[c];
				1169	if (cap == 1)
				1170	for (c = 0; c < window_layer_size; c++)
				1171	capParam(syn1neg_window, c + l2);
				1172	}
				1173	// Noise Contrastive Estimation
				1174	if (nce > 0)
				1175	for (d = 0; d < nce + 1; d++) {
				1176	if (d == 0) {
				1177	target = word;
				1178	label = 1;
				1179	} else {
				1180	next_random = next_random
				1181	* (unsigned long long) 25214903917 + 11;
				1182	if (word_to_group != NULL
				1183	&& word_to_group[word] != -1) {
				1184	target = word;
				1185	while (target == word) {
				1186	target = group_to_table[word_to_group[word]
				1187	* table_size
				1188	+ (next_random >> 16) % table_size];
				1189	next_random = next_random
				1190	* (unsigned long long) 25214903917
				1191	+ 11;
				1192	}
				1193	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1194	} else {
				1195	target =
				1196	table[(next_random >> 16) % table_size];
				1197	}
				1198	if (target == 0)
				1199	target = next_random % (vocab_size - 1) + 1;
				1200	if (target == word)
				1201	continue;
				1202	label = 0;
				1203	}
				1204	l2 = target * window_layer_size;
				1205	f = 0;
				1206	for (c = 0; c < window_layer_size; c++)
				1207	f += neu1[c] * syn1nce_window[c + l2];
				1208	if (f > MAX_EXP)
				1209	g = (label - 1) * alpha;
				1210	else if (f < -MAX_EXP)
				1211	g = (label - 0) * alpha;
				1212	else {
				1213	f = exp(f);
				1214	g =
				1215	(label
				1216	- f
				1217	/ (noise_distribution[target]
				1218	* nce + f)) * alpha;
				1219	}
				1220	for (c = 0; c < window_layer_size; c++)
				1221	neu1e[c] += g * syn1nce_window[c + l2];
				1222	for (c = 0; c < window_layer_size; c++)
				1223	syn1nce_window[c + l2] += g * neu1[c];
				1224	if (cap == 1)
				1225	for (c = 0; c < window_layer_size; c++)
				1226	capParam(syn1nce_window, c + l2);
				1227	}
				1228	// hidden -> in
				1229	for (a = 0; a < window * 2 + 1; a++)
				1230	if (a != window) {
				1231	c = sentence_position - window + a;
				1232	if (c < 0)
				1233	continue;
				1234	if (c >= sentence_length)
				1235	continue;
				1236	last_word = sen[c];
				1237	if (last_word == -1)
				1238	continue;
				1239	window_offset = a * layer1_size;
				1240	if (a > window)
				1241	window_offset -= layer1_size;
				1242	for (c = 0; c < layer1_size; c++)
				1243	syn0[c + last_word * layer1_size] += neu1e[c
				1244	+ window_offset];
				1245	}
				1246	}
				1247	} else if (type == 3) { //train structured skip-gram
				1248	for (a = 0; a < window * 2 + 1; a++)
				1249	if (a != window) {
				1250	c = sentence_position - window + a;
				1251	if (c < 0)
				1252	continue;
				1253	if (c >= sentence_length)
				1254	continue;
				1255	last_word = sen[c];
				1256	if (last_word == -1)
				1257	continue;
				1258	l1 = last_word * layer1_size;
				1259	window_offset = a * layer1_size;
				1260	if (a > window)
				1261	window_offset -= layer1_size;
				1262	for (c = 0; c < layer1_size; c++)
				1263	neu1e[c] = 0;
				1264	// HIERARCHICAL SOFTMAX
				1265	if (hs)
				1266	for (d = 0; d < vocab[word].codelen; d++) {
				1267	f = 0;
				1268	l2 = vocab[word].point[d] * window_layer_size;
				1269	// Propagate hidden -> output
				1270	for (c = 0; c < layer1_size; c++)
				1271	f += syn0[c + l1]
				1272	* syn1_window[c + l2 + window_offset];
				1273	if (f <= -MAX_EXP)
				1274	continue;
				1275	else if (f >= MAX_EXP)
				1276	continue;
				1277	else
				1278	f = expTable[(int) ((f + MAX_EXP)
				1279	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1280	// 'g' is the gradient multiplied by the learning rate
				1281	g = (1 - vocab[word].code[d] - f) * alpha;
				1282	// Propagate errors output -> hidden
				1283	for (c = 0; c < layer1_size; c++)
				1284	neu1e[c] += g
				1285	* syn1_window[c + l2 + window_offset];
				1286	// Learn weights hidden -> output
				1287	for (c = 0; c < layer1_size; c++)
				1288	syn1[c + l2 + window_offset] += g
				1289	* syn0[c + l1];
				1290	if (cap == 1)
				1291	for (c = 0; c < layer1_size; c++)
				1292	capParam(syn1, c + l2 + window_offset);
				1293	}
				1294	// NEGATIVE SAMPLING
				1295	if (negative > 0)
				1296	for (d = 0; d < negative + 1; d++) {
				1297	if (d == 0) {
				1298	target = word;
				1299	label = 1;
				1300	} else {
				1301	next_random = next_random
				1302	* (unsigned long long) 25214903917 + 11;
				1303	if (word_to_group != NULL
				1304	&& word_to_group[word] != -1) {
				1305	target = word;
				1306	while (target == word) {
				1307	target =
				1308	group_to_table[word_to_group[word]
				1309	* table_size
				1310	+ (next_random >> 16)
				1311	% table_size];
				1312	next_random =
				1313	next_random
				1314	* (unsigned long long) 25214903917
				1315	+ 11;
				1316	}
				1317	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1318	} else {
				1319	target = table[(next_random >> 16)
				1320	% table_size];
				1321	}
				1322	if (target == 0)
				1323	target = next_random % (vocab_size - 1) + 1;
				1324	if (target == word)
				1325	continue;
				1326	label = 0;
				1327	}
				1328	l2 = target * window_layer_size;
				1329	f = 0;
				1330	for (c = 0; c < layer1_size; c++)
				1331	f +=
				1332	syn0[c + l1]
				1333	* syn1neg_window[c + l2
				1334	+ window_offset];
				1335	if (f > MAX_EXP)
				1336	g = (label - 1) * alpha;
				1337	else if (f < -MAX_EXP)
				1338	g = (label - 0) * alpha;
				1339	else
				1340	g =
				1341	(label
				1342	- expTable[(int) ((f + MAX_EXP)
				1343	* (EXP_TABLE_SIZE
				1344	/ MAX_EXP / 2))])
				1345	* alpha;
				1346	for (c = 0; c < layer1_size; c++)
				1347	neu1e[c] +=
				1348	g
				1349	* syn1neg_window[c + l2
				1350	+ window_offset];
				1351	for (c = 0; c < layer1_size; c++)
				1352	syn1neg_window[c + l2 + window_offset] += g
				1353	* syn0[c + l1];
				1354	if (cap == 1)
				1355	for (c = 0; c < layer1_size; c++)
				1356	capParam(syn1neg_window,
				1357	c + l2 + window_offset);
				1358	}
				1359	// Noise Constrastive Estimation
				1360	if (nce > 0)
				1361	for (d = 0; d < nce + 1; d++) {
				1362	if (d == 0) {
				1363	target = word;
				1364	label = 1;
				1365	} else {
				1366	next_random = next_random
				1367	* (unsigned long long) 25214903917 + 11;
				1368	if (word_to_group != NULL
				1369	&& word_to_group[word] != -1) {
				1370	target = word;
				1371	while (target == word) {
				1372	target =
				1373	group_to_table[word_to_group[word]
				1374	* table_size
				1375	+ (next_random >> 16)
				1376	% table_size];
				1377	next_random =
				1378	next_random
				1379	* (unsigned long long) 25214903917
				1380	+ 11;
				1381	}
				1382	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1383	} else {
				1384	target = table[(next_random >> 16)
				1385	% table_size];
				1386	}
				1387	if (target == 0)
				1388	target = next_random % (vocab_size - 1) + 1;
				1389	if (target == word)
				1390	continue;
				1391	label = 0;
				1392	}
				1393	l2 = target * window_layer_size;
				1394	f = 0;
				1395	for (c = 0; c < layer1_size; c++)
				1396	f +=
				1397	syn0[c + l1]
				1398	* syn1nce_window[c + l2
				1399	+ window_offset];
				1400	if (f > MAX_EXP)
				1401	g = (label - 1) * alpha;
				1402	else if (f < -MAX_EXP)
				1403	g = (label - 0) * alpha;
				1404	else {
				1405	f = exp(f);
				1406	g = (label
				1407	- f
				1408	/ (noise_distribution[target]
				1409	* nce + f)) * alpha;
				1410	}
				1411	for (c = 0; c < layer1_size; c++)
				1412	neu1e[c] +=
				1413	g
				1414	* syn1nce_window[c + l2
				1415	+ window_offset];
				1416	for (c = 0; c < layer1_size; c++)
				1417	syn1nce_window[c + l2 + window_offset] += g
				1418	* syn0[c + l1];
				1419	if (cap == 1)
				1420	for (c = 0; c < layer1_size; c++)
				1421	capParam(syn1nce_window,
				1422	c + l2 + window_offset);
				1423	}
				1424	// Learn weights input -> hidden
				1425	for (c = 0; c < layer1_size; c++) {
				1426	syn0[c + l1] += neu1e[c];
				1427	if (syn0[c + l1] > 50)
				1428	syn0[c + l1] = 50;
				1429	if (syn0[c + l1] < -50)
				1430	syn0[c + l1] = -50;
				1431	}
				1432	}
				1433	} else if (type == 4) { //training senna
				1434	// in -> hidden
				1435	cw = 0;
				1436	for (a = 0; a < window * 2 + 1; a++)
				1437	if (a != window) {
				1438	c = sentence_position - window + a;
				1439	if (c < 0)
				1440	continue;
				1441	if (c >= sentence_length)
				1442	continue;
				1443	last_word = sen[c];
				1444	if (last_word == -1)
				1445	continue;
				1446	window_offset = a * layer1_size;
				1447	if (a > window)
				1448	window_offset -= layer1_size;
				1449	for (c = 0; c < layer1_size; c++)
				1450	neu1[c + window_offset] += syn0[c
				1451	+ last_word * layer1_size];
				1452	cw++;
				1453	}
				1454	if (cw) {
				1455	for (a = 0; a < window_hidden_size; a++) {
				1456	c = a * window_layer_size;
				1457	for (b = 0; b < window_layer_size; b++) {
				1458	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1459	}
				1460	}
				1461	if (hs)
				1462	for (d = 0; d < vocab[word].codelen; d++) {
				1463	f = 0;
				1464	l2 = vocab[word].point[d] * window_hidden_size;
				1465	// Propagate hidden -> output
				1466	for (c = 0; c < window_hidden_size; c++)
				1467	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1468	if (f <= -MAX_EXP)
				1469	continue;
				1470	else if (f >= MAX_EXP)
				1471	continue;
				1472	else
				1473	f = expTable[(int) ((f + MAX_EXP)
				1474	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1475	// 'g' is the gradient multiplied by the learning rate
				1476	g = (1 - vocab[word].code[d] - f) * alpha;
				1477	// Propagate errors output -> hidden
				1478	for (c = 0; c < window_hidden_size; c++)
				1479	neu2e[c] += dHardTanh(neu2[c], g) * g
				1480	* syn_hidden_word[c + l2];
				1481	// Learn weights hidden -> output
				1482	for (c = 0; c < window_hidden_size; c++)
				1483	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1484	* neu2[c];
				1485	}
				1486	// NEGATIVE SAMPLING
				1487	if (negative > 0)
				1488	for (d = 0; d < negative + 1; d++) {
				1489	if (d == 0) {
				1490	target = word;
				1491	label = 1;
				1492	} else {
				1493	next_random = next_random
				1494	* (unsigned long long) 25214903917 + 11;
				1495	if (word_to_group != NULL
				1496	&& word_to_group[word] != -1) {
				1497	target = word;
				1498	while (target == word) {
				1499	target = group_to_table[word_to_group[word]
				1500	* table_size
				1501	+ (next_random >> 16) % table_size];
				1502	next_random = next_random
				1503	* (unsigned long long) 25214903917
				1504	+ 11;
				1505	}
				1506	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1507	} else {
				1508	target =
				1509	table[(next_random >> 16) % table_size];
				1510	}
				1511	if (target == 0)
				1512	target = next_random % (vocab_size - 1) + 1;
				1513	if (target == word)
				1514	continue;
				1515	label = 0;
				1516	}
				1517	l2 = target * window_hidden_size;
				1518	f = 0;
				1519	for (c = 0; c < window_hidden_size; c++)
				1520	f += hardTanh(neu2[c])
				1521	* syn_hidden_word_neg[c + l2];
				1522	if (f > MAX_EXP)
				1523	g = (label - 1) * alpha / negative;
				1524	else if (f < -MAX_EXP)
				1525	g = (label - 0) * alpha / negative;
				1526	else
				1527	g = (label
				1528	- expTable[(int) ((f + MAX_EXP)
				1529	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1530	* alpha / negative;
				1531	for (c = 0; c < window_hidden_size; c++)
				1532	neu2e[c] += dHardTanh(neu2[c], g) * g
				1533	* syn_hidden_word_neg[c + l2];
				1534	for (c = 0; c < window_hidden_size; c++)
				1535	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1536	* g * neu2[c];
				1537	}
				1538	for (a = 0; a < window_hidden_size; a++)
				1539	for (b = 0; b < window_layer_size; b++)
				1540	neu1e[b] += neu2e[a]
				1541	* syn_window_hidden[a * window_layer_size + b];
				1542	for (a = 0; a < window_hidden_size; a++)
				1543	for (b = 0; b < window_layer_size; b++)
				1544	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1545	* neu1[b];
				1546	// hidden -> in
				1547	for (a = 0; a < window * 2 + 1; a++)
				1548	if (a != window) {
				1549	c = sentence_position - window + a;
				1550	if (c < 0)
				1551	continue;
				1552	if (c >= sentence_length)
				1553	continue;
				1554	last_word = sen[c];
				1555	if (last_word == -1)
				1556	continue;
				1557	window_offset = a * layer1_size;
				1558	if (a > window)
				1559	window_offset -= layer1_size;
				1560	for (c = 0; c < layer1_size; c++)
				1561	syn0[c + last_word * layer1_size] += neu1e[c
				1562	+ window_offset];
				1563	}
				1564	}
				1565	} else {
				1566	printf("unknown type %i", type);
				1567	exit(0);
				1568	}
				1569	sentence_position++;
				1570	if (sentence_position >= sentence_length) {
				1571	sentence_length = 0;
				1572	continue;
				1573	}
				1574	}
				1575	fclose(fi);
				1576	free(neu1);
				1577	free(neu1e);
				1578	pthread_exit(NULL);
				1579	}
				1580
				1581	void TrainModel() {
				1582	long a, b, c, d;
				1583	FILE *fo;
				1584	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
				1585	printf("Starting training using file %s\n", train_file);
				1586	starting_alpha = alpha;
				1587	if (read_vocab_file[0] != 0)
				1588	ReadVocab();
				1589	else
				1590	LearnVocabFromTrainFile();
				1591	if (save_vocab_file[0] != 0)
				1592	SaveVocab();
				1593	if (output_file[0] == 0)
				1594	return;
				1595	InitNet();
				1596	if (negative > 0 \|\| nce > 0)
				1597	InitUnigramTable();
				1598	if (negative_classes_file[0] != 0)
				1599	InitClassUnigramTable();
				1600	start = clock();
				1601	for (a = 0; a < num_threads; a++)
				1602	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
				1603	for (a = 0; a < num_threads; a++)
				1604	pthread_join(pt[a], NULL);
				1605	fo = fopen(output_file, "wb");
				1606	if (classes == 0) {
				1607	// Save the word vectors
				1608	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1609	for (a = 0; a < vocab_size; a++) {
				1610	fprintf(fo, "%s ", vocab[a].word);
				1611	if (binary)
				1612	for (b = 0; b < layer1_size; b++)
				1613	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1614	else
				1615	for (b = 0; b < layer1_size; b++)
				1616	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1617	fprintf(fo, "\n");
				1618	}
				1619	} else {
				1620	// Run K-means on the word vectors
				1621	int clcn = classes, iter = 10, closeid;
				1622	int centcn = (int ) malloc(classes * sizeof(int));
				1623	int cl = (int ) calloc(vocab_size, sizeof(int));
				1624	real closev, x;
				1625	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1626	for (a = 0; a < vocab_size; a++)
				1627	cl[a] = a % clcn;
				1628	for (a = 0; a < iter; a++) {
				1629	for (b = 0; b < clcn * layer1_size; b++)
				1630	cent[b] = 0;
				1631	for (b = 0; b < clcn; b++)
				1632	centcn[b] = 1;
				1633	for (c = 0; c < vocab_size; c++) {
				1634	for (d = 0; d < layer1_size; d++)
				1635	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1636	centcn[cl[c]]++;
				1637	}
				1638	for (b = 0; b < clcn; b++) {
				1639	closev = 0;
				1640	for (c = 0; c < layer1_size; c++) {
				1641	cent[layer1_size * b + c] /= centcn[b];
				1642	closev += cent[layer1_size * b + c]
				1643	* cent[layer1_size * b + c];
				1644	}
				1645	closev = sqrt(closev);
				1646	for (c = 0; c < layer1_size; c++)
				1647	cent[layer1_size * b + c] /= closev;
				1648	}
				1649	for (c = 0; c < vocab_size; c++) {
				1650	closev = -10;
				1651	closeid = 0;
				1652	for (d = 0; d < clcn; d++) {
				1653	x = 0;
				1654	for (b = 0; b < layer1_size; b++)
				1655	x += cent[layer1_size * d + b]
				1656	* syn0[c * layer1_size + b];
				1657	if (x > closev) {
				1658	closev = x;
				1659	closeid = d;
				1660	}
				1661	}
				1662	cl[c] = closeid;
				1663	}
				1664	}
				1665	// Save the K-means classes
				1666	for (a = 0; a < vocab_size; a++)
				1667	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1668	free(centcn);
				1669	free(cent);
				1670	free(cl);
				1671	}
				1672	fclose(fo);
				1673	if (save_net_file[0] != 0)
				1674	SaveNet();
				1675	}
				1676
				1677	int ArgPos(char str, int argc, char *argv) {
				1678	int a;
				1679	for (a = 1; a < argc; a++)
				1680	if (!strcmp(str, argv[a])) {
				1681	if (a == argc - 1) {
				1682	printf("Argument missing for %s\n", str);
				1683	exit(1);
				1684	}
				1685	return a;
				1686	}
				1687	return -1;
				1688	}
				1689
				1690	int main(int argc, char **argv) {
				1691	int i;
				1692	if (argc == 1) {
				1693	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1694	printf("Options:\n");
				1695	printf("Parameters for training:\n");
				1696	printf("\t-train <file>\n");
				1697	printf("\t\tUse text data from <file> to train the model\n");
				1698	printf("\t-output <file>\n");
				1699	printf(
				1700	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1701	printf("\t-size <int>\n");
				1702	printf("\t\tSet size of word vectors; default is 100\n");
				1703	printf("\t-window <int>\n");
				1704	printf("\t\tSet max skip length between words; default is 5\n");
				1705	printf("\t-sample <float>\n");
				1706	printf(
				1707	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1708	printf(
				1709	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1710	printf("\t-hs <int>\n");
				1711	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1712	printf("\t-negative <int>\n");
				1713	printf(
				1714	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1715	printf("\t-negative-classes <file>\n");
				1716	printf("\t\tNegative classes to sample from\n");
				1717	printf("\t-nce <int>\n");
				1718	printf(
				1719	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1720	printf("\t-threads <int>\n");
				1721	printf("\t\tUse <int> threads (default 12)\n");
				1722	printf("\t-iter <int>\n");
				1723	printf("\t\tRun more training iterations (default 5)\n");
				1724	printf("\t-min-count <int>\n");
				1725	printf(
				1726	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1727	printf("\t-alpha <float>\n");
				1728	printf(
				1729	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1730	printf("\t-classes <int>\n");
				1731	printf(
				1732	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1733	printf("\t-debug <int>\n");
				1734	printf(
				1735	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1736	printf("\t-binary <int>\n");
				1737	printf(
				1738	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1739	printf("\t-save-vocab <file>\n");
				1740	printf("\t\tThe vocabulary will be saved to <file>\n");
				1741	printf("\t-read-vocab <file>\n");
				1742	printf(
				1743	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1744	printf("\t-read-net <file>\n");
				1745	printf(
				1746	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1747	printf("\t-save-net <file>\n");
				1748	printf("\t\tThe net parameters will be saved to <file>\n");
				1749	printf("\t-type <int>\n");
				1750	printf(
				1751	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1752	printf("\t-cap <int>\n");
				1753	printf(
				1754	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1755	printf("\nExamples:\n");
				1756	printf(
				1757	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1758	return 0;
				1759	}
				1760	output_file[0] = 0;
				1761	save_vocab_file[0] = 0;
				1762	read_vocab_file[0] = 0;
				1763	save_net_file[0] = 0;
				1764	read_net_file[0] = 0;
				1765	negative_classes_file[0] = 0;
				1766	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				1767	layer1_size = atoi(argv[i + 1]);
				1768	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				1769	strcpy(train_file, argv[i + 1]);
				1770	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				1771	strcpy(save_vocab_file, argv[i + 1]);
				1772	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				1773	strcpy(read_vocab_file, argv[i + 1]);
				1774	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				1775	strcpy(save_net_file, argv[i + 1]);
				1776	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				1777	strcpy(read_net_file, argv[i + 1]);
				1778	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				1779	debug_mode = atoi(argv[i + 1]);
				1780	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				1781	binary = atoi(argv[i + 1]);
				1782	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				1783	type = atoi(argv[i + 1]);
				1784	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				1785	strcpy(output_file, argv[i + 1]);
				1786	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				1787	window = atoi(argv[i + 1]);
				1788	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				1789	sample = atof(argv[i + 1]);
				1790	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				1791	hs = atoi(argv[i + 1]);
				1792	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				1793	negative = atoi(argv[i + 1]);
				1794	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				1795	strcpy(negative_classes_file, argv[i + 1]);
				1796	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				1797	nce = atoi(argv[i + 1]);
				1798	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				1799	num_threads = atoi(argv[i + 1]);
				1800	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				1801	iter = atoi(argv[i + 1]);
				1802	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				1803	min_count = atoi(argv[i + 1]);
				1804	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				1805	classes = atoi(argv[i + 1]);
				1806	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				1807	cap = atoi(argv[i + 1]);
				1808	if (type == 0 \|\| type == 2 \|\| type == 4)
				1809	alpha = 0.05;
				1810	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				1811	alpha = atof(argv[i + 1]);
				1812	vocab = (struct vocab_word *) calloc(vocab_max_size,
				1813	sizeof(struct vocab_word));
				1814	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				1815	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1816	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1817	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1818	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1819	}
				1820	TrainModel();
				1821	return 0;
				1822	}
				1823