Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 7de104aac82903c11856b64e939b5273279543d9 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	25	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	26	#define MAX_CODE_LENGTH 40
				27
				28	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				29
				30	typedef float real; // Precision of float numbers
				31
				32	struct vocab_word {
				33	long long cn;
				34	int *point;
				35	char word, code, codelen;
				36	};
				37
				38	char train_file[MAX_STRING], output_file[MAX_STRING];
				39	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				40	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				41	struct vocab_word *vocab;
				42	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
				43	num_threads = 12, min_reduce = 1;
				44	int *vocab_hash;
				45	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				46	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				47	classes = 0;
				48	real alpha = 0.025, starting_alpha, sample = 1e-3;
				49	real syn0, syn1, syn1neg, syn1nce, *expTable;
				50	clock_t start;
				51
				52	real syn1_window, syn1neg_window, *syn1nce_window;
				53	int w_offset, window_layer_size;
				54
				55	int window_hidden_size = 500;
				56	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				57	*syn_hidden_word_nce;
				58
				59	int hs = 0, negative = 5;
				60	const int table_size = 1e8;
				61	int *table;
				62
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	63	long cc = 0;
				64
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	65	//constrastive negative sampling
				66	char negative_classes_file[MAX_STRING];
				67	int *word_to_group;
				68	int group_to_table; //group_sizetable_size
				69	int class_number;
				70
				71	//nce
				72	real* noise_distribution;
				73	int nce = 0;
				74
				75	//param caps
				76	real CAP_VALUE = 50;
				77	int cap = 0;
				78
				79	void capParam(real* array, int index) {
				80	if (array[index] > CAP_VALUE)
				81	array[index] = CAP_VALUE;
				82	else if (array[index] < -CAP_VALUE)
				83	array[index] = -CAP_VALUE;
				84	}
				85
				86	real hardTanh(real x) {
				87	if (x >= 1) {
				88	return 1;
				89	} else if (x <= -1) {
				90	return -1;
				91	} else {
				92	return x;
				93	}
				94	}
				95
				96	real dHardTanh(real x, real g) {
				97	if (x > 1 && g > 0) {
				98	return 0;
				99	}
				100	if (x < -1 && g < 0) {
				101	return 0;
				102	}
				103	return 1;
				104	}
				105
				106	void InitUnigramTable() {
				107	int a, i;
				108	long long train_words_pow = 0;
				109	real d1, power = 0.75;
				110	table = (int ) malloc(table_size sizeof(int));
				111	for (a = 0; a < vocab_size; a++)
				112	train_words_pow += pow(vocab[a].cn, power);
				113	i = 0;
				114	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				115	for (a = 0; a < table_size; a++) {
				116	table[a] = i;
				117	if (a / (real) table_size > d1) {
				118	i++;
				119	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				120	}
				121	if (i >= vocab_size)
				122	i = vocab_size - 1;
				123	}
				124
				125	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				126	for (a = 0; a < vocab_size; a++)
				127	noise_distribution[a] = pow(vocab[a].cn, power)
				128	/ (real) train_words_pow;
				129	}
				130
				131	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				132	void ReadWord(char word, FILE fin) {
				133	int a = 0, ch;
				134	while (!feof(fin)) {
				135	ch = fgetc(fin);
				136	if (ch == 13)
				137	continue;
				138	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				139	if (a > 0) {
				140	if (ch == '\n')
				141	ungetc(ch, fin);
				142	break;
				143	}
				144	if (ch == '\n') {
				145	strcpy(word, (char *) "</s>");
				146	return;
				147	} else
				148	continue;
				149	}
				150	word[a] = ch;
				151	a++;
				152	if (a >= MAX_STRING - 1)
				153	a--; // Truncate too long words
				154	}
				155	word[a] = 0;
				156	}
				157
				158	// Returns hash value of a word
				159	int GetWordHash(char *word) {
				160	unsigned long long a, hash = 0;
				161	for (a = 0; a < strlen(word); a++)
				162	hash = hash * 257 + word[a];
				163	hash = hash % vocab_hash_size;
				164	return hash;
				165	}
				166
				167	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				168	int SearchVocab(char *word) {
				169	unsigned int hash = GetWordHash(word);
				170	while (1) {
				171	if (vocab_hash[hash] == -1)
				172	return -1;
				173	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				174	return vocab_hash[hash];
				175	hash = (hash + 1) % vocab_hash_size;
				176	}
				177	return -1;
				178	}
				179
				180	// Reads a word and returns its index in the vocabulary
				181	int ReadWordIndex(FILE *fin) {
				182	char word[MAX_STRING];
				183	ReadWord(word, fin);
				184	if (feof(fin))
				185	return -1;
				186	return SearchVocab(word);
				187	}
				188
				189	// Adds a word to the vocabulary
				190	int AddWordToVocab(char *word) {
				191	unsigned int hash, length = strlen(word) + 1;
				192	if (length > MAX_STRING)
				193	length = MAX_STRING;
				194	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				195	strcpy(vocab[vocab_size].word, word);
				196	vocab[vocab_size].cn = 0;
				197	vocab_size++;
				198	// Reallocate memory if needed
				199	if (vocab_size + 2 >= vocab_max_size) {
				200	vocab_max_size += 1000;
				201	vocab = (struct vocab_word *) realloc(vocab,
				202	vocab_max_size * sizeof(struct vocab_word));
				203	}
				204	hash = GetWordHash(word);
				205	while (vocab_hash[hash] != -1)
				206	hash = (hash + 1) % vocab_hash_size;
				207	vocab_hash[hash] = vocab_size - 1;
				208	return vocab_size - 1;
				209	}
				210
				211	// Used later for sorting by word counts
				212	int VocabCompare(const void a, const void b) {
				213	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				214	}
				215
				216	// Sorts the vocabulary by frequency using word counts
				217	void SortVocab() {
				218	int a, size;
				219	unsigned int hash;
				220	// Sort the vocabulary and keep </s> at the first position
				221	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				222	for (a = 0; a < vocab_hash_size; a++)
				223	vocab_hash[a] = -1;
				224	size = vocab_size;
				225	train_words = 0;
				226	for (a = 0; a < size; a++) {
				227	// Words occuring less than min_count times will be discarded from the vocab
				228	if ((vocab[a].cn < min_count) && (a != 0)) {
				229	vocab_size--;
				230	free(vocab[a].word);
				231	} else {
				232	// Hash will be re-computed, as after the sorting it is not actual
				233	hash = GetWordHash(vocab[a].word);
				234	while (vocab_hash[hash] != -1)
				235	hash = (hash + 1) % vocab_hash_size;
				236	vocab_hash[hash] = a;
				237	train_words += vocab[a].cn;
				238	}
				239	}
				240	vocab = (struct vocab_word *) realloc(vocab,
				241	(vocab_size + 1) * sizeof(struct vocab_word));
				242	// Allocate memory for the binary tree construction
				243	for (a = 0; a < vocab_size; a++) {
				244	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				245	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				246	}
				247	}
				248
				249	// Reduces the vocabulary by removing infrequent tokens
				250	void ReduceVocab() {
				251	int a, b = 0;
				252	unsigned int hash;
				253	for (a = 0; a < vocab_size; a++)
				254	if (vocab[a].cn > min_reduce) {
				255	vocab[b].cn = vocab[a].cn;
				256	vocab[b].word = vocab[a].word;
				257	b++;
				258	} else
				259	free(vocab[a].word);
				260	vocab_size = b;
				261	for (a = 0; a < vocab_hash_size; a++)
				262	vocab_hash[a] = -1;
				263	for (a = 0; a < vocab_size; a++) {
				264	// Hash will be re-computed, as it is not actual
				265	hash = GetWordHash(vocab[a].word);
				266	while (vocab_hash[hash] != -1)
				267	hash = (hash + 1) % vocab_hash_size;
				268	vocab_hash[hash] = a;
				269	}
				270	fflush(stdout);
				271	min_reduce++;
				272	}
				273
				274	// Create binary Huffman tree using the word counts
				275	// Frequent words will have short uniqe binary codes
				276	void CreateBinaryTree() {
				277	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				278	char code[MAX_CODE_LENGTH];
				279	long long count = (long long ) calloc(vocab_size * 2 + 1,
				280	sizeof(long long));
				281	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				282	sizeof(long long));
				283	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				284	sizeof(long long));
				285	for (a = 0; a < vocab_size; a++)
				286	count[a] = vocab[a].cn;
				287	for (a = vocab_size; a < vocab_size * 2; a++)
				288	count[a] = 1e15;
				289	pos1 = vocab_size - 1;
				290	pos2 = vocab_size;
				291	// Following algorithm constructs the Huffman tree by adding one node at a time
				292	for (a = 0; a < vocab_size - 1; a++) {
				293	// First, find two smallest nodes 'min1, min2'
				294	if (pos1 >= 0) {
				295	if (count[pos1] < count[pos2]) {
				296	min1i = pos1;
				297	pos1--;
				298	} else {
				299	min1i = pos2;
				300	pos2++;
				301	}
				302	} else {
				303	min1i = pos2;
				304	pos2++;
				305	}
				306	if (pos1 >= 0) {
				307	if (count[pos1] < count[pos2]) {
				308	min2i = pos1;
				309	pos1--;
				310	} else {
				311	min2i = pos2;
				312	pos2++;
				313	}
				314	} else {
				315	min2i = pos2;
				316	pos2++;
				317	}
				318	count[vocab_size + a] = count[min1i] + count[min2i];
				319	parent_node[min1i] = vocab_size + a;
				320	parent_node[min2i] = vocab_size + a;
				321	binary[min2i] = 1;
				322	}
				323	// Now assign binary code to each vocabulary word
				324	for (a = 0; a < vocab_size; a++) {
				325	b = a;
				326	i = 0;
				327	while (1) {
				328	code[i] = binary[b];
				329	point[i] = b;
				330	i++;
				331	b = parent_node[b];
				332	if (b == vocab_size * 2 - 2)
				333	break;
				334	}
				335	vocab[a].codelen = i;
				336	vocab[a].point[0] = vocab_size - 2;
				337	for (b = 0; b < i; b++) {
				338	vocab[a].code[i - b - 1] = code[b];
				339	vocab[a].point[i - b] = point[b] - vocab_size;
				340	}
				341	}
				342	free(count);
				343	free(binary);
				344	free(parent_node);
				345	}
				346
				347	void LearnVocabFromTrainFile() {
				348	char word[MAX_STRING];
				349	FILE *fin;
				350	long long a, i;
				351	for (a = 0; a < vocab_hash_size; a++)
				352	vocab_hash[a] = -1;
				353	fin = fopen(train_file, "rb");
				354	if (fin == NULL) {
				355	printf("ERROR: training data file not found!\n");
				356	exit(1);
				357	}
				358	vocab_size = 0;
				359	AddWordToVocab((char *) "</s>");
				360	while (1) {
				361	ReadWord(word, fin);
				362	if (feof(fin))
				363	break;
				364	train_words++;
				365	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				366	printf("%lldK%c", train_words / 1000, 13);
				367	fflush(stdout);
				368	}
				369	i = SearchVocab(word);
				370	if (i == -1) {
				371	a = AddWordToVocab(word);
				372	vocab[a].cn = 1;
				373	} else
				374	vocab[i].cn++;
				375	if (vocab_size > vocab_hash_size * 0.7)
				376	ReduceVocab();
				377	}
				378	SortVocab();
				379	if (debug_mode > 0) {
				380	printf("Vocab size: %lld\n", vocab_size);
				381	printf("Words in train file: %lld\n", train_words);
				382	}
				383	file_size = ftell(fin);
				384	fclose(fin);
				385	}
				386
				387	void SaveVocab() {
				388	long long i;
				389	FILE *fo = fopen(save_vocab_file, "wb");
				390	for (i = 0; i < vocab_size; i++)
				391	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				392	fclose(fo);
				393	}
				394
				395	void ReadVocab() {
				396	long long a, i = 0;
				397	char c;
				398	char word[MAX_STRING];
				399	FILE *fin = fopen(read_vocab_file, "rb");
				400	if (fin == NULL) {
				401	printf("Vocabulary file not found\n");
				402	exit(1);
				403	}
				404	for (a = 0; a < vocab_hash_size; a++)
				405	vocab_hash[a] = -1;
				406	vocab_size = 0;
				407	while (1) {
				408	ReadWord(word, fin);
				409	if (feof(fin))
				410	break;
				411	a = AddWordToVocab(word);
				412	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				413	i++;
				414	}
				415	SortVocab();
				416	if (debug_mode > 0) {
				417	printf("Vocab size: %lld\n", vocab_size);
				418	printf("Words in train file: %lld\n", train_words);
				419	}
				420	fin = fopen(train_file, "rb");
				421	if (fin == NULL) {
				422	printf("ERROR: training data file not found!\n");
				423	exit(1);
				424	}
				425	fseek(fin, 0, SEEK_END);
				426	file_size = ftell(fin);
				427	fclose(fin);
				428	}
				429
				430	void InitClassUnigramTable() {
				431	long long a, c;
				432	printf("loading class unigrams \n");
				433	FILE *fin = fopen(negative_classes_file, "rb");
				434	if (fin == NULL) {
				435	printf("ERROR: class file not found!\n");
				436	exit(1);
				437	}
				438	word_to_group = (int ) malloc(vocab_size sizeof(int));
				439	for (a = 0; a < vocab_size; a++)
				440	word_to_group[a] = -1;
				441	char class[MAX_STRING];
				442	char prev_class[MAX_STRING];
				443	prev_class[0] = 0;
				444	char word[MAX_STRING];
				445	class_number = -1;
				446	while (1) {
				447	if (feof(fin))
				448	break;
				449	ReadWord(class, fin);
				450	ReadWord(word, fin);
				451	int word_index = SearchVocab(word);
				452	if (word_index != -1) {
				453	if (strcmp(class, prev_class) != 0) {
				454	class_number++;
				455	strcpy(prev_class, class);
				456	}
				457	word_to_group[word_index] = class_number;
				458	}
				459	ReadWord(word, fin);
				460	}
				461	class_number++;
				462	fclose(fin);
				463
				464	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				465	long long train_words_pow = 0;
				466	real d1, power = 0.75;
				467
				468	for (c = 0; c < class_number; c++) {
				469	long long offset = c * table_size;
				470	train_words_pow = 0;
				471	for (a = 0; a < vocab_size; a++)
				472	if (word_to_group[a] == c)
				473	train_words_pow += pow(vocab[a].cn, power);
				474	int i = 0;
				475	while (word_to_group[i] != c && i < vocab_size)
				476	i++;
				477	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				478	for (a = 0; a < table_size; a++) {
				479	//printf("index %lld , word %d\n", a, i);
				480	group_to_table[offset + a] = i;
				481	if (a / (real) table_size > d1) {
				482	i++;
				483	while (word_to_group[i] != c && i < vocab_size)
				484	i++;
				485	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				486	}
				487	if (i >= vocab_size)
				488	while (word_to_group[i] != c && i >= 0)
				489	i--;
				490	}
				491	}
				492	}
				493
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	494	void SaveArgs(int argc, char **argv) {
				495	unsigned int i;
				496	size_t len = 0;
				497	char _all_args, all_args;
				498	char *args_file = strdup(output_file);
				499	strcat(args_file, ".args");
				500	FILE *fargs = fopen(args_file, "w");
				501	if (fargs == NULL) {
				502	printf("Cannot save args to %s.\n", args_file);
				503	return;
				504	}
				505
				506	for(i=1; i<argc; i++) {
				507	len += strlen(argv[i]);
				508	}
				509
				510	_all_args = all_args = (char *)malloc(len+argc-1);
				511
				512	for(i=1; i<argc; i++) {
				513	memcpy(_all_args, argv[i], strlen(argv[i]));
				514	_all_args += strlen(argv[i])+1;
				515	*(_all_args-1) = ' ';
				516	}
				517	*(_all_args-1) = 0;
				518
				519	fprintf(fargs, "%s\n", all_args);
				520	fclose(fargs);
				521
				522	free(all_args);
				523
				524	return;
				525	}
				526
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	527	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	528	if(type != 3 \|\| negative <= 0) {
				529	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				530	return;
				531	}
				532
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	533	FILE *fnet = fopen(save_net_file, "wb");
				534	if (fnet == NULL) {
				535	printf("Net parameter file not found\n");
				536	exit(1);
				537	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	538	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	539	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	540	fclose(fnet);
				541	}
				542
				543	void InitNet() {
				544	long long a, b;
				545	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	546	long long read;
				547
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	548	window_layer_size = layer1_size * window * 2;
				549	a = posix_memalign((void **) &syn0, 128,
				550	(long long) vocab_size * layer1_size * sizeof(real));
				551	if (syn0 == NULL) {
				552	printf("Memory allocation failed\n");
				553	exit(1);
				554	}
				555
				556	if (hs) {
				557	a = posix_memalign((void **) &syn1, 128,
				558	(long long) vocab_size * layer1_size * sizeof(real));
				559	if (syn1 == NULL) {
				560	printf("Memory allocation failed\n");
				561	exit(1);
				562	}
				563	a = posix_memalign((void **) &syn1_window, 128,
				564	(long long) vocab_size * window_layer_size * sizeof(real));
				565	if (syn1_window == NULL) {
				566	printf("Memory allocation failed\n");
				567	exit(1);
				568	}
				569	a = posix_memalign((void **) &syn_hidden_word, 128,
				570	(long long) vocab_size * window_hidden_size * sizeof(real));
				571	if (syn_hidden_word == NULL) {
				572	printf("Memory allocation failed\n");
				573	exit(1);
				574	}
				575
				576	for (a = 0; a < vocab_size; a++)
				577	for (b = 0; b < layer1_size; b++)
				578	syn1[a * layer1_size + b] = 0;
				579	for (a = 0; a < vocab_size; a++)
				580	for (b = 0; b < window_layer_size; b++)
				581	syn1_window[a * window_layer_size + b] = 0;
				582	for (a = 0; a < vocab_size; a++)
				583	for (b = 0; b < window_hidden_size; b++)
				584	syn_hidden_word[a * window_hidden_size + b] = 0;
				585	}
				586	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	587	if(type == 0) {
				588	a = posix_memalign((void **) &syn1neg, 128,
				589	(long long) vocab_size * layer1_size * sizeof(real));
				590	if (syn1neg == NULL) {
				591	printf("Memory allocation failed\n");
				592	exit(1);
				593	}
				594	for (a = 0; a < vocab_size; a++)
				595	for (b = 0; b < layer1_size; b++)
				596	syn1neg[a * layer1_size + b] = 0;
				597	} else if (type == 3) {
				598	a = posix_memalign((void **) &syn1neg_window, 128,
				599	(long long) vocab_size * window_layer_size * sizeof(real));
				600	if (syn1neg_window == NULL) {
				601	printf("Memory allocation failed\n");
				602	exit(1);
				603	}
				604	for (a = 0; a < vocab_size; a++)
				605	for (b = 0; b < window_layer_size; b++)
				606	syn1neg_window[a * window_layer_size + b] = 0;
				607	} else if (type == 4) {
				608	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				609	(long long) vocab_size * window_hidden_size * sizeof(real));
				610	if (syn_hidden_word_neg == NULL) {
				611	printf("Memory allocation failed\n");
				612	exit(1);
				613	}
				614	for (a = 0; a < vocab_size; a++)
				615	for (b = 0; b < window_hidden_size; b++)
				616	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	617	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	618	}
				619	if (nce > 0) {
				620	a = posix_memalign((void **) &syn1nce, 128,
				621	(long long) vocab_size * layer1_size * sizeof(real));
				622	if (syn1nce == NULL) {
				623	printf("Memory allocation failed\n");
				624	exit(1);
				625	}
				626	a = posix_memalign((void **) &syn1nce_window, 128,
				627	(long long) vocab_size * window_layer_size * sizeof(real));
				628	if (syn1nce_window == NULL) {
				629	printf("Memory allocation failed\n");
				630	exit(1);
				631	}
				632	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				633	(long long) vocab_size * window_hidden_size * sizeof(real));
				634	if (syn_hidden_word_nce == NULL) {
				635	printf("Memory allocation failed\n");
				636	exit(1);
				637	}
				638
				639	for (a = 0; a < vocab_size; a++)
				640	for (b = 0; b < layer1_size; b++)
				641	syn1nce[a * layer1_size + b] = 0;
				642	for (a = 0; a < vocab_size; a++)
				643	for (b = 0; b < window_layer_size; b++)
				644	syn1nce_window[a * window_layer_size + b] = 0;
				645	for (a = 0; a < vocab_size; a++)
				646	for (b = 0; b < window_hidden_size; b++)
				647	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				648	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	649
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	650	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	651	a = posix_memalign((void **) &syn_window_hidden, 128,
				652	window_hidden_size * window_layer_size * sizeof(real));
				653	if (syn_window_hidden == NULL) {
				654	printf("Memory allocation failed\n");
				655	exit(1);
				656	}
				657	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				658	next_random = next_random * (unsigned long long) 25214903917 + 11;
				659	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				660	- 0.5) / (window_hidden_size * window_layer_size);
				661	}
				662	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	663
				664	if (read_net_file[0] == 0) {
				665	for (a = 0; a < vocab_size; a++)
				666	for (b = 0; b < layer1_size; b++) {
				667	next_random = next_random * (unsigned long long) 25214903917
				668	+ 11;
				669	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				670	/ (real) 65536) - 0.5) / layer1_size;
				671	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	672	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	673	FILE *fnet = fopen(read_net_file, "rb");
				674	if (fnet == NULL) {
				675	printf("Net parameter file not found\n");
				676	exit(1);
				677	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	678	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				679	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				680	if(read != vocab_size * layer1_size) {
				681	fprintf(stderr, "read-net failed %lld\n", read);
				682	exit(-1);
				683	}
				684	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				685	if(read != (long long) vocab_size * window_layer_size) {
				686	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				687	(long long) sizeof(real) * vocab_size * window_layer_size);
				688	exit(-1);
				689	}
				690	fgetc(fnet);
				691	if(!feof(fnet)) {
				692	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				693	exit(-1);
				694	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	695	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	696	} else {
				697	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				698	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	699	}
				700
				701	CreateBinaryTree();
				702	}
				703
				704	void TrainModelThread(void id) {
				705	long long a, b, d, cw, word, last_word, sentence_length = 0,
				706	sentence_position = 0;
				707	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				708	long long l1, l2, c, target, label, local_iter = iter;
				709	unsigned long long next_random = (long long) id;
				710	real f, g;
				711	clock_t now;
				712	int input_len_1 = layer1_size;
				713	int window_offset = -1;
				714	if (type == 2 \|\| type == 4) {
				715	input_len_1 = window_layer_size;
				716	}
				717	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				718	real neu1e = (real ) calloc(input_len_1, sizeof(real));
				719
				720	int input_len_2 = 0;
				721	if (type == 4) {
				722	input_len_2 = window_hidden_size;
				723	}
				724	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				725	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				726
				727	FILE *fi = fopen(train_file, "rb");
				728	fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
				729	while (1) {
				730	if (word_count - last_word_count > 10000) {
				731	word_count_actual += word_count - last_word_count;
				732	last_word_count = word_count;
				733	if ((debug_mode > 1)) {
				734	now = clock();
				735	printf(
				736	"%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
				737	13, alpha,
				738	word_count_actual / (real) (iter * train_words + 1)
				739	* 100,
				740	word_count_actual
				741	/ ((real) (now - start + 1)
				742	/ (real) CLOCKS_PER_SEC * 1000));
				743	fflush(stdout);
				744	}
				745	alpha = starting_alpha
				746	* (1 - word_count_actual / (real) (iter * train_words + 1));
				747	if (alpha < starting_alpha * 0.0001)
				748	alpha = starting_alpha * 0.0001;
				749	}
				750	if (sentence_length == 0) {
				751	while (1) {
				752	word = ReadWordIndex(fi);
				753	if (feof(fi))
				754	break;
				755	if (word == -1)
				756	continue;
				757	word_count++;
				758	if (word == 0)
				759	break;
				760	// The subsampling randomly discards frequent words while keeping the ranking same
				761	if (sample > 0) {
				762	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				763	+ 1) * (sample * train_words) / vocab[word].cn;
				764	next_random = next_random * (unsigned long long) 25214903917
				765	+ 11;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	766	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				767	if(type == 3) // in structured skipgrams
				768	word = -2; // keep the window position correct
				769	else
				770	continue;
				771	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	772	}
				773	sen[sentence_length] = word;
				774	sentence_length++;
				775	if (sentence_length >= MAX_SENTENCE_LENGTH)
				776	break;
				777	}
				778	sentence_position = 0;
				779	}
				780	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				781	word_count_actual += word_count - last_word_count;
				782	local_iter--;
				783	if (local_iter == 0)
				784	break;
				785	word_count = 0;
				786	last_word_count = 0;
				787	sentence_length = 0;
				788	fseek(fi, file_size / (long long) num_threads * (long long) id,
				789	SEEK_SET);
				790	continue;
				791	}
				792	word = sen[sentence_position];
Marc Kupietz	da5ebbc	2016-04-19 09:16:02 +0200	[diff] [blame^]	793	while (word == -2)
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	794	word = sen[++sentence_position];
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	795	if (word == -1)
				796	continue;
				797	for (c = 0; c < input_len_1; c++)
				798	neu1[c] = 0;
				799	for (c = 0; c < input_len_1; c++)
				800	neu1e[c] = 0;
				801	for (c = 0; c < input_len_2; c++)
				802	neu2[c] = 0;
				803	for (c = 0; c < input_len_2; c++)
				804	neu2e[c] = 0;
				805	next_random = next_random * (unsigned long long) 25214903917 + 11;
				806	b = next_random % window;
				807	if (type == 0) { //train the cbow architecture
				808	// in -> hidden
				809	cw = 0;
				810	for (a = b; a < window * 2 + 1 - b; a++)
				811	if (a != window) {
				812	c = sentence_position - window + a;
				813	if (c < 0)
				814	continue;
				815	if (c >= sentence_length)
				816	continue;
				817	last_word = sen[c];
				818	if (last_word == -1)
				819	continue;
				820	for (c = 0; c < layer1_size; c++)
				821	neu1[c] += syn0[c + last_word * layer1_size];
				822	cw++;
				823	}
				824	if (cw) {
				825	for (c = 0; c < layer1_size; c++)
				826	neu1[c] /= cw;
				827	if (hs)
				828	for (d = 0; d < vocab[word].codelen; d++) {
				829	f = 0;
				830	l2 = vocab[word].point[d] * layer1_size;
				831	// Propagate hidden -> output
				832	for (c = 0; c < layer1_size; c++)
				833	f += neu1[c] * syn1[c + l2];
				834	if (f <= -MAX_EXP)
				835	continue;
				836	else if (f >= MAX_EXP)
				837	continue;
				838	else
				839	f = expTable[(int) ((f + MAX_EXP)
				840	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				841	// 'g' is the gradient multiplied by the learning rate
				842	g = (1 - vocab[word].code[d] - f) * alpha;
				843	// Propagate errors output -> hidden
				844	for (c = 0; c < layer1_size; c++)
				845	neu1e[c] += g * syn1[c + l2];
				846	// Learn weights hidden -> output
				847	for (c = 0; c < layer1_size; c++)
				848	syn1[c + l2] += g * neu1[c];
				849	if (cap == 1)
				850	for (c = 0; c < layer1_size; c++)
				851	capParam(syn1, c + l2);
				852	}
				853	// NEGATIVE SAMPLING
				854	if (negative > 0)
				855	for (d = 0; d < negative + 1; d++) {
				856	if (d == 0) {
				857	target = word;
				858	label = 1;
				859	} else {
				860	next_random = next_random
				861	* (unsigned long long) 25214903917 + 11;
				862	if (word_to_group != NULL
				863	&& word_to_group[word] != -1) {
				864	target = word;
				865	while (target == word) {
				866	target = group_to_table[word_to_group[word]
				867	* table_size
				868	+ (next_random >> 16) % table_size];
				869	next_random = next_random
				870	* (unsigned long long) 25214903917
				871	+ 11;
				872	}
				873	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				874	} else {
				875	target =
				876	table[(next_random >> 16) % table_size];
				877	}
				878	if (target == 0)
				879	target = next_random % (vocab_size - 1) + 1;
				880	if (target == word)
				881	continue;
				882	label = 0;
				883	}
				884	l2 = target * layer1_size;
				885	f = 0;
				886	for (c = 0; c < layer1_size; c++)
				887	f += neu1[c] * syn1neg[c + l2];
				888	if (f > MAX_EXP)
				889	g = (label - 1) * alpha;
				890	else if (f < -MAX_EXP)
				891	g = (label - 0) * alpha;
				892	else
				893	g = (label
				894	- expTable[(int) ((f + MAX_EXP)
				895	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				896	* alpha;
				897	for (c = 0; c < layer1_size; c++)
				898	neu1e[c] += g * syn1neg[c + l2];
				899	for (c = 0; c < layer1_size; c++)
				900	syn1neg[c + l2] += g * neu1[c];
				901	if (cap == 1)
				902	for (c = 0; c < layer1_size; c++)
				903	capParam(syn1neg, c + l2);
				904	}
				905	// Noise Contrastive Estimation
				906	if (nce > 0)
				907	for (d = 0; d < nce + 1; d++) {
				908	if (d == 0) {
				909	target = word;
				910	label = 1;
				911	} else {
				912	next_random = next_random
				913	* (unsigned long long) 25214903917 + 11;
				914	if (word_to_group != NULL
				915	&& word_to_group[word] != -1) {
				916	target = word;
				917	while (target == word) {
				918	target = group_to_table[word_to_group[word]
				919	* table_size
				920	+ (next_random >> 16) % table_size];
				921	next_random = next_random
				922	* (unsigned long long) 25214903917
				923	+ 11;
				924	}
				925	} else {
				926	target =
				927	table[(next_random >> 16) % table_size];
				928	}
				929	if (target == 0)
				930	target = next_random % (vocab_size - 1) + 1;
				931	if (target == word)
				932	continue;
				933	label = 0;
				934	}
				935	l2 = target * layer1_size;
				936	f = 0;
				937
				938	for (c = 0; c < layer1_size; c++)
				939	f += neu1[c] * syn1nce[c + l2];
				940	if (f > MAX_EXP)
				941	g = (label - 1) * alpha;
				942	else if (f < -MAX_EXP)
				943	g = (label - 0) * alpha;
				944	else {
				945	f = exp(f);
				946	g =
				947	(label
				948	- f
				949	/ (noise_distribution[target]
				950	* nce + f)) * alpha;
				951	}
				952	for (c = 0; c < layer1_size; c++)
				953	neu1e[c] += g * syn1nce[c + l2];
				954	for (c = 0; c < layer1_size; c++)
				955	syn1nce[c + l2] += g * neu1[c];
				956	if (cap == 1)
				957	for (c = 0; c < layer1_size; c++)
				958	capParam(syn1nce, c + l2);
				959	}
				960	// hidden -> in
				961	for (a = b; a < window * 2 + 1 - b; a++)
				962	if (a != window) {
				963	c = sentence_position - window + a;
				964	if (c < 0)
				965	continue;
				966	if (c >= sentence_length)
				967	continue;
				968	last_word = sen[c];
				969	if (last_word == -1)
				970	continue;
				971	for (c = 0; c < layer1_size; c++)
				972	syn0[c + last_word * layer1_size] += neu1e[c];
				973	}
				974	}
				975	} else if (type == 1) { //train skip-gram
				976	for (a = b; a < window * 2 + 1 - b; a++)
				977	if (a != window) {
				978	c = sentence_position - window + a;
				979	if (c < 0)
				980	continue;
				981	if (c >= sentence_length)
				982	continue;
				983	last_word = sen[c];
				984	if (last_word == -1)
				985	continue;
				986	l1 = last_word * layer1_size;
				987	for (c = 0; c < layer1_size; c++)
				988	neu1e[c] = 0;
				989	// HIERARCHICAL SOFTMAX
				990	if (hs)
				991	for (d = 0; d < vocab[word].codelen; d++) {
				992	f = 0;
				993	l2 = vocab[word].point[d] * layer1_size;
				994	// Propagate hidden -> output
				995	for (c = 0; c < layer1_size; c++)
				996	f += syn0[c + l1] * syn1[c + l2];
				997	if (f <= -MAX_EXP)
				998	continue;
				999	else if (f >= MAX_EXP)
				1000	continue;
				1001	else
				1002	f = expTable[(int) ((f + MAX_EXP)
				1003	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1004	// 'g' is the gradient multiplied by the learning rate
				1005	g = (1 - vocab[word].code[d] - f) * alpha;
				1006	// Propagate errors output -> hidden
				1007	for (c = 0; c < layer1_size; c++)
				1008	neu1e[c] += g * syn1[c + l2];
				1009	// Learn weights hidden -> output
				1010	for (c = 0; c < layer1_size; c++)
				1011	syn1[c + l2] += g * syn0[c + l1];
				1012	if (cap == 1)
				1013	for (c = 0; c < layer1_size; c++)
				1014	capParam(syn1, c + l2);
				1015	}
				1016	// NEGATIVE SAMPLING
				1017	if (negative > 0)
				1018	for (d = 0; d < negative + 1; d++) {
				1019	if (d == 0) {
				1020	target = word;
				1021	label = 1;
				1022	} else {
				1023	next_random = next_random
				1024	* (unsigned long long) 25214903917 + 11;
				1025	if (word_to_group != NULL
				1026	&& word_to_group[word] != -1) {
				1027	target = word;
				1028	while (target == word) {
				1029	target =
				1030	group_to_table[word_to_group[word]
				1031	* table_size
				1032	+ (next_random >> 16)
				1033	% table_size];
				1034	next_random =
				1035	next_random
				1036	* (unsigned long long) 25214903917
				1037	+ 11;
				1038	}
				1039	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1040	} else {
				1041	target = table[(next_random >> 16)
				1042	% table_size];
				1043	}
				1044	if (target == 0)
				1045	target = next_random % (vocab_size - 1) + 1;
				1046	if (target == word)
				1047	continue;
				1048	label = 0;
				1049	}
				1050	l2 = target * layer1_size;
				1051	f = 0;
				1052	for (c = 0; c < layer1_size; c++)
				1053	f += syn0[c + l1] * syn1neg[c + l2];
				1054	if (f > MAX_EXP)
				1055	g = (label - 1) * alpha;
				1056	else if (f < -MAX_EXP)
				1057	g = (label - 0) * alpha;
				1058	else
				1059	g =
				1060	(label
				1061	- expTable[(int) ((f + MAX_EXP)
				1062	* (EXP_TABLE_SIZE
				1063	/ MAX_EXP / 2))])
				1064	* alpha;
				1065	for (c = 0; c < layer1_size; c++)
				1066	neu1e[c] += g * syn1neg[c + l2];
				1067	for (c = 0; c < layer1_size; c++)
				1068	syn1neg[c + l2] += g * syn0[c + l1];
				1069	if (cap == 1)
				1070	for (c = 0; c < layer1_size; c++)
				1071	capParam(syn1neg, c + l2);
				1072	}
				1073	//Noise Contrastive Estimation
				1074	if (nce > 0)
				1075	for (d = 0; d < nce + 1; d++) {
				1076	if (d == 0) {
				1077	target = word;
				1078	label = 1;
				1079	} else {
				1080	next_random = next_random
				1081	* (unsigned long long) 25214903917 + 11;
				1082	if (word_to_group != NULL
				1083	&& word_to_group[word] != -1) {
				1084	target = word;
				1085	while (target == word) {
				1086	target =
				1087	group_to_table[word_to_group[word]
				1088	* table_size
				1089	+ (next_random >> 16)
				1090	% table_size];
				1091	next_random =
				1092	next_random
				1093	* (unsigned long long) 25214903917
				1094	+ 11;
				1095	}
				1096	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1097	} else {
				1098	target = table[(next_random >> 16)
				1099	% table_size];
				1100	}
				1101	if (target == 0)
				1102	target = next_random % (vocab_size - 1) + 1;
				1103	if (target == word)
				1104	continue;
				1105	label = 0;
				1106	}
				1107	l2 = target * layer1_size;
				1108	f = 0;
				1109	for (c = 0; c < layer1_size; c++)
				1110	f += syn0[c + l1] * syn1nce[c + l2];
				1111	if (f > MAX_EXP)
				1112	g = (label - 1) * alpha;
				1113	else if (f < -MAX_EXP)
				1114	g = (label - 0) * alpha;
				1115	else {
				1116	f = exp(f);
				1117	g = (label
				1118	- f
				1119	/ (noise_distribution[target]
				1120	* nce + f)) * alpha;
				1121	}
				1122	for (c = 0; c < layer1_size; c++)
				1123	neu1e[c] += g * syn1nce[c + l2];
				1124	for (c = 0; c < layer1_size; c++)
				1125	syn1nce[c + l2] += g * syn0[c + l1];
				1126	if (cap == 1)
				1127	for (c = 0; c < layer1_size; c++)
				1128	capParam(syn1nce, c + l2);
				1129	}
				1130	// Learn weights input -> hidden
				1131	for (c = 0; c < layer1_size; c++)
				1132	syn0[c + l1] += neu1e[c];
				1133	}
				1134	} else if (type == 2) { //train the cwindow architecture
				1135	// in -> hidden
				1136	cw = 0;
				1137	for (a = 0; a < window * 2 + 1; a++)
				1138	if (a != window) {
				1139	c = sentence_position - window + a;
				1140	if (c < 0)
				1141	continue;
				1142	if (c >= sentence_length)
				1143	continue;
				1144	last_word = sen[c];
				1145	if (last_word == -1)
				1146	continue;
				1147	window_offset = a * layer1_size;
				1148	if (a > window)
				1149	window_offset -= layer1_size;
				1150	for (c = 0; c < layer1_size; c++)
				1151	neu1[c + window_offset] += syn0[c
				1152	+ last_word * layer1_size];
				1153	cw++;
				1154	}
				1155	if (cw) {
				1156	if (hs)
				1157	for (d = 0; d < vocab[word].codelen; d++) {
				1158	f = 0;
				1159	l2 = vocab[word].point[d] * window_layer_size;
				1160	// Propagate hidden -> output
				1161	for (c = 0; c < window_layer_size; c++)
				1162	f += neu1[c] * syn1_window[c + l2];
				1163	if (f <= -MAX_EXP)
				1164	continue;
				1165	else if (f >= MAX_EXP)
				1166	continue;
				1167	else
				1168	f = expTable[(int) ((f + MAX_EXP)
				1169	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1170	// 'g' is the gradient multiplied by the learning rate
				1171	g = (1 - vocab[word].code[d] - f) * alpha;
				1172	// Propagate errors output -> hidden
				1173	for (c = 0; c < window_layer_size; c++)
				1174	neu1e[c] += g * syn1_window[c + l2];
				1175	// Learn weights hidden -> output
				1176	for (c = 0; c < window_layer_size; c++)
				1177	syn1_window[c + l2] += g * neu1[c];
				1178	if (cap == 1)
				1179	for (c = 0; c < window_layer_size; c++)
				1180	capParam(syn1_window, c + l2);
				1181	}
				1182	// NEGATIVE SAMPLING
				1183	if (negative > 0)
				1184	for (d = 0; d < negative + 1; d++) {
				1185	if (d == 0) {
				1186	target = word;
				1187	label = 1;
				1188	} else {
				1189	next_random = next_random
				1190	* (unsigned long long) 25214903917 + 11;
				1191	if (word_to_group != NULL
				1192	&& word_to_group[word] != -1) {
				1193	target = word;
				1194	while (target == word) {
				1195	target = group_to_table[word_to_group[word]
				1196	* table_size
				1197	+ (next_random >> 16) % table_size];
				1198	next_random = next_random
				1199	* (unsigned long long) 25214903917
				1200	+ 11;
				1201	}
				1202	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1203	} else {
				1204	target =
				1205	table[(next_random >> 16) % table_size];
				1206	}
				1207	if (target == 0)
				1208	target = next_random % (vocab_size - 1) + 1;
				1209	if (target == word)
				1210	continue;
				1211	label = 0;
				1212	}
				1213	l2 = target * window_layer_size;
				1214	f = 0;
				1215	for (c = 0; c < window_layer_size; c++)
				1216	f += neu1[c] * syn1neg_window[c + l2];
				1217	if (f > MAX_EXP)
				1218	g = (label - 1) * alpha;
				1219	else if (f < -MAX_EXP)
				1220	g = (label - 0) * alpha;
				1221	else
				1222	g = (label
				1223	- expTable[(int) ((f + MAX_EXP)
				1224	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1225	* alpha;
				1226	for (c = 0; c < window_layer_size; c++)
				1227	neu1e[c] += g * syn1neg_window[c + l2];
				1228	for (c = 0; c < window_layer_size; c++)
				1229	syn1neg_window[c + l2] += g * neu1[c];
				1230	if (cap == 1)
				1231	for (c = 0; c < window_layer_size; c++)
				1232	capParam(syn1neg_window, c + l2);
				1233	}
				1234	// Noise Contrastive Estimation
				1235	if (nce > 0)
				1236	for (d = 0; d < nce + 1; d++) {
				1237	if (d == 0) {
				1238	target = word;
				1239	label = 1;
				1240	} else {
				1241	next_random = next_random
				1242	* (unsigned long long) 25214903917 + 11;
				1243	if (word_to_group != NULL
				1244	&& word_to_group[word] != -1) {
				1245	target = word;
				1246	while (target == word) {
				1247	target = group_to_table[word_to_group[word]
				1248	* table_size
				1249	+ (next_random >> 16) % table_size];
				1250	next_random = next_random
				1251	* (unsigned long long) 25214903917
				1252	+ 11;
				1253	}
				1254	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1255	} else {
				1256	target =
				1257	table[(next_random >> 16) % table_size];
				1258	}
				1259	if (target == 0)
				1260	target = next_random % (vocab_size - 1) + 1;
				1261	if (target == word)
				1262	continue;
				1263	label = 0;
				1264	}
				1265	l2 = target * window_layer_size;
				1266	f = 0;
				1267	for (c = 0; c < window_layer_size; c++)
				1268	f += neu1[c] * syn1nce_window[c + l2];
				1269	if (f > MAX_EXP)
				1270	g = (label - 1) * alpha;
				1271	else if (f < -MAX_EXP)
				1272	g = (label - 0) * alpha;
				1273	else {
				1274	f = exp(f);
				1275	g =
				1276	(label
				1277	- f
				1278	/ (noise_distribution[target]
				1279	* nce + f)) * alpha;
				1280	}
				1281	for (c = 0; c < window_layer_size; c++)
				1282	neu1e[c] += g * syn1nce_window[c + l2];
				1283	for (c = 0; c < window_layer_size; c++)
				1284	syn1nce_window[c + l2] += g * neu1[c];
				1285	if (cap == 1)
				1286	for (c = 0; c < window_layer_size; c++)
				1287	capParam(syn1nce_window, c + l2);
				1288	}
				1289	// hidden -> in
				1290	for (a = 0; a < window * 2 + 1; a++)
				1291	if (a != window) {
				1292	c = sentence_position - window + a;
				1293	if (c < 0)
				1294	continue;
				1295	if (c >= sentence_length)
				1296	continue;
				1297	last_word = sen[c];
				1298	if (last_word == -1)
				1299	continue;
				1300	window_offset = a * layer1_size;
				1301	if (a > window)
				1302	window_offset -= layer1_size;
				1303	for (c = 0; c < layer1_size; c++)
				1304	syn0[c + last_word * layer1_size] += neu1e[c
				1305	+ window_offset];
				1306	}
				1307	}
				1308	} else if (type == 3) { //train structured skip-gram
				1309	for (a = 0; a < window * 2 + 1; a++)
				1310	if (a != window) {
				1311	c = sentence_position - window + a;
				1312	if (c < 0)
				1313	continue;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	1314	if(sen[c] == -2)
				1315	continue;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1316	if (c >= sentence_length)
				1317	continue;
				1318	last_word = sen[c];
				1319	if (last_word == -1)
				1320	continue;
				1321	l1 = last_word * layer1_size;
				1322	window_offset = a * layer1_size;
				1323	if (a > window)
				1324	window_offset -= layer1_size;
				1325	for (c = 0; c < layer1_size; c++)
				1326	neu1e[c] = 0;
				1327	// HIERARCHICAL SOFTMAX
				1328	if (hs)
				1329	for (d = 0; d < vocab[word].codelen; d++) {
				1330	f = 0;
				1331	l2 = vocab[word].point[d] * window_layer_size;
				1332	// Propagate hidden -> output
				1333	for (c = 0; c < layer1_size; c++)
				1334	f += syn0[c + l1]
				1335	* syn1_window[c + l2 + window_offset];
				1336	if (f <= -MAX_EXP)
				1337	continue;
				1338	else if (f >= MAX_EXP)
				1339	continue;
				1340	else
				1341	f = expTable[(int) ((f + MAX_EXP)
				1342	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1343	// 'g' is the gradient multiplied by the learning rate
				1344	g = (1 - vocab[word].code[d] - f) * alpha;
				1345	// Propagate errors output -> hidden
				1346	for (c = 0; c < layer1_size; c++)
				1347	neu1e[c] += g
				1348	* syn1_window[c + l2 + window_offset];
				1349	// Learn weights hidden -> output
				1350	for (c = 0; c < layer1_size; c++)
				1351	syn1[c + l2 + window_offset] += g
				1352	* syn0[c + l1];
				1353	if (cap == 1)
				1354	for (c = 0; c < layer1_size; c++)
				1355	capParam(syn1, c + l2 + window_offset);
				1356	}
				1357	// NEGATIVE SAMPLING
				1358	if (negative > 0)
				1359	for (d = 0; d < negative + 1; d++) {
				1360	if (d == 0) {
				1361	target = word;
				1362	label = 1;
				1363	} else {
				1364	next_random = next_random
				1365	* (unsigned long long) 25214903917 + 11;
				1366	if (word_to_group != NULL
				1367	&& word_to_group[word] != -1) {
				1368	target = word;
				1369	while (target == word) {
				1370	target =
				1371	group_to_table[word_to_group[word]
				1372	* table_size
				1373	+ (next_random >> 16)
				1374	% table_size];
				1375	next_random =
				1376	next_random
				1377	* (unsigned long long) 25214903917
				1378	+ 11;
				1379	}
				1380	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1381	} else {
				1382	target = table[(next_random >> 16)
				1383	% table_size];
				1384	}
				1385	if (target == 0)
				1386	target = next_random % (vocab_size - 1) + 1;
				1387	if (target == word)
				1388	continue;
				1389	label = 0;
				1390	}
				1391	l2 = target * window_layer_size;
				1392	f = 0;
				1393	for (c = 0; c < layer1_size; c++)
				1394	f +=
				1395	syn0[c + l1]
				1396	* syn1neg_window[c + l2
				1397	+ window_offset];
				1398	if (f > MAX_EXP)
				1399	g = (label - 1) * alpha;
				1400	else if (f < -MAX_EXP)
				1401	g = (label - 0) * alpha;
				1402	else
				1403	g =
				1404	(label
				1405	- expTable[(int) ((f + MAX_EXP)
				1406	* (EXP_TABLE_SIZE
				1407	/ MAX_EXP / 2))])
				1408	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1409	if(debug_mode > 2 && ((long long) id) == 0) {
				1410	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1411	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1412	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1413	for (c = 0; c < layer1_size; c++)
				1414	neu1e[c] +=
				1415	g
				1416	* syn1neg_window[c + l2
				1417	+ window_offset];
				1418	for (c = 0; c < layer1_size; c++)
				1419	syn1neg_window[c + l2 + window_offset] += g
				1420	* syn0[c + l1];
				1421	if (cap == 1)
				1422	for (c = 0; c < layer1_size; c++)
				1423	capParam(syn1neg_window,
				1424	c + l2 + window_offset);
				1425	}
				1426	// Noise Constrastive Estimation
				1427	if (nce > 0)
				1428	for (d = 0; d < nce + 1; d++) {
				1429	if (d == 0) {
				1430	target = word;
				1431	label = 1;
				1432	} else {
				1433	next_random = next_random
				1434	* (unsigned long long) 25214903917 + 11;
				1435	if (word_to_group != NULL
				1436	&& word_to_group[word] != -1) {
				1437	target = word;
				1438	while (target == word) {
				1439	target =
				1440	group_to_table[word_to_group[word]
				1441	* table_size
				1442	+ (next_random >> 16)
				1443	% table_size];
				1444	next_random =
				1445	next_random
				1446	* (unsigned long long) 25214903917
				1447	+ 11;
				1448	}
				1449	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1450	} else {
				1451	target = table[(next_random >> 16)
				1452	% table_size];
				1453	}
				1454	if (target == 0)
				1455	target = next_random % (vocab_size - 1) + 1;
				1456	if (target == word)
				1457	continue;
				1458	label = 0;
				1459	}
				1460	l2 = target * window_layer_size;
				1461	f = 0;
				1462	for (c = 0; c < layer1_size; c++)
				1463	f +=
				1464	syn0[c + l1]
				1465	* syn1nce_window[c + l2
				1466	+ window_offset];
				1467	if (f > MAX_EXP)
				1468	g = (label - 1) * alpha;
				1469	else if (f < -MAX_EXP)
				1470	g = (label - 0) * alpha;
				1471	else {
				1472	f = exp(f);
				1473	g = (label
				1474	- f
				1475	/ (noise_distribution[target]
				1476	* nce + f)) * alpha;
				1477	}
				1478	for (c = 0; c < layer1_size; c++)
				1479	neu1e[c] +=
				1480	g
				1481	* syn1nce_window[c + l2
				1482	+ window_offset];
				1483	for (c = 0; c < layer1_size; c++)
				1484	syn1nce_window[c + l2 + window_offset] += g
				1485	* syn0[c + l1];
				1486	if (cap == 1)
				1487	for (c = 0; c < layer1_size; c++)
				1488	capParam(syn1nce_window,
				1489	c + l2 + window_offset);
				1490	}
				1491	// Learn weights input -> hidden
				1492	for (c = 0; c < layer1_size; c++) {
				1493	syn0[c + l1] += neu1e[c];
				1494	if (syn0[c + l1] > 50)
				1495	syn0[c + l1] = 50;
				1496	if (syn0[c + l1] < -50)
				1497	syn0[c + l1] = -50;
				1498	}
				1499	}
				1500	} else if (type == 4) { //training senna
				1501	// in -> hidden
				1502	cw = 0;
				1503	for (a = 0; a < window * 2 + 1; a++)
				1504	if (a != window) {
				1505	c = sentence_position - window + a;
				1506	if (c < 0)
				1507	continue;
				1508	if (c >= sentence_length)
				1509	continue;
				1510	last_word = sen[c];
				1511	if (last_word == -1)
				1512	continue;
				1513	window_offset = a * layer1_size;
				1514	if (a > window)
				1515	window_offset -= layer1_size;
				1516	for (c = 0; c < layer1_size; c++)
				1517	neu1[c + window_offset] += syn0[c
				1518	+ last_word * layer1_size];
				1519	cw++;
				1520	}
				1521	if (cw) {
				1522	for (a = 0; a < window_hidden_size; a++) {
				1523	c = a * window_layer_size;
				1524	for (b = 0; b < window_layer_size; b++) {
				1525	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1526	}
				1527	}
				1528	if (hs)
				1529	for (d = 0; d < vocab[word].codelen; d++) {
				1530	f = 0;
				1531	l2 = vocab[word].point[d] * window_hidden_size;
				1532	// Propagate hidden -> output
				1533	for (c = 0; c < window_hidden_size; c++)
				1534	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1535	if (f <= -MAX_EXP)
				1536	continue;
				1537	else if (f >= MAX_EXP)
				1538	continue;
				1539	else
				1540	f = expTable[(int) ((f + MAX_EXP)
				1541	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1542	// 'g' is the gradient multiplied by the learning rate
				1543	g = (1 - vocab[word].code[d] - f) * alpha;
				1544	// Propagate errors output -> hidden
				1545	for (c = 0; c < window_hidden_size; c++)
				1546	neu2e[c] += dHardTanh(neu2[c], g) * g
				1547	* syn_hidden_word[c + l2];
				1548	// Learn weights hidden -> output
				1549	for (c = 0; c < window_hidden_size; c++)
				1550	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1551	* neu2[c];
				1552	}
				1553	// NEGATIVE SAMPLING
				1554	if (negative > 0)
				1555	for (d = 0; d < negative + 1; d++) {
				1556	if (d == 0) {
				1557	target = word;
				1558	label = 1;
				1559	} else {
				1560	next_random = next_random
				1561	* (unsigned long long) 25214903917 + 11;
				1562	if (word_to_group != NULL
				1563	&& word_to_group[word] != -1) {
				1564	target = word;
				1565	while (target == word) {
				1566	target = group_to_table[word_to_group[word]
				1567	* table_size
				1568	+ (next_random >> 16) % table_size];
				1569	next_random = next_random
				1570	* (unsigned long long) 25214903917
				1571	+ 11;
				1572	}
				1573	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1574	} else {
				1575	target =
				1576	table[(next_random >> 16) % table_size];
				1577	}
				1578	if (target == 0)
				1579	target = next_random % (vocab_size - 1) + 1;
				1580	if (target == word)
				1581	continue;
				1582	label = 0;
				1583	}
				1584	l2 = target * window_hidden_size;
				1585	f = 0;
				1586	for (c = 0; c < window_hidden_size; c++)
				1587	f += hardTanh(neu2[c])
				1588	* syn_hidden_word_neg[c + l2];
				1589	if (f > MAX_EXP)
				1590	g = (label - 1) * alpha / negative;
				1591	else if (f < -MAX_EXP)
				1592	g = (label - 0) * alpha / negative;
				1593	else
				1594	g = (label
				1595	- expTable[(int) ((f + MAX_EXP)
				1596	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1597	* alpha / negative;
				1598	for (c = 0; c < window_hidden_size; c++)
				1599	neu2e[c] += dHardTanh(neu2[c], g) * g
				1600	* syn_hidden_word_neg[c + l2];
				1601	for (c = 0; c < window_hidden_size; c++)
				1602	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1603	* g * neu2[c];
				1604	}
				1605	for (a = 0; a < window_hidden_size; a++)
				1606	for (b = 0; b < window_layer_size; b++)
				1607	neu1e[b] += neu2e[a]
				1608	* syn_window_hidden[a * window_layer_size + b];
				1609	for (a = 0; a < window_hidden_size; a++)
				1610	for (b = 0; b < window_layer_size; b++)
				1611	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1612	* neu1[b];
				1613	// hidden -> in
				1614	for (a = 0; a < window * 2 + 1; a++)
				1615	if (a != window) {
				1616	c = sentence_position - window + a;
				1617	if (c < 0)
				1618	continue;
				1619	if (c >= sentence_length)
				1620	continue;
				1621	last_word = sen[c];
				1622	if (last_word == -1)
				1623	continue;
				1624	window_offset = a * layer1_size;
				1625	if (a > window)
				1626	window_offset -= layer1_size;
				1627	for (c = 0; c < layer1_size; c++)
				1628	syn0[c + last_word * layer1_size] += neu1e[c
				1629	+ window_offset];
				1630	}
				1631	}
				1632	} else {
				1633	printf("unknown type %i", type);
				1634	exit(0);
				1635	}
				1636	sentence_position++;
				1637	if (sentence_position >= sentence_length) {
				1638	sentence_length = 0;
				1639	continue;
				1640	}
				1641	}
				1642	fclose(fi);
				1643	free(neu1);
				1644	free(neu1e);
				1645	pthread_exit(NULL);
				1646	}
				1647
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1648	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1649	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1650	real f, max_f, maxmax_f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1651	real *target_sums, bestf[MAX_CC], worstbest;
				1652	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1653	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1654	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1655
				1656	for (d = cc; d < vocab_size; d++) {
				1657	for (b = 0; b < vocab_size; b++)
				1658	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1659	for (b = 0; b < N; b++)
				1660	bestf[b]=-1;
				1661	worstbest = -1;
				1662
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1663	maxmax_f = -1;
				1664	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1665	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1666	if (a != window) {
				1667	max_f = -1;
				1668	window_offset = a * layer1_size;
				1669	if (a > window)
				1670	window_offset -= layer1_size;
				1671	for(target = 0; target < vocab_size; target ++) {
				1672	if(target == d)
				1673	continue;
				1674	f = 0;
				1675	for (c = 0; c < layer1_size; c++)
				1676	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1677	if (f < -MAX_EXP)
				1678	continue;
				1679	else if (f > MAX_EXP)
				1680	continue;
				1681	else
				1682	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1683	if(f > max_f) {
				1684	max_f = f;
				1685	max_target = target;
				1686	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1687	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1688	if(f > worstbest) {
				1689	for (b = 0; b < N; b++) {
				1690	if (f > bestf[b]) {
				1691	for (e = N - 1; e > b; e--) {
				1692	bestf[e] = bestf[e - 1];
				1693	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1694	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1695	}
				1696	bestf[b] = f;
				1697	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1698	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1699	break;
				1700	}
				1701	}
				1702	worstbest = bestf[N-1];
				1703	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1704	}
				1705	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1706	if(max_f > maxmax_f) {
				1707	maxmax_f = max_f;
				1708	maxmax_target = max_target;
				1709	}
				1710	} else {
				1711	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1712	}
				1713	}
				1714	max_f = -1;
				1715	for (b = 0; b < vocab_size; b++) {
				1716	if(target_sums[b] > max_f) {
				1717	max_f = target_sums[b];
				1718	max_target = b;
				1719	}
				1720	}
				1721	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1722	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1723	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1724	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1725	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1726	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1727	}
				1728	}
				1729
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1730	void TrainModel() {
				1731	long a, b, c, d;
				1732	FILE *fo;
				1733	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
				1734	printf("Starting training using file %s\n", train_file);
				1735	starting_alpha = alpha;
				1736	if (read_vocab_file[0] != 0)
				1737	ReadVocab();
				1738	else
				1739	LearnVocabFromTrainFile();
				1740	if (save_vocab_file[0] != 0)
				1741	SaveVocab();
				1742	if (output_file[0] == 0)
				1743	return;
				1744	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1745	if(cc > 0)
				1746	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1747	if (negative > 0 \|\| nce > 0)
				1748	InitUnigramTable();
				1749	if (negative_classes_file[0] != 0)
				1750	InitClassUnigramTable();
				1751	start = clock();
				1752	for (a = 0; a < num_threads; a++)
				1753	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
				1754	for (a = 0; a < num_threads; a++)
				1755	pthread_join(pt[a], NULL);
				1756	fo = fopen(output_file, "wb");
				1757	if (classes == 0) {
				1758	// Save the word vectors
				1759	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1760	for (a = 0; a < vocab_size; a++) {
				1761	fprintf(fo, "%s ", vocab[a].word);
				1762	if (binary)
				1763	for (b = 0; b < layer1_size; b++)
				1764	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1765	else
				1766	for (b = 0; b < layer1_size; b++)
				1767	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1768	fprintf(fo, "\n");
				1769	}
				1770	} else {
				1771	// Run K-means on the word vectors
				1772	int clcn = classes, iter = 10, closeid;
				1773	int centcn = (int ) malloc(classes * sizeof(int));
				1774	int cl = (int ) calloc(vocab_size, sizeof(int));
				1775	real closev, x;
				1776	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1777	for (a = 0; a < vocab_size; a++)
				1778	cl[a] = a % clcn;
				1779	for (a = 0; a < iter; a++) {
				1780	for (b = 0; b < clcn * layer1_size; b++)
				1781	cent[b] = 0;
				1782	for (b = 0; b < clcn; b++)
				1783	centcn[b] = 1;
				1784	for (c = 0; c < vocab_size; c++) {
				1785	for (d = 0; d < layer1_size; d++)
				1786	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1787	centcn[cl[c]]++;
				1788	}
				1789	for (b = 0; b < clcn; b++) {
				1790	closev = 0;
				1791	for (c = 0; c < layer1_size; c++) {
				1792	cent[layer1_size * b + c] /= centcn[b];
				1793	closev += cent[layer1_size * b + c]
				1794	* cent[layer1_size * b + c];
				1795	}
				1796	closev = sqrt(closev);
				1797	for (c = 0; c < layer1_size; c++)
				1798	cent[layer1_size * b + c] /= closev;
				1799	}
				1800	for (c = 0; c < vocab_size; c++) {
				1801	closev = -10;
				1802	closeid = 0;
				1803	for (d = 0; d < clcn; d++) {
				1804	x = 0;
				1805	for (b = 0; b < layer1_size; b++)
				1806	x += cent[layer1_size * d + b]
				1807	* syn0[c * layer1_size + b];
				1808	if (x > closev) {
				1809	closev = x;
				1810	closeid = d;
				1811	}
				1812	}
				1813	cl[c] = closeid;
				1814	}
				1815	}
				1816	// Save the K-means classes
				1817	for (a = 0; a < vocab_size; a++)
				1818	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1819	free(centcn);
				1820	free(cent);
				1821	free(cl);
				1822	}
				1823	fclose(fo);
				1824	if (save_net_file[0] != 0)
				1825	SaveNet();
				1826	}
				1827
				1828	int ArgPos(char str, int argc, char *argv) {
				1829	int a;
				1830	for (a = 1; a < argc; a++)
				1831	if (!strcmp(str, argv[a])) {
				1832	if (a == argc - 1) {
				1833	printf("Argument missing for %s\n", str);
				1834	exit(1);
				1835	}
				1836	return a;
				1837	}
				1838	return -1;
				1839	}
				1840
				1841	int main(int argc, char **argv) {
				1842	int i;
				1843	if (argc == 1) {
				1844	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1845	printf("Options:\n");
				1846	printf("Parameters for training:\n");
				1847	printf("\t-train <file>\n");
				1848	printf("\t\tUse text data from <file> to train the model\n");
				1849	printf("\t-output <file>\n");
				1850	printf(
				1851	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1852	printf("\t-size <int>\n");
				1853	printf("\t\tSet size of word vectors; default is 100\n");
				1854	printf("\t-window <int>\n");
				1855	printf("\t\tSet max skip length between words; default is 5\n");
				1856	printf("\t-sample <float>\n");
				1857	printf(
				1858	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1859	printf(
				1860	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1861	printf("\t-hs <int>\n");
				1862	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1863	printf("\t-negative <int>\n");
				1864	printf(
				1865	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1866	printf("\t-negative-classes <file>\n");
				1867	printf("\t\tNegative classes to sample from\n");
				1868	printf("\t-nce <int>\n");
				1869	printf(
				1870	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1871	printf("\t-threads <int>\n");
				1872	printf("\t\tUse <int> threads (default 12)\n");
				1873	printf("\t-iter <int>\n");
				1874	printf("\t\tRun more training iterations (default 5)\n");
				1875	printf("\t-min-count <int>\n");
				1876	printf(
				1877	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1878	printf("\t-alpha <float>\n");
				1879	printf(
				1880	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1881	printf("\t-classes <int>\n");
				1882	printf(
				1883	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1884	printf("\t-debug <int>\n");
				1885	printf(
				1886	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1887	printf("\t-binary <int>\n");
				1888	printf(
				1889	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1890	printf("\t-save-vocab <file>\n");
				1891	printf("\t\tThe vocabulary will be saved to <file>\n");
				1892	printf("\t-read-vocab <file>\n");
				1893	printf(
				1894	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1895	printf("\t-read-net <file>\n");
				1896	printf(
				1897	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1898	printf("\t-save-net <file>\n");
				1899	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1900	printf("\t-show-cc <int>\n");
				1901	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1902	printf("\t-type <int>\n");
				1903	printf(
				1904	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1905	printf("\t-cap <int>\n");
				1906	printf(
				1907	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1908	printf("\nExamples:\n");
				1909	printf(
				1910	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1911	return 0;
				1912	}
				1913	output_file[0] = 0;
				1914	save_vocab_file[0] = 0;
				1915	read_vocab_file[0] = 0;
				1916	save_net_file[0] = 0;
				1917	read_net_file[0] = 0;
				1918	negative_classes_file[0] = 0;
				1919	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				1920	layer1_size = atoi(argv[i + 1]);
				1921	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				1922	strcpy(train_file, argv[i + 1]);
				1923	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				1924	strcpy(save_vocab_file, argv[i + 1]);
				1925	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				1926	strcpy(read_vocab_file, argv[i + 1]);
				1927	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				1928	strcpy(save_net_file, argv[i + 1]);
				1929	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				1930	strcpy(read_net_file, argv[i + 1]);
				1931	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				1932	debug_mode = atoi(argv[i + 1]);
				1933	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				1934	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1935	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				1936	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1937	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				1938	type = atoi(argv[i + 1]);
				1939	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				1940	strcpy(output_file, argv[i + 1]);
				1941	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				1942	window = atoi(argv[i + 1]);
				1943	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				1944	sample = atof(argv[i + 1]);
				1945	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				1946	hs = atoi(argv[i + 1]);
				1947	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				1948	negative = atoi(argv[i + 1]);
				1949	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				1950	strcpy(negative_classes_file, argv[i + 1]);
				1951	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				1952	nce = atoi(argv[i + 1]);
				1953	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				1954	num_threads = atoi(argv[i + 1]);
				1955	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				1956	iter = atoi(argv[i + 1]);
				1957	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				1958	min_count = atoi(argv[i + 1]);
				1959	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				1960	classes = atoi(argv[i + 1]);
				1961	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				1962	cap = atoi(argv[i + 1]);
				1963	if (type == 0 \|\| type == 2 \|\| type == 4)
				1964	alpha = 0.05;
				1965	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				1966	alpha = atof(argv[i + 1]);
				1967	vocab = (struct vocab_word *) calloc(vocab_max_size,
				1968	sizeof(struct vocab_word));
				1969	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				1970	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1971	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1972	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1973	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1974	}
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	1975	SaveArgs(argc, argv);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1976	TrainModel();
				1977	return 0;
				1978	}
				1979