Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 77a2dda6f45bb9c78090bd5edab0d923ae5bc6f8 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	25	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	26	#define MAX_CODE_LENGTH 40
				27
				28	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				29
				30	typedef float real; // Precision of float numbers
				31
				32	struct vocab_word {
				33	long long cn;
				34	int *point;
				35	char word, code, codelen;
				36	};
				37
				38	char train_file[MAX_STRING], output_file[MAX_STRING];
				39	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				40	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				41	struct vocab_word *vocab;
				42	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
				43	num_threads = 12, min_reduce = 1;
				44	int *vocab_hash;
				45	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				46	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				47	classes = 0;
				48	real alpha = 0.025, starting_alpha, sample = 1e-3;
				49	real syn0, syn1, syn1neg, syn1nce, *expTable;
				50	clock_t start;
				51
				52	real syn1_window, syn1neg_window, *syn1nce_window;
				53	int w_offset, window_layer_size;
				54
				55	int window_hidden_size = 500;
				56	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				57	*syn_hidden_word_nce;
				58
				59	int hs = 0, negative = 5;
				60	const int table_size = 1e8;
				61	int *table;
				62
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	63	long cc = 0;
				64
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	65	//constrastive negative sampling
				66	char negative_classes_file[MAX_STRING];
				67	int *word_to_group;
				68	int group_to_table; //group_sizetable_size
				69	int class_number;
				70
				71	//nce
				72	real* noise_distribution;
				73	int nce = 0;
				74
				75	//param caps
				76	real CAP_VALUE = 50;
				77	int cap = 0;
				78
				79	void capParam(real* array, int index) {
				80	if (array[index] > CAP_VALUE)
				81	array[index] = CAP_VALUE;
				82	else if (array[index] < -CAP_VALUE)
				83	array[index] = -CAP_VALUE;
				84	}
				85
				86	real hardTanh(real x) {
				87	if (x >= 1) {
				88	return 1;
				89	} else if (x <= -1) {
				90	return -1;
				91	} else {
				92	return x;
				93	}
				94	}
				95
				96	real dHardTanh(real x, real g) {
				97	if (x > 1 && g > 0) {
				98	return 0;
				99	}
				100	if (x < -1 && g < 0) {
				101	return 0;
				102	}
				103	return 1;
				104	}
				105
				106	void InitUnigramTable() {
				107	int a, i;
				108	long long train_words_pow = 0;
				109	real d1, power = 0.75;
				110	table = (int ) malloc(table_size sizeof(int));
				111	for (a = 0; a < vocab_size; a++)
				112	train_words_pow += pow(vocab[a].cn, power);
				113	i = 0;
				114	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				115	for (a = 0; a < table_size; a++) {
				116	table[a] = i;
				117	if (a / (real) table_size > d1) {
				118	i++;
				119	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				120	}
				121	if (i >= vocab_size)
				122	i = vocab_size - 1;
				123	}
				124
				125	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				126	for (a = 0; a < vocab_size; a++)
				127	noise_distribution[a] = pow(vocab[a].cn, power)
				128	/ (real) train_words_pow;
				129	}
				130
				131	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				132	void ReadWord(char word, FILE fin) {
				133	int a = 0, ch;
				134	while (!feof(fin)) {
				135	ch = fgetc(fin);
				136	if (ch == 13)
				137	continue;
				138	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				139	if (a > 0) {
				140	if (ch == '\n')
				141	ungetc(ch, fin);
				142	break;
				143	}
				144	if (ch == '\n') {
				145	strcpy(word, (char *) "</s>");
				146	return;
				147	} else
				148	continue;
				149	}
				150	word[a] = ch;
				151	a++;
				152	if (a >= MAX_STRING - 1)
				153	a--; // Truncate too long words
				154	}
				155	word[a] = 0;
				156	}
				157
				158	// Returns hash value of a word
				159	int GetWordHash(char *word) {
				160	unsigned long long a, hash = 0;
				161	for (a = 0; a < strlen(word); a++)
				162	hash = hash * 257 + word[a];
				163	hash = hash % vocab_hash_size;
				164	return hash;
				165	}
				166
				167	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				168	int SearchVocab(char *word) {
				169	unsigned int hash = GetWordHash(word);
				170	while (1) {
				171	if (vocab_hash[hash] == -1)
				172	return -1;
				173	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				174	return vocab_hash[hash];
				175	hash = (hash + 1) % vocab_hash_size;
				176	}
				177	return -1;
				178	}
				179
				180	// Reads a word and returns its index in the vocabulary
				181	int ReadWordIndex(FILE *fin) {
				182	char word[MAX_STRING];
				183	ReadWord(word, fin);
				184	if (feof(fin))
				185	return -1;
				186	return SearchVocab(word);
				187	}
				188
				189	// Adds a word to the vocabulary
				190	int AddWordToVocab(char *word) {
				191	unsigned int hash, length = strlen(word) + 1;
				192	if (length > MAX_STRING)
				193	length = MAX_STRING;
				194	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				195	strcpy(vocab[vocab_size].word, word);
				196	vocab[vocab_size].cn = 0;
				197	vocab_size++;
				198	// Reallocate memory if needed
				199	if (vocab_size + 2 >= vocab_max_size) {
				200	vocab_max_size += 1000;
				201	vocab = (struct vocab_word *) realloc(vocab,
				202	vocab_max_size * sizeof(struct vocab_word));
				203	}
				204	hash = GetWordHash(word);
				205	while (vocab_hash[hash] != -1)
				206	hash = (hash + 1) % vocab_hash_size;
				207	vocab_hash[hash] = vocab_size - 1;
				208	return vocab_size - 1;
				209	}
				210
				211	// Used later for sorting by word counts
				212	int VocabCompare(const void a, const void b) {
				213	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				214	}
				215
				216	// Sorts the vocabulary by frequency using word counts
				217	void SortVocab() {
				218	int a, size;
				219	unsigned int hash;
				220	// Sort the vocabulary and keep </s> at the first position
				221	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				222	for (a = 0; a < vocab_hash_size; a++)
				223	vocab_hash[a] = -1;
				224	size = vocab_size;
				225	train_words = 0;
				226	for (a = 0; a < size; a++) {
				227	// Words occuring less than min_count times will be discarded from the vocab
				228	if ((vocab[a].cn < min_count) && (a != 0)) {
				229	vocab_size--;
				230	free(vocab[a].word);
				231	} else {
				232	// Hash will be re-computed, as after the sorting it is not actual
				233	hash = GetWordHash(vocab[a].word);
				234	while (vocab_hash[hash] != -1)
				235	hash = (hash + 1) % vocab_hash_size;
				236	vocab_hash[hash] = a;
				237	train_words += vocab[a].cn;
				238	}
				239	}
				240	vocab = (struct vocab_word *) realloc(vocab,
				241	(vocab_size + 1) * sizeof(struct vocab_word));
				242	// Allocate memory for the binary tree construction
				243	for (a = 0; a < vocab_size; a++) {
				244	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				245	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				246	}
				247	}
				248
				249	// Reduces the vocabulary by removing infrequent tokens
				250	void ReduceVocab() {
				251	int a, b = 0;
				252	unsigned int hash;
				253	for (a = 0; a < vocab_size; a++)
				254	if (vocab[a].cn > min_reduce) {
				255	vocab[b].cn = vocab[a].cn;
				256	vocab[b].word = vocab[a].word;
				257	b++;
				258	} else
				259	free(vocab[a].word);
				260	vocab_size = b;
				261	for (a = 0; a < vocab_hash_size; a++)
				262	vocab_hash[a] = -1;
				263	for (a = 0; a < vocab_size; a++) {
				264	// Hash will be re-computed, as it is not actual
				265	hash = GetWordHash(vocab[a].word);
				266	while (vocab_hash[hash] != -1)
				267	hash = (hash + 1) % vocab_hash_size;
				268	vocab_hash[hash] = a;
				269	}
				270	fflush(stdout);
				271	min_reduce++;
				272	}
				273
				274	// Create binary Huffman tree using the word counts
				275	// Frequent words will have short uniqe binary codes
				276	void CreateBinaryTree() {
				277	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				278	char code[MAX_CODE_LENGTH];
				279	long long count = (long long ) calloc(vocab_size * 2 + 1,
				280	sizeof(long long));
				281	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				282	sizeof(long long));
				283	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				284	sizeof(long long));
				285	for (a = 0; a < vocab_size; a++)
				286	count[a] = vocab[a].cn;
				287	for (a = vocab_size; a < vocab_size * 2; a++)
				288	count[a] = 1e15;
				289	pos1 = vocab_size - 1;
				290	pos2 = vocab_size;
				291	// Following algorithm constructs the Huffman tree by adding one node at a time
				292	for (a = 0; a < vocab_size - 1; a++) {
				293	// First, find two smallest nodes 'min1, min2'
				294	if (pos1 >= 0) {
				295	if (count[pos1] < count[pos2]) {
				296	min1i = pos1;
				297	pos1--;
				298	} else {
				299	min1i = pos2;
				300	pos2++;
				301	}
				302	} else {
				303	min1i = pos2;
				304	pos2++;
				305	}
				306	if (pos1 >= 0) {
				307	if (count[pos1] < count[pos2]) {
				308	min2i = pos1;
				309	pos1--;
				310	} else {
				311	min2i = pos2;
				312	pos2++;
				313	}
				314	} else {
				315	min2i = pos2;
				316	pos2++;
				317	}
				318	count[vocab_size + a] = count[min1i] + count[min2i];
				319	parent_node[min1i] = vocab_size + a;
				320	parent_node[min2i] = vocab_size + a;
				321	binary[min2i] = 1;
				322	}
				323	// Now assign binary code to each vocabulary word
				324	for (a = 0; a < vocab_size; a++) {
				325	b = a;
				326	i = 0;
				327	while (1) {
				328	code[i] = binary[b];
				329	point[i] = b;
				330	i++;
				331	b = parent_node[b];
				332	if (b == vocab_size * 2 - 2)
				333	break;
				334	}
				335	vocab[a].codelen = i;
				336	vocab[a].point[0] = vocab_size - 2;
				337	for (b = 0; b < i; b++) {
				338	vocab[a].code[i - b - 1] = code[b];
				339	vocab[a].point[i - b] = point[b] - vocab_size;
				340	}
				341	}
				342	free(count);
				343	free(binary);
				344	free(parent_node);
				345	}
				346
				347	void LearnVocabFromTrainFile() {
				348	char word[MAX_STRING];
				349	FILE *fin;
				350	long long a, i;
				351	for (a = 0; a < vocab_hash_size; a++)
				352	vocab_hash[a] = -1;
				353	fin = fopen(train_file, "rb");
				354	if (fin == NULL) {
				355	printf("ERROR: training data file not found!\n");
				356	exit(1);
				357	}
				358	vocab_size = 0;
				359	AddWordToVocab((char *) "</s>");
				360	while (1) {
				361	ReadWord(word, fin);
				362	if (feof(fin))
				363	break;
				364	train_words++;
				365	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				366	printf("%lldK%c", train_words / 1000, 13);
				367	fflush(stdout);
				368	}
				369	i = SearchVocab(word);
				370	if (i == -1) {
				371	a = AddWordToVocab(word);
				372	vocab[a].cn = 1;
				373	} else
				374	vocab[i].cn++;
				375	if (vocab_size > vocab_hash_size * 0.7)
				376	ReduceVocab();
				377	}
				378	SortVocab();
				379	if (debug_mode > 0) {
				380	printf("Vocab size: %lld\n", vocab_size);
				381	printf("Words in train file: %lld\n", train_words);
				382	}
				383	file_size = ftell(fin);
				384	fclose(fin);
				385	}
				386
				387	void SaveVocab() {
				388	long long i;
				389	FILE *fo = fopen(save_vocab_file, "wb");
				390	for (i = 0; i < vocab_size; i++)
				391	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				392	fclose(fo);
				393	}
				394
				395	void ReadVocab() {
				396	long long a, i = 0;
				397	char c;
				398	char word[MAX_STRING];
				399	FILE *fin = fopen(read_vocab_file, "rb");
				400	if (fin == NULL) {
				401	printf("Vocabulary file not found\n");
				402	exit(1);
				403	}
				404	for (a = 0; a < vocab_hash_size; a++)
				405	vocab_hash[a] = -1;
				406	vocab_size = 0;
				407	while (1) {
				408	ReadWord(word, fin);
				409	if (feof(fin))
				410	break;
				411	a = AddWordToVocab(word);
				412	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				413	i++;
				414	}
				415	SortVocab();
				416	if (debug_mode > 0) {
				417	printf("Vocab size: %lld\n", vocab_size);
				418	printf("Words in train file: %lld\n", train_words);
				419	}
				420	fin = fopen(train_file, "rb");
				421	if (fin == NULL) {
				422	printf("ERROR: training data file not found!\n");
				423	exit(1);
				424	}
				425	fseek(fin, 0, SEEK_END);
				426	file_size = ftell(fin);
				427	fclose(fin);
				428	}
				429
				430	void InitClassUnigramTable() {
				431	long long a, c;
				432	printf("loading class unigrams \n");
				433	FILE *fin = fopen(negative_classes_file, "rb");
				434	if (fin == NULL) {
				435	printf("ERROR: class file not found!\n");
				436	exit(1);
				437	}
				438	word_to_group = (int ) malloc(vocab_size sizeof(int));
				439	for (a = 0; a < vocab_size; a++)
				440	word_to_group[a] = -1;
				441	char class[MAX_STRING];
				442	char prev_class[MAX_STRING];
				443	prev_class[0] = 0;
				444	char word[MAX_STRING];
				445	class_number = -1;
				446	while (1) {
				447	if (feof(fin))
				448	break;
				449	ReadWord(class, fin);
				450	ReadWord(word, fin);
				451	int word_index = SearchVocab(word);
				452	if (word_index != -1) {
				453	if (strcmp(class, prev_class) != 0) {
				454	class_number++;
				455	strcpy(prev_class, class);
				456	}
				457	word_to_group[word_index] = class_number;
				458	}
				459	ReadWord(word, fin);
				460	}
				461	class_number++;
				462	fclose(fin);
				463
				464	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				465	long long train_words_pow = 0;
				466	real d1, power = 0.75;
				467
				468	for (c = 0; c < class_number; c++) {
				469	long long offset = c * table_size;
				470	train_words_pow = 0;
				471	for (a = 0; a < vocab_size; a++)
				472	if (word_to_group[a] == c)
				473	train_words_pow += pow(vocab[a].cn, power);
				474	int i = 0;
				475	while (word_to_group[i] != c && i < vocab_size)
				476	i++;
				477	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				478	for (a = 0; a < table_size; a++) {
				479	//printf("index %lld , word %d\n", a, i);
				480	group_to_table[offset + a] = i;
				481	if (a / (real) table_size > d1) {
				482	i++;
				483	while (word_to_group[i] != c && i < vocab_size)
				484	i++;
				485	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				486	}
				487	if (i >= vocab_size)
				488	while (word_to_group[i] != c && i >= 0)
				489	i--;
				490	}
				491	}
				492	}
				493
				494	void SaveNet() {
Marc Kupietz	c14eff0	2016-03-19 15:23:14 +0100	[diff] [blame^]	495	long long a, b;
				496	float len;
				497
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	498	if(type != 3 \|\| negative <= 0) {
				499	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				500	return;
				501	}
				502
Marc Kupietz	c14eff0	2016-03-19 15:23:14 +0100	[diff] [blame^]	503	for (b = 0; b < vocab_size; b++) {
				504	len = 0;
				505	for (a = 0; a < layer1_size; a++) len += syn0[a + b * layer1_size] * syn0[a + b * layer1_size];
				506	len = sqrt(len);
				507	for (a = 0; a < layer1_size; a++) syn0[a + b * layer1_size] /= len;
				508	}
				509
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	510	FILE *fnet = fopen(save_net_file, "wb");
				511	if (fnet == NULL) {
				512	printf("Net parameter file not found\n");
				513	exit(1);
				514	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	515	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	516	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	517	fclose(fnet);
				518	}
				519
				520	void InitNet() {
				521	long long a, b;
				522	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	523	long long read;
				524
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	525	window_layer_size = layer1_size * window * 2;
				526	a = posix_memalign((void **) &syn0, 128,
				527	(long long) vocab_size * layer1_size * sizeof(real));
				528	if (syn0 == NULL) {
				529	printf("Memory allocation failed\n");
				530	exit(1);
				531	}
				532
				533	if (hs) {
				534	a = posix_memalign((void **) &syn1, 128,
				535	(long long) vocab_size * layer1_size * sizeof(real));
				536	if (syn1 == NULL) {
				537	printf("Memory allocation failed\n");
				538	exit(1);
				539	}
				540	a = posix_memalign((void **) &syn1_window, 128,
				541	(long long) vocab_size * window_layer_size * sizeof(real));
				542	if (syn1_window == NULL) {
				543	printf("Memory allocation failed\n");
				544	exit(1);
				545	}
				546	a = posix_memalign((void **) &syn_hidden_word, 128,
				547	(long long) vocab_size * window_hidden_size * sizeof(real));
				548	if (syn_hidden_word == NULL) {
				549	printf("Memory allocation failed\n");
				550	exit(1);
				551	}
				552
				553	for (a = 0; a < vocab_size; a++)
				554	for (b = 0; b < layer1_size; b++)
				555	syn1[a * layer1_size + b] = 0;
				556	for (a = 0; a < vocab_size; a++)
				557	for (b = 0; b < window_layer_size; b++)
				558	syn1_window[a * window_layer_size + b] = 0;
				559	for (a = 0; a < vocab_size; a++)
				560	for (b = 0; b < window_hidden_size; b++)
				561	syn_hidden_word[a * window_hidden_size + b] = 0;
				562	}
				563	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	564	if(type == 0) {
				565	a = posix_memalign((void **) &syn1neg, 128,
				566	(long long) vocab_size * layer1_size * sizeof(real));
				567	if (syn1neg == NULL) {
				568	printf("Memory allocation failed\n");
				569	exit(1);
				570	}
				571	for (a = 0; a < vocab_size; a++)
				572	for (b = 0; b < layer1_size; b++)
				573	syn1neg[a * layer1_size + b] = 0;
				574	} else if (type == 3) {
				575	a = posix_memalign((void **) &syn1neg_window, 128,
				576	(long long) vocab_size * window_layer_size * sizeof(real));
				577	if (syn1neg_window == NULL) {
				578	printf("Memory allocation failed\n");
				579	exit(1);
				580	}
				581	for (a = 0; a < vocab_size; a++)
				582	for (b = 0; b < window_layer_size; b++)
				583	syn1neg_window[a * window_layer_size + b] = 0;
				584	} else if (type == 4) {
				585	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				586	(long long) vocab_size * window_hidden_size * sizeof(real));
				587	if (syn_hidden_word_neg == NULL) {
				588	printf("Memory allocation failed\n");
				589	exit(1);
				590	}
				591	for (a = 0; a < vocab_size; a++)
				592	for (b = 0; b < window_hidden_size; b++)
				593	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	594	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	595	}
				596	if (nce > 0) {
				597	a = posix_memalign((void **) &syn1nce, 128,
				598	(long long) vocab_size * layer1_size * sizeof(real));
				599	if (syn1nce == NULL) {
				600	printf("Memory allocation failed\n");
				601	exit(1);
				602	}
				603	a = posix_memalign((void **) &syn1nce_window, 128,
				604	(long long) vocab_size * window_layer_size * sizeof(real));
				605	if (syn1nce_window == NULL) {
				606	printf("Memory allocation failed\n");
				607	exit(1);
				608	}
				609	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				610	(long long) vocab_size * window_hidden_size * sizeof(real));
				611	if (syn_hidden_word_nce == NULL) {
				612	printf("Memory allocation failed\n");
				613	exit(1);
				614	}
				615
				616	for (a = 0; a < vocab_size; a++)
				617	for (b = 0; b < layer1_size; b++)
				618	syn1nce[a * layer1_size + b] = 0;
				619	for (a = 0; a < vocab_size; a++)
				620	for (b = 0; b < window_layer_size; b++)
				621	syn1nce_window[a * window_layer_size + b] = 0;
				622	for (a = 0; a < vocab_size; a++)
				623	for (b = 0; b < window_hidden_size; b++)
				624	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				625	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	626
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	627	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	628	a = posix_memalign((void **) &syn_window_hidden, 128,
				629	window_hidden_size * window_layer_size * sizeof(real));
				630	if (syn_window_hidden == NULL) {
				631	printf("Memory allocation failed\n");
				632	exit(1);
				633	}
				634	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				635	next_random = next_random * (unsigned long long) 25214903917 + 11;
				636	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				637	- 0.5) / (window_hidden_size * window_layer_size);
				638	}
				639	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	640
				641	if (read_net_file[0] == 0) {
				642	for (a = 0; a < vocab_size; a++)
				643	for (b = 0; b < layer1_size; b++) {
				644	next_random = next_random * (unsigned long long) 25214903917
				645	+ 11;
				646	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				647	/ (real) 65536) - 0.5) / layer1_size;
				648	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	649	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	650	FILE *fnet = fopen(read_net_file, "rb");
				651	if (fnet == NULL) {
				652	printf("Net parameter file not found\n");
				653	exit(1);
				654	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	655	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				656	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				657	if(read != vocab_size * layer1_size) {
				658	fprintf(stderr, "read-net failed %lld\n", read);
				659	exit(-1);
				660	}
				661	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				662	if(read != (long long) vocab_size * window_layer_size) {
				663	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				664	(long long) sizeof(real) * vocab_size * window_layer_size);
				665	exit(-1);
				666	}
				667	fgetc(fnet);
				668	if(!feof(fnet)) {
				669	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				670	exit(-1);
				671	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	672	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	673	} else {
				674	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				675	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	676	}
				677
				678	CreateBinaryTree();
				679	}
				680
				681	void TrainModelThread(void id) {
				682	long long a, b, d, cw, word, last_word, sentence_length = 0,
				683	sentence_position = 0;
				684	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				685	long long l1, l2, c, target, label, local_iter = iter;
				686	unsigned long long next_random = (long long) id;
				687	real f, g;
				688	clock_t now;
				689	int input_len_1 = layer1_size;
				690	int window_offset = -1;
				691	if (type == 2 \|\| type == 4) {
				692	input_len_1 = window_layer_size;
				693	}
				694	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				695	real neu1e = (real ) calloc(input_len_1, sizeof(real));
				696
				697	int input_len_2 = 0;
				698	if (type == 4) {
				699	input_len_2 = window_hidden_size;
				700	}
				701	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				702	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				703
				704	FILE *fi = fopen(train_file, "rb");
				705	fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
				706	while (1) {
				707	if (word_count - last_word_count > 10000) {
				708	word_count_actual += word_count - last_word_count;
				709	last_word_count = word_count;
				710	if ((debug_mode > 1)) {
				711	now = clock();
				712	printf(
				713	"%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
				714	13, alpha,
				715	word_count_actual / (real) (iter * train_words + 1)
				716	* 100,
				717	word_count_actual
				718	/ ((real) (now - start + 1)
				719	/ (real) CLOCKS_PER_SEC * 1000));
				720	fflush(stdout);
				721	}
				722	alpha = starting_alpha
				723	* (1 - word_count_actual / (real) (iter * train_words + 1));
				724	if (alpha < starting_alpha * 0.0001)
				725	alpha = starting_alpha * 0.0001;
				726	}
				727	if (sentence_length == 0) {
				728	while (1) {
				729	word = ReadWordIndex(fi);
				730	if (feof(fi))
				731	break;
				732	if (word == -1)
				733	continue;
				734	word_count++;
				735	if (word == 0)
				736	break;
				737	// The subsampling randomly discards frequent words while keeping the ranking same
				738	if (sample > 0) {
				739	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				740	+ 1) * (sample * train_words) / vocab[word].cn;
				741	next_random = next_random * (unsigned long long) 25214903917
				742	+ 11;
				743	if (ran < (next_random & 0xFFFF) / (real) 65536)
				744	continue;
				745	}
				746	sen[sentence_length] = word;
				747	sentence_length++;
				748	if (sentence_length >= MAX_SENTENCE_LENGTH)
				749	break;
				750	}
				751	sentence_position = 0;
				752	}
				753	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				754	word_count_actual += word_count - last_word_count;
				755	local_iter--;
				756	if (local_iter == 0)
				757	break;
				758	word_count = 0;
				759	last_word_count = 0;
				760	sentence_length = 0;
				761	fseek(fi, file_size / (long long) num_threads * (long long) id,
				762	SEEK_SET);
				763	continue;
				764	}
				765	word = sen[sentence_position];
				766	if (word == -1)
				767	continue;
				768	for (c = 0; c < input_len_1; c++)
				769	neu1[c] = 0;
				770	for (c = 0; c < input_len_1; c++)
				771	neu1e[c] = 0;
				772	for (c = 0; c < input_len_2; c++)
				773	neu2[c] = 0;
				774	for (c = 0; c < input_len_2; c++)
				775	neu2e[c] = 0;
				776	next_random = next_random * (unsigned long long) 25214903917 + 11;
				777	b = next_random % window;
				778	if (type == 0) { //train the cbow architecture
				779	// in -> hidden
				780	cw = 0;
				781	for (a = b; a < window * 2 + 1 - b; a++)
				782	if (a != window) {
				783	c = sentence_position - window + a;
				784	if (c < 0)
				785	continue;
				786	if (c >= sentence_length)
				787	continue;
				788	last_word = sen[c];
				789	if (last_word == -1)
				790	continue;
				791	for (c = 0; c < layer1_size; c++)
				792	neu1[c] += syn0[c + last_word * layer1_size];
				793	cw++;
				794	}
				795	if (cw) {
				796	for (c = 0; c < layer1_size; c++)
				797	neu1[c] /= cw;
				798	if (hs)
				799	for (d = 0; d < vocab[word].codelen; d++) {
				800	f = 0;
				801	l2 = vocab[word].point[d] * layer1_size;
				802	// Propagate hidden -> output
				803	for (c = 0; c < layer1_size; c++)
				804	f += neu1[c] * syn1[c + l2];
				805	if (f <= -MAX_EXP)
				806	continue;
				807	else if (f >= MAX_EXP)
				808	continue;
				809	else
				810	f = expTable[(int) ((f + MAX_EXP)
				811	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				812	// 'g' is the gradient multiplied by the learning rate
				813	g = (1 - vocab[word].code[d] - f) * alpha;
				814	// Propagate errors output -> hidden
				815	for (c = 0; c < layer1_size; c++)
				816	neu1e[c] += g * syn1[c + l2];
				817	// Learn weights hidden -> output
				818	for (c = 0; c < layer1_size; c++)
				819	syn1[c + l2] += g * neu1[c];
				820	if (cap == 1)
				821	for (c = 0; c < layer1_size; c++)
				822	capParam(syn1, c + l2);
				823	}
				824	// NEGATIVE SAMPLING
				825	if (negative > 0)
				826	for (d = 0; d < negative + 1; d++) {
				827	if (d == 0) {
				828	target = word;
				829	label = 1;
				830	} else {
				831	next_random = next_random
				832	* (unsigned long long) 25214903917 + 11;
				833	if (word_to_group != NULL
				834	&& word_to_group[word] != -1) {
				835	target = word;
				836	while (target == word) {
				837	target = group_to_table[word_to_group[word]
				838	* table_size
				839	+ (next_random >> 16) % table_size];
				840	next_random = next_random
				841	* (unsigned long long) 25214903917
				842	+ 11;
				843	}
				844	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				845	} else {
				846	target =
				847	table[(next_random >> 16) % table_size];
				848	}
				849	if (target == 0)
				850	target = next_random % (vocab_size - 1) + 1;
				851	if (target == word)
				852	continue;
				853	label = 0;
				854	}
				855	l2 = target * layer1_size;
				856	f = 0;
				857	for (c = 0; c < layer1_size; c++)
				858	f += neu1[c] * syn1neg[c + l2];
				859	if (f > MAX_EXP)
				860	g = (label - 1) * alpha;
				861	else if (f < -MAX_EXP)
				862	g = (label - 0) * alpha;
				863	else
				864	g = (label
				865	- expTable[(int) ((f + MAX_EXP)
				866	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				867	* alpha;
				868	for (c = 0; c < layer1_size; c++)
				869	neu1e[c] += g * syn1neg[c + l2];
				870	for (c = 0; c < layer1_size; c++)
				871	syn1neg[c + l2] += g * neu1[c];
				872	if (cap == 1)
				873	for (c = 0; c < layer1_size; c++)
				874	capParam(syn1neg, c + l2);
				875	}
				876	// Noise Contrastive Estimation
				877	if (nce > 0)
				878	for (d = 0; d < nce + 1; d++) {
				879	if (d == 0) {
				880	target = word;
				881	label = 1;
				882	} else {
				883	next_random = next_random
				884	* (unsigned long long) 25214903917 + 11;
				885	if (word_to_group != NULL
				886	&& word_to_group[word] != -1) {
				887	target = word;
				888	while (target == word) {
				889	target = group_to_table[word_to_group[word]
				890	* table_size
				891	+ (next_random >> 16) % table_size];
				892	next_random = next_random
				893	* (unsigned long long) 25214903917
				894	+ 11;
				895	}
				896	} else {
				897	target =
				898	table[(next_random >> 16) % table_size];
				899	}
				900	if (target == 0)
				901	target = next_random % (vocab_size - 1) + 1;
				902	if (target == word)
				903	continue;
				904	label = 0;
				905	}
				906	l2 = target * layer1_size;
				907	f = 0;
				908
				909	for (c = 0; c < layer1_size; c++)
				910	f += neu1[c] * syn1nce[c + l2];
				911	if (f > MAX_EXP)
				912	g = (label - 1) * alpha;
				913	else if (f < -MAX_EXP)
				914	g = (label - 0) * alpha;
				915	else {
				916	f = exp(f);
				917	g =
				918	(label
				919	- f
				920	/ (noise_distribution[target]
				921	* nce + f)) * alpha;
				922	}
				923	for (c = 0; c < layer1_size; c++)
				924	neu1e[c] += g * syn1nce[c + l2];
				925	for (c = 0; c < layer1_size; c++)
				926	syn1nce[c + l2] += g * neu1[c];
				927	if (cap == 1)
				928	for (c = 0; c < layer1_size; c++)
				929	capParam(syn1nce, c + l2);
				930	}
				931	// hidden -> in
				932	for (a = b; a < window * 2 + 1 - b; a++)
				933	if (a != window) {
				934	c = sentence_position - window + a;
				935	if (c < 0)
				936	continue;
				937	if (c >= sentence_length)
				938	continue;
				939	last_word = sen[c];
				940	if (last_word == -1)
				941	continue;
				942	for (c = 0; c < layer1_size; c++)
				943	syn0[c + last_word * layer1_size] += neu1e[c];
				944	}
				945	}
				946	} else if (type == 1) { //train skip-gram
				947	for (a = b; a < window * 2 + 1 - b; a++)
				948	if (a != window) {
				949	c = sentence_position - window + a;
				950	if (c < 0)
				951	continue;
				952	if (c >= sentence_length)
				953	continue;
				954	last_word = sen[c];
				955	if (last_word == -1)
				956	continue;
				957	l1 = last_word * layer1_size;
				958	for (c = 0; c < layer1_size; c++)
				959	neu1e[c] = 0;
				960	// HIERARCHICAL SOFTMAX
				961	if (hs)
				962	for (d = 0; d < vocab[word].codelen; d++) {
				963	f = 0;
				964	l2 = vocab[word].point[d] * layer1_size;
				965	// Propagate hidden -> output
				966	for (c = 0; c < layer1_size; c++)
				967	f += syn0[c + l1] * syn1[c + l2];
				968	if (f <= -MAX_EXP)
				969	continue;
				970	else if (f >= MAX_EXP)
				971	continue;
				972	else
				973	f = expTable[(int) ((f + MAX_EXP)
				974	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				975	// 'g' is the gradient multiplied by the learning rate
				976	g = (1 - vocab[word].code[d] - f) * alpha;
				977	// Propagate errors output -> hidden
				978	for (c = 0; c < layer1_size; c++)
				979	neu1e[c] += g * syn1[c + l2];
				980	// Learn weights hidden -> output
				981	for (c = 0; c < layer1_size; c++)
				982	syn1[c + l2] += g * syn0[c + l1];
				983	if (cap == 1)
				984	for (c = 0; c < layer1_size; c++)
				985	capParam(syn1, c + l2);
				986	}
				987	// NEGATIVE SAMPLING
				988	if (negative > 0)
				989	for (d = 0; d < negative + 1; d++) {
				990	if (d == 0) {
				991	target = word;
				992	label = 1;
				993	} else {
				994	next_random = next_random
				995	* (unsigned long long) 25214903917 + 11;
				996	if (word_to_group != NULL
				997	&& word_to_group[word] != -1) {
				998	target = word;
				999	while (target == word) {
				1000	target =
				1001	group_to_table[word_to_group[word]
				1002	* table_size
				1003	+ (next_random >> 16)
				1004	% table_size];
				1005	next_random =
				1006	next_random
				1007	* (unsigned long long) 25214903917
				1008	+ 11;
				1009	}
				1010	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1011	} else {
				1012	target = table[(next_random >> 16)
				1013	% table_size];
				1014	}
				1015	if (target == 0)
				1016	target = next_random % (vocab_size - 1) + 1;
				1017	if (target == word)
				1018	continue;
				1019	label = 0;
				1020	}
				1021	l2 = target * layer1_size;
				1022	f = 0;
				1023	for (c = 0; c < layer1_size; c++)
				1024	f += syn0[c + l1] * syn1neg[c + l2];
				1025	if (f > MAX_EXP)
				1026	g = (label - 1) * alpha;
				1027	else if (f < -MAX_EXP)
				1028	g = (label - 0) * alpha;
				1029	else
				1030	g =
				1031	(label
				1032	- expTable[(int) ((f + MAX_EXP)
				1033	* (EXP_TABLE_SIZE
				1034	/ MAX_EXP / 2))])
				1035	* alpha;
				1036	for (c = 0; c < layer1_size; c++)
				1037	neu1e[c] += g * syn1neg[c + l2];
				1038	for (c = 0; c < layer1_size; c++)
				1039	syn1neg[c + l2] += g * syn0[c + l1];
				1040	if (cap == 1)
				1041	for (c = 0; c < layer1_size; c++)
				1042	capParam(syn1neg, c + l2);
				1043	}
				1044	//Noise Contrastive Estimation
				1045	if (nce > 0)
				1046	for (d = 0; d < nce + 1; d++) {
				1047	if (d == 0) {
				1048	target = word;
				1049	label = 1;
				1050	} else {
				1051	next_random = next_random
				1052	* (unsigned long long) 25214903917 + 11;
				1053	if (word_to_group != NULL
				1054	&& word_to_group[word] != -1) {
				1055	target = word;
				1056	while (target == word) {
				1057	target =
				1058	group_to_table[word_to_group[word]
				1059	* table_size
				1060	+ (next_random >> 16)
				1061	% table_size];
				1062	next_random =
				1063	next_random
				1064	* (unsigned long long) 25214903917
				1065	+ 11;
				1066	}
				1067	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1068	} else {
				1069	target = table[(next_random >> 16)
				1070	% table_size];
				1071	}
				1072	if (target == 0)
				1073	target = next_random % (vocab_size - 1) + 1;
				1074	if (target == word)
				1075	continue;
				1076	label = 0;
				1077	}
				1078	l2 = target * layer1_size;
				1079	f = 0;
				1080	for (c = 0; c < layer1_size; c++)
				1081	f += syn0[c + l1] * syn1nce[c + l2];
				1082	if (f > MAX_EXP)
				1083	g = (label - 1) * alpha;
				1084	else if (f < -MAX_EXP)
				1085	g = (label - 0) * alpha;
				1086	else {
				1087	f = exp(f);
				1088	g = (label
				1089	- f
				1090	/ (noise_distribution[target]
				1091	* nce + f)) * alpha;
				1092	}
				1093	for (c = 0; c < layer1_size; c++)
				1094	neu1e[c] += g * syn1nce[c + l2];
				1095	for (c = 0; c < layer1_size; c++)
				1096	syn1nce[c + l2] += g * syn0[c + l1];
				1097	if (cap == 1)
				1098	for (c = 0; c < layer1_size; c++)
				1099	capParam(syn1nce, c + l2);
				1100	}
				1101	// Learn weights input -> hidden
				1102	for (c = 0; c < layer1_size; c++)
				1103	syn0[c + l1] += neu1e[c];
				1104	}
				1105	} else if (type == 2) { //train the cwindow architecture
				1106	// in -> hidden
				1107	cw = 0;
				1108	for (a = 0; a < window * 2 + 1; a++)
				1109	if (a != window) {
				1110	c = sentence_position - window + a;
				1111	if (c < 0)
				1112	continue;
				1113	if (c >= sentence_length)
				1114	continue;
				1115	last_word = sen[c];
				1116	if (last_word == -1)
				1117	continue;
				1118	window_offset = a * layer1_size;
				1119	if (a > window)
				1120	window_offset -= layer1_size;
				1121	for (c = 0; c < layer1_size; c++)
				1122	neu1[c + window_offset] += syn0[c
				1123	+ last_word * layer1_size];
				1124	cw++;
				1125	}
				1126	if (cw) {
				1127	if (hs)
				1128	for (d = 0; d < vocab[word].codelen; d++) {
				1129	f = 0;
				1130	l2 = vocab[word].point[d] * window_layer_size;
				1131	// Propagate hidden -> output
				1132	for (c = 0; c < window_layer_size; c++)
				1133	f += neu1[c] * syn1_window[c + l2];
				1134	if (f <= -MAX_EXP)
				1135	continue;
				1136	else if (f >= MAX_EXP)
				1137	continue;
				1138	else
				1139	f = expTable[(int) ((f + MAX_EXP)
				1140	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1141	// 'g' is the gradient multiplied by the learning rate
				1142	g = (1 - vocab[word].code[d] - f) * alpha;
				1143	// Propagate errors output -> hidden
				1144	for (c = 0; c < window_layer_size; c++)
				1145	neu1e[c] += g * syn1_window[c + l2];
				1146	// Learn weights hidden -> output
				1147	for (c = 0; c < window_layer_size; c++)
				1148	syn1_window[c + l2] += g * neu1[c];
				1149	if (cap == 1)
				1150	for (c = 0; c < window_layer_size; c++)
				1151	capParam(syn1_window, c + l2);
				1152	}
				1153	// NEGATIVE SAMPLING
				1154	if (negative > 0)
				1155	for (d = 0; d < negative + 1; d++) {
				1156	if (d == 0) {
				1157	target = word;
				1158	label = 1;
				1159	} else {
				1160	next_random = next_random
				1161	* (unsigned long long) 25214903917 + 11;
				1162	if (word_to_group != NULL
				1163	&& word_to_group[word] != -1) {
				1164	target = word;
				1165	while (target == word) {
				1166	target = group_to_table[word_to_group[word]
				1167	* table_size
				1168	+ (next_random >> 16) % table_size];
				1169	next_random = next_random
				1170	* (unsigned long long) 25214903917
				1171	+ 11;
				1172	}
				1173	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1174	} else {
				1175	target =
				1176	table[(next_random >> 16) % table_size];
				1177	}
				1178	if (target == 0)
				1179	target = next_random % (vocab_size - 1) + 1;
				1180	if (target == word)
				1181	continue;
				1182	label = 0;
				1183	}
				1184	l2 = target * window_layer_size;
				1185	f = 0;
				1186	for (c = 0; c < window_layer_size; c++)
				1187	f += neu1[c] * syn1neg_window[c + l2];
				1188	if (f > MAX_EXP)
				1189	g = (label - 1) * alpha;
				1190	else if (f < -MAX_EXP)
				1191	g = (label - 0) * alpha;
				1192	else
				1193	g = (label
				1194	- expTable[(int) ((f + MAX_EXP)
				1195	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1196	* alpha;
				1197	for (c = 0; c < window_layer_size; c++)
				1198	neu1e[c] += g * syn1neg_window[c + l2];
				1199	for (c = 0; c < window_layer_size; c++)
				1200	syn1neg_window[c + l2] += g * neu1[c];
				1201	if (cap == 1)
				1202	for (c = 0; c < window_layer_size; c++)
				1203	capParam(syn1neg_window, c + l2);
				1204	}
				1205	// Noise Contrastive Estimation
				1206	if (nce > 0)
				1207	for (d = 0; d < nce + 1; d++) {
				1208	if (d == 0) {
				1209	target = word;
				1210	label = 1;
				1211	} else {
				1212	next_random = next_random
				1213	* (unsigned long long) 25214903917 + 11;
				1214	if (word_to_group != NULL
				1215	&& word_to_group[word] != -1) {
				1216	target = word;
				1217	while (target == word) {
				1218	target = group_to_table[word_to_group[word]
				1219	* table_size
				1220	+ (next_random >> 16) % table_size];
				1221	next_random = next_random
				1222	* (unsigned long long) 25214903917
				1223	+ 11;
				1224	}
				1225	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1226	} else {
				1227	target =
				1228	table[(next_random >> 16) % table_size];
				1229	}
				1230	if (target == 0)
				1231	target = next_random % (vocab_size - 1) + 1;
				1232	if (target == word)
				1233	continue;
				1234	label = 0;
				1235	}
				1236	l2 = target * window_layer_size;
				1237	f = 0;
				1238	for (c = 0; c < window_layer_size; c++)
				1239	f += neu1[c] * syn1nce_window[c + l2];
				1240	if (f > MAX_EXP)
				1241	g = (label - 1) * alpha;
				1242	else if (f < -MAX_EXP)
				1243	g = (label - 0) * alpha;
				1244	else {
				1245	f = exp(f);
				1246	g =
				1247	(label
				1248	- f
				1249	/ (noise_distribution[target]
				1250	* nce + f)) * alpha;
				1251	}
				1252	for (c = 0; c < window_layer_size; c++)
				1253	neu1e[c] += g * syn1nce_window[c + l2];
				1254	for (c = 0; c < window_layer_size; c++)
				1255	syn1nce_window[c + l2] += g * neu1[c];
				1256	if (cap == 1)
				1257	for (c = 0; c < window_layer_size; c++)
				1258	capParam(syn1nce_window, c + l2);
				1259	}
				1260	// hidden -> in
				1261	for (a = 0; a < window * 2 + 1; a++)
				1262	if (a != window) {
				1263	c = sentence_position - window + a;
				1264	if (c < 0)
				1265	continue;
				1266	if (c >= sentence_length)
				1267	continue;
				1268	last_word = sen[c];
				1269	if (last_word == -1)
				1270	continue;
				1271	window_offset = a * layer1_size;
				1272	if (a > window)
				1273	window_offset -= layer1_size;
				1274	for (c = 0; c < layer1_size; c++)
				1275	syn0[c + last_word * layer1_size] += neu1e[c
				1276	+ window_offset];
				1277	}
				1278	}
				1279	} else if (type == 3) { //train structured skip-gram
				1280	for (a = 0; a < window * 2 + 1; a++)
				1281	if (a != window) {
				1282	c = sentence_position - window + a;
				1283	if (c < 0)
				1284	continue;
				1285	if (c >= sentence_length)
				1286	continue;
				1287	last_word = sen[c];
				1288	if (last_word == -1)
				1289	continue;
				1290	l1 = last_word * layer1_size;
				1291	window_offset = a * layer1_size;
				1292	if (a > window)
				1293	window_offset -= layer1_size;
				1294	for (c = 0; c < layer1_size; c++)
				1295	neu1e[c] = 0;
				1296	// HIERARCHICAL SOFTMAX
				1297	if (hs)
				1298	for (d = 0; d < vocab[word].codelen; d++) {
				1299	f = 0;
				1300	l2 = vocab[word].point[d] * window_layer_size;
				1301	// Propagate hidden -> output
				1302	for (c = 0; c < layer1_size; c++)
				1303	f += syn0[c + l1]
				1304	* syn1_window[c + l2 + window_offset];
				1305	if (f <= -MAX_EXP)
				1306	continue;
				1307	else if (f >= MAX_EXP)
				1308	continue;
				1309	else
				1310	f = expTable[(int) ((f + MAX_EXP)
				1311	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1312	// 'g' is the gradient multiplied by the learning rate
				1313	g = (1 - vocab[word].code[d] - f) * alpha;
				1314	// Propagate errors output -> hidden
				1315	for (c = 0; c < layer1_size; c++)
				1316	neu1e[c] += g
				1317	* syn1_window[c + l2 + window_offset];
				1318	// Learn weights hidden -> output
				1319	for (c = 0; c < layer1_size; c++)
				1320	syn1[c + l2 + window_offset] += g
				1321	* syn0[c + l1];
				1322	if (cap == 1)
				1323	for (c = 0; c < layer1_size; c++)
				1324	capParam(syn1, c + l2 + window_offset);
				1325	}
				1326	// NEGATIVE SAMPLING
				1327	if (negative > 0)
				1328	for (d = 0; d < negative + 1; d++) {
				1329	if (d == 0) {
				1330	target = word;
				1331	label = 1;
				1332	} else {
				1333	next_random = next_random
				1334	* (unsigned long long) 25214903917 + 11;
				1335	if (word_to_group != NULL
				1336	&& word_to_group[word] != -1) {
				1337	target = word;
				1338	while (target == word) {
				1339	target =
				1340	group_to_table[word_to_group[word]
				1341	* table_size
				1342	+ (next_random >> 16)
				1343	% table_size];
				1344	next_random =
				1345	next_random
				1346	* (unsigned long long) 25214903917
				1347	+ 11;
				1348	}
				1349	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1350	} else {
				1351	target = table[(next_random >> 16)
				1352	% table_size];
				1353	}
				1354	if (target == 0)
				1355	target = next_random % (vocab_size - 1) + 1;
				1356	if (target == word)
				1357	continue;
				1358	label = 0;
				1359	}
				1360	l2 = target * window_layer_size;
				1361	f = 0;
				1362	for (c = 0; c < layer1_size; c++)
				1363	f +=
				1364	syn0[c + l1]
				1365	* syn1neg_window[c + l2
				1366	+ window_offset];
				1367	if (f > MAX_EXP)
				1368	g = (label - 1) * alpha;
				1369	else if (f < -MAX_EXP)
				1370	g = (label - 0) * alpha;
				1371	else
				1372	g =
				1373	(label
				1374	- expTable[(int) ((f + MAX_EXP)
				1375	* (EXP_TABLE_SIZE
				1376	/ MAX_EXP / 2))])
				1377	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1378	if(debug_mode > 2 && ((long long) id) == 0) {
				1379	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1380	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1381	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1382	for (c = 0; c < layer1_size; c++)
				1383	neu1e[c] +=
				1384	g
				1385	* syn1neg_window[c + l2
				1386	+ window_offset];
				1387	for (c = 0; c < layer1_size; c++)
				1388	syn1neg_window[c + l2 + window_offset] += g
				1389	* syn0[c + l1];
				1390	if (cap == 1)
				1391	for (c = 0; c < layer1_size; c++)
				1392	capParam(syn1neg_window,
				1393	c + l2 + window_offset);
				1394	}
				1395	// Noise Constrastive Estimation
				1396	if (nce > 0)
				1397	for (d = 0; d < nce + 1; d++) {
				1398	if (d == 0) {
				1399	target = word;
				1400	label = 1;
				1401	} else {
				1402	next_random = next_random
				1403	* (unsigned long long) 25214903917 + 11;
				1404	if (word_to_group != NULL
				1405	&& word_to_group[word] != -1) {
				1406	target = word;
				1407	while (target == word) {
				1408	target =
				1409	group_to_table[word_to_group[word]
				1410	* table_size
				1411	+ (next_random >> 16)
				1412	% table_size];
				1413	next_random =
				1414	next_random
				1415	* (unsigned long long) 25214903917
				1416	+ 11;
				1417	}
				1418	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1419	} else {
				1420	target = table[(next_random >> 16)
				1421	% table_size];
				1422	}
				1423	if (target == 0)
				1424	target = next_random % (vocab_size - 1) + 1;
				1425	if (target == word)
				1426	continue;
				1427	label = 0;
				1428	}
				1429	l2 = target * window_layer_size;
				1430	f = 0;
				1431	for (c = 0; c < layer1_size; c++)
				1432	f +=
				1433	syn0[c + l1]
				1434	* syn1nce_window[c + l2
				1435	+ window_offset];
				1436	if (f > MAX_EXP)
				1437	g = (label - 1) * alpha;
				1438	else if (f < -MAX_EXP)
				1439	g = (label - 0) * alpha;
				1440	else {
				1441	f = exp(f);
				1442	g = (label
				1443	- f
				1444	/ (noise_distribution[target]
				1445	* nce + f)) * alpha;
				1446	}
				1447	for (c = 0; c < layer1_size; c++)
				1448	neu1e[c] +=
				1449	g
				1450	* syn1nce_window[c + l2
				1451	+ window_offset];
				1452	for (c = 0; c < layer1_size; c++)
				1453	syn1nce_window[c + l2 + window_offset] += g
				1454	* syn0[c + l1];
				1455	if (cap == 1)
				1456	for (c = 0; c < layer1_size; c++)
				1457	capParam(syn1nce_window,
				1458	c + l2 + window_offset);
				1459	}
				1460	// Learn weights input -> hidden
				1461	for (c = 0; c < layer1_size; c++) {
				1462	syn0[c + l1] += neu1e[c];
				1463	if (syn0[c + l1] > 50)
				1464	syn0[c + l1] = 50;
				1465	if (syn0[c + l1] < -50)
				1466	syn0[c + l1] = -50;
				1467	}
				1468	}
				1469	} else if (type == 4) { //training senna
				1470	// in -> hidden
				1471	cw = 0;
				1472	for (a = 0; a < window * 2 + 1; a++)
				1473	if (a != window) {
				1474	c = sentence_position - window + a;
				1475	if (c < 0)
				1476	continue;
				1477	if (c >= sentence_length)
				1478	continue;
				1479	last_word = sen[c];
				1480	if (last_word == -1)
				1481	continue;
				1482	window_offset = a * layer1_size;
				1483	if (a > window)
				1484	window_offset -= layer1_size;
				1485	for (c = 0; c < layer1_size; c++)
				1486	neu1[c + window_offset] += syn0[c
				1487	+ last_word * layer1_size];
				1488	cw++;
				1489	}
				1490	if (cw) {
				1491	for (a = 0; a < window_hidden_size; a++) {
				1492	c = a * window_layer_size;
				1493	for (b = 0; b < window_layer_size; b++) {
				1494	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1495	}
				1496	}
				1497	if (hs)
				1498	for (d = 0; d < vocab[word].codelen; d++) {
				1499	f = 0;
				1500	l2 = vocab[word].point[d] * window_hidden_size;
				1501	// Propagate hidden -> output
				1502	for (c = 0; c < window_hidden_size; c++)
				1503	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1504	if (f <= -MAX_EXP)
				1505	continue;
				1506	else if (f >= MAX_EXP)
				1507	continue;
				1508	else
				1509	f = expTable[(int) ((f + MAX_EXP)
				1510	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1511	// 'g' is the gradient multiplied by the learning rate
				1512	g = (1 - vocab[word].code[d] - f) * alpha;
				1513	// Propagate errors output -> hidden
				1514	for (c = 0; c < window_hidden_size; c++)
				1515	neu2e[c] += dHardTanh(neu2[c], g) * g
				1516	* syn_hidden_word[c + l2];
				1517	// Learn weights hidden -> output
				1518	for (c = 0; c < window_hidden_size; c++)
				1519	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1520	* neu2[c];
				1521	}
				1522	// NEGATIVE SAMPLING
				1523	if (negative > 0)
				1524	for (d = 0; d < negative + 1; d++) {
				1525	if (d == 0) {
				1526	target = word;
				1527	label = 1;
				1528	} else {
				1529	next_random = next_random
				1530	* (unsigned long long) 25214903917 + 11;
				1531	if (word_to_group != NULL
				1532	&& word_to_group[word] != -1) {
				1533	target = word;
				1534	while (target == word) {
				1535	target = group_to_table[word_to_group[word]
				1536	* table_size
				1537	+ (next_random >> 16) % table_size];
				1538	next_random = next_random
				1539	* (unsigned long long) 25214903917
				1540	+ 11;
				1541	}
				1542	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1543	} else {
				1544	target =
				1545	table[(next_random >> 16) % table_size];
				1546	}
				1547	if (target == 0)
				1548	target = next_random % (vocab_size - 1) + 1;
				1549	if (target == word)
				1550	continue;
				1551	label = 0;
				1552	}
				1553	l2 = target * window_hidden_size;
				1554	f = 0;
				1555	for (c = 0; c < window_hidden_size; c++)
				1556	f += hardTanh(neu2[c])
				1557	* syn_hidden_word_neg[c + l2];
				1558	if (f > MAX_EXP)
				1559	g = (label - 1) * alpha / negative;
				1560	else if (f < -MAX_EXP)
				1561	g = (label - 0) * alpha / negative;
				1562	else
				1563	g = (label
				1564	- expTable[(int) ((f + MAX_EXP)
				1565	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1566	* alpha / negative;
				1567	for (c = 0; c < window_hidden_size; c++)
				1568	neu2e[c] += dHardTanh(neu2[c], g) * g
				1569	* syn_hidden_word_neg[c + l2];
				1570	for (c = 0; c < window_hidden_size; c++)
				1571	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1572	* g * neu2[c];
				1573	}
				1574	for (a = 0; a < window_hidden_size; a++)
				1575	for (b = 0; b < window_layer_size; b++)
				1576	neu1e[b] += neu2e[a]
				1577	* syn_window_hidden[a * window_layer_size + b];
				1578	for (a = 0; a < window_hidden_size; a++)
				1579	for (b = 0; b < window_layer_size; b++)
				1580	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1581	* neu1[b];
				1582	// hidden -> in
				1583	for (a = 0; a < window * 2 + 1; a++)
				1584	if (a != window) {
				1585	c = sentence_position - window + a;
				1586	if (c < 0)
				1587	continue;
				1588	if (c >= sentence_length)
				1589	continue;
				1590	last_word = sen[c];
				1591	if (last_word == -1)
				1592	continue;
				1593	window_offset = a * layer1_size;
				1594	if (a > window)
				1595	window_offset -= layer1_size;
				1596	for (c = 0; c < layer1_size; c++)
				1597	syn0[c + last_word * layer1_size] += neu1e[c
				1598	+ window_offset];
				1599	}
				1600	}
				1601	} else {
				1602	printf("unknown type %i", type);
				1603	exit(0);
				1604	}
				1605	sentence_position++;
				1606	if (sentence_position >= sentence_length) {
				1607	sentence_length = 0;
				1608	continue;
				1609	}
				1610	}
				1611	fclose(fi);
				1612	free(neu1);
				1613	free(neu1e);
				1614	pthread_exit(NULL);
				1615	}
				1616
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1617	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1618	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1619	real f, max_f, maxmax_f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1620	real *target_sums, bestf[MAX_CC], worstbest;
				1621	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1622	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1623	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1624
				1625	for (d = cc; d < vocab_size; d++) {
				1626	for (b = 0; b < vocab_size; b++)
				1627	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1628	for (b = 0; b < N; b++)
				1629	bestf[b]=-1;
				1630	worstbest = -1;
				1631
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1632	maxmax_f = -1;
				1633	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1634	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1635	if (a != window) {
				1636	max_f = -1;
				1637	window_offset = a * layer1_size;
				1638	if (a > window)
				1639	window_offset -= layer1_size;
				1640	for(target = 0; target < vocab_size; target ++) {
				1641	if(target == d)
				1642	continue;
				1643	f = 0;
				1644	for (c = 0; c < layer1_size; c++)
				1645	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1646	if (f < -MAX_EXP)
				1647	continue;
				1648	else if (f > MAX_EXP)
				1649	continue;
				1650	else
				1651	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1652	if(f > max_f) {
				1653	max_f = f;
				1654	max_target = target;
				1655	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1656	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1657	if(f > worstbest) {
				1658	for (b = 0; b < N; b++) {
				1659	if (f > bestf[b]) {
				1660	for (e = N - 1; e > b; e--) {
				1661	bestf[e] = bestf[e - 1];
				1662	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1663	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1664	}
				1665	bestf[b] = f;
				1666	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1667	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1668	break;
				1669	}
				1670	}
				1671	worstbest = bestf[N-1];
				1672	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1673	}
				1674	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1675	if(max_f > maxmax_f) {
				1676	maxmax_f = max_f;
				1677	maxmax_target = max_target;
				1678	}
				1679	} else {
				1680	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1681	}
				1682	}
				1683	max_f = -1;
				1684	for (b = 0; b < vocab_size; b++) {
				1685	if(target_sums[b] > max_f) {
				1686	max_f = target_sums[b];
				1687	max_target = b;
				1688	}
				1689	}
				1690	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1691	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1692	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1693	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1694	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1695	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1696	}
				1697	}
				1698
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1699	void TrainModel() {
				1700	long a, b, c, d;
				1701	FILE *fo;
				1702	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
				1703	printf("Starting training using file %s\n", train_file);
				1704	starting_alpha = alpha;
				1705	if (read_vocab_file[0] != 0)
				1706	ReadVocab();
				1707	else
				1708	LearnVocabFromTrainFile();
				1709	if (save_vocab_file[0] != 0)
				1710	SaveVocab();
				1711	if (output_file[0] == 0)
				1712	return;
				1713	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1714	if(cc > 0)
				1715	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1716	if (negative > 0 \|\| nce > 0)
				1717	InitUnigramTable();
				1718	if (negative_classes_file[0] != 0)
				1719	InitClassUnigramTable();
				1720	start = clock();
				1721	for (a = 0; a < num_threads; a++)
				1722	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
				1723	for (a = 0; a < num_threads; a++)
				1724	pthread_join(pt[a], NULL);
				1725	fo = fopen(output_file, "wb");
				1726	if (classes == 0) {
				1727	// Save the word vectors
				1728	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1729	for (a = 0; a < vocab_size; a++) {
				1730	fprintf(fo, "%s ", vocab[a].word);
				1731	if (binary)
				1732	for (b = 0; b < layer1_size; b++)
				1733	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1734	else
				1735	for (b = 0; b < layer1_size; b++)
				1736	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1737	fprintf(fo, "\n");
				1738	}
				1739	} else {
				1740	// Run K-means on the word vectors
				1741	int clcn = classes, iter = 10, closeid;
				1742	int centcn = (int ) malloc(classes * sizeof(int));
				1743	int cl = (int ) calloc(vocab_size, sizeof(int));
				1744	real closev, x;
				1745	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1746	for (a = 0; a < vocab_size; a++)
				1747	cl[a] = a % clcn;
				1748	for (a = 0; a < iter; a++) {
				1749	for (b = 0; b < clcn * layer1_size; b++)
				1750	cent[b] = 0;
				1751	for (b = 0; b < clcn; b++)
				1752	centcn[b] = 1;
				1753	for (c = 0; c < vocab_size; c++) {
				1754	for (d = 0; d < layer1_size; d++)
				1755	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1756	centcn[cl[c]]++;
				1757	}
				1758	for (b = 0; b < clcn; b++) {
				1759	closev = 0;
				1760	for (c = 0; c < layer1_size; c++) {
				1761	cent[layer1_size * b + c] /= centcn[b];
				1762	closev += cent[layer1_size * b + c]
				1763	* cent[layer1_size * b + c];
				1764	}
				1765	closev = sqrt(closev);
				1766	for (c = 0; c < layer1_size; c++)
				1767	cent[layer1_size * b + c] /= closev;
				1768	}
				1769	for (c = 0; c < vocab_size; c++) {
				1770	closev = -10;
				1771	closeid = 0;
				1772	for (d = 0; d < clcn; d++) {
				1773	x = 0;
				1774	for (b = 0; b < layer1_size; b++)
				1775	x += cent[layer1_size * d + b]
				1776	* syn0[c * layer1_size + b];
				1777	if (x > closev) {
				1778	closev = x;
				1779	closeid = d;
				1780	}
				1781	}
				1782	cl[c] = closeid;
				1783	}
				1784	}
				1785	// Save the K-means classes
				1786	for (a = 0; a < vocab_size; a++)
				1787	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1788	free(centcn);
				1789	free(cent);
				1790	free(cl);
				1791	}
				1792	fclose(fo);
				1793	if (save_net_file[0] != 0)
				1794	SaveNet();
				1795	}
				1796
				1797	int ArgPos(char str, int argc, char *argv) {
				1798	int a;
				1799	for (a = 1; a < argc; a++)
				1800	if (!strcmp(str, argv[a])) {
				1801	if (a == argc - 1) {
				1802	printf("Argument missing for %s\n", str);
				1803	exit(1);
				1804	}
				1805	return a;
				1806	}
				1807	return -1;
				1808	}
				1809
				1810	int main(int argc, char **argv) {
				1811	int i;
				1812	if (argc == 1) {
				1813	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1814	printf("Options:\n");
				1815	printf("Parameters for training:\n");
				1816	printf("\t-train <file>\n");
				1817	printf("\t\tUse text data from <file> to train the model\n");
				1818	printf("\t-output <file>\n");
				1819	printf(
				1820	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1821	printf("\t-size <int>\n");
				1822	printf("\t\tSet size of word vectors; default is 100\n");
				1823	printf("\t-window <int>\n");
				1824	printf("\t\tSet max skip length between words; default is 5\n");
				1825	printf("\t-sample <float>\n");
				1826	printf(
				1827	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1828	printf(
				1829	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1830	printf("\t-hs <int>\n");
				1831	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1832	printf("\t-negative <int>\n");
				1833	printf(
				1834	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1835	printf("\t-negative-classes <file>\n");
				1836	printf("\t\tNegative classes to sample from\n");
				1837	printf("\t-nce <int>\n");
				1838	printf(
				1839	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1840	printf("\t-threads <int>\n");
				1841	printf("\t\tUse <int> threads (default 12)\n");
				1842	printf("\t-iter <int>\n");
				1843	printf("\t\tRun more training iterations (default 5)\n");
				1844	printf("\t-min-count <int>\n");
				1845	printf(
				1846	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1847	printf("\t-alpha <float>\n");
				1848	printf(
				1849	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1850	printf("\t-classes <int>\n");
				1851	printf(
				1852	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1853	printf("\t-debug <int>\n");
				1854	printf(
				1855	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1856	printf("\t-binary <int>\n");
				1857	printf(
				1858	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1859	printf("\t-save-vocab <file>\n");
				1860	printf("\t\tThe vocabulary will be saved to <file>\n");
				1861	printf("\t-read-vocab <file>\n");
				1862	printf(
				1863	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1864	printf("\t-read-net <file>\n");
				1865	printf(
				1866	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1867	printf("\t-save-net <file>\n");
				1868	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1869	printf("\t-show-cc <int>\n");
				1870	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1871	printf("\t-type <int>\n");
				1872	printf(
				1873	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1874	printf("\t-cap <int>\n");
				1875	printf(
				1876	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1877	printf("\nExamples:\n");
				1878	printf(
				1879	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1880	return 0;
				1881	}
				1882	output_file[0] = 0;
				1883	save_vocab_file[0] = 0;
				1884	read_vocab_file[0] = 0;
				1885	save_net_file[0] = 0;
				1886	read_net_file[0] = 0;
				1887	negative_classes_file[0] = 0;
				1888	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				1889	layer1_size = atoi(argv[i + 1]);
				1890	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				1891	strcpy(train_file, argv[i + 1]);
				1892	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				1893	strcpy(save_vocab_file, argv[i + 1]);
				1894	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				1895	strcpy(read_vocab_file, argv[i + 1]);
				1896	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				1897	strcpy(save_net_file, argv[i + 1]);
				1898	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				1899	strcpy(read_net_file, argv[i + 1]);
				1900	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				1901	debug_mode = atoi(argv[i + 1]);
				1902	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				1903	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1904	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				1905	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1906	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				1907	type = atoi(argv[i + 1]);
				1908	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				1909	strcpy(output_file, argv[i + 1]);
				1910	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				1911	window = atoi(argv[i + 1]);
				1912	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				1913	sample = atof(argv[i + 1]);
				1914	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				1915	hs = atoi(argv[i + 1]);
				1916	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				1917	negative = atoi(argv[i + 1]);
				1918	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				1919	strcpy(negative_classes_file, argv[i + 1]);
				1920	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				1921	nce = atoi(argv[i + 1]);
				1922	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				1923	num_threads = atoi(argv[i + 1]);
				1924	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				1925	iter = atoi(argv[i + 1]);
				1926	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				1927	min_count = atoi(argv[i + 1]);
				1928	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				1929	classes = atoi(argv[i + 1]);
				1930	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				1931	cap = atoi(argv[i + 1]);
				1932	if (type == 0 \|\| type == 2 \|\| type == 4)
				1933	alpha = 0.05;
				1934	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				1935	alpha = atof(argv[i + 1]);
				1936	vocab = (struct vocab_word *) calloc(vocab_max_size,
				1937	sizeof(struct vocab_word));
				1938	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				1939	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1940	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1941	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1942	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1943	}
				1944	TrainModel();
				1945	return 0;
				1946	}
				1947