Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 871558cee8161e44b4a8c22c295c528d19dcaddc [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
				25	#define MAX_CODE_LENGTH 40
				26
				27	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				28
				29	typedef float real; // Precision of float numbers
				30
				31	struct vocab_word {
				32	long long cn;
				33	int *point;
				34	char word, code, codelen;
				35	};
				36
				37	char train_file[MAX_STRING], output_file[MAX_STRING];
				38	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				39	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				40	struct vocab_word *vocab;
				41	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
				42	num_threads = 12, min_reduce = 1;
				43	int *vocab_hash;
				44	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				45	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				46	classes = 0;
				47	real alpha = 0.025, starting_alpha, sample = 1e-3;
				48	real syn0, syn1, syn1neg, syn1nce, *expTable;
				49	clock_t start;
				50
				51	real syn1_window, syn1neg_window, *syn1nce_window;
				52	int w_offset, window_layer_size;
				53
				54	int window_hidden_size = 500;
				55	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				56	*syn_hidden_word_nce;
				57
				58	int hs = 0, negative = 5;
				59	const int table_size = 1e8;
				60	int *table;
				61
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	62	long cc = 0;
				63
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	64	//constrastive negative sampling
				65	char negative_classes_file[MAX_STRING];
				66	int *word_to_group;
				67	int group_to_table; //group_sizetable_size
				68	int class_number;
				69
				70	//nce
				71	real* noise_distribution;
				72	int nce = 0;
				73
				74	//param caps
				75	real CAP_VALUE = 50;
				76	int cap = 0;
				77
				78	void capParam(real* array, int index) {
				79	if (array[index] > CAP_VALUE)
				80	array[index] = CAP_VALUE;
				81	else if (array[index] < -CAP_VALUE)
				82	array[index] = -CAP_VALUE;
				83	}
				84
				85	real hardTanh(real x) {
				86	if (x >= 1) {
				87	return 1;
				88	} else if (x <= -1) {
				89	return -1;
				90	} else {
				91	return x;
				92	}
				93	}
				94
				95	real dHardTanh(real x, real g) {
				96	if (x > 1 && g > 0) {
				97	return 0;
				98	}
				99	if (x < -1 && g < 0) {
				100	return 0;
				101	}
				102	return 1;
				103	}
				104
				105	void InitUnigramTable() {
				106	int a, i;
				107	long long train_words_pow = 0;
				108	real d1, power = 0.75;
				109	table = (int ) malloc(table_size sizeof(int));
				110	for (a = 0; a < vocab_size; a++)
				111	train_words_pow += pow(vocab[a].cn, power);
				112	i = 0;
				113	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				114	for (a = 0; a < table_size; a++) {
				115	table[a] = i;
				116	if (a / (real) table_size > d1) {
				117	i++;
				118	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				119	}
				120	if (i >= vocab_size)
				121	i = vocab_size - 1;
				122	}
				123
				124	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				125	for (a = 0; a < vocab_size; a++)
				126	noise_distribution[a] = pow(vocab[a].cn, power)
				127	/ (real) train_words_pow;
				128	}
				129
				130	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				131	void ReadWord(char word, FILE fin) {
				132	int a = 0, ch;
				133	while (!feof(fin)) {
				134	ch = fgetc(fin);
				135	if (ch == 13)
				136	continue;
				137	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				138	if (a > 0) {
				139	if (ch == '\n')
				140	ungetc(ch, fin);
				141	break;
				142	}
				143	if (ch == '\n') {
				144	strcpy(word, (char *) "</s>");
				145	return;
				146	} else
				147	continue;
				148	}
				149	word[a] = ch;
				150	a++;
				151	if (a >= MAX_STRING - 1)
				152	a--; // Truncate too long words
				153	}
				154	word[a] = 0;
				155	}
				156
				157	// Returns hash value of a word
				158	int GetWordHash(char *word) {
				159	unsigned long long a, hash = 0;
				160	for (a = 0; a < strlen(word); a++)
				161	hash = hash * 257 + word[a];
				162	hash = hash % vocab_hash_size;
				163	return hash;
				164	}
				165
				166	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				167	int SearchVocab(char *word) {
				168	unsigned int hash = GetWordHash(word);
				169	while (1) {
				170	if (vocab_hash[hash] == -1)
				171	return -1;
				172	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				173	return vocab_hash[hash];
				174	hash = (hash + 1) % vocab_hash_size;
				175	}
				176	return -1;
				177	}
				178
				179	// Reads a word and returns its index in the vocabulary
				180	int ReadWordIndex(FILE *fin) {
				181	char word[MAX_STRING];
				182	ReadWord(word, fin);
				183	if (feof(fin))
				184	return -1;
				185	return SearchVocab(word);
				186	}
				187
				188	// Adds a word to the vocabulary
				189	int AddWordToVocab(char *word) {
				190	unsigned int hash, length = strlen(word) + 1;
				191	if (length > MAX_STRING)
				192	length = MAX_STRING;
				193	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				194	strcpy(vocab[vocab_size].word, word);
				195	vocab[vocab_size].cn = 0;
				196	vocab_size++;
				197	// Reallocate memory if needed
				198	if (vocab_size + 2 >= vocab_max_size) {
				199	vocab_max_size += 1000;
				200	vocab = (struct vocab_word *) realloc(vocab,
				201	vocab_max_size * sizeof(struct vocab_word));
				202	}
				203	hash = GetWordHash(word);
				204	while (vocab_hash[hash] != -1)
				205	hash = (hash + 1) % vocab_hash_size;
				206	vocab_hash[hash] = vocab_size - 1;
				207	return vocab_size - 1;
				208	}
				209
				210	// Used later for sorting by word counts
				211	int VocabCompare(const void a, const void b) {
				212	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				213	}
				214
				215	// Sorts the vocabulary by frequency using word counts
				216	void SortVocab() {
				217	int a, size;
				218	unsigned int hash;
				219	// Sort the vocabulary and keep </s> at the first position
				220	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				221	for (a = 0; a < vocab_hash_size; a++)
				222	vocab_hash[a] = -1;
				223	size = vocab_size;
				224	train_words = 0;
				225	for (a = 0; a < size; a++) {
				226	// Words occuring less than min_count times will be discarded from the vocab
				227	if ((vocab[a].cn < min_count) && (a != 0)) {
				228	vocab_size--;
				229	free(vocab[a].word);
				230	} else {
				231	// Hash will be re-computed, as after the sorting it is not actual
				232	hash = GetWordHash(vocab[a].word);
				233	while (vocab_hash[hash] != -1)
				234	hash = (hash + 1) % vocab_hash_size;
				235	vocab_hash[hash] = a;
				236	train_words += vocab[a].cn;
				237	}
				238	}
				239	vocab = (struct vocab_word *) realloc(vocab,
				240	(vocab_size + 1) * sizeof(struct vocab_word));
				241	// Allocate memory for the binary tree construction
				242	for (a = 0; a < vocab_size; a++) {
				243	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				244	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				245	}
				246	}
				247
				248	// Reduces the vocabulary by removing infrequent tokens
				249	void ReduceVocab() {
				250	int a, b = 0;
				251	unsigned int hash;
				252	for (a = 0; a < vocab_size; a++)
				253	if (vocab[a].cn > min_reduce) {
				254	vocab[b].cn = vocab[a].cn;
				255	vocab[b].word = vocab[a].word;
				256	b++;
				257	} else
				258	free(vocab[a].word);
				259	vocab_size = b;
				260	for (a = 0; a < vocab_hash_size; a++)
				261	vocab_hash[a] = -1;
				262	for (a = 0; a < vocab_size; a++) {
				263	// Hash will be re-computed, as it is not actual
				264	hash = GetWordHash(vocab[a].word);
				265	while (vocab_hash[hash] != -1)
				266	hash = (hash + 1) % vocab_hash_size;
				267	vocab_hash[hash] = a;
				268	}
				269	fflush(stdout);
				270	min_reduce++;
				271	}
				272
				273	// Create binary Huffman tree using the word counts
				274	// Frequent words will have short uniqe binary codes
				275	void CreateBinaryTree() {
				276	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				277	char code[MAX_CODE_LENGTH];
				278	long long count = (long long ) calloc(vocab_size * 2 + 1,
				279	sizeof(long long));
				280	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				281	sizeof(long long));
				282	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				283	sizeof(long long));
				284	for (a = 0; a < vocab_size; a++)
				285	count[a] = vocab[a].cn;
				286	for (a = vocab_size; a < vocab_size * 2; a++)
				287	count[a] = 1e15;
				288	pos1 = vocab_size - 1;
				289	pos2 = vocab_size;
				290	// Following algorithm constructs the Huffman tree by adding one node at a time
				291	for (a = 0; a < vocab_size - 1; a++) {
				292	// First, find two smallest nodes 'min1, min2'
				293	if (pos1 >= 0) {
				294	if (count[pos1] < count[pos2]) {
				295	min1i = pos1;
				296	pos1--;
				297	} else {
				298	min1i = pos2;
				299	pos2++;
				300	}
				301	} else {
				302	min1i = pos2;
				303	pos2++;
				304	}
				305	if (pos1 >= 0) {
				306	if (count[pos1] < count[pos2]) {
				307	min2i = pos1;
				308	pos1--;
				309	} else {
				310	min2i = pos2;
				311	pos2++;
				312	}
				313	} else {
				314	min2i = pos2;
				315	pos2++;
				316	}
				317	count[vocab_size + a] = count[min1i] + count[min2i];
				318	parent_node[min1i] = vocab_size + a;
				319	parent_node[min2i] = vocab_size + a;
				320	binary[min2i] = 1;
				321	}
				322	// Now assign binary code to each vocabulary word
				323	for (a = 0; a < vocab_size; a++) {
				324	b = a;
				325	i = 0;
				326	while (1) {
				327	code[i] = binary[b];
				328	point[i] = b;
				329	i++;
				330	b = parent_node[b];
				331	if (b == vocab_size * 2 - 2)
				332	break;
				333	}
				334	vocab[a].codelen = i;
				335	vocab[a].point[0] = vocab_size - 2;
				336	for (b = 0; b < i; b++) {
				337	vocab[a].code[i - b - 1] = code[b];
				338	vocab[a].point[i - b] = point[b] - vocab_size;
				339	}
				340	}
				341	free(count);
				342	free(binary);
				343	free(parent_node);
				344	}
				345
				346	void LearnVocabFromTrainFile() {
				347	char word[MAX_STRING];
				348	FILE *fin;
				349	long long a, i;
				350	for (a = 0; a < vocab_hash_size; a++)
				351	vocab_hash[a] = -1;
				352	fin = fopen(train_file, "rb");
				353	if (fin == NULL) {
				354	printf("ERROR: training data file not found!\n");
				355	exit(1);
				356	}
				357	vocab_size = 0;
				358	AddWordToVocab((char *) "</s>");
				359	while (1) {
				360	ReadWord(word, fin);
				361	if (feof(fin))
				362	break;
				363	train_words++;
				364	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				365	printf("%lldK%c", train_words / 1000, 13);
				366	fflush(stdout);
				367	}
				368	i = SearchVocab(word);
				369	if (i == -1) {
				370	a = AddWordToVocab(word);
				371	vocab[a].cn = 1;
				372	} else
				373	vocab[i].cn++;
				374	if (vocab_size > vocab_hash_size * 0.7)
				375	ReduceVocab();
				376	}
				377	SortVocab();
				378	if (debug_mode > 0) {
				379	printf("Vocab size: %lld\n", vocab_size);
				380	printf("Words in train file: %lld\n", train_words);
				381	}
				382	file_size = ftell(fin);
				383	fclose(fin);
				384	}
				385
				386	void SaveVocab() {
				387	long long i;
				388	FILE *fo = fopen(save_vocab_file, "wb");
				389	for (i = 0; i < vocab_size; i++)
				390	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				391	fclose(fo);
				392	}
				393
				394	void ReadVocab() {
				395	long long a, i = 0;
				396	char c;
				397	char word[MAX_STRING];
				398	FILE *fin = fopen(read_vocab_file, "rb");
				399	if (fin == NULL) {
				400	printf("Vocabulary file not found\n");
				401	exit(1);
				402	}
				403	for (a = 0; a < vocab_hash_size; a++)
				404	vocab_hash[a] = -1;
				405	vocab_size = 0;
				406	while (1) {
				407	ReadWord(word, fin);
				408	if (feof(fin))
				409	break;
				410	a = AddWordToVocab(word);
				411	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				412	i++;
				413	}
				414	SortVocab();
				415	if (debug_mode > 0) {
				416	printf("Vocab size: %lld\n", vocab_size);
				417	printf("Words in train file: %lld\n", train_words);
				418	}
				419	fin = fopen(train_file, "rb");
				420	if (fin == NULL) {
				421	printf("ERROR: training data file not found!\n");
				422	exit(1);
				423	}
				424	fseek(fin, 0, SEEK_END);
				425	file_size = ftell(fin);
				426	fclose(fin);
				427	}
				428
				429	void InitClassUnigramTable() {
				430	long long a, c;
				431	printf("loading class unigrams \n");
				432	FILE *fin = fopen(negative_classes_file, "rb");
				433	if (fin == NULL) {
				434	printf("ERROR: class file not found!\n");
				435	exit(1);
				436	}
				437	word_to_group = (int ) malloc(vocab_size sizeof(int));
				438	for (a = 0; a < vocab_size; a++)
				439	word_to_group[a] = -1;
				440	char class[MAX_STRING];
				441	char prev_class[MAX_STRING];
				442	prev_class[0] = 0;
				443	char word[MAX_STRING];
				444	class_number = -1;
				445	while (1) {
				446	if (feof(fin))
				447	break;
				448	ReadWord(class, fin);
				449	ReadWord(word, fin);
				450	int word_index = SearchVocab(word);
				451	if (word_index != -1) {
				452	if (strcmp(class, prev_class) != 0) {
				453	class_number++;
				454	strcpy(prev_class, class);
				455	}
				456	word_to_group[word_index] = class_number;
				457	}
				458	ReadWord(word, fin);
				459	}
				460	class_number++;
				461	fclose(fin);
				462
				463	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				464	long long train_words_pow = 0;
				465	real d1, power = 0.75;
				466
				467	for (c = 0; c < class_number; c++) {
				468	long long offset = c * table_size;
				469	train_words_pow = 0;
				470	for (a = 0; a < vocab_size; a++)
				471	if (word_to_group[a] == c)
				472	train_words_pow += pow(vocab[a].cn, power);
				473	int i = 0;
				474	while (word_to_group[i] != c && i < vocab_size)
				475	i++;
				476	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				477	for (a = 0; a < table_size; a++) {
				478	//printf("index %lld , word %d\n", a, i);
				479	group_to_table[offset + a] = i;
				480	if (a / (real) table_size > d1) {
				481	i++;
				482	while (word_to_group[i] != c && i < vocab_size)
				483	i++;
				484	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				485	}
				486	if (i >= vocab_size)
				487	while (word_to_group[i] != c && i >= 0)
				488	i--;
				489	}
				490	}
				491	}
				492
				493	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	494	if(type != 3 \|\| negative <= 0) {
				495	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				496	return;
				497	}
				498
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	499	FILE *fnet = fopen(save_net_file, "wb");
				500	if (fnet == NULL) {
				501	printf("Net parameter file not found\n");
				502	exit(1);
				503	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	504	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	505	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	506	fclose(fnet);
				507	}
				508
				509	void InitNet() {
				510	long long a, b;
				511	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	512	long long read;
				513
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	514	window_layer_size = layer1_size * window * 2;
				515	a = posix_memalign((void **) &syn0, 128,
				516	(long long) vocab_size * layer1_size * sizeof(real));
				517	if (syn0 == NULL) {
				518	printf("Memory allocation failed\n");
				519	exit(1);
				520	}
				521
				522	if (hs) {
				523	a = posix_memalign((void **) &syn1, 128,
				524	(long long) vocab_size * layer1_size * sizeof(real));
				525	if (syn1 == NULL) {
				526	printf("Memory allocation failed\n");
				527	exit(1);
				528	}
				529	a = posix_memalign((void **) &syn1_window, 128,
				530	(long long) vocab_size * window_layer_size * sizeof(real));
				531	if (syn1_window == NULL) {
				532	printf("Memory allocation failed\n");
				533	exit(1);
				534	}
				535	a = posix_memalign((void **) &syn_hidden_word, 128,
				536	(long long) vocab_size * window_hidden_size * sizeof(real));
				537	if (syn_hidden_word == NULL) {
				538	printf("Memory allocation failed\n");
				539	exit(1);
				540	}
				541
				542	for (a = 0; a < vocab_size; a++)
				543	for (b = 0; b < layer1_size; b++)
				544	syn1[a * layer1_size + b] = 0;
				545	for (a = 0; a < vocab_size; a++)
				546	for (b = 0; b < window_layer_size; b++)
				547	syn1_window[a * window_layer_size + b] = 0;
				548	for (a = 0; a < vocab_size; a++)
				549	for (b = 0; b < window_hidden_size; b++)
				550	syn_hidden_word[a * window_hidden_size + b] = 0;
				551	}
				552	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	553	if(type == 0) {
				554	a = posix_memalign((void **) &syn1neg, 128,
				555	(long long) vocab_size * layer1_size * sizeof(real));
				556	if (syn1neg == NULL) {
				557	printf("Memory allocation failed\n");
				558	exit(1);
				559	}
				560	for (a = 0; a < vocab_size; a++)
				561	for (b = 0; b < layer1_size; b++)
				562	syn1neg[a * layer1_size + b] = 0;
				563	} else if (type == 3) {
				564	a = posix_memalign((void **) &syn1neg_window, 128,
				565	(long long) vocab_size * window_layer_size * sizeof(real));
				566	if (syn1neg_window == NULL) {
				567	printf("Memory allocation failed\n");
				568	exit(1);
				569	}
				570	for (a = 0; a < vocab_size; a++)
				571	for (b = 0; b < window_layer_size; b++)
				572	syn1neg_window[a * window_layer_size + b] = 0;
				573	} else if (type == 4) {
				574	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				575	(long long) vocab_size * window_hidden_size * sizeof(real));
				576	if (syn_hidden_word_neg == NULL) {
				577	printf("Memory allocation failed\n");
				578	exit(1);
				579	}
				580	for (a = 0; a < vocab_size; a++)
				581	for (b = 0; b < window_hidden_size; b++)
				582	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	583	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	584	}
				585	if (nce > 0) {
				586	a = posix_memalign((void **) &syn1nce, 128,
				587	(long long) vocab_size * layer1_size * sizeof(real));
				588	if (syn1nce == NULL) {
				589	printf("Memory allocation failed\n");
				590	exit(1);
				591	}
				592	a = posix_memalign((void **) &syn1nce_window, 128,
				593	(long long) vocab_size * window_layer_size * sizeof(real));
				594	if (syn1nce_window == NULL) {
				595	printf("Memory allocation failed\n");
				596	exit(1);
				597	}
				598	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				599	(long long) vocab_size * window_hidden_size * sizeof(real));
				600	if (syn_hidden_word_nce == NULL) {
				601	printf("Memory allocation failed\n");
				602	exit(1);
				603	}
				604
				605	for (a = 0; a < vocab_size; a++)
				606	for (b = 0; b < layer1_size; b++)
				607	syn1nce[a * layer1_size + b] = 0;
				608	for (a = 0; a < vocab_size; a++)
				609	for (b = 0; b < window_layer_size; b++)
				610	syn1nce_window[a * window_layer_size + b] = 0;
				611	for (a = 0; a < vocab_size; a++)
				612	for (b = 0; b < window_hidden_size; b++)
				613	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				614	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	615
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	616	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	617	a = posix_memalign((void **) &syn_window_hidden, 128,
				618	window_hidden_size * window_layer_size * sizeof(real));
				619	if (syn_window_hidden == NULL) {
				620	printf("Memory allocation failed\n");
				621	exit(1);
				622	}
				623	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				624	next_random = next_random * (unsigned long long) 25214903917 + 11;
				625	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				626	- 0.5) / (window_hidden_size * window_layer_size);
				627	}
				628	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	629
				630	if (read_net_file[0] == 0) {
				631	for (a = 0; a < vocab_size; a++)
				632	for (b = 0; b < layer1_size; b++) {
				633	next_random = next_random * (unsigned long long) 25214903917
				634	+ 11;
				635	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				636	/ (real) 65536) - 0.5) / layer1_size;
				637	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	638	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	639	FILE *fnet = fopen(read_net_file, "rb");
				640	if (fnet == NULL) {
				641	printf("Net parameter file not found\n");
				642	exit(1);
				643	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	644	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				645	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				646	if(read != vocab_size * layer1_size) {
				647	fprintf(stderr, "read-net failed %lld\n", read);
				648	exit(-1);
				649	}
				650	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				651	if(read != (long long) vocab_size * window_layer_size) {
				652	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				653	(long long) sizeof(real) * vocab_size * window_layer_size);
				654	exit(-1);
				655	}
				656	fgetc(fnet);
				657	if(!feof(fnet)) {
				658	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				659	exit(-1);
				660	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	661	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	662	} else {
				663	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				664	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	665	}
				666
				667	CreateBinaryTree();
				668	}
				669
				670	void TrainModelThread(void id) {
				671	long long a, b, d, cw, word, last_word, sentence_length = 0,
				672	sentence_position = 0;
				673	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				674	long long l1, l2, c, target, label, local_iter = iter;
				675	unsigned long long next_random = (long long) id;
				676	real f, g;
				677	clock_t now;
				678	int input_len_1 = layer1_size;
				679	int window_offset = -1;
				680	if (type == 2 \|\| type == 4) {
				681	input_len_1 = window_layer_size;
				682	}
				683	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				684	real neu1e = (real ) calloc(input_len_1, sizeof(real));
				685
				686	int input_len_2 = 0;
				687	if (type == 4) {
				688	input_len_2 = window_hidden_size;
				689	}
				690	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				691	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				692
				693	FILE *fi = fopen(train_file, "rb");
				694	fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
				695	while (1) {
				696	if (word_count - last_word_count > 10000) {
				697	word_count_actual += word_count - last_word_count;
				698	last_word_count = word_count;
				699	if ((debug_mode > 1)) {
				700	now = clock();
				701	printf(
				702	"%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
				703	13, alpha,
				704	word_count_actual / (real) (iter * train_words + 1)
				705	* 100,
				706	word_count_actual
				707	/ ((real) (now - start + 1)
				708	/ (real) CLOCKS_PER_SEC * 1000));
				709	fflush(stdout);
				710	}
				711	alpha = starting_alpha
				712	* (1 - word_count_actual / (real) (iter * train_words + 1));
				713	if (alpha < starting_alpha * 0.0001)
				714	alpha = starting_alpha * 0.0001;
				715	}
				716	if (sentence_length == 0) {
				717	while (1) {
				718	word = ReadWordIndex(fi);
				719	if (feof(fi))
				720	break;
				721	if (word == -1)
				722	continue;
				723	word_count++;
				724	if (word == 0)
				725	break;
				726	// The subsampling randomly discards frequent words while keeping the ranking same
				727	if (sample > 0) {
				728	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				729	+ 1) * (sample * train_words) / vocab[word].cn;
				730	next_random = next_random * (unsigned long long) 25214903917
				731	+ 11;
				732	if (ran < (next_random & 0xFFFF) / (real) 65536)
				733	continue;
				734	}
				735	sen[sentence_length] = word;
				736	sentence_length++;
				737	if (sentence_length >= MAX_SENTENCE_LENGTH)
				738	break;
				739	}
				740	sentence_position = 0;
				741	}
				742	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				743	word_count_actual += word_count - last_word_count;
				744	local_iter--;
				745	if (local_iter == 0)
				746	break;
				747	word_count = 0;
				748	last_word_count = 0;
				749	sentence_length = 0;
				750	fseek(fi, file_size / (long long) num_threads * (long long) id,
				751	SEEK_SET);
				752	continue;
				753	}
				754	word = sen[sentence_position];
				755	if (word == -1)
				756	continue;
				757	for (c = 0; c < input_len_1; c++)
				758	neu1[c] = 0;
				759	for (c = 0; c < input_len_1; c++)
				760	neu1e[c] = 0;
				761	for (c = 0; c < input_len_2; c++)
				762	neu2[c] = 0;
				763	for (c = 0; c < input_len_2; c++)
				764	neu2e[c] = 0;
				765	next_random = next_random * (unsigned long long) 25214903917 + 11;
				766	b = next_random % window;
				767	if (type == 0) { //train the cbow architecture
				768	// in -> hidden
				769	cw = 0;
				770	for (a = b; a < window * 2 + 1 - b; a++)
				771	if (a != window) {
				772	c = sentence_position - window + a;
				773	if (c < 0)
				774	continue;
				775	if (c >= sentence_length)
				776	continue;
				777	last_word = sen[c];
				778	if (last_word == -1)
				779	continue;
				780	for (c = 0; c < layer1_size; c++)
				781	neu1[c] += syn0[c + last_word * layer1_size];
				782	cw++;
				783	}
				784	if (cw) {
				785	for (c = 0; c < layer1_size; c++)
				786	neu1[c] /= cw;
				787	if (hs)
				788	for (d = 0; d < vocab[word].codelen; d++) {
				789	f = 0;
				790	l2 = vocab[word].point[d] * layer1_size;
				791	// Propagate hidden -> output
				792	for (c = 0; c < layer1_size; c++)
				793	f += neu1[c] * syn1[c + l2];
				794	if (f <= -MAX_EXP)
				795	continue;
				796	else if (f >= MAX_EXP)
				797	continue;
				798	else
				799	f = expTable[(int) ((f + MAX_EXP)
				800	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				801	// 'g' is the gradient multiplied by the learning rate
				802	g = (1 - vocab[word].code[d] - f) * alpha;
				803	// Propagate errors output -> hidden
				804	for (c = 0; c < layer1_size; c++)
				805	neu1e[c] += g * syn1[c + l2];
				806	// Learn weights hidden -> output
				807	for (c = 0; c < layer1_size; c++)
				808	syn1[c + l2] += g * neu1[c];
				809	if (cap == 1)
				810	for (c = 0; c < layer1_size; c++)
				811	capParam(syn1, c + l2);
				812	}
				813	// NEGATIVE SAMPLING
				814	if (negative > 0)
				815	for (d = 0; d < negative + 1; d++) {
				816	if (d == 0) {
				817	target = word;
				818	label = 1;
				819	} else {
				820	next_random = next_random
				821	* (unsigned long long) 25214903917 + 11;
				822	if (word_to_group != NULL
				823	&& word_to_group[word] != -1) {
				824	target = word;
				825	while (target == word) {
				826	target = group_to_table[word_to_group[word]
				827	* table_size
				828	+ (next_random >> 16) % table_size];
				829	next_random = next_random
				830	* (unsigned long long) 25214903917
				831	+ 11;
				832	}
				833	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				834	} else {
				835	target =
				836	table[(next_random >> 16) % table_size];
				837	}
				838	if (target == 0)
				839	target = next_random % (vocab_size - 1) + 1;
				840	if (target == word)
				841	continue;
				842	label = 0;
				843	}
				844	l2 = target * layer1_size;
				845	f = 0;
				846	for (c = 0; c < layer1_size; c++)
				847	f += neu1[c] * syn1neg[c + l2];
				848	if (f > MAX_EXP)
				849	g = (label - 1) * alpha;
				850	else if (f < -MAX_EXP)
				851	g = (label - 0) * alpha;
				852	else
				853	g = (label
				854	- expTable[(int) ((f + MAX_EXP)
				855	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				856	* alpha;
				857	for (c = 0; c < layer1_size; c++)
				858	neu1e[c] += g * syn1neg[c + l2];
				859	for (c = 0; c < layer1_size; c++)
				860	syn1neg[c + l2] += g * neu1[c];
				861	if (cap == 1)
				862	for (c = 0; c < layer1_size; c++)
				863	capParam(syn1neg, c + l2);
				864	}
				865	// Noise Contrastive Estimation
				866	if (nce > 0)
				867	for (d = 0; d < nce + 1; d++) {
				868	if (d == 0) {
				869	target = word;
				870	label = 1;
				871	} else {
				872	next_random = next_random
				873	* (unsigned long long) 25214903917 + 11;
				874	if (word_to_group != NULL
				875	&& word_to_group[word] != -1) {
				876	target = word;
				877	while (target == word) {
				878	target = group_to_table[word_to_group[word]
				879	* table_size
				880	+ (next_random >> 16) % table_size];
				881	next_random = next_random
				882	* (unsigned long long) 25214903917
				883	+ 11;
				884	}
				885	} else {
				886	target =
				887	table[(next_random >> 16) % table_size];
				888	}
				889	if (target == 0)
				890	target = next_random % (vocab_size - 1) + 1;
				891	if (target == word)
				892	continue;
				893	label = 0;
				894	}
				895	l2 = target * layer1_size;
				896	f = 0;
				897
				898	for (c = 0; c < layer1_size; c++)
				899	f += neu1[c] * syn1nce[c + l2];
				900	if (f > MAX_EXP)
				901	g = (label - 1) * alpha;
				902	else if (f < -MAX_EXP)
				903	g = (label - 0) * alpha;
				904	else {
				905	f = exp(f);
				906	g =
				907	(label
				908	- f
				909	/ (noise_distribution[target]
				910	* nce + f)) * alpha;
				911	}
				912	for (c = 0; c < layer1_size; c++)
				913	neu1e[c] += g * syn1nce[c + l2];
				914	for (c = 0; c < layer1_size; c++)
				915	syn1nce[c + l2] += g * neu1[c];
				916	if (cap == 1)
				917	for (c = 0; c < layer1_size; c++)
				918	capParam(syn1nce, c + l2);
				919	}
				920	// hidden -> in
				921	for (a = b; a < window * 2 + 1 - b; a++)
				922	if (a != window) {
				923	c = sentence_position - window + a;
				924	if (c < 0)
				925	continue;
				926	if (c >= sentence_length)
				927	continue;
				928	last_word = sen[c];
				929	if (last_word == -1)
				930	continue;
				931	for (c = 0; c < layer1_size; c++)
				932	syn0[c + last_word * layer1_size] += neu1e[c];
				933	}
				934	}
				935	} else if (type == 1) { //train skip-gram
				936	for (a = b; a < window * 2 + 1 - b; a++)
				937	if (a != window) {
				938	c = sentence_position - window + a;
				939	if (c < 0)
				940	continue;
				941	if (c >= sentence_length)
				942	continue;
				943	last_word = sen[c];
				944	if (last_word == -1)
				945	continue;
				946	l1 = last_word * layer1_size;
				947	for (c = 0; c < layer1_size; c++)
				948	neu1e[c] = 0;
				949	// HIERARCHICAL SOFTMAX
				950	if (hs)
				951	for (d = 0; d < vocab[word].codelen; d++) {
				952	f = 0;
				953	l2 = vocab[word].point[d] * layer1_size;
				954	// Propagate hidden -> output
				955	for (c = 0; c < layer1_size; c++)
				956	f += syn0[c + l1] * syn1[c + l2];
				957	if (f <= -MAX_EXP)
				958	continue;
				959	else if (f >= MAX_EXP)
				960	continue;
				961	else
				962	f = expTable[(int) ((f + MAX_EXP)
				963	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				964	// 'g' is the gradient multiplied by the learning rate
				965	g = (1 - vocab[word].code[d] - f) * alpha;
				966	// Propagate errors output -> hidden
				967	for (c = 0; c < layer1_size; c++)
				968	neu1e[c] += g * syn1[c + l2];
				969	// Learn weights hidden -> output
				970	for (c = 0; c < layer1_size; c++)
				971	syn1[c + l2] += g * syn0[c + l1];
				972	if (cap == 1)
				973	for (c = 0; c < layer1_size; c++)
				974	capParam(syn1, c + l2);
				975	}
				976	// NEGATIVE SAMPLING
				977	if (negative > 0)
				978	for (d = 0; d < negative + 1; d++) {
				979	if (d == 0) {
				980	target = word;
				981	label = 1;
				982	} else {
				983	next_random = next_random
				984	* (unsigned long long) 25214903917 + 11;
				985	if (word_to_group != NULL
				986	&& word_to_group[word] != -1) {
				987	target = word;
				988	while (target == word) {
				989	target =
				990	group_to_table[word_to_group[word]
				991	* table_size
				992	+ (next_random >> 16)
				993	% table_size];
				994	next_random =
				995	next_random
				996	* (unsigned long long) 25214903917
				997	+ 11;
				998	}
				999	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1000	} else {
				1001	target = table[(next_random >> 16)
				1002	% table_size];
				1003	}
				1004	if (target == 0)
				1005	target = next_random % (vocab_size - 1) + 1;
				1006	if (target == word)
				1007	continue;
				1008	label = 0;
				1009	}
				1010	l2 = target * layer1_size;
				1011	f = 0;
				1012	for (c = 0; c < layer1_size; c++)
				1013	f += syn0[c + l1] * syn1neg[c + l2];
				1014	if (f > MAX_EXP)
				1015	g = (label - 1) * alpha;
				1016	else if (f < -MAX_EXP)
				1017	g = (label - 0) * alpha;
				1018	else
				1019	g =
				1020	(label
				1021	- expTable[(int) ((f + MAX_EXP)
				1022	* (EXP_TABLE_SIZE
				1023	/ MAX_EXP / 2))])
				1024	* alpha;
				1025	for (c = 0; c < layer1_size; c++)
				1026	neu1e[c] += g * syn1neg[c + l2];
				1027	for (c = 0; c < layer1_size; c++)
				1028	syn1neg[c + l2] += g * syn0[c + l1];
				1029	if (cap == 1)
				1030	for (c = 0; c < layer1_size; c++)
				1031	capParam(syn1neg, c + l2);
				1032	}
				1033	//Noise Contrastive Estimation
				1034	if (nce > 0)
				1035	for (d = 0; d < nce + 1; d++) {
				1036	if (d == 0) {
				1037	target = word;
				1038	label = 1;
				1039	} else {
				1040	next_random = next_random
				1041	* (unsigned long long) 25214903917 + 11;
				1042	if (word_to_group != NULL
				1043	&& word_to_group[word] != -1) {
				1044	target = word;
				1045	while (target == word) {
				1046	target =
				1047	group_to_table[word_to_group[word]
				1048	* table_size
				1049	+ (next_random >> 16)
				1050	% table_size];
				1051	next_random =
				1052	next_random
				1053	* (unsigned long long) 25214903917
				1054	+ 11;
				1055	}
				1056	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1057	} else {
				1058	target = table[(next_random >> 16)
				1059	% table_size];
				1060	}
				1061	if (target == 0)
				1062	target = next_random % (vocab_size - 1) + 1;
				1063	if (target == word)
				1064	continue;
				1065	label = 0;
				1066	}
				1067	l2 = target * layer1_size;
				1068	f = 0;
				1069	for (c = 0; c < layer1_size; c++)
				1070	f += syn0[c + l1] * syn1nce[c + l2];
				1071	if (f > MAX_EXP)
				1072	g = (label - 1) * alpha;
				1073	else if (f < -MAX_EXP)
				1074	g = (label - 0) * alpha;
				1075	else {
				1076	f = exp(f);
				1077	g = (label
				1078	- f
				1079	/ (noise_distribution[target]
				1080	* nce + f)) * alpha;
				1081	}
				1082	for (c = 0; c < layer1_size; c++)
				1083	neu1e[c] += g * syn1nce[c + l2];
				1084	for (c = 0; c < layer1_size; c++)
				1085	syn1nce[c + l2] += g * syn0[c + l1];
				1086	if (cap == 1)
				1087	for (c = 0; c < layer1_size; c++)
				1088	capParam(syn1nce, c + l2);
				1089	}
				1090	// Learn weights input -> hidden
				1091	for (c = 0; c < layer1_size; c++)
				1092	syn0[c + l1] += neu1e[c];
				1093	}
				1094	} else if (type == 2) { //train the cwindow architecture
				1095	// in -> hidden
				1096	cw = 0;
				1097	for (a = 0; a < window * 2 + 1; a++)
				1098	if (a != window) {
				1099	c = sentence_position - window + a;
				1100	if (c < 0)
				1101	continue;
				1102	if (c >= sentence_length)
				1103	continue;
				1104	last_word = sen[c];
				1105	if (last_word == -1)
				1106	continue;
				1107	window_offset = a * layer1_size;
				1108	if (a > window)
				1109	window_offset -= layer1_size;
				1110	for (c = 0; c < layer1_size; c++)
				1111	neu1[c + window_offset] += syn0[c
				1112	+ last_word * layer1_size];
				1113	cw++;
				1114	}
				1115	if (cw) {
				1116	if (hs)
				1117	for (d = 0; d < vocab[word].codelen; d++) {
				1118	f = 0;
				1119	l2 = vocab[word].point[d] * window_layer_size;
				1120	// Propagate hidden -> output
				1121	for (c = 0; c < window_layer_size; c++)
				1122	f += neu1[c] * syn1_window[c + l2];
				1123	if (f <= -MAX_EXP)
				1124	continue;
				1125	else if (f >= MAX_EXP)
				1126	continue;
				1127	else
				1128	f = expTable[(int) ((f + MAX_EXP)
				1129	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1130	// 'g' is the gradient multiplied by the learning rate
				1131	g = (1 - vocab[word].code[d] - f) * alpha;
				1132	// Propagate errors output -> hidden
				1133	for (c = 0; c < window_layer_size; c++)
				1134	neu1e[c] += g * syn1_window[c + l2];
				1135	// Learn weights hidden -> output
				1136	for (c = 0; c < window_layer_size; c++)
				1137	syn1_window[c + l2] += g * neu1[c];
				1138	if (cap == 1)
				1139	for (c = 0; c < window_layer_size; c++)
				1140	capParam(syn1_window, c + l2);
				1141	}
				1142	// NEGATIVE SAMPLING
				1143	if (negative > 0)
				1144	for (d = 0; d < negative + 1; d++) {
				1145	if (d == 0) {
				1146	target = word;
				1147	label = 1;
				1148	} else {
				1149	next_random = next_random
				1150	* (unsigned long long) 25214903917 + 11;
				1151	if (word_to_group != NULL
				1152	&& word_to_group[word] != -1) {
				1153	target = word;
				1154	while (target == word) {
				1155	target = group_to_table[word_to_group[word]
				1156	* table_size
				1157	+ (next_random >> 16) % table_size];
				1158	next_random = next_random
				1159	* (unsigned long long) 25214903917
				1160	+ 11;
				1161	}
				1162	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1163	} else {
				1164	target =
				1165	table[(next_random >> 16) % table_size];
				1166	}
				1167	if (target == 0)
				1168	target = next_random % (vocab_size - 1) + 1;
				1169	if (target == word)
				1170	continue;
				1171	label = 0;
				1172	}
				1173	l2 = target * window_layer_size;
				1174	f = 0;
				1175	for (c = 0; c < window_layer_size; c++)
				1176	f += neu1[c] * syn1neg_window[c + l2];
				1177	if (f > MAX_EXP)
				1178	g = (label - 1) * alpha;
				1179	else if (f < -MAX_EXP)
				1180	g = (label - 0) * alpha;
				1181	else
				1182	g = (label
				1183	- expTable[(int) ((f + MAX_EXP)
				1184	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1185	* alpha;
				1186	for (c = 0; c < window_layer_size; c++)
				1187	neu1e[c] += g * syn1neg_window[c + l2];
				1188	for (c = 0; c < window_layer_size; c++)
				1189	syn1neg_window[c + l2] += g * neu1[c];
				1190	if (cap == 1)
				1191	for (c = 0; c < window_layer_size; c++)
				1192	capParam(syn1neg_window, c + l2);
				1193	}
				1194	// Noise Contrastive Estimation
				1195	if (nce > 0)
				1196	for (d = 0; d < nce + 1; d++) {
				1197	if (d == 0) {
				1198	target = word;
				1199	label = 1;
				1200	} else {
				1201	next_random = next_random
				1202	* (unsigned long long) 25214903917 + 11;
				1203	if (word_to_group != NULL
				1204	&& word_to_group[word] != -1) {
				1205	target = word;
				1206	while (target == word) {
				1207	target = group_to_table[word_to_group[word]
				1208	* table_size
				1209	+ (next_random >> 16) % table_size];
				1210	next_random = next_random
				1211	* (unsigned long long) 25214903917
				1212	+ 11;
				1213	}
				1214	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1215	} else {
				1216	target =
				1217	table[(next_random >> 16) % table_size];
				1218	}
				1219	if (target == 0)
				1220	target = next_random % (vocab_size - 1) + 1;
				1221	if (target == word)
				1222	continue;
				1223	label = 0;
				1224	}
				1225	l2 = target * window_layer_size;
				1226	f = 0;
				1227	for (c = 0; c < window_layer_size; c++)
				1228	f += neu1[c] * syn1nce_window[c + l2];
				1229	if (f > MAX_EXP)
				1230	g = (label - 1) * alpha;
				1231	else if (f < -MAX_EXP)
				1232	g = (label - 0) * alpha;
				1233	else {
				1234	f = exp(f);
				1235	g =
				1236	(label
				1237	- f
				1238	/ (noise_distribution[target]
				1239	* nce + f)) * alpha;
				1240	}
				1241	for (c = 0; c < window_layer_size; c++)
				1242	neu1e[c] += g * syn1nce_window[c + l2];
				1243	for (c = 0; c < window_layer_size; c++)
				1244	syn1nce_window[c + l2] += g * neu1[c];
				1245	if (cap == 1)
				1246	for (c = 0; c < window_layer_size; c++)
				1247	capParam(syn1nce_window, c + l2);
				1248	}
				1249	// hidden -> in
				1250	for (a = 0; a < window * 2 + 1; a++)
				1251	if (a != window) {
				1252	c = sentence_position - window + a;
				1253	if (c < 0)
				1254	continue;
				1255	if (c >= sentence_length)
				1256	continue;
				1257	last_word = sen[c];
				1258	if (last_word == -1)
				1259	continue;
				1260	window_offset = a * layer1_size;
				1261	if (a > window)
				1262	window_offset -= layer1_size;
				1263	for (c = 0; c < layer1_size; c++)
				1264	syn0[c + last_word * layer1_size] += neu1e[c
				1265	+ window_offset];
				1266	}
				1267	}
				1268	} else if (type == 3) { //train structured skip-gram
				1269	for (a = 0; a < window * 2 + 1; a++)
				1270	if (a != window) {
				1271	c = sentence_position - window + a;
				1272	if (c < 0)
				1273	continue;
				1274	if (c >= sentence_length)
				1275	continue;
				1276	last_word = sen[c];
				1277	if (last_word == -1)
				1278	continue;
				1279	l1 = last_word * layer1_size;
				1280	window_offset = a * layer1_size;
				1281	if (a > window)
				1282	window_offset -= layer1_size;
				1283	for (c = 0; c < layer1_size; c++)
				1284	neu1e[c] = 0;
				1285	// HIERARCHICAL SOFTMAX
				1286	if (hs)
				1287	for (d = 0; d < vocab[word].codelen; d++) {
				1288	f = 0;
				1289	l2 = vocab[word].point[d] * window_layer_size;
				1290	// Propagate hidden -> output
				1291	for (c = 0; c < layer1_size; c++)
				1292	f += syn0[c + l1]
				1293	* syn1_window[c + l2 + window_offset];
				1294	if (f <= -MAX_EXP)
				1295	continue;
				1296	else if (f >= MAX_EXP)
				1297	continue;
				1298	else
				1299	f = expTable[(int) ((f + MAX_EXP)
				1300	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1301	// 'g' is the gradient multiplied by the learning rate
				1302	g = (1 - vocab[word].code[d] - f) * alpha;
				1303	// Propagate errors output -> hidden
				1304	for (c = 0; c < layer1_size; c++)
				1305	neu1e[c] += g
				1306	* syn1_window[c + l2 + window_offset];
				1307	// Learn weights hidden -> output
				1308	for (c = 0; c < layer1_size; c++)
				1309	syn1[c + l2 + window_offset] += g
				1310	* syn0[c + l1];
				1311	if (cap == 1)
				1312	for (c = 0; c < layer1_size; c++)
				1313	capParam(syn1, c + l2 + window_offset);
				1314	}
				1315	// NEGATIVE SAMPLING
				1316	if (negative > 0)
				1317	for (d = 0; d < negative + 1; d++) {
				1318	if (d == 0) {
				1319	target = word;
				1320	label = 1;
				1321	} else {
				1322	next_random = next_random
				1323	* (unsigned long long) 25214903917 + 11;
				1324	if (word_to_group != NULL
				1325	&& word_to_group[word] != -1) {
				1326	target = word;
				1327	while (target == word) {
				1328	target =
				1329	group_to_table[word_to_group[word]
				1330	* table_size
				1331	+ (next_random >> 16)
				1332	% table_size];
				1333	next_random =
				1334	next_random
				1335	* (unsigned long long) 25214903917
				1336	+ 11;
				1337	}
				1338	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1339	} else {
				1340	target = table[(next_random >> 16)
				1341	% table_size];
				1342	}
				1343	if (target == 0)
				1344	target = next_random % (vocab_size - 1) + 1;
				1345	if (target == word)
				1346	continue;
				1347	label = 0;
				1348	}
				1349	l2 = target * window_layer_size;
				1350	f = 0;
				1351	for (c = 0; c < layer1_size; c++)
				1352	f +=
				1353	syn0[c + l1]
				1354	* syn1neg_window[c + l2
				1355	+ window_offset];
				1356	if (f > MAX_EXP)
				1357	g = (label - 1) * alpha;
				1358	else if (f < -MAX_EXP)
				1359	g = (label - 0) * alpha;
				1360	else
				1361	g =
				1362	(label
				1363	- expTable[(int) ((f + MAX_EXP)
				1364	* (EXP_TABLE_SIZE
				1365	/ MAX_EXP / 2))])
				1366	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1367	if(debug_mode > 2 && ((long long) id) == 0) {
				1368	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1369	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1370	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1371	for (c = 0; c < layer1_size; c++)
				1372	neu1e[c] +=
				1373	g
				1374	* syn1neg_window[c + l2
				1375	+ window_offset];
				1376	for (c = 0; c < layer1_size; c++)
				1377	syn1neg_window[c + l2 + window_offset] += g
				1378	* syn0[c + l1];
				1379	if (cap == 1)
				1380	for (c = 0; c < layer1_size; c++)
				1381	capParam(syn1neg_window,
				1382	c + l2 + window_offset);
				1383	}
				1384	// Noise Constrastive Estimation
				1385	if (nce > 0)
				1386	for (d = 0; d < nce + 1; d++) {
				1387	if (d == 0) {
				1388	target = word;
				1389	label = 1;
				1390	} else {
				1391	next_random = next_random
				1392	* (unsigned long long) 25214903917 + 11;
				1393	if (word_to_group != NULL
				1394	&& word_to_group[word] != -1) {
				1395	target = word;
				1396	while (target == word) {
				1397	target =
				1398	group_to_table[word_to_group[word]
				1399	* table_size
				1400	+ (next_random >> 16)
				1401	% table_size];
				1402	next_random =
				1403	next_random
				1404	* (unsigned long long) 25214903917
				1405	+ 11;
				1406	}
				1407	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1408	} else {
				1409	target = table[(next_random >> 16)
				1410	% table_size];
				1411	}
				1412	if (target == 0)
				1413	target = next_random % (vocab_size - 1) + 1;
				1414	if (target == word)
				1415	continue;
				1416	label = 0;
				1417	}
				1418	l2 = target * window_layer_size;
				1419	f = 0;
				1420	for (c = 0; c < layer1_size; c++)
				1421	f +=
				1422	syn0[c + l1]
				1423	* syn1nce_window[c + l2
				1424	+ window_offset];
				1425	if (f > MAX_EXP)
				1426	g = (label - 1) * alpha;
				1427	else if (f < -MAX_EXP)
				1428	g = (label - 0) * alpha;
				1429	else {
				1430	f = exp(f);
				1431	g = (label
				1432	- f
				1433	/ (noise_distribution[target]
				1434	* nce + f)) * alpha;
				1435	}
				1436	for (c = 0; c < layer1_size; c++)
				1437	neu1e[c] +=
				1438	g
				1439	* syn1nce_window[c + l2
				1440	+ window_offset];
				1441	for (c = 0; c < layer1_size; c++)
				1442	syn1nce_window[c + l2 + window_offset] += g
				1443	* syn0[c + l1];
				1444	if (cap == 1)
				1445	for (c = 0; c < layer1_size; c++)
				1446	capParam(syn1nce_window,
				1447	c + l2 + window_offset);
				1448	}
				1449	// Learn weights input -> hidden
				1450	for (c = 0; c < layer1_size; c++) {
				1451	syn0[c + l1] += neu1e[c];
				1452	if (syn0[c + l1] > 50)
				1453	syn0[c + l1] = 50;
				1454	if (syn0[c + l1] < -50)
				1455	syn0[c + l1] = -50;
				1456	}
				1457	}
				1458	} else if (type == 4) { //training senna
				1459	// in -> hidden
				1460	cw = 0;
				1461	for (a = 0; a < window * 2 + 1; a++)
				1462	if (a != window) {
				1463	c = sentence_position - window + a;
				1464	if (c < 0)
				1465	continue;
				1466	if (c >= sentence_length)
				1467	continue;
				1468	last_word = sen[c];
				1469	if (last_word == -1)
				1470	continue;
				1471	window_offset = a * layer1_size;
				1472	if (a > window)
				1473	window_offset -= layer1_size;
				1474	for (c = 0; c < layer1_size; c++)
				1475	neu1[c + window_offset] += syn0[c
				1476	+ last_word * layer1_size];
				1477	cw++;
				1478	}
				1479	if (cw) {
				1480	for (a = 0; a < window_hidden_size; a++) {
				1481	c = a * window_layer_size;
				1482	for (b = 0; b < window_layer_size; b++) {
				1483	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1484	}
				1485	}
				1486	if (hs)
				1487	for (d = 0; d < vocab[word].codelen; d++) {
				1488	f = 0;
				1489	l2 = vocab[word].point[d] * window_hidden_size;
				1490	// Propagate hidden -> output
				1491	for (c = 0; c < window_hidden_size; c++)
				1492	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1493	if (f <= -MAX_EXP)
				1494	continue;
				1495	else if (f >= MAX_EXP)
				1496	continue;
				1497	else
				1498	f = expTable[(int) ((f + MAX_EXP)
				1499	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1500	// 'g' is the gradient multiplied by the learning rate
				1501	g = (1 - vocab[word].code[d] - f) * alpha;
				1502	// Propagate errors output -> hidden
				1503	for (c = 0; c < window_hidden_size; c++)
				1504	neu2e[c] += dHardTanh(neu2[c], g) * g
				1505	* syn_hidden_word[c + l2];
				1506	// Learn weights hidden -> output
				1507	for (c = 0; c < window_hidden_size; c++)
				1508	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1509	* neu2[c];
				1510	}
				1511	// NEGATIVE SAMPLING
				1512	if (negative > 0)
				1513	for (d = 0; d < negative + 1; d++) {
				1514	if (d == 0) {
				1515	target = word;
				1516	label = 1;
				1517	} else {
				1518	next_random = next_random
				1519	* (unsigned long long) 25214903917 + 11;
				1520	if (word_to_group != NULL
				1521	&& word_to_group[word] != -1) {
				1522	target = word;
				1523	while (target == word) {
				1524	target = group_to_table[word_to_group[word]
				1525	* table_size
				1526	+ (next_random >> 16) % table_size];
				1527	next_random = next_random
				1528	* (unsigned long long) 25214903917
				1529	+ 11;
				1530	}
				1531	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1532	} else {
				1533	target =
				1534	table[(next_random >> 16) % table_size];
				1535	}
				1536	if (target == 0)
				1537	target = next_random % (vocab_size - 1) + 1;
				1538	if (target == word)
				1539	continue;
				1540	label = 0;
				1541	}
				1542	l2 = target * window_hidden_size;
				1543	f = 0;
				1544	for (c = 0; c < window_hidden_size; c++)
				1545	f += hardTanh(neu2[c])
				1546	* syn_hidden_word_neg[c + l2];
				1547	if (f > MAX_EXP)
				1548	g = (label - 1) * alpha / negative;
				1549	else if (f < -MAX_EXP)
				1550	g = (label - 0) * alpha / negative;
				1551	else
				1552	g = (label
				1553	- expTable[(int) ((f + MAX_EXP)
				1554	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1555	* alpha / negative;
				1556	for (c = 0; c < window_hidden_size; c++)
				1557	neu2e[c] += dHardTanh(neu2[c], g) * g
				1558	* syn_hidden_word_neg[c + l2];
				1559	for (c = 0; c < window_hidden_size; c++)
				1560	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1561	* g * neu2[c];
				1562	}
				1563	for (a = 0; a < window_hidden_size; a++)
				1564	for (b = 0; b < window_layer_size; b++)
				1565	neu1e[b] += neu2e[a]
				1566	* syn_window_hidden[a * window_layer_size + b];
				1567	for (a = 0; a < window_hidden_size; a++)
				1568	for (b = 0; b < window_layer_size; b++)
				1569	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1570	* neu1[b];
				1571	// hidden -> in
				1572	for (a = 0; a < window * 2 + 1; a++)
				1573	if (a != window) {
				1574	c = sentence_position - window + a;
				1575	if (c < 0)
				1576	continue;
				1577	if (c >= sentence_length)
				1578	continue;
				1579	last_word = sen[c];
				1580	if (last_word == -1)
				1581	continue;
				1582	window_offset = a * layer1_size;
				1583	if (a > window)
				1584	window_offset -= layer1_size;
				1585	for (c = 0; c < layer1_size; c++)
				1586	syn0[c + last_word * layer1_size] += neu1e[c
				1587	+ window_offset];
				1588	}
				1589	}
				1590	} else {
				1591	printf("unknown type %i", type);
				1592	exit(0);
				1593	}
				1594	sentence_position++;
				1595	if (sentence_position >= sentence_length) {
				1596	sentence_length = 0;
				1597	continue;
				1598	}
				1599	}
				1600	fclose(fi);
				1601	free(neu1);
				1602	free(neu1e);
				1603	pthread_exit(NULL);
				1604	}
				1605
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1606	void ShowCollocations() {
				1607	long a, b, c, d, window_offset, target, max_target=0, maxmax_target;
				1608	real f, max_f, maxmax_f;
				1609	real *target_sums;
				1610	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1611
				1612	for (d = cc; d < vocab_size; d++) {
				1613	for (b = 0; b < vocab_size; b++)
				1614	target_sums[b]=0;
				1615	maxmax_f = -1;
				1616	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame^]	1617	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1618	if (a != window) {
				1619	max_f = -1;
				1620	window_offset = a * layer1_size;
				1621	if (a > window)
				1622	window_offset -= layer1_size;
				1623	for(target = 0; target < vocab_size; target ++) {
				1624	if(target == d)
				1625	continue;
				1626	f = 0;
				1627	for (c = 0; c < layer1_size; c++)
				1628	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1629	if (f < -MAX_EXP)
				1630	continue;
				1631	else if (f > MAX_EXP)
				1632	continue;
				1633	else
				1634	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1635	if(f > max_f) {
				1636	max_f = f;
				1637	max_target = target;
				1638	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1639	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1640	}
				1641	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1642	if(max_f > maxmax_f) {
				1643	maxmax_f = max_f;
				1644	maxmax_target = max_target;
				1645	}
				1646	} else {
				1647	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1648	}
				1649	}
				1650	max_f = -1;
				1651	for (b = 0; b < vocab_size; b++) {
				1652	if(target_sums[b] > max_f) {
				1653	max_f = target_sums[b];
				1654	max_target = b;
				1655	}
				1656	}
				1657	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1658	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1659	vocab[maxmax_target].word, maxmax_f);
				1660	}
				1661	}
				1662
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1663	void TrainModel() {
				1664	long a, b, c, d;
				1665	FILE *fo;
				1666	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
				1667	printf("Starting training using file %s\n", train_file);
				1668	starting_alpha = alpha;
				1669	if (read_vocab_file[0] != 0)
				1670	ReadVocab();
				1671	else
				1672	LearnVocabFromTrainFile();
				1673	if (save_vocab_file[0] != 0)
				1674	SaveVocab();
				1675	if (output_file[0] == 0)
				1676	return;
				1677	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1678	if(cc > 0)
				1679	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1680	if (negative > 0 \|\| nce > 0)
				1681	InitUnigramTable();
				1682	if (negative_classes_file[0] != 0)
				1683	InitClassUnigramTable();
				1684	start = clock();
				1685	for (a = 0; a < num_threads; a++)
				1686	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
				1687	for (a = 0; a < num_threads; a++)
				1688	pthread_join(pt[a], NULL);
				1689	fo = fopen(output_file, "wb");
				1690	if (classes == 0) {
				1691	// Save the word vectors
				1692	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1693	for (a = 0; a < vocab_size; a++) {
				1694	fprintf(fo, "%s ", vocab[a].word);
				1695	if (binary)
				1696	for (b = 0; b < layer1_size; b++)
				1697	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1698	else
				1699	for (b = 0; b < layer1_size; b++)
				1700	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1701	fprintf(fo, "\n");
				1702	}
				1703	} else {
				1704	// Run K-means on the word vectors
				1705	int clcn = classes, iter = 10, closeid;
				1706	int centcn = (int ) malloc(classes * sizeof(int));
				1707	int cl = (int ) calloc(vocab_size, sizeof(int));
				1708	real closev, x;
				1709	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1710	for (a = 0; a < vocab_size; a++)
				1711	cl[a] = a % clcn;
				1712	for (a = 0; a < iter; a++) {
				1713	for (b = 0; b < clcn * layer1_size; b++)
				1714	cent[b] = 0;
				1715	for (b = 0; b < clcn; b++)
				1716	centcn[b] = 1;
				1717	for (c = 0; c < vocab_size; c++) {
				1718	for (d = 0; d < layer1_size; d++)
				1719	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1720	centcn[cl[c]]++;
				1721	}
				1722	for (b = 0; b < clcn; b++) {
				1723	closev = 0;
				1724	for (c = 0; c < layer1_size; c++) {
				1725	cent[layer1_size * b + c] /= centcn[b];
				1726	closev += cent[layer1_size * b + c]
				1727	* cent[layer1_size * b + c];
				1728	}
				1729	closev = sqrt(closev);
				1730	for (c = 0; c < layer1_size; c++)
				1731	cent[layer1_size * b + c] /= closev;
				1732	}
				1733	for (c = 0; c < vocab_size; c++) {
				1734	closev = -10;
				1735	closeid = 0;
				1736	for (d = 0; d < clcn; d++) {
				1737	x = 0;
				1738	for (b = 0; b < layer1_size; b++)
				1739	x += cent[layer1_size * d + b]
				1740	* syn0[c * layer1_size + b];
				1741	if (x > closev) {
				1742	closev = x;
				1743	closeid = d;
				1744	}
				1745	}
				1746	cl[c] = closeid;
				1747	}
				1748	}
				1749	// Save the K-means classes
				1750	for (a = 0; a < vocab_size; a++)
				1751	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1752	free(centcn);
				1753	free(cent);
				1754	free(cl);
				1755	}
				1756	fclose(fo);
				1757	if (save_net_file[0] != 0)
				1758	SaveNet();
				1759	}
				1760
				1761	int ArgPos(char str, int argc, char *argv) {
				1762	int a;
				1763	for (a = 1; a < argc; a++)
				1764	if (!strcmp(str, argv[a])) {
				1765	if (a == argc - 1) {
				1766	printf("Argument missing for %s\n", str);
				1767	exit(1);
				1768	}
				1769	return a;
				1770	}
				1771	return -1;
				1772	}
				1773
				1774	int main(int argc, char **argv) {
				1775	int i;
				1776	if (argc == 1) {
				1777	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1778	printf("Options:\n");
				1779	printf("Parameters for training:\n");
				1780	printf("\t-train <file>\n");
				1781	printf("\t\tUse text data from <file> to train the model\n");
				1782	printf("\t-output <file>\n");
				1783	printf(
				1784	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1785	printf("\t-size <int>\n");
				1786	printf("\t\tSet size of word vectors; default is 100\n");
				1787	printf("\t-window <int>\n");
				1788	printf("\t\tSet max skip length between words; default is 5\n");
				1789	printf("\t-sample <float>\n");
				1790	printf(
				1791	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1792	printf(
				1793	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1794	printf("\t-hs <int>\n");
				1795	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1796	printf("\t-negative <int>\n");
				1797	printf(
				1798	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1799	printf("\t-negative-classes <file>\n");
				1800	printf("\t\tNegative classes to sample from\n");
				1801	printf("\t-nce <int>\n");
				1802	printf(
				1803	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1804	printf("\t-threads <int>\n");
				1805	printf("\t\tUse <int> threads (default 12)\n");
				1806	printf("\t-iter <int>\n");
				1807	printf("\t\tRun more training iterations (default 5)\n");
				1808	printf("\t-min-count <int>\n");
				1809	printf(
				1810	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1811	printf("\t-alpha <float>\n");
				1812	printf(
				1813	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1814	printf("\t-classes <int>\n");
				1815	printf(
				1816	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1817	printf("\t-debug <int>\n");
				1818	printf(
				1819	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1820	printf("\t-binary <int>\n");
				1821	printf(
				1822	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1823	printf("\t-save-vocab <file>\n");
				1824	printf("\t\tThe vocabulary will be saved to <file>\n");
				1825	printf("\t-read-vocab <file>\n");
				1826	printf(
				1827	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1828	printf("\t-read-net <file>\n");
				1829	printf(
				1830	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1831	printf("\t-save-net <file>\n");
				1832	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1833	printf("\t-show-cc <int>\n");
				1834	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1835	printf("\t-type <int>\n");
				1836	printf(
				1837	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1838	printf("\t-cap <int>\n");
				1839	printf(
				1840	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1841	printf("\nExamples:\n");
				1842	printf(
				1843	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1844	return 0;
				1845	}
				1846	output_file[0] = 0;
				1847	save_vocab_file[0] = 0;
				1848	read_vocab_file[0] = 0;
				1849	save_net_file[0] = 0;
				1850	read_net_file[0] = 0;
				1851	negative_classes_file[0] = 0;
				1852	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				1853	layer1_size = atoi(argv[i + 1]);
				1854	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				1855	strcpy(train_file, argv[i + 1]);
				1856	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				1857	strcpy(save_vocab_file, argv[i + 1]);
				1858	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				1859	strcpy(read_vocab_file, argv[i + 1]);
				1860	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				1861	strcpy(save_net_file, argv[i + 1]);
				1862	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				1863	strcpy(read_net_file, argv[i + 1]);
				1864	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				1865	debug_mode = atoi(argv[i + 1]);
				1866	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				1867	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1868	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				1869	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1870	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				1871	type = atoi(argv[i + 1]);
				1872	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				1873	strcpy(output_file, argv[i + 1]);
				1874	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				1875	window = atoi(argv[i + 1]);
				1876	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				1877	sample = atof(argv[i + 1]);
				1878	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				1879	hs = atoi(argv[i + 1]);
				1880	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				1881	negative = atoi(argv[i + 1]);
				1882	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				1883	strcpy(negative_classes_file, argv[i + 1]);
				1884	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				1885	nce = atoi(argv[i + 1]);
				1886	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				1887	num_threads = atoi(argv[i + 1]);
				1888	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				1889	iter = atoi(argv[i + 1]);
				1890	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				1891	min_count = atoi(argv[i + 1]);
				1892	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				1893	classes = atoi(argv[i + 1]);
				1894	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				1895	cap = atoi(argv[i + 1]);
				1896	if (type == 0 \|\| type == 2 \|\| type == 4)
				1897	alpha = 0.05;
				1898	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				1899	alpha = atof(argv[i + 1]);
				1900	vocab = (struct vocab_word *) calloc(vocab_max_size,
				1901	sizeof(struct vocab_word));
				1902	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				1903	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1904	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1905	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1906	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1907	}
				1908	TrainModel();
				1909	return 0;
				1910	}
				1911