Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 6aec264cb9b5d82c8b9110cea66be657504323b8 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
				25	#define MAX_CODE_LENGTH 40
				26
				27	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				28
				29	typedef float real; // Precision of float numbers
				30
				31	struct vocab_word {
				32	long long cn;
				33	int *point;
				34	char word, code, codelen;
				35	};
				36
				37	char train_file[MAX_STRING], output_file[MAX_STRING];
				38	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				39	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				40	struct vocab_word *vocab;
				41	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
				42	num_threads = 12, min_reduce = 1;
				43	int *vocab_hash;
				44	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				45	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				46	classes = 0;
				47	real alpha = 0.025, starting_alpha, sample = 1e-3;
				48	real syn0, syn1, syn1neg, syn1nce, *expTable;
				49	clock_t start;
				50
				51	real syn1_window, syn1neg_window, *syn1nce_window;
				52	int w_offset, window_layer_size;
				53
				54	int window_hidden_size = 500;
				55	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				56	*syn_hidden_word_nce;
				57
				58	int hs = 0, negative = 5;
				59	const int table_size = 1e8;
				60	int *table;
				61
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	62	long cc = 0;
				63
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	64	//constrastive negative sampling
				65	char negative_classes_file[MAX_STRING];
				66	int *word_to_group;
				67	int group_to_table; //group_sizetable_size
				68	int class_number;
				69
				70	//nce
				71	real* noise_distribution;
				72	int nce = 0;
				73
				74	//param caps
				75	real CAP_VALUE = 50;
				76	int cap = 0;
				77
				78	void capParam(real* array, int index) {
				79	if (array[index] > CAP_VALUE)
				80	array[index] = CAP_VALUE;
				81	else if (array[index] < -CAP_VALUE)
				82	array[index] = -CAP_VALUE;
				83	}
				84
				85	real hardTanh(real x) {
				86	if (x >= 1) {
				87	return 1;
				88	} else if (x <= -1) {
				89	return -1;
				90	} else {
				91	return x;
				92	}
				93	}
				94
				95	real dHardTanh(real x, real g) {
				96	if (x > 1 && g > 0) {
				97	return 0;
				98	}
				99	if (x < -1 && g < 0) {
				100	return 0;
				101	}
				102	return 1;
				103	}
				104
				105	void InitUnigramTable() {
				106	int a, i;
				107	long long train_words_pow = 0;
				108	real d1, power = 0.75;
				109	table = (int ) malloc(table_size sizeof(int));
				110	for (a = 0; a < vocab_size; a++)
				111	train_words_pow += pow(vocab[a].cn, power);
				112	i = 0;
				113	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				114	for (a = 0; a < table_size; a++) {
				115	table[a] = i;
				116	if (a / (real) table_size > d1) {
				117	i++;
				118	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				119	}
				120	if (i >= vocab_size)
				121	i = vocab_size - 1;
				122	}
				123
				124	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				125	for (a = 0; a < vocab_size; a++)
				126	noise_distribution[a] = pow(vocab[a].cn, power)
				127	/ (real) train_words_pow;
				128	}
				129
				130	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				131	void ReadWord(char word, FILE fin) {
				132	int a = 0, ch;
				133	while (!feof(fin)) {
				134	ch = fgetc(fin);
				135	if (ch == 13)
				136	continue;
				137	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				138	if (a > 0) {
				139	if (ch == '\n')
				140	ungetc(ch, fin);
				141	break;
				142	}
				143	if (ch == '\n') {
				144	strcpy(word, (char *) "</s>");
				145	return;
				146	} else
				147	continue;
				148	}
				149	word[a] = ch;
				150	a++;
				151	if (a >= MAX_STRING - 1)
				152	a--; // Truncate too long words
				153	}
				154	word[a] = 0;
				155	}
				156
				157	// Returns hash value of a word
				158	int GetWordHash(char *word) {
				159	unsigned long long a, hash = 0;
				160	for (a = 0; a < strlen(word); a++)
				161	hash = hash * 257 + word[a];
				162	hash = hash % vocab_hash_size;
				163	return hash;
				164	}
				165
				166	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				167	int SearchVocab(char *word) {
				168	unsigned int hash = GetWordHash(word);
				169	while (1) {
				170	if (vocab_hash[hash] == -1)
				171	return -1;
				172	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				173	return vocab_hash[hash];
				174	hash = (hash + 1) % vocab_hash_size;
				175	}
				176	return -1;
				177	}
				178
				179	// Reads a word and returns its index in the vocabulary
				180	int ReadWordIndex(FILE *fin) {
				181	char word[MAX_STRING];
				182	ReadWord(word, fin);
				183	if (feof(fin))
				184	return -1;
				185	return SearchVocab(word);
				186	}
				187
				188	// Adds a word to the vocabulary
				189	int AddWordToVocab(char *word) {
				190	unsigned int hash, length = strlen(word) + 1;
				191	if (length > MAX_STRING)
				192	length = MAX_STRING;
				193	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				194	strcpy(vocab[vocab_size].word, word);
				195	vocab[vocab_size].cn = 0;
				196	vocab_size++;
				197	// Reallocate memory if needed
				198	if (vocab_size + 2 >= vocab_max_size) {
				199	vocab_max_size += 1000;
				200	vocab = (struct vocab_word *) realloc(vocab,
				201	vocab_max_size * sizeof(struct vocab_word));
				202	}
				203	hash = GetWordHash(word);
				204	while (vocab_hash[hash] != -1)
				205	hash = (hash + 1) % vocab_hash_size;
				206	vocab_hash[hash] = vocab_size - 1;
				207	return vocab_size - 1;
				208	}
				209
				210	// Used later for sorting by word counts
				211	int VocabCompare(const void a, const void b) {
				212	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				213	}
				214
				215	// Sorts the vocabulary by frequency using word counts
				216	void SortVocab() {
				217	int a, size;
				218	unsigned int hash;
				219	// Sort the vocabulary and keep </s> at the first position
				220	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				221	for (a = 0; a < vocab_hash_size; a++)
				222	vocab_hash[a] = -1;
				223	size = vocab_size;
				224	train_words = 0;
				225	for (a = 0; a < size; a++) {
				226	// Words occuring less than min_count times will be discarded from the vocab
				227	if ((vocab[a].cn < min_count) && (a != 0)) {
				228	vocab_size--;
				229	free(vocab[a].word);
				230	} else {
				231	// Hash will be re-computed, as after the sorting it is not actual
				232	hash = GetWordHash(vocab[a].word);
				233	while (vocab_hash[hash] != -1)
				234	hash = (hash + 1) % vocab_hash_size;
				235	vocab_hash[hash] = a;
				236	train_words += vocab[a].cn;
				237	}
				238	}
				239	vocab = (struct vocab_word *) realloc(vocab,
				240	(vocab_size + 1) * sizeof(struct vocab_word));
				241	// Allocate memory for the binary tree construction
				242	for (a = 0; a < vocab_size; a++) {
				243	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				244	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				245	}
				246	}
				247
				248	// Reduces the vocabulary by removing infrequent tokens
				249	void ReduceVocab() {
				250	int a, b = 0;
				251	unsigned int hash;
				252	for (a = 0; a < vocab_size; a++)
				253	if (vocab[a].cn > min_reduce) {
				254	vocab[b].cn = vocab[a].cn;
				255	vocab[b].word = vocab[a].word;
				256	b++;
				257	} else
				258	free(vocab[a].word);
				259	vocab_size = b;
				260	for (a = 0; a < vocab_hash_size; a++)
				261	vocab_hash[a] = -1;
				262	for (a = 0; a < vocab_size; a++) {
				263	// Hash will be re-computed, as it is not actual
				264	hash = GetWordHash(vocab[a].word);
				265	while (vocab_hash[hash] != -1)
				266	hash = (hash + 1) % vocab_hash_size;
				267	vocab_hash[hash] = a;
				268	}
				269	fflush(stdout);
				270	min_reduce++;
				271	}
				272
				273	// Create binary Huffman tree using the word counts
				274	// Frequent words will have short uniqe binary codes
				275	void CreateBinaryTree() {
				276	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				277	char code[MAX_CODE_LENGTH];
				278	long long count = (long long ) calloc(vocab_size * 2 + 1,
				279	sizeof(long long));
				280	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				281	sizeof(long long));
				282	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				283	sizeof(long long));
				284	for (a = 0; a < vocab_size; a++)
				285	count[a] = vocab[a].cn;
				286	for (a = vocab_size; a < vocab_size * 2; a++)
				287	count[a] = 1e15;
				288	pos1 = vocab_size - 1;
				289	pos2 = vocab_size;
				290	// Following algorithm constructs the Huffman tree by adding one node at a time
				291	for (a = 0; a < vocab_size - 1; a++) {
				292	// First, find two smallest nodes 'min1, min2'
				293	if (pos1 >= 0) {
				294	if (count[pos1] < count[pos2]) {
				295	min1i = pos1;
				296	pos1--;
				297	} else {
				298	min1i = pos2;
				299	pos2++;
				300	}
				301	} else {
				302	min1i = pos2;
				303	pos2++;
				304	}
				305	if (pos1 >= 0) {
				306	if (count[pos1] < count[pos2]) {
				307	min2i = pos1;
				308	pos1--;
				309	} else {
				310	min2i = pos2;
				311	pos2++;
				312	}
				313	} else {
				314	min2i = pos2;
				315	pos2++;
				316	}
				317	count[vocab_size + a] = count[min1i] + count[min2i];
				318	parent_node[min1i] = vocab_size + a;
				319	parent_node[min2i] = vocab_size + a;
				320	binary[min2i] = 1;
				321	}
				322	// Now assign binary code to each vocabulary word
				323	for (a = 0; a < vocab_size; a++) {
				324	b = a;
				325	i = 0;
				326	while (1) {
				327	code[i] = binary[b];
				328	point[i] = b;
				329	i++;
				330	b = parent_node[b];
				331	if (b == vocab_size * 2 - 2)
				332	break;
				333	}
				334	vocab[a].codelen = i;
				335	vocab[a].point[0] = vocab_size - 2;
				336	for (b = 0; b < i; b++) {
				337	vocab[a].code[i - b - 1] = code[b];
				338	vocab[a].point[i - b] = point[b] - vocab_size;
				339	}
				340	}
				341	free(count);
				342	free(binary);
				343	free(parent_node);
				344	}
				345
				346	void LearnVocabFromTrainFile() {
				347	char word[MAX_STRING];
				348	FILE *fin;
				349	long long a, i;
				350	for (a = 0; a < vocab_hash_size; a++)
				351	vocab_hash[a] = -1;
				352	fin = fopen(train_file, "rb");
				353	if (fin == NULL) {
				354	printf("ERROR: training data file not found!\n");
				355	exit(1);
				356	}
				357	vocab_size = 0;
				358	AddWordToVocab((char *) "</s>");
				359	while (1) {
				360	ReadWord(word, fin);
				361	if (feof(fin))
				362	break;
				363	train_words++;
				364	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				365	printf("%lldK%c", train_words / 1000, 13);
				366	fflush(stdout);
				367	}
				368	i = SearchVocab(word);
				369	if (i == -1) {
				370	a = AddWordToVocab(word);
				371	vocab[a].cn = 1;
				372	} else
				373	vocab[i].cn++;
				374	if (vocab_size > vocab_hash_size * 0.7)
				375	ReduceVocab();
				376	}
				377	SortVocab();
				378	if (debug_mode > 0) {
				379	printf("Vocab size: %lld\n", vocab_size);
				380	printf("Words in train file: %lld\n", train_words);
				381	}
				382	file_size = ftell(fin);
				383	fclose(fin);
				384	}
				385
				386	void SaveVocab() {
				387	long long i;
				388	FILE *fo = fopen(save_vocab_file, "wb");
				389	for (i = 0; i < vocab_size; i++)
				390	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				391	fclose(fo);
				392	}
				393
				394	void ReadVocab() {
				395	long long a, i = 0;
				396	char c;
				397	char word[MAX_STRING];
				398	FILE *fin = fopen(read_vocab_file, "rb");
				399	if (fin == NULL) {
				400	printf("Vocabulary file not found\n");
				401	exit(1);
				402	}
				403	for (a = 0; a < vocab_hash_size; a++)
				404	vocab_hash[a] = -1;
				405	vocab_size = 0;
				406	while (1) {
				407	ReadWord(word, fin);
				408	if (feof(fin))
				409	break;
				410	a = AddWordToVocab(word);
				411	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				412	i++;
				413	}
				414	SortVocab();
				415	if (debug_mode > 0) {
				416	printf("Vocab size: %lld\n", vocab_size);
				417	printf("Words in train file: %lld\n", train_words);
				418	}
				419	fin = fopen(train_file, "rb");
				420	if (fin == NULL) {
				421	printf("ERROR: training data file not found!\n");
				422	exit(1);
				423	}
				424	fseek(fin, 0, SEEK_END);
				425	file_size = ftell(fin);
				426	fclose(fin);
				427	}
				428
				429	void InitClassUnigramTable() {
				430	long long a, c;
				431	printf("loading class unigrams \n");
				432	FILE *fin = fopen(negative_classes_file, "rb");
				433	if (fin == NULL) {
				434	printf("ERROR: class file not found!\n");
				435	exit(1);
				436	}
				437	word_to_group = (int ) malloc(vocab_size sizeof(int));
				438	for (a = 0; a < vocab_size; a++)
				439	word_to_group[a] = -1;
				440	char class[MAX_STRING];
				441	char prev_class[MAX_STRING];
				442	prev_class[0] = 0;
				443	char word[MAX_STRING];
				444	class_number = -1;
				445	while (1) {
				446	if (feof(fin))
				447	break;
				448	ReadWord(class, fin);
				449	ReadWord(word, fin);
				450	int word_index = SearchVocab(word);
				451	if (word_index != -1) {
				452	if (strcmp(class, prev_class) != 0) {
				453	class_number++;
				454	strcpy(prev_class, class);
				455	}
				456	word_to_group[word_index] = class_number;
				457	}
				458	ReadWord(word, fin);
				459	}
				460	class_number++;
				461	fclose(fin);
				462
				463	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				464	long long train_words_pow = 0;
				465	real d1, power = 0.75;
				466
				467	for (c = 0; c < class_number; c++) {
				468	long long offset = c * table_size;
				469	train_words_pow = 0;
				470	for (a = 0; a < vocab_size; a++)
				471	if (word_to_group[a] == c)
				472	train_words_pow += pow(vocab[a].cn, power);
				473	int i = 0;
				474	while (word_to_group[i] != c && i < vocab_size)
				475	i++;
				476	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				477	for (a = 0; a < table_size; a++) {
				478	//printf("index %lld , word %d\n", a, i);
				479	group_to_table[offset + a] = i;
				480	if (a / (real) table_size > d1) {
				481	i++;
				482	while (word_to_group[i] != c && i < vocab_size)
				483	i++;
				484	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				485	}
				486	if (i >= vocab_size)
				487	while (word_to_group[i] != c && i >= 0)
				488	i--;
				489	}
				490	}
				491	}
				492
				493	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	494	if(type != 3 \|\| negative <= 0) {
				495	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				496	return;
				497	}
				498
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	499	FILE *fnet = fopen(save_net_file, "wb");
				500	if (fnet == NULL) {
				501	printf("Net parameter file not found\n");
				502	exit(1);
				503	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	504	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	505	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	506	fclose(fnet);
				507	}
				508
				509	void InitNet() {
				510	long long a, b;
				511	unsigned long long next_random = 1;
				512	window_layer_size = layer1_size * window * 2;
				513	a = posix_memalign((void **) &syn0, 128,
				514	(long long) vocab_size * layer1_size * sizeof(real));
				515	if (syn0 == NULL) {
				516	printf("Memory allocation failed\n");
				517	exit(1);
				518	}
				519
				520	if (hs) {
				521	a = posix_memalign((void **) &syn1, 128,
				522	(long long) vocab_size * layer1_size * sizeof(real));
				523	if (syn1 == NULL) {
				524	printf("Memory allocation failed\n");
				525	exit(1);
				526	}
				527	a = posix_memalign((void **) &syn1_window, 128,
				528	(long long) vocab_size * window_layer_size * sizeof(real));
				529	if (syn1_window == NULL) {
				530	printf("Memory allocation failed\n");
				531	exit(1);
				532	}
				533	a = posix_memalign((void **) &syn_hidden_word, 128,
				534	(long long) vocab_size * window_hidden_size * sizeof(real));
				535	if (syn_hidden_word == NULL) {
				536	printf("Memory allocation failed\n");
				537	exit(1);
				538	}
				539
				540	for (a = 0; a < vocab_size; a++)
				541	for (b = 0; b < layer1_size; b++)
				542	syn1[a * layer1_size + b] = 0;
				543	for (a = 0; a < vocab_size; a++)
				544	for (b = 0; b < window_layer_size; b++)
				545	syn1_window[a * window_layer_size + b] = 0;
				546	for (a = 0; a < vocab_size; a++)
				547	for (b = 0; b < window_hidden_size; b++)
				548	syn_hidden_word[a * window_hidden_size + b] = 0;
				549	}
				550	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	551	if(type == 0) {
				552	a = posix_memalign((void **) &syn1neg, 128,
				553	(long long) vocab_size * layer1_size * sizeof(real));
				554	if (syn1neg == NULL) {
				555	printf("Memory allocation failed\n");
				556	exit(1);
				557	}
				558	for (a = 0; a < vocab_size; a++)
				559	for (b = 0; b < layer1_size; b++)
				560	syn1neg[a * layer1_size + b] = 0;
				561	} else if (type == 3) {
				562	a = posix_memalign((void **) &syn1neg_window, 128,
				563	(long long) vocab_size * window_layer_size * sizeof(real));
				564	if (syn1neg_window == NULL) {
				565	printf("Memory allocation failed\n");
				566	exit(1);
				567	}
				568	for (a = 0; a < vocab_size; a++)
				569	for (b = 0; b < window_layer_size; b++)
				570	syn1neg_window[a * window_layer_size + b] = 0;
				571	} else if (type == 4) {
				572	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				573	(long long) vocab_size * window_hidden_size * sizeof(real));
				574	if (syn_hidden_word_neg == NULL) {
				575	printf("Memory allocation failed\n");
				576	exit(1);
				577	}
				578	for (a = 0; a < vocab_size; a++)
				579	for (b = 0; b < window_hidden_size; b++)
				580	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	581	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	582	}
				583	if (nce > 0) {
				584	a = posix_memalign((void **) &syn1nce, 128,
				585	(long long) vocab_size * layer1_size * sizeof(real));
				586	if (syn1nce == NULL) {
				587	printf("Memory allocation failed\n");
				588	exit(1);
				589	}
				590	a = posix_memalign((void **) &syn1nce_window, 128,
				591	(long long) vocab_size * window_layer_size * sizeof(real));
				592	if (syn1nce_window == NULL) {
				593	printf("Memory allocation failed\n");
				594	exit(1);
				595	}
				596	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				597	(long long) vocab_size * window_hidden_size * sizeof(real));
				598	if (syn_hidden_word_nce == NULL) {
				599	printf("Memory allocation failed\n");
				600	exit(1);
				601	}
				602
				603	for (a = 0; a < vocab_size; a++)
				604	for (b = 0; b < layer1_size; b++)
				605	syn1nce[a * layer1_size + b] = 0;
				606	for (a = 0; a < vocab_size; a++)
				607	for (b = 0; b < window_layer_size; b++)
				608	syn1nce_window[a * window_layer_size + b] = 0;
				609	for (a = 0; a < vocab_size; a++)
				610	for (b = 0; b < window_hidden_size; b++)
				611	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				612	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	613
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	614	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	615	a = posix_memalign((void **) &syn_window_hidden, 128,
				616	window_hidden_size * window_layer_size * sizeof(real));
				617	if (syn_window_hidden == NULL) {
				618	printf("Memory allocation failed\n");
				619	exit(1);
				620	}
				621	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				622	next_random = next_random * (unsigned long long) 25214903917 + 11;
				623	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				624	- 0.5) / (window_hidden_size * window_layer_size);
				625	}
				626	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	627
				628	if (read_net_file[0] == 0) {
				629	for (a = 0; a < vocab_size; a++)
				630	for (b = 0; b < layer1_size; b++) {
				631	next_random = next_random * (unsigned long long) 25214903917
				632	+ 11;
				633	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				634	/ (real) 65536) - 0.5) / layer1_size;
				635	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	636	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	637	FILE *fnet = fopen(read_net_file, "rb");
				638	if (fnet == NULL) {
				639	printf("Net parameter file not found\n");
				640	exit(1);
				641	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	642	fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	643	fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	644	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	645	} else {
				646	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				647	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	648	}
				649
				650	CreateBinaryTree();
				651	}
				652
				653	void TrainModelThread(void id) {
				654	long long a, b, d, cw, word, last_word, sentence_length = 0,
				655	sentence_position = 0;
				656	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				657	long long l1, l2, c, target, label, local_iter = iter;
				658	unsigned long long next_random = (long long) id;
				659	real f, g;
				660	clock_t now;
				661	int input_len_1 = layer1_size;
				662	int window_offset = -1;
				663	if (type == 2 \|\| type == 4) {
				664	input_len_1 = window_layer_size;
				665	}
				666	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				667	real neu1e = (real ) calloc(input_len_1, sizeof(real));
				668
				669	int input_len_2 = 0;
				670	if (type == 4) {
				671	input_len_2 = window_hidden_size;
				672	}
				673	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				674	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				675
				676	FILE *fi = fopen(train_file, "rb");
				677	fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
				678	while (1) {
				679	if (word_count - last_word_count > 10000) {
				680	word_count_actual += word_count - last_word_count;
				681	last_word_count = word_count;
				682	if ((debug_mode > 1)) {
				683	now = clock();
				684	printf(
				685	"%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
				686	13, alpha,
				687	word_count_actual / (real) (iter * train_words + 1)
				688	* 100,
				689	word_count_actual
				690	/ ((real) (now - start + 1)
				691	/ (real) CLOCKS_PER_SEC * 1000));
				692	fflush(stdout);
				693	}
				694	alpha = starting_alpha
				695	* (1 - word_count_actual / (real) (iter * train_words + 1));
				696	if (alpha < starting_alpha * 0.0001)
				697	alpha = starting_alpha * 0.0001;
				698	}
				699	if (sentence_length == 0) {
				700	while (1) {
				701	word = ReadWordIndex(fi);
				702	if (feof(fi))
				703	break;
				704	if (word == -1)
				705	continue;
				706	word_count++;
				707	if (word == 0)
				708	break;
				709	// The subsampling randomly discards frequent words while keeping the ranking same
				710	if (sample > 0) {
				711	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				712	+ 1) * (sample * train_words) / vocab[word].cn;
				713	next_random = next_random * (unsigned long long) 25214903917
				714	+ 11;
				715	if (ran < (next_random & 0xFFFF) / (real) 65536)
				716	continue;
				717	}
				718	sen[sentence_length] = word;
				719	sentence_length++;
				720	if (sentence_length >= MAX_SENTENCE_LENGTH)
				721	break;
				722	}
				723	sentence_position = 0;
				724	}
				725	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				726	word_count_actual += word_count - last_word_count;
				727	local_iter--;
				728	if (local_iter == 0)
				729	break;
				730	word_count = 0;
				731	last_word_count = 0;
				732	sentence_length = 0;
				733	fseek(fi, file_size / (long long) num_threads * (long long) id,
				734	SEEK_SET);
				735	continue;
				736	}
				737	word = sen[sentence_position];
				738	if (word == -1)
				739	continue;
				740	for (c = 0; c < input_len_1; c++)
				741	neu1[c] = 0;
				742	for (c = 0; c < input_len_1; c++)
				743	neu1e[c] = 0;
				744	for (c = 0; c < input_len_2; c++)
				745	neu2[c] = 0;
				746	for (c = 0; c < input_len_2; c++)
				747	neu2e[c] = 0;
				748	next_random = next_random * (unsigned long long) 25214903917 + 11;
				749	b = next_random % window;
				750	if (type == 0) { //train the cbow architecture
				751	// in -> hidden
				752	cw = 0;
				753	for (a = b; a < window * 2 + 1 - b; a++)
				754	if (a != window) {
				755	c = sentence_position - window + a;
				756	if (c < 0)
				757	continue;
				758	if (c >= sentence_length)
				759	continue;
				760	last_word = sen[c];
				761	if (last_word == -1)
				762	continue;
				763	for (c = 0; c < layer1_size; c++)
				764	neu1[c] += syn0[c + last_word * layer1_size];
				765	cw++;
				766	}
				767	if (cw) {
				768	for (c = 0; c < layer1_size; c++)
				769	neu1[c] /= cw;
				770	if (hs)
				771	for (d = 0; d < vocab[word].codelen; d++) {
				772	f = 0;
				773	l2 = vocab[word].point[d] * layer1_size;
				774	// Propagate hidden -> output
				775	for (c = 0; c < layer1_size; c++)
				776	f += neu1[c] * syn1[c + l2];
				777	if (f <= -MAX_EXP)
				778	continue;
				779	else if (f >= MAX_EXP)
				780	continue;
				781	else
				782	f = expTable[(int) ((f + MAX_EXP)
				783	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				784	// 'g' is the gradient multiplied by the learning rate
				785	g = (1 - vocab[word].code[d] - f) * alpha;
				786	// Propagate errors output -> hidden
				787	for (c = 0; c < layer1_size; c++)
				788	neu1e[c] += g * syn1[c + l2];
				789	// Learn weights hidden -> output
				790	for (c = 0; c < layer1_size; c++)
				791	syn1[c + l2] += g * neu1[c];
				792	if (cap == 1)
				793	for (c = 0; c < layer1_size; c++)
				794	capParam(syn1, c + l2);
				795	}
				796	// NEGATIVE SAMPLING
				797	if (negative > 0)
				798	for (d = 0; d < negative + 1; d++) {
				799	if (d == 0) {
				800	target = word;
				801	label = 1;
				802	} else {
				803	next_random = next_random
				804	* (unsigned long long) 25214903917 + 11;
				805	if (word_to_group != NULL
				806	&& word_to_group[word] != -1) {
				807	target = word;
				808	while (target == word) {
				809	target = group_to_table[word_to_group[word]
				810	* table_size
				811	+ (next_random >> 16) % table_size];
				812	next_random = next_random
				813	* (unsigned long long) 25214903917
				814	+ 11;
				815	}
				816	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				817	} else {
				818	target =
				819	table[(next_random >> 16) % table_size];
				820	}
				821	if (target == 0)
				822	target = next_random % (vocab_size - 1) + 1;
				823	if (target == word)
				824	continue;
				825	label = 0;
				826	}
				827	l2 = target * layer1_size;
				828	f = 0;
				829	for (c = 0; c < layer1_size; c++)
				830	f += neu1[c] * syn1neg[c + l2];
				831	if (f > MAX_EXP)
				832	g = (label - 1) * alpha;
				833	else if (f < -MAX_EXP)
				834	g = (label - 0) * alpha;
				835	else
				836	g = (label
				837	- expTable[(int) ((f + MAX_EXP)
				838	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				839	* alpha;
				840	for (c = 0; c < layer1_size; c++)
				841	neu1e[c] += g * syn1neg[c + l2];
				842	for (c = 0; c < layer1_size; c++)
				843	syn1neg[c + l2] += g * neu1[c];
				844	if (cap == 1)
				845	for (c = 0; c < layer1_size; c++)
				846	capParam(syn1neg, c + l2);
				847	}
				848	// Noise Contrastive Estimation
				849	if (nce > 0)
				850	for (d = 0; d < nce + 1; d++) {
				851	if (d == 0) {
				852	target = word;
				853	label = 1;
				854	} else {
				855	next_random = next_random
				856	* (unsigned long long) 25214903917 + 11;
				857	if (word_to_group != NULL
				858	&& word_to_group[word] != -1) {
				859	target = word;
				860	while (target == word) {
				861	target = group_to_table[word_to_group[word]
				862	* table_size
				863	+ (next_random >> 16) % table_size];
				864	next_random = next_random
				865	* (unsigned long long) 25214903917
				866	+ 11;
				867	}
				868	} else {
				869	target =
				870	table[(next_random >> 16) % table_size];
				871	}
				872	if (target == 0)
				873	target = next_random % (vocab_size - 1) + 1;
				874	if (target == word)
				875	continue;
				876	label = 0;
				877	}
				878	l2 = target * layer1_size;
				879	f = 0;
				880
				881	for (c = 0; c < layer1_size; c++)
				882	f += neu1[c] * syn1nce[c + l2];
				883	if (f > MAX_EXP)
				884	g = (label - 1) * alpha;
				885	else if (f < -MAX_EXP)
				886	g = (label - 0) * alpha;
				887	else {
				888	f = exp(f);
				889	g =
				890	(label
				891	- f
				892	/ (noise_distribution[target]
				893	* nce + f)) * alpha;
				894	}
				895	for (c = 0; c < layer1_size; c++)
				896	neu1e[c] += g * syn1nce[c + l2];
				897	for (c = 0; c < layer1_size; c++)
				898	syn1nce[c + l2] += g * neu1[c];
				899	if (cap == 1)
				900	for (c = 0; c < layer1_size; c++)
				901	capParam(syn1nce, c + l2);
				902	}
				903	// hidden -> in
				904	for (a = b; a < window * 2 + 1 - b; a++)
				905	if (a != window) {
				906	c = sentence_position - window + a;
				907	if (c < 0)
				908	continue;
				909	if (c >= sentence_length)
				910	continue;
				911	last_word = sen[c];
				912	if (last_word == -1)
				913	continue;
				914	for (c = 0; c < layer1_size; c++)
				915	syn0[c + last_word * layer1_size] += neu1e[c];
				916	}
				917	}
				918	} else if (type == 1) { //train skip-gram
				919	for (a = b; a < window * 2 + 1 - b; a++)
				920	if (a != window) {
				921	c = sentence_position - window + a;
				922	if (c < 0)
				923	continue;
				924	if (c >= sentence_length)
				925	continue;
				926	last_word = sen[c];
				927	if (last_word == -1)
				928	continue;
				929	l1 = last_word * layer1_size;
				930	for (c = 0; c < layer1_size; c++)
				931	neu1e[c] = 0;
				932	// HIERARCHICAL SOFTMAX
				933	if (hs)
				934	for (d = 0; d < vocab[word].codelen; d++) {
				935	f = 0;
				936	l2 = vocab[word].point[d] * layer1_size;
				937	// Propagate hidden -> output
				938	for (c = 0; c < layer1_size; c++)
				939	f += syn0[c + l1] * syn1[c + l2];
				940	if (f <= -MAX_EXP)
				941	continue;
				942	else if (f >= MAX_EXP)
				943	continue;
				944	else
				945	f = expTable[(int) ((f + MAX_EXP)
				946	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				947	// 'g' is the gradient multiplied by the learning rate
				948	g = (1 - vocab[word].code[d] - f) * alpha;
				949	// Propagate errors output -> hidden
				950	for (c = 0; c < layer1_size; c++)
				951	neu1e[c] += g * syn1[c + l2];
				952	// Learn weights hidden -> output
				953	for (c = 0; c < layer1_size; c++)
				954	syn1[c + l2] += g * syn0[c + l1];
				955	if (cap == 1)
				956	for (c = 0; c < layer1_size; c++)
				957	capParam(syn1, c + l2);
				958	}
				959	// NEGATIVE SAMPLING
				960	if (negative > 0)
				961	for (d = 0; d < negative + 1; d++) {
				962	if (d == 0) {
				963	target = word;
				964	label = 1;
				965	} else {
				966	next_random = next_random
				967	* (unsigned long long) 25214903917 + 11;
				968	if (word_to_group != NULL
				969	&& word_to_group[word] != -1) {
				970	target = word;
				971	while (target == word) {
				972	target =
				973	group_to_table[word_to_group[word]
				974	* table_size
				975	+ (next_random >> 16)
				976	% table_size];
				977	next_random =
				978	next_random
				979	* (unsigned long long) 25214903917
				980	+ 11;
				981	}
				982	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				983	} else {
				984	target = table[(next_random >> 16)
				985	% table_size];
				986	}
				987	if (target == 0)
				988	target = next_random % (vocab_size - 1) + 1;
				989	if (target == word)
				990	continue;
				991	label = 0;
				992	}
				993	l2 = target * layer1_size;
				994	f = 0;
				995	for (c = 0; c < layer1_size; c++)
				996	f += syn0[c + l1] * syn1neg[c + l2];
				997	if (f > MAX_EXP)
				998	g = (label - 1) * alpha;
				999	else if (f < -MAX_EXP)
				1000	g = (label - 0) * alpha;
				1001	else
				1002	g =
				1003	(label
				1004	- expTable[(int) ((f + MAX_EXP)
				1005	* (EXP_TABLE_SIZE
				1006	/ MAX_EXP / 2))])
				1007	* alpha;
				1008	for (c = 0; c < layer1_size; c++)
				1009	neu1e[c] += g * syn1neg[c + l2];
				1010	for (c = 0; c < layer1_size; c++)
				1011	syn1neg[c + l2] += g * syn0[c + l1];
				1012	if (cap == 1)
				1013	for (c = 0; c < layer1_size; c++)
				1014	capParam(syn1neg, c + l2);
				1015	}
				1016	//Noise Contrastive Estimation
				1017	if (nce > 0)
				1018	for (d = 0; d < nce + 1; d++) {
				1019	if (d == 0) {
				1020	target = word;
				1021	label = 1;
				1022	} else {
				1023	next_random = next_random
				1024	* (unsigned long long) 25214903917 + 11;
				1025	if (word_to_group != NULL
				1026	&& word_to_group[word] != -1) {
				1027	target = word;
				1028	while (target == word) {
				1029	target =
				1030	group_to_table[word_to_group[word]
				1031	* table_size
				1032	+ (next_random >> 16)
				1033	% table_size];
				1034	next_random =
				1035	next_random
				1036	* (unsigned long long) 25214903917
				1037	+ 11;
				1038	}
				1039	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1040	} else {
				1041	target = table[(next_random >> 16)
				1042	% table_size];
				1043	}
				1044	if (target == 0)
				1045	target = next_random % (vocab_size - 1) + 1;
				1046	if (target == word)
				1047	continue;
				1048	label = 0;
				1049	}
				1050	l2 = target * layer1_size;
				1051	f = 0;
				1052	for (c = 0; c < layer1_size; c++)
				1053	f += syn0[c + l1] * syn1nce[c + l2];
				1054	if (f > MAX_EXP)
				1055	g = (label - 1) * alpha;
				1056	else if (f < -MAX_EXP)
				1057	g = (label - 0) * alpha;
				1058	else {
				1059	f = exp(f);
				1060	g = (label
				1061	- f
				1062	/ (noise_distribution[target]
				1063	* nce + f)) * alpha;
				1064	}
				1065	for (c = 0; c < layer1_size; c++)
				1066	neu1e[c] += g * syn1nce[c + l2];
				1067	for (c = 0; c < layer1_size; c++)
				1068	syn1nce[c + l2] += g * syn0[c + l1];
				1069	if (cap == 1)
				1070	for (c = 0; c < layer1_size; c++)
				1071	capParam(syn1nce, c + l2);
				1072	}
				1073	// Learn weights input -> hidden
				1074	for (c = 0; c < layer1_size; c++)
				1075	syn0[c + l1] += neu1e[c];
				1076	}
				1077	} else if (type == 2) { //train the cwindow architecture
				1078	// in -> hidden
				1079	cw = 0;
				1080	for (a = 0; a < window * 2 + 1; a++)
				1081	if (a != window) {
				1082	c = sentence_position - window + a;
				1083	if (c < 0)
				1084	continue;
				1085	if (c >= sentence_length)
				1086	continue;
				1087	last_word = sen[c];
				1088	if (last_word == -1)
				1089	continue;
				1090	window_offset = a * layer1_size;
				1091	if (a > window)
				1092	window_offset -= layer1_size;
				1093	for (c = 0; c < layer1_size; c++)
				1094	neu1[c + window_offset] += syn0[c
				1095	+ last_word * layer1_size];
				1096	cw++;
				1097	}
				1098	if (cw) {
				1099	if (hs)
				1100	for (d = 0; d < vocab[word].codelen; d++) {
				1101	f = 0;
				1102	l2 = vocab[word].point[d] * window_layer_size;
				1103	// Propagate hidden -> output
				1104	for (c = 0; c < window_layer_size; c++)
				1105	f += neu1[c] * syn1_window[c + l2];
				1106	if (f <= -MAX_EXP)
				1107	continue;
				1108	else if (f >= MAX_EXP)
				1109	continue;
				1110	else
				1111	f = expTable[(int) ((f + MAX_EXP)
				1112	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1113	// 'g' is the gradient multiplied by the learning rate
				1114	g = (1 - vocab[word].code[d] - f) * alpha;
				1115	// Propagate errors output -> hidden
				1116	for (c = 0; c < window_layer_size; c++)
				1117	neu1e[c] += g * syn1_window[c + l2];
				1118	// Learn weights hidden -> output
				1119	for (c = 0; c < window_layer_size; c++)
				1120	syn1_window[c + l2] += g * neu1[c];
				1121	if (cap == 1)
				1122	for (c = 0; c < window_layer_size; c++)
				1123	capParam(syn1_window, c + l2);
				1124	}
				1125	// NEGATIVE SAMPLING
				1126	if (negative > 0)
				1127	for (d = 0; d < negative + 1; d++) {
				1128	if (d == 0) {
				1129	target = word;
				1130	label = 1;
				1131	} else {
				1132	next_random = next_random
				1133	* (unsigned long long) 25214903917 + 11;
				1134	if (word_to_group != NULL
				1135	&& word_to_group[word] != -1) {
				1136	target = word;
				1137	while (target == word) {
				1138	target = group_to_table[word_to_group[word]
				1139	* table_size
				1140	+ (next_random >> 16) % table_size];
				1141	next_random = next_random
				1142	* (unsigned long long) 25214903917
				1143	+ 11;
				1144	}
				1145	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1146	} else {
				1147	target =
				1148	table[(next_random >> 16) % table_size];
				1149	}
				1150	if (target == 0)
				1151	target = next_random % (vocab_size - 1) + 1;
				1152	if (target == word)
				1153	continue;
				1154	label = 0;
				1155	}
				1156	l2 = target * window_layer_size;
				1157	f = 0;
				1158	for (c = 0; c < window_layer_size; c++)
				1159	f += neu1[c] * syn1neg_window[c + l2];
				1160	if (f > MAX_EXP)
				1161	g = (label - 1) * alpha;
				1162	else if (f < -MAX_EXP)
				1163	g = (label - 0) * alpha;
				1164	else
				1165	g = (label
				1166	- expTable[(int) ((f + MAX_EXP)
				1167	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1168	* alpha;
				1169	for (c = 0; c < window_layer_size; c++)
				1170	neu1e[c] += g * syn1neg_window[c + l2];
				1171	for (c = 0; c < window_layer_size; c++)
				1172	syn1neg_window[c + l2] += g * neu1[c];
				1173	if (cap == 1)
				1174	for (c = 0; c < window_layer_size; c++)
				1175	capParam(syn1neg_window, c + l2);
				1176	}
				1177	// Noise Contrastive Estimation
				1178	if (nce > 0)
				1179	for (d = 0; d < nce + 1; d++) {
				1180	if (d == 0) {
				1181	target = word;
				1182	label = 1;
				1183	} else {
				1184	next_random = next_random
				1185	* (unsigned long long) 25214903917 + 11;
				1186	if (word_to_group != NULL
				1187	&& word_to_group[word] != -1) {
				1188	target = word;
				1189	while (target == word) {
				1190	target = group_to_table[word_to_group[word]
				1191	* table_size
				1192	+ (next_random >> 16) % table_size];
				1193	next_random = next_random
				1194	* (unsigned long long) 25214903917
				1195	+ 11;
				1196	}
				1197	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1198	} else {
				1199	target =
				1200	table[(next_random >> 16) % table_size];
				1201	}
				1202	if (target == 0)
				1203	target = next_random % (vocab_size - 1) + 1;
				1204	if (target == word)
				1205	continue;
				1206	label = 0;
				1207	}
				1208	l2 = target * window_layer_size;
				1209	f = 0;
				1210	for (c = 0; c < window_layer_size; c++)
				1211	f += neu1[c] * syn1nce_window[c + l2];
				1212	if (f > MAX_EXP)
				1213	g = (label - 1) * alpha;
				1214	else if (f < -MAX_EXP)
				1215	g = (label - 0) * alpha;
				1216	else {
				1217	f = exp(f);
				1218	g =
				1219	(label
				1220	- f
				1221	/ (noise_distribution[target]
				1222	* nce + f)) * alpha;
				1223	}
				1224	for (c = 0; c < window_layer_size; c++)
				1225	neu1e[c] += g * syn1nce_window[c + l2];
				1226	for (c = 0; c < window_layer_size; c++)
				1227	syn1nce_window[c + l2] += g * neu1[c];
				1228	if (cap == 1)
				1229	for (c = 0; c < window_layer_size; c++)
				1230	capParam(syn1nce_window, c + l2);
				1231	}
				1232	// hidden -> in
				1233	for (a = 0; a < window * 2 + 1; a++)
				1234	if (a != window) {
				1235	c = sentence_position - window + a;
				1236	if (c < 0)
				1237	continue;
				1238	if (c >= sentence_length)
				1239	continue;
				1240	last_word = sen[c];
				1241	if (last_word == -1)
				1242	continue;
				1243	window_offset = a * layer1_size;
				1244	if (a > window)
				1245	window_offset -= layer1_size;
				1246	for (c = 0; c < layer1_size; c++)
				1247	syn0[c + last_word * layer1_size] += neu1e[c
				1248	+ window_offset];
				1249	}
				1250	}
				1251	} else if (type == 3) { //train structured skip-gram
				1252	for (a = 0; a < window * 2 + 1; a++)
				1253	if (a != window) {
				1254	c = sentence_position - window + a;
				1255	if (c < 0)
				1256	continue;
				1257	if (c >= sentence_length)
				1258	continue;
				1259	last_word = sen[c];
				1260	if (last_word == -1)
				1261	continue;
				1262	l1 = last_word * layer1_size;
				1263	window_offset = a * layer1_size;
				1264	if (a > window)
				1265	window_offset -= layer1_size;
				1266	for (c = 0; c < layer1_size; c++)
				1267	neu1e[c] = 0;
				1268	// HIERARCHICAL SOFTMAX
				1269	if (hs)
				1270	for (d = 0; d < vocab[word].codelen; d++) {
				1271	f = 0;
				1272	l2 = vocab[word].point[d] * window_layer_size;
				1273	// Propagate hidden -> output
				1274	for (c = 0; c < layer1_size; c++)
				1275	f += syn0[c + l1]
				1276	* syn1_window[c + l2 + window_offset];
				1277	if (f <= -MAX_EXP)
				1278	continue;
				1279	else if (f >= MAX_EXP)
				1280	continue;
				1281	else
				1282	f = expTable[(int) ((f + MAX_EXP)
				1283	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1284	// 'g' is the gradient multiplied by the learning rate
				1285	g = (1 - vocab[word].code[d] - f) * alpha;
				1286	// Propagate errors output -> hidden
				1287	for (c = 0; c < layer1_size; c++)
				1288	neu1e[c] += g
				1289	* syn1_window[c + l2 + window_offset];
				1290	// Learn weights hidden -> output
				1291	for (c = 0; c < layer1_size; c++)
				1292	syn1[c + l2 + window_offset] += g
				1293	* syn0[c + l1];
				1294	if (cap == 1)
				1295	for (c = 0; c < layer1_size; c++)
				1296	capParam(syn1, c + l2 + window_offset);
				1297	}
				1298	// NEGATIVE SAMPLING
				1299	if (negative > 0)
				1300	for (d = 0; d < negative + 1; d++) {
				1301	if (d == 0) {
				1302	target = word;
				1303	label = 1;
				1304	} else {
				1305	next_random = next_random
				1306	* (unsigned long long) 25214903917 + 11;
				1307	if (word_to_group != NULL
				1308	&& word_to_group[word] != -1) {
				1309	target = word;
				1310	while (target == word) {
				1311	target =
				1312	group_to_table[word_to_group[word]
				1313	* table_size
				1314	+ (next_random >> 16)
				1315	% table_size];
				1316	next_random =
				1317	next_random
				1318	* (unsigned long long) 25214903917
				1319	+ 11;
				1320	}
				1321	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1322	} else {
				1323	target = table[(next_random >> 16)
				1324	% table_size];
				1325	}
				1326	if (target == 0)
				1327	target = next_random % (vocab_size - 1) + 1;
				1328	if (target == word)
				1329	continue;
				1330	label = 0;
				1331	}
				1332	l2 = target * window_layer_size;
				1333	f = 0;
				1334	for (c = 0; c < layer1_size; c++)
				1335	f +=
				1336	syn0[c + l1]
				1337	* syn1neg_window[c + l2
				1338	+ window_offset];
				1339	if (f > MAX_EXP)
				1340	g = (label - 1) * alpha;
				1341	else if (f < -MAX_EXP)
				1342	g = (label - 0) * alpha;
				1343	else
				1344	g =
				1345	(label
				1346	- expTable[(int) ((f + MAX_EXP)
				1347	* (EXP_TABLE_SIZE
				1348	/ MAX_EXP / 2))])
				1349	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1350	if(debug_mode > 2 && ((long long) id) == 0) {
				1351	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1352	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1353	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1354	for (c = 0; c < layer1_size; c++)
				1355	neu1e[c] +=
				1356	g
				1357	* syn1neg_window[c + l2
				1358	+ window_offset];
				1359	for (c = 0; c < layer1_size; c++)
				1360	syn1neg_window[c + l2 + window_offset] += g
				1361	* syn0[c + l1];
				1362	if (cap == 1)
				1363	for (c = 0; c < layer1_size; c++)
				1364	capParam(syn1neg_window,
				1365	c + l2 + window_offset);
				1366	}
				1367	// Noise Constrastive Estimation
				1368	if (nce > 0)
				1369	for (d = 0; d < nce + 1; d++) {
				1370	if (d == 0) {
				1371	target = word;
				1372	label = 1;
				1373	} else {
				1374	next_random = next_random
				1375	* (unsigned long long) 25214903917 + 11;
				1376	if (word_to_group != NULL
				1377	&& word_to_group[word] != -1) {
				1378	target = word;
				1379	while (target == word) {
				1380	target =
				1381	group_to_table[word_to_group[word]
				1382	* table_size
				1383	+ (next_random >> 16)
				1384	% table_size];
				1385	next_random =
				1386	next_random
				1387	* (unsigned long long) 25214903917
				1388	+ 11;
				1389	}
				1390	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1391	} else {
				1392	target = table[(next_random >> 16)
				1393	% table_size];
				1394	}
				1395	if (target == 0)
				1396	target = next_random % (vocab_size - 1) + 1;
				1397	if (target == word)
				1398	continue;
				1399	label = 0;
				1400	}
				1401	l2 = target * window_layer_size;
				1402	f = 0;
				1403	for (c = 0; c < layer1_size; c++)
				1404	f +=
				1405	syn0[c + l1]
				1406	* syn1nce_window[c + l2
				1407	+ window_offset];
				1408	if (f > MAX_EXP)
				1409	g = (label - 1) * alpha;
				1410	else if (f < -MAX_EXP)
				1411	g = (label - 0) * alpha;
				1412	else {
				1413	f = exp(f);
				1414	g = (label
				1415	- f
				1416	/ (noise_distribution[target]
				1417	* nce + f)) * alpha;
				1418	}
				1419	for (c = 0; c < layer1_size; c++)
				1420	neu1e[c] +=
				1421	g
				1422	* syn1nce_window[c + l2
				1423	+ window_offset];
				1424	for (c = 0; c < layer1_size; c++)
				1425	syn1nce_window[c + l2 + window_offset] += g
				1426	* syn0[c + l1];
				1427	if (cap == 1)
				1428	for (c = 0; c < layer1_size; c++)
				1429	capParam(syn1nce_window,
				1430	c + l2 + window_offset);
				1431	}
				1432	// Learn weights input -> hidden
				1433	for (c = 0; c < layer1_size; c++) {
				1434	syn0[c + l1] += neu1e[c];
				1435	if (syn0[c + l1] > 50)
				1436	syn0[c + l1] = 50;
				1437	if (syn0[c + l1] < -50)
				1438	syn0[c + l1] = -50;
				1439	}
				1440	}
				1441	} else if (type == 4) { //training senna
				1442	// in -> hidden
				1443	cw = 0;
				1444	for (a = 0; a < window * 2 + 1; a++)
				1445	if (a != window) {
				1446	c = sentence_position - window + a;
				1447	if (c < 0)
				1448	continue;
				1449	if (c >= sentence_length)
				1450	continue;
				1451	last_word = sen[c];
				1452	if (last_word == -1)
				1453	continue;
				1454	window_offset = a * layer1_size;
				1455	if (a > window)
				1456	window_offset -= layer1_size;
				1457	for (c = 0; c < layer1_size; c++)
				1458	neu1[c + window_offset] += syn0[c
				1459	+ last_word * layer1_size];
				1460	cw++;
				1461	}
				1462	if (cw) {
				1463	for (a = 0; a < window_hidden_size; a++) {
				1464	c = a * window_layer_size;
				1465	for (b = 0; b < window_layer_size; b++) {
				1466	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1467	}
				1468	}
				1469	if (hs)
				1470	for (d = 0; d < vocab[word].codelen; d++) {
				1471	f = 0;
				1472	l2 = vocab[word].point[d] * window_hidden_size;
				1473	// Propagate hidden -> output
				1474	for (c = 0; c < window_hidden_size; c++)
				1475	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1476	if (f <= -MAX_EXP)
				1477	continue;
				1478	else if (f >= MAX_EXP)
				1479	continue;
				1480	else
				1481	f = expTable[(int) ((f + MAX_EXP)
				1482	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1483	// 'g' is the gradient multiplied by the learning rate
				1484	g = (1 - vocab[word].code[d] - f) * alpha;
				1485	// Propagate errors output -> hidden
				1486	for (c = 0; c < window_hidden_size; c++)
				1487	neu2e[c] += dHardTanh(neu2[c], g) * g
				1488	* syn_hidden_word[c + l2];
				1489	// Learn weights hidden -> output
				1490	for (c = 0; c < window_hidden_size; c++)
				1491	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1492	* neu2[c];
				1493	}
				1494	// NEGATIVE SAMPLING
				1495	if (negative > 0)
				1496	for (d = 0; d < negative + 1; d++) {
				1497	if (d == 0) {
				1498	target = word;
				1499	label = 1;
				1500	} else {
				1501	next_random = next_random
				1502	* (unsigned long long) 25214903917 + 11;
				1503	if (word_to_group != NULL
				1504	&& word_to_group[word] != -1) {
				1505	target = word;
				1506	while (target == word) {
				1507	target = group_to_table[word_to_group[word]
				1508	* table_size
				1509	+ (next_random >> 16) % table_size];
				1510	next_random = next_random
				1511	* (unsigned long long) 25214903917
				1512	+ 11;
				1513	}
				1514	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1515	} else {
				1516	target =
				1517	table[(next_random >> 16) % table_size];
				1518	}
				1519	if (target == 0)
				1520	target = next_random % (vocab_size - 1) + 1;
				1521	if (target == word)
				1522	continue;
				1523	label = 0;
				1524	}
				1525	l2 = target * window_hidden_size;
				1526	f = 0;
				1527	for (c = 0; c < window_hidden_size; c++)
				1528	f += hardTanh(neu2[c])
				1529	* syn_hidden_word_neg[c + l2];
				1530	if (f > MAX_EXP)
				1531	g = (label - 1) * alpha / negative;
				1532	else if (f < -MAX_EXP)
				1533	g = (label - 0) * alpha / negative;
				1534	else
				1535	g = (label
				1536	- expTable[(int) ((f + MAX_EXP)
				1537	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1538	* alpha / negative;
				1539	for (c = 0; c < window_hidden_size; c++)
				1540	neu2e[c] += dHardTanh(neu2[c], g) * g
				1541	* syn_hidden_word_neg[c + l2];
				1542	for (c = 0; c < window_hidden_size; c++)
				1543	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1544	* g * neu2[c];
				1545	}
				1546	for (a = 0; a < window_hidden_size; a++)
				1547	for (b = 0; b < window_layer_size; b++)
				1548	neu1e[b] += neu2e[a]
				1549	* syn_window_hidden[a * window_layer_size + b];
				1550	for (a = 0; a < window_hidden_size; a++)
				1551	for (b = 0; b < window_layer_size; b++)
				1552	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1553	* neu1[b];
				1554	// hidden -> in
				1555	for (a = 0; a < window * 2 + 1; a++)
				1556	if (a != window) {
				1557	c = sentence_position - window + a;
				1558	if (c < 0)
				1559	continue;
				1560	if (c >= sentence_length)
				1561	continue;
				1562	last_word = sen[c];
				1563	if (last_word == -1)
				1564	continue;
				1565	window_offset = a * layer1_size;
				1566	if (a > window)
				1567	window_offset -= layer1_size;
				1568	for (c = 0; c < layer1_size; c++)
				1569	syn0[c + last_word * layer1_size] += neu1e[c
				1570	+ window_offset];
				1571	}
				1572	}
				1573	} else {
				1574	printf("unknown type %i", type);
				1575	exit(0);
				1576	}
				1577	sentence_position++;
				1578	if (sentence_position >= sentence_length) {
				1579	sentence_length = 0;
				1580	continue;
				1581	}
				1582	}
				1583	fclose(fi);
				1584	free(neu1);
				1585	free(neu1e);
				1586	pthread_exit(NULL);
				1587	}
				1588
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1589	void ShowCollocations() {
				1590	long a, b, c, d, window_offset, target, max_target=0, maxmax_target;
				1591	real f, max_f, maxmax_f;
				1592	real *target_sums;
				1593	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1594
				1595	for (d = cc; d < vocab_size; d++) {
				1596	for (b = 0; b < vocab_size; b++)
				1597	target_sums[b]=0;
				1598	maxmax_f = -1;
				1599	maxmax_target = 0;
				1600	for (a = 0; a < window * 2 + 1; a++) {
				1601	if (a != window) {
				1602	max_f = -1;
				1603	window_offset = a * layer1_size;
				1604	if (a > window)
				1605	window_offset -= layer1_size;
				1606	for(target = 0; target < vocab_size; target ++) {
				1607	if(target == d)
				1608	continue;
				1609	f = 0;
				1610	for (c = 0; c < layer1_size; c++)
				1611	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1612	if (f < -MAX_EXP)
				1613	continue;
				1614	else if (f > MAX_EXP)
				1615	continue;
				1616	else
				1617	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1618	if(f > max_f) {
				1619	max_f = f;
				1620	max_target = target;
				1621	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame^]	1622	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1623	}
				1624	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1625	if(max_f > maxmax_f) {
				1626	maxmax_f = max_f;
				1627	maxmax_target = max_target;
				1628	}
				1629	} else {
				1630	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1631	}
				1632	}
				1633	max_f = -1;
				1634	for (b = 0; b < vocab_size; b++) {
				1635	if(target_sums[b] > max_f) {
				1636	max_f = target_sums[b];
				1637	max_target = b;
				1638	}
				1639	}
				1640	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame^]	1641	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1642	vocab[maxmax_target].word, maxmax_f);
				1643	}
				1644	}
				1645
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1646	void TrainModel() {
				1647	long a, b, c, d;
				1648	FILE *fo;
				1649	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
				1650	printf("Starting training using file %s\n", train_file);
				1651	starting_alpha = alpha;
				1652	if (read_vocab_file[0] != 0)
				1653	ReadVocab();
				1654	else
				1655	LearnVocabFromTrainFile();
				1656	if (save_vocab_file[0] != 0)
				1657	SaveVocab();
				1658	if (output_file[0] == 0)
				1659	return;
				1660	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1661	if(cc > 0)
				1662	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1663	if (negative > 0 \|\| nce > 0)
				1664	InitUnigramTable();
				1665	if (negative_classes_file[0] != 0)
				1666	InitClassUnigramTable();
				1667	start = clock();
				1668	for (a = 0; a < num_threads; a++)
				1669	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
				1670	for (a = 0; a < num_threads; a++)
				1671	pthread_join(pt[a], NULL);
				1672	fo = fopen(output_file, "wb");
				1673	if (classes == 0) {
				1674	// Save the word vectors
				1675	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1676	for (a = 0; a < vocab_size; a++) {
				1677	fprintf(fo, "%s ", vocab[a].word);
				1678	if (binary)
				1679	for (b = 0; b < layer1_size; b++)
				1680	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1681	else
				1682	for (b = 0; b < layer1_size; b++)
				1683	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1684	fprintf(fo, "\n");
				1685	}
				1686	} else {
				1687	// Run K-means on the word vectors
				1688	int clcn = classes, iter = 10, closeid;
				1689	int centcn = (int ) malloc(classes * sizeof(int));
				1690	int cl = (int ) calloc(vocab_size, sizeof(int));
				1691	real closev, x;
				1692	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1693	for (a = 0; a < vocab_size; a++)
				1694	cl[a] = a % clcn;
				1695	for (a = 0; a < iter; a++) {
				1696	for (b = 0; b < clcn * layer1_size; b++)
				1697	cent[b] = 0;
				1698	for (b = 0; b < clcn; b++)
				1699	centcn[b] = 1;
				1700	for (c = 0; c < vocab_size; c++) {
				1701	for (d = 0; d < layer1_size; d++)
				1702	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1703	centcn[cl[c]]++;
				1704	}
				1705	for (b = 0; b < clcn; b++) {
				1706	closev = 0;
				1707	for (c = 0; c < layer1_size; c++) {
				1708	cent[layer1_size * b + c] /= centcn[b];
				1709	closev += cent[layer1_size * b + c]
				1710	* cent[layer1_size * b + c];
				1711	}
				1712	closev = sqrt(closev);
				1713	for (c = 0; c < layer1_size; c++)
				1714	cent[layer1_size * b + c] /= closev;
				1715	}
				1716	for (c = 0; c < vocab_size; c++) {
				1717	closev = -10;
				1718	closeid = 0;
				1719	for (d = 0; d < clcn; d++) {
				1720	x = 0;
				1721	for (b = 0; b < layer1_size; b++)
				1722	x += cent[layer1_size * d + b]
				1723	* syn0[c * layer1_size + b];
				1724	if (x > closev) {
				1725	closev = x;
				1726	closeid = d;
				1727	}
				1728	}
				1729	cl[c] = closeid;
				1730	}
				1731	}
				1732	// Save the K-means classes
				1733	for (a = 0; a < vocab_size; a++)
				1734	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1735	free(centcn);
				1736	free(cent);
				1737	free(cl);
				1738	}
				1739	fclose(fo);
				1740	if (save_net_file[0] != 0)
				1741	SaveNet();
				1742	}
				1743
				1744	int ArgPos(char str, int argc, char *argv) {
				1745	int a;
				1746	for (a = 1; a < argc; a++)
				1747	if (!strcmp(str, argv[a])) {
				1748	if (a == argc - 1) {
				1749	printf("Argument missing for %s\n", str);
				1750	exit(1);
				1751	}
				1752	return a;
				1753	}
				1754	return -1;
				1755	}
				1756
				1757	int main(int argc, char **argv) {
				1758	int i;
				1759	if (argc == 1) {
				1760	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1761	printf("Options:\n");
				1762	printf("Parameters for training:\n");
				1763	printf("\t-train <file>\n");
				1764	printf("\t\tUse text data from <file> to train the model\n");
				1765	printf("\t-output <file>\n");
				1766	printf(
				1767	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1768	printf("\t-size <int>\n");
				1769	printf("\t\tSet size of word vectors; default is 100\n");
				1770	printf("\t-window <int>\n");
				1771	printf("\t\tSet max skip length between words; default is 5\n");
				1772	printf("\t-sample <float>\n");
				1773	printf(
				1774	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1775	printf(
				1776	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1777	printf("\t-hs <int>\n");
				1778	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1779	printf("\t-negative <int>\n");
				1780	printf(
				1781	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1782	printf("\t-negative-classes <file>\n");
				1783	printf("\t\tNegative classes to sample from\n");
				1784	printf("\t-nce <int>\n");
				1785	printf(
				1786	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1787	printf("\t-threads <int>\n");
				1788	printf("\t\tUse <int> threads (default 12)\n");
				1789	printf("\t-iter <int>\n");
				1790	printf("\t\tRun more training iterations (default 5)\n");
				1791	printf("\t-min-count <int>\n");
				1792	printf(
				1793	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1794	printf("\t-alpha <float>\n");
				1795	printf(
				1796	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1797	printf("\t-classes <int>\n");
				1798	printf(
				1799	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1800	printf("\t-debug <int>\n");
				1801	printf(
				1802	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1803	printf("\t-binary <int>\n");
				1804	printf(
				1805	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1806	printf("\t-save-vocab <file>\n");
				1807	printf("\t\tThe vocabulary will be saved to <file>\n");
				1808	printf("\t-read-vocab <file>\n");
				1809	printf(
				1810	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1811	printf("\t-read-net <file>\n");
				1812	printf(
				1813	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1814	printf("\t-save-net <file>\n");
				1815	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1816	printf("\t-show-cc <int>\n");
				1817	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1818	printf("\t-type <int>\n");
				1819	printf(
				1820	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1821	printf("\t-cap <int>\n");
				1822	printf(
				1823	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1824	printf("\nExamples:\n");
				1825	printf(
				1826	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1827	return 0;
				1828	}
				1829	output_file[0] = 0;
				1830	save_vocab_file[0] = 0;
				1831	read_vocab_file[0] = 0;
				1832	save_net_file[0] = 0;
				1833	read_net_file[0] = 0;
				1834	negative_classes_file[0] = 0;
				1835	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				1836	layer1_size = atoi(argv[i + 1]);
				1837	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				1838	strcpy(train_file, argv[i + 1]);
				1839	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				1840	strcpy(save_vocab_file, argv[i + 1]);
				1841	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				1842	strcpy(read_vocab_file, argv[i + 1]);
				1843	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				1844	strcpy(save_net_file, argv[i + 1]);
				1845	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				1846	strcpy(read_net_file, argv[i + 1]);
				1847	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				1848	debug_mode = atoi(argv[i + 1]);
				1849	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				1850	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1851	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				1852	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1853	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				1854	type = atoi(argv[i + 1]);
				1855	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				1856	strcpy(output_file, argv[i + 1]);
				1857	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				1858	window = atoi(argv[i + 1]);
				1859	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				1860	sample = atof(argv[i + 1]);
				1861	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				1862	hs = atoi(argv[i + 1]);
				1863	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				1864	negative = atoi(argv[i + 1]);
				1865	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				1866	strcpy(negative_classes_file, argv[i + 1]);
				1867	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				1868	nce = atoi(argv[i + 1]);
				1869	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				1870	num_threads = atoi(argv[i + 1]);
				1871	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				1872	iter = atoi(argv[i + 1]);
				1873	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				1874	min_count = atoi(argv[i + 1]);
				1875	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				1876	classes = atoi(argv[i + 1]);
				1877	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				1878	cap = atoi(argv[i + 1]);
				1879	if (type == 0 \|\| type == 2 \|\| type == 4)
				1880	alpha = 0.05;
				1881	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				1882	alpha = atof(argv[i + 1]);
				1883	vocab = (struct vocab_word *) calloc(vocab_max_size,
				1884	sizeof(struct vocab_word));
				1885	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				1886	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1887	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1888	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1889	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1890	}
				1891	TrainModel();
				1892	return 0;
				1893	}
				1894