Blame - src/dereko2vec.c - ids-kl/dereko2vec

blob: 5b8ef43fc637640e445bd27822db49f5d233e28c [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	15	#include <locale.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	16	#include <stdio.h>
				17	#include <stdlib.h>
				18	#include <string.h>
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	19	#include <unistd.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	20	#include <math.h>
				21	#include <pthread.h>
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	22	#include <collocatordb.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	23
				24	#define MAX_STRING 100
				25	#define EXP_TABLE_SIZE 1000
				26	#define MAX_EXP 6
				27	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	28	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	29	#define MAX_CODE_LENGTH 40
				30
				31	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				32
				33	typedef float real; // Precision of float numbers
				34
				35	struct vocab_word {
				36	long long cn;
				37	int *point;
				38	char word, code, codelen;
				39	};
				40
				41	char train_file[MAX_STRING], output_file[MAX_STRING];
				42	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				43	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	44	char magic_stop_file[MAX_STRING];
				45
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	46	struct vocab_word *vocab;
				47	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	48	num_threads = 12, min_reduce = 1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	49	int *vocab_hash;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	50	long long *threadPos;
				51	int *threadIters;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	52	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				53	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				54	classes = 0;
				55	real alpha = 0.025, starting_alpha, sample = 1e-3;
				56	real syn0, syn1, syn1neg, syn1nce, *expTable;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	57	real avgWordLength=0;
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	58	clock_t start, start_clock;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	59
				60	real syn1_window, syn1neg_window, *syn1nce_window;
				61	int w_offset, window_layer_size;
				62
				63	int window_hidden_size = 500;
				64	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				65	*syn_hidden_word_nce;
				66
				67	int hs = 0, negative = 5;
				68	const int table_size = 1e8;
				69	int *table;
				70
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	71	long cc = 0;
				72
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	73	//constrastive negative sampling
				74	char negative_classes_file[MAX_STRING];
				75	int *word_to_group;
				76	int group_to_table; //group_sizetable_size
				77	int class_number;
				78
				79	//nce
				80	real* noise_distribution;
				81	int nce = 0;
				82
				83	//param caps
				84	real CAP_VALUE = 50;
				85	int cap = 0;
				86
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	87	COLLOCATORDB *cdb = NULL;
				88
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	89	void capParam(real* array, int index) {
				90	if (array[index] > CAP_VALUE)
				91	array[index] = CAP_VALUE;
				92	else if (array[index] < -CAP_VALUE)
				93	array[index] = -CAP_VALUE;
				94	}
				95
				96	real hardTanh(real x) {
				97	if (x >= 1) {
				98	return 1;
				99	} else if (x <= -1) {
				100	return -1;
				101	} else {
				102	return x;
				103	}
				104	}
				105
				106	real dHardTanh(real x, real g) {
				107	if (x > 1 && g > 0) {
				108	return 0;
				109	}
				110	if (x < -1 && g < 0) {
				111	return 0;
				112	}
				113	return 1;
				114	}
				115
				116	void InitUnigramTable() {
				117	int a, i;
				118	long long train_words_pow = 0;
				119	real d1, power = 0.75;
				120	table = (int ) malloc(table_size sizeof(int));
				121	for (a = 0; a < vocab_size; a++)
				122	train_words_pow += pow(vocab[a].cn, power);
				123	i = 0;
				124	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				125	for (a = 0; a < table_size; a++) {
				126	table[a] = i;
				127	if (a / (real) table_size > d1) {
				128	i++;
				129	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				130	}
				131	if (i >= vocab_size)
				132	i = vocab_size - 1;
				133	}
				134
				135	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				136	for (a = 0; a < vocab_size; a++)
				137	noise_distribution[a] = pow(vocab[a].cn, power)
				138	/ (real) train_words_pow;
				139	}
				140
				141	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				142	void ReadWord(char word, FILE fin) {
				143	int a = 0, ch;
				144	while (!feof(fin)) {
				145	ch = fgetc(fin);
				146	if (ch == 13)
				147	continue;
				148	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				149	if (a > 0) {
				150	if (ch == '\n')
				151	ungetc(ch, fin);
				152	break;
				153	}
				154	if (ch == '\n') {
				155	strcpy(word, (char *) "</s>");
				156	return;
				157	} else
				158	continue;
				159	}
				160	word[a] = ch;
				161	a++;
				162	if (a >= MAX_STRING - 1)
				163	a--; // Truncate too long words
				164	}
				165	word[a] = 0;
				166	}
				167
				168	// Returns hash value of a word
				169	int GetWordHash(char *word) {
				170	unsigned long long a, hash = 0;
				171	for (a = 0; a < strlen(word); a++)
				172	hash = hash * 257 + word[a];
				173	hash = hash % vocab_hash_size;
				174	return hash;
				175	}
				176
				177	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				178	int SearchVocab(char *word) {
				179	unsigned int hash = GetWordHash(word);
				180	while (1) {
				181	if (vocab_hash[hash] == -1)
				182	return -1;
				183	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				184	return vocab_hash[hash];
				185	hash = (hash + 1) % vocab_hash_size;
				186	}
				187	return -1;
				188	}
				189
				190	// Reads a word and returns its index in the vocabulary
				191	int ReadWordIndex(FILE *fin) {
				192	char word[MAX_STRING];
				193	ReadWord(word, fin);
				194	if (feof(fin))
				195	return -1;
				196	return SearchVocab(word);
				197	}
				198
				199	// Adds a word to the vocabulary
				200	int AddWordToVocab(char *word) {
				201	unsigned int hash, length = strlen(word) + 1;
				202	if (length > MAX_STRING)
				203	length = MAX_STRING;
				204	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				205	strcpy(vocab[vocab_size].word, word);
				206	vocab[vocab_size].cn = 0;
				207	vocab_size++;
				208	// Reallocate memory if needed
				209	if (vocab_size + 2 >= vocab_max_size) {
				210	vocab_max_size += 1000;
				211	vocab = (struct vocab_word *) realloc(vocab,
				212	vocab_max_size * sizeof(struct vocab_word));
				213	}
				214	hash = GetWordHash(word);
				215	while (vocab_hash[hash] != -1)
				216	hash = (hash + 1) % vocab_hash_size;
				217	vocab_hash[hash] = vocab_size - 1;
				218	return vocab_size - 1;
				219	}
				220
				221	// Used later for sorting by word counts
				222	int VocabCompare(const void a, const void b) {
„feldmueller“	7f1fc33	2024-10-21 18:05:57 +0200	[diff] [blame]	223	long long freq1 = ((struct vocab_word *) a)->cn;
				224	long long freq2 = ((struct vocab_word *) b)->cn;
				225	if (freq1 < freq2) return 1;
				226	else if (freq1 > freq2) return -1;
				227	else return 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	228	}
				229
				230	// Sorts the vocabulary by frequency using word counts
				231	void SortVocab() {
				232	int a, size;
				233	unsigned int hash;
				234	// Sort the vocabulary and keep </s> at the first position
				235	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				236	for (a = 0; a < vocab_hash_size; a++)
				237	vocab_hash[a] = -1;
				238	size = vocab_size;
				239	train_words = 0;
				240	for (a = 0; a < size; a++) {
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	241	avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	242	// Words occuring less than min_count times will be discarded from the vocab
				243	if ((vocab[a].cn < min_count) && (a != 0)) {
				244	vocab_size--;
				245	free(vocab[a].word);
				246	} else {
				247	// Hash will be re-computed, as after the sorting it is not actual
				248	hash = GetWordHash(vocab[a].word);
				249	while (vocab_hash[hash] != -1)
				250	hash = (hash + 1) % vocab_hash_size;
				251	vocab_hash[hash] = a;
				252	train_words += vocab[a].cn;
				253	}
				254	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	255	avgWordLength /= train_words;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	256	vocab = (struct vocab_word *) realloc(vocab,
				257	(vocab_size + 1) * sizeof(struct vocab_word));
				258	// Allocate memory for the binary tree construction
				259	for (a = 0; a < vocab_size; a++) {
				260	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				261	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				262	}
				263	}
				264
				265	// Reduces the vocabulary by removing infrequent tokens
				266	void ReduceVocab() {
				267	int a, b = 0;
				268	unsigned int hash;
				269	for (a = 0; a < vocab_size; a++)
				270	if (vocab[a].cn > min_reduce) {
				271	vocab[b].cn = vocab[a].cn;
				272	vocab[b].word = vocab[a].word;
				273	b++;
				274	} else
				275	free(vocab[a].word);
				276	vocab_size = b;
				277	for (a = 0; a < vocab_hash_size; a++)
				278	vocab_hash[a] = -1;
				279	for (a = 0; a < vocab_size; a++) {
				280	// Hash will be re-computed, as it is not actual
				281	hash = GetWordHash(vocab[a].word);
				282	while (vocab_hash[hash] != -1)
				283	hash = (hash + 1) % vocab_hash_size;
				284	vocab_hash[hash] = a;
				285	}
				286	fflush(stdout);
				287	min_reduce++;
				288	}
				289
				290	// Create binary Huffman tree using the word counts
				291	// Frequent words will have short uniqe binary codes
				292	void CreateBinaryTree() {
				293	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				294	char code[MAX_CODE_LENGTH];
				295	long long count = (long long ) calloc(vocab_size * 2 + 1,
				296	sizeof(long long));
				297	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				298	sizeof(long long));
				299	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				300	sizeof(long long));
				301	for (a = 0; a < vocab_size; a++)
				302	count[a] = vocab[a].cn;
				303	for (a = vocab_size; a < vocab_size * 2; a++)
				304	count[a] = 1e15;
				305	pos1 = vocab_size - 1;
				306	pos2 = vocab_size;
				307	// Following algorithm constructs the Huffman tree by adding one node at a time
				308	for (a = 0; a < vocab_size - 1; a++) {
				309	// First, find two smallest nodes 'min1, min2'
				310	if (pos1 >= 0) {
				311	if (count[pos1] < count[pos2]) {
				312	min1i = pos1;
				313	pos1--;
				314	} else {
				315	min1i = pos2;
				316	pos2++;
				317	}
				318	} else {
				319	min1i = pos2;
				320	pos2++;
				321	}
				322	if (pos1 >= 0) {
				323	if (count[pos1] < count[pos2]) {
				324	min2i = pos1;
				325	pos1--;
				326	} else {
				327	min2i = pos2;
				328	pos2++;
				329	}
				330	} else {
				331	min2i = pos2;
				332	pos2++;
				333	}
				334	count[vocab_size + a] = count[min1i] + count[min2i];
				335	parent_node[min1i] = vocab_size + a;
				336	parent_node[min2i] = vocab_size + a;
				337	binary[min2i] = 1;
				338	}
				339	// Now assign binary code to each vocabulary word
				340	for (a = 0; a < vocab_size; a++) {
				341	b = a;
				342	i = 0;
				343	while (1) {
				344	code[i] = binary[b];
				345	point[i] = b;
				346	i++;
				347	b = parent_node[b];
				348	if (b == vocab_size * 2 - 2)
				349	break;
				350	}
				351	vocab[a].codelen = i;
				352	vocab[a].point[0] = vocab_size - 2;
				353	for (b = 0; b < i; b++) {
				354	vocab[a].code[i - b - 1] = code[b];
				355	vocab[a].point[i - b] = point[b] - vocab_size;
				356	}
				357	}
				358	free(count);
				359	free(binary);
				360	free(parent_node);
				361	}
				362
				363	void LearnVocabFromTrainFile() {
				364	char word[MAX_STRING];
				365	FILE *fin;
				366	long long a, i;
				367	for (a = 0; a < vocab_hash_size; a++)
				368	vocab_hash[a] = -1;
				369	fin = fopen(train_file, "rb");
				370	if (fin == NULL) {
				371	printf("ERROR: training data file not found!\n");
				372	exit(1);
				373	}
				374	vocab_size = 0;
				375	AddWordToVocab((char *) "</s>");
				376	while (1) {
				377	ReadWord(word, fin);
				378	if (feof(fin))
				379	break;
				380	train_words++;
				381	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				382	printf("%lldK%c", train_words / 1000, 13);
				383	fflush(stdout);
				384	}
				385	i = SearchVocab(word);
				386	if (i == -1) {
				387	a = AddWordToVocab(word);
				388	vocab[a].cn = 1;
				389	} else
				390	vocab[i].cn++;
				391	if (vocab_size > vocab_hash_size * 0.7)
				392	ReduceVocab();
				393	}
				394	SortVocab();
				395	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	396	printf("Vocab size: %'lld\n", vocab_size);
				397	printf("Words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	398	}
				399	file_size = ftell(fin);
				400	fclose(fin);
				401	}
				402
				403	void SaveVocab() {
				404	long long i;
				405	FILE *fo = fopen(save_vocab_file, "wb");
				406	for (i = 0; i < vocab_size; i++)
				407	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				408	fclose(fo);
				409	}
				410
				411	void ReadVocab() {
				412	long long a, i = 0;
				413	char c;
				414	char word[MAX_STRING];
				415	FILE *fin = fopen(read_vocab_file, "rb");
				416	if (fin == NULL) {
				417	printf("Vocabulary file not found\n");
				418	exit(1);
				419	}
				420	for (a = 0; a < vocab_hash_size; a++)
				421	vocab_hash[a] = -1;
				422	vocab_size = 0;
				423	while (1) {
				424	ReadWord(word, fin);
				425	if (feof(fin))
				426	break;
				427	a = AddWordToVocab(word);
				428	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				429	i++;
				430	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	431	fclose(fin);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	432	fin = fopen(train_file, "rb");
				433	if (fin == NULL) {
				434	printf("ERROR: training data file not found!\n");
				435	exit(1);
				436	}
				437	fseek(fin, 0, SEEK_END);
				438	file_size = ftell(fin);
				439	fclose(fin);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	440	SortVocab();
				441	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	442	printf("Vocab size: %'lld\n", vocab_size);
				443	printf("Words in vocab's train file: %'lld\n", train_words);
				444	printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	445	}
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	446	train_words = file_size / avgWordLength;
				447	if(debug_mode > 0)
				448	printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	449	}
				450
				451	void InitClassUnigramTable() {
				452	long long a, c;
				453	printf("loading class unigrams \n");
				454	FILE *fin = fopen(negative_classes_file, "rb");
				455	if (fin == NULL) {
				456	printf("ERROR: class file not found!\n");
				457	exit(1);
				458	}
				459	word_to_group = (int ) malloc(vocab_size sizeof(int));
				460	for (a = 0; a < vocab_size; a++)
				461	word_to_group[a] = -1;
				462	char class[MAX_STRING];
				463	char prev_class[MAX_STRING];
				464	prev_class[0] = 0;
				465	char word[MAX_STRING];
				466	class_number = -1;
				467	while (1) {
				468	if (feof(fin))
				469	break;
				470	ReadWord(class, fin);
				471	ReadWord(word, fin);
				472	int word_index = SearchVocab(word);
				473	if (word_index != -1) {
				474	if (strcmp(class, prev_class) != 0) {
				475	class_number++;
				476	strcpy(prev_class, class);
				477	}
				478	word_to_group[word_index] = class_number;
				479	}
				480	ReadWord(word, fin);
				481	}
				482	class_number++;
				483	fclose(fin);
				484
				485	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				486	long long train_words_pow = 0;
				487	real d1, power = 0.75;
				488
				489	for (c = 0; c < class_number; c++) {
				490	long long offset = c * table_size;
				491	train_words_pow = 0;
				492	for (a = 0; a < vocab_size; a++)
				493	if (word_to_group[a] == c)
				494	train_words_pow += pow(vocab[a].cn, power);
				495	int i = 0;
				496	while (word_to_group[i] != c && i < vocab_size)
				497	i++;
				498	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				499	for (a = 0; a < table_size; a++) {
				500	//printf("index %lld , word %d\n", a, i);
				501	group_to_table[offset + a] = i;
				502	if (a / (real) table_size > d1) {
				503	i++;
				504	while (word_to_group[i] != c && i < vocab_size)
				505	i++;
				506	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				507	}
				508	if (i >= vocab_size)
				509	while (word_to_group[i] != c && i >= 0)
				510	i--;
				511	}
				512	}
				513	}
				514
Marc Kupietz	61485ad	2023-12-22 16:16:59 +0100	[diff] [blame]	515	void SaveArgs(unsigned int argc, char **argv) {
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	516	unsigned int i;
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	517	char args_file[MAX_STRING];
				518	strcpy(args_file, output_file);
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	519	strcat(args_file, ".args");
				520	FILE *fargs = fopen(args_file, "w");
				521	if (fargs == NULL) {
				522	printf("Cannot save args to %s.\n", args_file);
				523	return;
				524	}
				525
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	526	for(i=1; i<argc; i++)
				527	fprintf(fargs, "%s ", argv[i]);
				528
				529	fprintf(fargs, "\n");
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	530	fclose(fargs);
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	531
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	532	return;
				533	}
				534
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	535	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	536	if(type != 3 \|\| negative <= 0) {
				537	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				538	return;
				539	}
				540
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	541	FILE *fnet = fopen(save_net_file, "wb");
				542	if (fnet == NULL) {
				543	printf("Net parameter file not found\n");
				544	exit(1);
				545	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	546	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	547	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	548	fclose(fnet);
				549	}
				550
				551	void InitNet() {
				552	long long a, b;
				553	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	554	long long read;
				555
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	556	window_layer_size = layer1_size * window * 2;
				557	a = posix_memalign((void **) &syn0, 128,
				558	(long long) vocab_size * layer1_size * sizeof(real));
				559	if (syn0 == NULL) {
				560	printf("Memory allocation failed\n");
				561	exit(1);
				562	}
				563
				564	if (hs) {
				565	a = posix_memalign((void **) &syn1, 128,
				566	(long long) vocab_size * layer1_size * sizeof(real));
				567	if (syn1 == NULL) {
				568	printf("Memory allocation failed\n");
				569	exit(1);
				570	}
				571	a = posix_memalign((void **) &syn1_window, 128,
				572	(long long) vocab_size * window_layer_size * sizeof(real));
				573	if (syn1_window == NULL) {
				574	printf("Memory allocation failed\n");
				575	exit(1);
				576	}
				577	a = posix_memalign((void **) &syn_hidden_word, 128,
				578	(long long) vocab_size * window_hidden_size * sizeof(real));
				579	if (syn_hidden_word == NULL) {
				580	printf("Memory allocation failed\n");
				581	exit(1);
				582	}
				583
				584	for (a = 0; a < vocab_size; a++)
				585	for (b = 0; b < layer1_size; b++)
				586	syn1[a * layer1_size + b] = 0;
				587	for (a = 0; a < vocab_size; a++)
				588	for (b = 0; b < window_layer_size; b++)
				589	syn1_window[a * window_layer_size + b] = 0;
				590	for (a = 0; a < vocab_size; a++)
				591	for (b = 0; b < window_hidden_size; b++)
				592	syn_hidden_word[a * window_hidden_size + b] = 0;
				593	}
				594	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	595	if(type == 0) {
				596	a = posix_memalign((void **) &syn1neg, 128,
				597	(long long) vocab_size * layer1_size * sizeof(real));
				598	if (syn1neg == NULL) {
				599	printf("Memory allocation failed\n");
				600	exit(1);
				601	}
				602	for (a = 0; a < vocab_size; a++)
				603	for (b = 0; b < layer1_size; b++)
				604	syn1neg[a * layer1_size + b] = 0;
				605	} else if (type == 3) {
				606	a = posix_memalign((void **) &syn1neg_window, 128,
				607	(long long) vocab_size * window_layer_size * sizeof(real));
				608	if (syn1neg_window == NULL) {
				609	printf("Memory allocation failed\n");
				610	exit(1);
				611	}
				612	for (a = 0; a < vocab_size; a++)
				613	for (b = 0; b < window_layer_size; b++)
				614	syn1neg_window[a * window_layer_size + b] = 0;
				615	} else if (type == 4) {
				616	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				617	(long long) vocab_size * window_hidden_size * sizeof(real));
				618	if (syn_hidden_word_neg == NULL) {
				619	printf("Memory allocation failed\n");
				620	exit(1);
				621	}
				622	for (a = 0; a < vocab_size; a++)
				623	for (b = 0; b < window_hidden_size; b++)
				624	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	625	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	626	}
				627	if (nce > 0) {
				628	a = posix_memalign((void **) &syn1nce, 128,
				629	(long long) vocab_size * layer1_size * sizeof(real));
				630	if (syn1nce == NULL) {
				631	printf("Memory allocation failed\n");
				632	exit(1);
				633	}
				634	a = posix_memalign((void **) &syn1nce_window, 128,
				635	(long long) vocab_size * window_layer_size * sizeof(real));
				636	if (syn1nce_window == NULL) {
				637	printf("Memory allocation failed\n");
				638	exit(1);
				639	}
				640	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				641	(long long) vocab_size * window_hidden_size * sizeof(real));
				642	if (syn_hidden_word_nce == NULL) {
				643	printf("Memory allocation failed\n");
				644	exit(1);
				645	}
				646
				647	for (a = 0; a < vocab_size; a++)
				648	for (b = 0; b < layer1_size; b++)
				649	syn1nce[a * layer1_size + b] = 0;
				650	for (a = 0; a < vocab_size; a++)
				651	for (b = 0; b < window_layer_size; b++)
				652	syn1nce_window[a * window_layer_size + b] = 0;
				653	for (a = 0; a < vocab_size; a++)
				654	for (b = 0; b < window_hidden_size; b++)
				655	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				656	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	657
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	658	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	659	a = posix_memalign((void **) &syn_window_hidden, 128,
				660	window_hidden_size * window_layer_size * sizeof(real));
				661	if (syn_window_hidden == NULL) {
				662	printf("Memory allocation failed\n");
				663	exit(1);
				664	}
				665	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				666	next_random = next_random * (unsigned long long) 25214903917 + 11;
				667	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				668	- 0.5) / (window_hidden_size * window_layer_size);
				669	}
				670	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	671
				672	if (read_net_file[0] == 0) {
				673	for (a = 0; a < vocab_size; a++)
				674	for (b = 0; b < layer1_size; b++) {
				675	next_random = next_random * (unsigned long long) 25214903917
				676	+ 11;
				677	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				678	/ (real) 65536) - 0.5) / layer1_size;
				679	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	680	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	681	FILE *fnet = fopen(read_net_file, "rb");
				682	if (fnet == NULL) {
				683	printf("Net parameter file not found\n");
				684	exit(1);
				685	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	686	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				687	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				688	if(read != vocab_size * layer1_size) {
				689	fprintf(stderr, "read-net failed %lld\n", read);
				690	exit(-1);
				691	}
				692	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				693	if(read != (long long) vocab_size * window_layer_size) {
				694	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				695	(long long) sizeof(real) * vocab_size * window_layer_size);
				696	exit(-1);
				697	}
				698	fgetc(fnet);
				699	if(!feof(fnet)) {
				700	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				701	exit(-1);
				702	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	703	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	704	} else {
				705	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				706	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	707	}
				708
				709	CreateBinaryTree();
				710	}
				711
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	712	char currentDateTime(char buf, real offset) {
				713	time_t t;
				714	time(&t);
				715	t += (long) offset;
				716	struct tm tstruct;
				717	tstruct = *localtime(&t);
				718	strftime(buf, 80, "%c", &tstruct);
				719	return buf;
				720	}
				721
				722	void MonitorThread(void id) {
				723	char *timebuf = malloc(80);;
				724	int i, n=num_threads;
				725	long long sum;
				726	sleep(1);
				727	while(n > 0) {
				728	sleep(1);
				729	sum = n = 0;
				730	for(i=0; i < num_threads; i++) {
				731	if(threadPos[i] >= 0) {
				732	sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
				733	n++;
				734	} else {
				735	sum += iter * file_size / num_threads;
				736	}
				737	}
				738	if(n == 0)
				739	break;
				740	real finished_portion = (real) sum / (float) (file_size * iter);
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	741	long long now = time(NULL);
				742	long long elapsed = (now - start);
				743	long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	744
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	745	printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	746	alpha,
				747	finished_portion * 100,
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	748	(float) sum / elapsed / 1000,
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	749	elapsed,
				750	ttg,
				751	currentDateTime(timebuf, ttg)
				752	);
				753	fflush(stdout);
				754	}
				755	pthread_exit(NULL);
				756	}
				757
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	758	void TrainModelThread(void id) {
				759	long long a, b, d, cw, word, last_word, sentence_length = 0,
				760	sentence_position = 0;
				761	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				762	long long l1, l2, c, target, label, local_iter = iter;
				763	unsigned long long next_random = (long long) id;
				764	real f, g;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	765	int input_len_1 = layer1_size;
				766	int window_offset = -1;
				767	if (type == 2 \|\| type == 4) {
				768	input_len_1 = window_layer_size;
				769	}
				770	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				771	real neu1e = (real ) calloc(input_len_1, sizeof(real));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	772	threadIters[(long) id] = iter;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	773
				774	int input_len_2 = 0;
				775	if (type == 4) {
				776	input_len_2 = window_hidden_size;
				777	}
				778	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				779	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				780
				781	FILE *fi = fopen(train_file, "rb");
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	782	long long start_pos = file_size / (long long) num_threads * (long long) id;
				783	long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
				784	long long current_pos = start_pos;
				785	long long last_pos = start_pos;;
				786	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	787	while (1) {
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	788	if ((current_pos - last_pos > 100000)) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	789	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	790	last_pos = current_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	791	last_word_count = word_count;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	792	alpha = starting_alpha
				793	* (1 - word_count_actual / (real) (iter * train_words + 1));
				794	if (alpha < starting_alpha * 0.0001)
				795	alpha = starting_alpha * 0.0001;
				796	}
				797	if (sentence_length == 0) {
				798	while (1) {
				799	word = ReadWordIndex(fi);
				800	if (feof(fi))
				801	break;
				802	if (word == -1)
				803	continue;
				804	word_count++;
				805	if (word == 0)
				806	break;
				807	// The subsampling randomly discards frequent words while keeping the ranking same
				808	if (sample > 0) {
				809	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				810	+ 1) * (sample * train_words) / vocab[word].cn;
				811	next_random = next_random * (unsigned long long) 25214903917
				812	+ 11;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	813	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				814	if(type == 3) // in structured skipgrams
				815	word = -2; // keep the window position correct
				816	else
				817	continue;
				818	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	819	}
				820	sen[sentence_length] = word;
				821	sentence_length++;
				822	if (sentence_length >= MAX_SENTENCE_LENGTH)
				823	break;
				824	}
				825	sentence_position = 0;
				826	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	827	current_pos = threadPos[(long) id] = ftell(fi);
				828	if (feof(fi) \|\| current_pos >= end_pos ) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	829	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	830	threadIters[(long) id]--;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	831	local_iter--;
				832	if (local_iter == 0)
				833	break;
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	834	if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
				835	printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
				836	break;
				837	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	838	word_count = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	839	current_pos = last_pos = start_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	840	last_word_count = 0;
				841	sentence_length = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	842	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	843	continue;
				844	}
				845	word = sen[sentence_position];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	846	while (word == -2 && sentence_position<sentence_length)
				847	word = sen[++sentence_position];
				848	if (sentence_position>=sentence_length) {
				849	sentence_length=0;
				850	continue;
				851	}
				852	if (word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	853	continue;
				854	for (c = 0; c < input_len_1; c++)
				855	neu1[c] = 0;
				856	for (c = 0; c < input_len_1; c++)
				857	neu1e[c] = 0;
				858	for (c = 0; c < input_len_2; c++)
				859	neu2[c] = 0;
				860	for (c = 0; c < input_len_2; c++)
				861	neu2e[c] = 0;
				862	next_random = next_random * (unsigned long long) 25214903917 + 11;
				863	b = next_random % window;
				864	if (type == 0) { //train the cbow architecture
				865	// in -> hidden
				866	cw = 0;
				867	for (a = b; a < window * 2 + 1 - b; a++)
				868	if (a != window) {
				869	c = sentence_position - window + a;
				870	if (c < 0)
				871	continue;
				872	if (c >= sentence_length)
				873	continue;
				874	last_word = sen[c];
				875	if (last_word == -1)
				876	continue;
				877	for (c = 0; c < layer1_size; c++)
				878	neu1[c] += syn0[c + last_word * layer1_size];
				879	cw++;
				880	}
				881	if (cw) {
				882	for (c = 0; c < layer1_size; c++)
				883	neu1[c] /= cw;
				884	if (hs)
				885	for (d = 0; d < vocab[word].codelen; d++) {
				886	f = 0;
				887	l2 = vocab[word].point[d] * layer1_size;
				888	// Propagate hidden -> output
				889	for (c = 0; c < layer1_size; c++)
				890	f += neu1[c] * syn1[c + l2];
				891	if (f <= -MAX_EXP)
				892	continue;
				893	else if (f >= MAX_EXP)
				894	continue;
				895	else
				896	f = expTable[(int) ((f + MAX_EXP)
				897	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				898	// 'g' is the gradient multiplied by the learning rate
				899	g = (1 - vocab[word].code[d] - f) * alpha;
				900	// Propagate errors output -> hidden
				901	for (c = 0; c < layer1_size; c++)
				902	neu1e[c] += g * syn1[c + l2];
				903	// Learn weights hidden -> output
				904	for (c = 0; c < layer1_size; c++)
				905	syn1[c + l2] += g * neu1[c];
				906	if (cap == 1)
				907	for (c = 0; c < layer1_size; c++)
				908	capParam(syn1, c + l2);
				909	}
				910	// NEGATIVE SAMPLING
				911	if (negative > 0)
				912	for (d = 0; d < negative + 1; d++) {
				913	if (d == 0) {
				914	target = word;
				915	label = 1;
				916	} else {
				917	next_random = next_random
				918	* (unsigned long long) 25214903917 + 11;
				919	if (word_to_group != NULL
				920	&& word_to_group[word] != -1) {
				921	target = word;
				922	while (target == word) {
				923	target = group_to_table[word_to_group[word]
				924	* table_size
				925	+ (next_random >> 16) % table_size];
				926	next_random = next_random
				927	* (unsigned long long) 25214903917
				928	+ 11;
				929	}
				930	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				931	} else {
				932	target =
				933	table[(next_random >> 16) % table_size];
				934	}
				935	if (target == 0)
				936	target = next_random % (vocab_size - 1) + 1;
				937	if (target == word)
				938	continue;
				939	label = 0;
				940	}
				941	l2 = target * layer1_size;
				942	f = 0;
				943	for (c = 0; c < layer1_size; c++)
				944	f += neu1[c] * syn1neg[c + l2];
				945	if (f > MAX_EXP)
				946	g = (label - 1) * alpha;
				947	else if (f < -MAX_EXP)
				948	g = (label - 0) * alpha;
				949	else
				950	g = (label
				951	- expTable[(int) ((f + MAX_EXP)
				952	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				953	* alpha;
				954	for (c = 0; c < layer1_size; c++)
				955	neu1e[c] += g * syn1neg[c + l2];
				956	for (c = 0; c < layer1_size; c++)
				957	syn1neg[c + l2] += g * neu1[c];
				958	if (cap == 1)
				959	for (c = 0; c < layer1_size; c++)
				960	capParam(syn1neg, c + l2);
				961	}
				962	// Noise Contrastive Estimation
				963	if (nce > 0)
				964	for (d = 0; d < nce + 1; d++) {
				965	if (d == 0) {
				966	target = word;
				967	label = 1;
				968	} else {
				969	next_random = next_random
				970	* (unsigned long long) 25214903917 + 11;
				971	if (word_to_group != NULL
				972	&& word_to_group[word] != -1) {
				973	target = word;
				974	while (target == word) {
				975	target = group_to_table[word_to_group[word]
				976	* table_size
				977	+ (next_random >> 16) % table_size];
				978	next_random = next_random
				979	* (unsigned long long) 25214903917
				980	+ 11;
				981	}
				982	} else {
				983	target =
				984	table[(next_random >> 16) % table_size];
				985	}
				986	if (target == 0)
				987	target = next_random % (vocab_size - 1) + 1;
				988	if (target == word)
				989	continue;
				990	label = 0;
				991	}
				992	l2 = target * layer1_size;
				993	f = 0;
				994
				995	for (c = 0; c < layer1_size; c++)
				996	f += neu1[c] * syn1nce[c + l2];
				997	if (f > MAX_EXP)
				998	g = (label - 1) * alpha;
				999	else if (f < -MAX_EXP)
				1000	g = (label - 0) * alpha;
				1001	else {
				1002	f = exp(f);
				1003	g =
				1004	(label
				1005	- f
				1006	/ (noise_distribution[target]
				1007	* nce + f)) * alpha;
				1008	}
				1009	for (c = 0; c < layer1_size; c++)
				1010	neu1e[c] += g * syn1nce[c + l2];
				1011	for (c = 0; c < layer1_size; c++)
				1012	syn1nce[c + l2] += g * neu1[c];
				1013	if (cap == 1)
				1014	for (c = 0; c < layer1_size; c++)
				1015	capParam(syn1nce, c + l2);
				1016	}
				1017	// hidden -> in
				1018	for (a = b; a < window * 2 + 1 - b; a++)
				1019	if (a != window) {
				1020	c = sentence_position - window + a;
				1021	if (c < 0)
				1022	continue;
				1023	if (c >= sentence_length)
				1024	continue;
				1025	last_word = sen[c];
				1026	if (last_word == -1)
				1027	continue;
				1028	for (c = 0; c < layer1_size; c++)
				1029	syn0[c + last_word * layer1_size] += neu1e[c];
				1030	}
				1031	}
				1032	} else if (type == 1) { //train skip-gram
				1033	for (a = b; a < window * 2 + 1 - b; a++)
				1034	if (a != window) {
				1035	c = sentence_position - window + a;
				1036	if (c < 0)
				1037	continue;
				1038	if (c >= sentence_length)
				1039	continue;
				1040	last_word = sen[c];
				1041	if (last_word == -1)
				1042	continue;
				1043	l1 = last_word * layer1_size;
				1044	for (c = 0; c < layer1_size; c++)
				1045	neu1e[c] = 0;
				1046	// HIERARCHICAL SOFTMAX
				1047	if (hs)
				1048	for (d = 0; d < vocab[word].codelen; d++) {
				1049	f = 0;
				1050	l2 = vocab[word].point[d] * layer1_size;
				1051	// Propagate hidden -> output
				1052	for (c = 0; c < layer1_size; c++)
				1053	f += syn0[c + l1] * syn1[c + l2];
				1054	if (f <= -MAX_EXP)
				1055	continue;
				1056	else if (f >= MAX_EXP)
				1057	continue;
				1058	else
				1059	f = expTable[(int) ((f + MAX_EXP)
				1060	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1061	// 'g' is the gradient multiplied by the learning rate
				1062	g = (1 - vocab[word].code[d] - f) * alpha;
				1063	// Propagate errors output -> hidden
				1064	for (c = 0; c < layer1_size; c++)
				1065	neu1e[c] += g * syn1[c + l2];
				1066	// Learn weights hidden -> output
				1067	for (c = 0; c < layer1_size; c++)
				1068	syn1[c + l2] += g * syn0[c + l1];
				1069	if (cap == 1)
				1070	for (c = 0; c < layer1_size; c++)
				1071	capParam(syn1, c + l2);
				1072	}
				1073	// NEGATIVE SAMPLING
				1074	if (negative > 0)
				1075	for (d = 0; d < negative + 1; d++) {
				1076	if (d == 0) {
				1077	target = word;
				1078	label = 1;
				1079	} else {
				1080	next_random = next_random
				1081	* (unsigned long long) 25214903917 + 11;
				1082	if (word_to_group != NULL
				1083	&& word_to_group[word] != -1) {
				1084	target = word;
				1085	while (target == word) {
				1086	target =
				1087	group_to_table[word_to_group[word]
				1088	* table_size
				1089	+ (next_random >> 16)
				1090	% table_size];
				1091	next_random =
				1092	next_random
				1093	* (unsigned long long) 25214903917
				1094	+ 11;
				1095	}
				1096	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1097	} else {
				1098	target = table[(next_random >> 16)
				1099	% table_size];
				1100	}
				1101	if (target == 0)
				1102	target = next_random % (vocab_size - 1) + 1;
				1103	if (target == word)
				1104	continue;
				1105	label = 0;
				1106	}
				1107	l2 = target * layer1_size;
				1108	f = 0;
				1109	for (c = 0; c < layer1_size; c++)
				1110	f += syn0[c + l1] * syn1neg[c + l2];
				1111	if (f > MAX_EXP)
				1112	g = (label - 1) * alpha;
				1113	else if (f < -MAX_EXP)
				1114	g = (label - 0) * alpha;
				1115	else
				1116	g =
				1117	(label
				1118	- expTable[(int) ((f + MAX_EXP)
				1119	* (EXP_TABLE_SIZE
				1120	/ MAX_EXP / 2))])
				1121	* alpha;
				1122	for (c = 0; c < layer1_size; c++)
				1123	neu1e[c] += g * syn1neg[c + l2];
				1124	for (c = 0; c < layer1_size; c++)
				1125	syn1neg[c + l2] += g * syn0[c + l1];
				1126	if (cap == 1)
				1127	for (c = 0; c < layer1_size; c++)
				1128	capParam(syn1neg, c + l2);
				1129	}
				1130	//Noise Contrastive Estimation
				1131	if (nce > 0)
				1132	for (d = 0; d < nce + 1; d++) {
				1133	if (d == 0) {
				1134	target = word;
				1135	label = 1;
				1136	} else {
				1137	next_random = next_random
				1138	* (unsigned long long) 25214903917 + 11;
				1139	if (word_to_group != NULL
				1140	&& word_to_group[word] != -1) {
				1141	target = word;
				1142	while (target == word) {
				1143	target =
				1144	group_to_table[word_to_group[word]
				1145	* table_size
				1146	+ (next_random >> 16)
				1147	% table_size];
				1148	next_random =
				1149	next_random
				1150	* (unsigned long long) 25214903917
				1151	+ 11;
				1152	}
				1153	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1154	} else {
				1155	target = table[(next_random >> 16)
				1156	% table_size];
				1157	}
				1158	if (target == 0)
				1159	target = next_random % (vocab_size - 1) + 1;
				1160	if (target == word)
				1161	continue;
				1162	label = 0;
				1163	}
				1164	l2 = target * layer1_size;
				1165	f = 0;
				1166	for (c = 0; c < layer1_size; c++)
				1167	f += syn0[c + l1] * syn1nce[c + l2];
				1168	if (f > MAX_EXP)
				1169	g = (label - 1) * alpha;
				1170	else if (f < -MAX_EXP)
				1171	g = (label - 0) * alpha;
				1172	else {
				1173	f = exp(f);
				1174	g = (label
				1175	- f
				1176	/ (noise_distribution[target]
				1177	* nce + f)) * alpha;
				1178	}
				1179	for (c = 0; c < layer1_size; c++)
				1180	neu1e[c] += g * syn1nce[c + l2];
				1181	for (c = 0; c < layer1_size; c++)
				1182	syn1nce[c + l2] += g * syn0[c + l1];
				1183	if (cap == 1)
				1184	for (c = 0; c < layer1_size; c++)
				1185	capParam(syn1nce, c + l2);
				1186	}
				1187	// Learn weights input -> hidden
				1188	for (c = 0; c < layer1_size; c++)
				1189	syn0[c + l1] += neu1e[c];
				1190	}
				1191	} else if (type == 2) { //train the cwindow architecture
				1192	// in -> hidden
				1193	cw = 0;
				1194	for (a = 0; a < window * 2 + 1; a++)
				1195	if (a != window) {
				1196	c = sentence_position - window + a;
				1197	if (c < 0)
				1198	continue;
				1199	if (c >= sentence_length)
				1200	continue;
				1201	last_word = sen[c];
				1202	if (last_word == -1)
				1203	continue;
				1204	window_offset = a * layer1_size;
				1205	if (a > window)
				1206	window_offset -= layer1_size;
				1207	for (c = 0; c < layer1_size; c++)
				1208	neu1[c + window_offset] += syn0[c
				1209	+ last_word * layer1_size];
				1210	cw++;
				1211	}
				1212	if (cw) {
				1213	if (hs)
				1214	for (d = 0; d < vocab[word].codelen; d++) {
				1215	f = 0;
				1216	l2 = vocab[word].point[d] * window_layer_size;
				1217	// Propagate hidden -> output
				1218	for (c = 0; c < window_layer_size; c++)
				1219	f += neu1[c] * syn1_window[c + l2];
				1220	if (f <= -MAX_EXP)
				1221	continue;
				1222	else if (f >= MAX_EXP)
				1223	continue;
				1224	else
				1225	f = expTable[(int) ((f + MAX_EXP)
				1226	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1227	// 'g' is the gradient multiplied by the learning rate
				1228	g = (1 - vocab[word].code[d] - f) * alpha;
				1229	// Propagate errors output -> hidden
				1230	for (c = 0; c < window_layer_size; c++)
				1231	neu1e[c] += g * syn1_window[c + l2];
				1232	// Learn weights hidden -> output
				1233	for (c = 0; c < window_layer_size; c++)
				1234	syn1_window[c + l2] += g * neu1[c];
				1235	if (cap == 1)
				1236	for (c = 0; c < window_layer_size; c++)
				1237	capParam(syn1_window, c + l2);
				1238	}
				1239	// NEGATIVE SAMPLING
				1240	if (negative > 0)
				1241	for (d = 0; d < negative + 1; d++) {
				1242	if (d == 0) {
				1243	target = word;
				1244	label = 1;
				1245	} else {
				1246	next_random = next_random
				1247	* (unsigned long long) 25214903917 + 11;
				1248	if (word_to_group != NULL
				1249	&& word_to_group[word] != -1) {
				1250	target = word;
				1251	while (target == word) {
				1252	target = group_to_table[word_to_group[word]
				1253	* table_size
				1254	+ (next_random >> 16) % table_size];
				1255	next_random = next_random
				1256	* (unsigned long long) 25214903917
				1257	+ 11;
				1258	}
				1259	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1260	} else {
				1261	target =
				1262	table[(next_random >> 16) % table_size];
				1263	}
				1264	if (target == 0)
				1265	target = next_random % (vocab_size - 1) + 1;
				1266	if (target == word)
				1267	continue;
				1268	label = 0;
				1269	}
				1270	l2 = target * window_layer_size;
				1271	f = 0;
				1272	for (c = 0; c < window_layer_size; c++)
				1273	f += neu1[c] * syn1neg_window[c + l2];
				1274	if (f > MAX_EXP)
				1275	g = (label - 1) * alpha;
				1276	else if (f < -MAX_EXP)
				1277	g = (label - 0) * alpha;
				1278	else
				1279	g = (label
				1280	- expTable[(int) ((f + MAX_EXP)
				1281	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1282	* alpha;
				1283	for (c = 0; c < window_layer_size; c++)
				1284	neu1e[c] += g * syn1neg_window[c + l2];
				1285	for (c = 0; c < window_layer_size; c++)
				1286	syn1neg_window[c + l2] += g * neu1[c];
				1287	if (cap == 1)
				1288	for (c = 0; c < window_layer_size; c++)
				1289	capParam(syn1neg_window, c + l2);
				1290	}
				1291	// Noise Contrastive Estimation
				1292	if (nce > 0)
				1293	for (d = 0; d < nce + 1; d++) {
				1294	if (d == 0) {
				1295	target = word;
				1296	label = 1;
				1297	} else {
				1298	next_random = next_random
				1299	* (unsigned long long) 25214903917 + 11;
				1300	if (word_to_group != NULL
				1301	&& word_to_group[word] != -1) {
				1302	target = word;
				1303	while (target == word) {
				1304	target = group_to_table[word_to_group[word]
				1305	* table_size
				1306	+ (next_random >> 16) % table_size];
				1307	next_random = next_random
				1308	* (unsigned long long) 25214903917
				1309	+ 11;
				1310	}
				1311	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1312	} else {
				1313	target =
				1314	table[(next_random >> 16) % table_size];
				1315	}
				1316	if (target == 0)
				1317	target = next_random % (vocab_size - 1) + 1;
				1318	if (target == word)
				1319	continue;
				1320	label = 0;
				1321	}
				1322	l2 = target * window_layer_size;
				1323	f = 0;
				1324	for (c = 0; c < window_layer_size; c++)
				1325	f += neu1[c] * syn1nce_window[c + l2];
				1326	if (f > MAX_EXP)
				1327	g = (label - 1) * alpha;
				1328	else if (f < -MAX_EXP)
				1329	g = (label - 0) * alpha;
				1330	else {
				1331	f = exp(f);
				1332	g =
				1333	(label
				1334	- f
				1335	/ (noise_distribution[target]
				1336	* nce + f)) * alpha;
				1337	}
				1338	for (c = 0; c < window_layer_size; c++)
				1339	neu1e[c] += g * syn1nce_window[c + l2];
				1340	for (c = 0; c < window_layer_size; c++)
				1341	syn1nce_window[c + l2] += g * neu1[c];
				1342	if (cap == 1)
				1343	for (c = 0; c < window_layer_size; c++)
				1344	capParam(syn1nce_window, c + l2);
				1345	}
				1346	// hidden -> in
				1347	for (a = 0; a < window * 2 + 1; a++)
				1348	if (a != window) {
				1349	c = sentence_position - window + a;
				1350	if (c < 0)
				1351	continue;
				1352	if (c >= sentence_length)
				1353	continue;
				1354	last_word = sen[c];
				1355	if (last_word == -1)
				1356	continue;
				1357	window_offset = a * layer1_size;
				1358	if (a > window)
				1359	window_offset -= layer1_size;
				1360	for (c = 0; c < layer1_size; c++)
				1361	syn0[c + last_word * layer1_size] += neu1e[c
				1362	+ window_offset];
				1363	}
				1364	}
				1365	} else if (type == 3) { //train structured skip-gram
				1366	for (a = 0; a < window * 2 + 1; a++)
				1367	if (a != window) {
				1368	c = sentence_position - window + a;
				1369	if (c < 0)
				1370	continue;
				1371	if (c >= sentence_length)
				1372	continue;
				1373	last_word = sen[c];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	1374	if (last_word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1375	continue;
				1376	l1 = last_word * layer1_size;
				1377	window_offset = a * layer1_size;
				1378	if (a > window)
				1379	window_offset -= layer1_size;
				1380	for (c = 0; c < layer1_size; c++)
				1381	neu1e[c] = 0;
				1382	// HIERARCHICAL SOFTMAX
				1383	if (hs)
				1384	for (d = 0; d < vocab[word].codelen; d++) {
				1385	f = 0;
				1386	l2 = vocab[word].point[d] * window_layer_size;
				1387	// Propagate hidden -> output
				1388	for (c = 0; c < layer1_size; c++)
				1389	f += syn0[c + l1]
				1390	* syn1_window[c + l2 + window_offset];
				1391	if (f <= -MAX_EXP)
				1392	continue;
				1393	else if (f >= MAX_EXP)
				1394	continue;
				1395	else
				1396	f = expTable[(int) ((f + MAX_EXP)
				1397	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1398	// 'g' is the gradient multiplied by the learning rate
				1399	g = (1 - vocab[word].code[d] - f) * alpha;
				1400	// Propagate errors output -> hidden
				1401	for (c = 0; c < layer1_size; c++)
				1402	neu1e[c] += g
				1403	* syn1_window[c + l2 + window_offset];
				1404	// Learn weights hidden -> output
				1405	for (c = 0; c < layer1_size; c++)
				1406	syn1[c + l2 + window_offset] += g
				1407	* syn0[c + l1];
				1408	if (cap == 1)
				1409	for (c = 0; c < layer1_size; c++)
				1410	capParam(syn1, c + l2 + window_offset);
				1411	}
				1412	// NEGATIVE SAMPLING
				1413	if (negative > 0)
				1414	for (d = 0; d < negative + 1; d++) {
				1415	if (d == 0) {
				1416	target = word;
				1417	label = 1;
				1418	} else {
				1419	next_random = next_random
				1420	* (unsigned long long) 25214903917 + 11;
				1421	if (word_to_group != NULL
				1422	&& word_to_group[word] != -1) {
				1423	target = word;
				1424	while (target == word) {
				1425	target =
				1426	group_to_table[word_to_group[word]
				1427	* table_size
				1428	+ (next_random >> 16)
				1429	% table_size];
				1430	next_random =
				1431	next_random
				1432	* (unsigned long long) 25214903917
				1433	+ 11;
				1434	}
				1435	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1436	} else {
				1437	target = table[(next_random >> 16)
				1438	% table_size];
				1439	}
				1440	if (target == 0)
				1441	target = next_random % (vocab_size - 1) + 1;
				1442	if (target == word)
				1443	continue;
				1444	label = 0;
				1445	}
				1446	l2 = target * window_layer_size;
				1447	f = 0;
				1448	for (c = 0; c < layer1_size; c++)
				1449	f +=
				1450	syn0[c + l1]
				1451	* syn1neg_window[c + l2
				1452	+ window_offset];
				1453	if (f > MAX_EXP)
				1454	g = (label - 1) * alpha;
				1455	else if (f < -MAX_EXP)
				1456	g = (label - 0) * alpha;
				1457	else
				1458	g =
				1459	(label
				1460	- expTable[(int) ((f + MAX_EXP)
				1461	* (EXP_TABLE_SIZE
				1462	/ MAX_EXP / 2))])
				1463	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1464	if(debug_mode > 2 && ((long long) id) == 0) {
				1465	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1466	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1467	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1468	for (c = 0; c < layer1_size; c++)
				1469	neu1e[c] +=
				1470	g
				1471	* syn1neg_window[c + l2
				1472	+ window_offset];
				1473	for (c = 0; c < layer1_size; c++)
				1474	syn1neg_window[c + l2 + window_offset] += g
				1475	* syn0[c + l1];
				1476	if (cap == 1)
				1477	for (c = 0; c < layer1_size; c++)
				1478	capParam(syn1neg_window,
				1479	c + l2 + window_offset);
				1480	}
				1481	// Noise Constrastive Estimation
				1482	if (nce > 0)
				1483	for (d = 0; d < nce + 1; d++) {
				1484	if (d == 0) {
				1485	target = word;
				1486	label = 1;
				1487	} else {
				1488	next_random = next_random
				1489	* (unsigned long long) 25214903917 + 11;
				1490	if (word_to_group != NULL
				1491	&& word_to_group[word] != -1) {
				1492	target = word;
				1493	while (target == word) {
				1494	target =
				1495	group_to_table[word_to_group[word]
				1496	* table_size
				1497	+ (next_random >> 16)
				1498	% table_size];
				1499	next_random =
				1500	next_random
				1501	* (unsigned long long) 25214903917
				1502	+ 11;
				1503	}
				1504	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1505	} else {
				1506	target = table[(next_random >> 16)
				1507	% table_size];
				1508	}
				1509	if (target == 0)
				1510	target = next_random % (vocab_size - 1) + 1;
				1511	if (target == word)
				1512	continue;
				1513	label = 0;
				1514	}
				1515	l2 = target * window_layer_size;
				1516	f = 0;
				1517	for (c = 0; c < layer1_size; c++)
				1518	f +=
				1519	syn0[c + l1]
				1520	* syn1nce_window[c + l2
				1521	+ window_offset];
				1522	if (f > MAX_EXP)
				1523	g = (label - 1) * alpha;
				1524	else if (f < -MAX_EXP)
				1525	g = (label - 0) * alpha;
				1526	else {
				1527	f = exp(f);
				1528	g = (label
				1529	- f
				1530	/ (noise_distribution[target]
				1531	* nce + f)) * alpha;
				1532	}
				1533	for (c = 0; c < layer1_size; c++)
				1534	neu1e[c] +=
				1535	g
				1536	* syn1nce_window[c + l2
				1537	+ window_offset];
				1538	for (c = 0; c < layer1_size; c++)
				1539	syn1nce_window[c + l2 + window_offset] += g
				1540	* syn0[c + l1];
				1541	if (cap == 1)
				1542	for (c = 0; c < layer1_size; c++)
				1543	capParam(syn1nce_window,
				1544	c + l2 + window_offset);
				1545	}
				1546	// Learn weights input -> hidden
				1547	for (c = 0; c < layer1_size; c++) {
				1548	syn0[c + l1] += neu1e[c];
				1549	if (syn0[c + l1] > 50)
				1550	syn0[c + l1] = 50;
				1551	if (syn0[c + l1] < -50)
				1552	syn0[c + l1] = -50;
				1553	}
				1554	}
				1555	} else if (type == 4) { //training senna
				1556	// in -> hidden
				1557	cw = 0;
				1558	for (a = 0; a < window * 2 + 1; a++)
				1559	if (a != window) {
				1560	c = sentence_position - window + a;
				1561	if (c < 0)
				1562	continue;
				1563	if (c >= sentence_length)
				1564	continue;
				1565	last_word = sen[c];
				1566	if (last_word == -1)
				1567	continue;
				1568	window_offset = a * layer1_size;
				1569	if (a > window)
				1570	window_offset -= layer1_size;
				1571	for (c = 0; c < layer1_size; c++)
				1572	neu1[c + window_offset] += syn0[c
				1573	+ last_word * layer1_size];
				1574	cw++;
				1575	}
				1576	if (cw) {
				1577	for (a = 0; a < window_hidden_size; a++) {
				1578	c = a * window_layer_size;
				1579	for (b = 0; b < window_layer_size; b++) {
				1580	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1581	}
				1582	}
				1583	if (hs)
				1584	for (d = 0; d < vocab[word].codelen; d++) {
				1585	f = 0;
				1586	l2 = vocab[word].point[d] * window_hidden_size;
				1587	// Propagate hidden -> output
				1588	for (c = 0; c < window_hidden_size; c++)
				1589	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1590	if (f <= -MAX_EXP)
				1591	continue;
				1592	else if (f >= MAX_EXP)
				1593	continue;
				1594	else
				1595	f = expTable[(int) ((f + MAX_EXP)
				1596	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1597	// 'g' is the gradient multiplied by the learning rate
				1598	g = (1 - vocab[word].code[d] - f) * alpha;
				1599	// Propagate errors output -> hidden
				1600	for (c = 0; c < window_hidden_size; c++)
				1601	neu2e[c] += dHardTanh(neu2[c], g) * g
				1602	* syn_hidden_word[c + l2];
				1603	// Learn weights hidden -> output
				1604	for (c = 0; c < window_hidden_size; c++)
				1605	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1606	* neu2[c];
				1607	}
				1608	// NEGATIVE SAMPLING
				1609	if (negative > 0)
				1610	for (d = 0; d < negative + 1; d++) {
				1611	if (d == 0) {
				1612	target = word;
				1613	label = 1;
				1614	} else {
				1615	next_random = next_random
				1616	* (unsigned long long) 25214903917 + 11;
				1617	if (word_to_group != NULL
				1618	&& word_to_group[word] != -1) {
				1619	target = word;
				1620	while (target == word) {
				1621	target = group_to_table[word_to_group[word]
				1622	* table_size
				1623	+ (next_random >> 16) % table_size];
				1624	next_random = next_random
				1625	* (unsigned long long) 25214903917
				1626	+ 11;
				1627	}
				1628	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1629	} else {
				1630	target =
				1631	table[(next_random >> 16) % table_size];
				1632	}
				1633	if (target == 0)
				1634	target = next_random % (vocab_size - 1) + 1;
				1635	if (target == word)
				1636	continue;
				1637	label = 0;
				1638	}
				1639	l2 = target * window_hidden_size;
				1640	f = 0;
				1641	for (c = 0; c < window_hidden_size; c++)
				1642	f += hardTanh(neu2[c])
				1643	* syn_hidden_word_neg[c + l2];
				1644	if (f > MAX_EXP)
				1645	g = (label - 1) * alpha / negative;
				1646	else if (f < -MAX_EXP)
				1647	g = (label - 0) * alpha / negative;
				1648	else
				1649	g = (label
				1650	- expTable[(int) ((f + MAX_EXP)
				1651	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1652	* alpha / negative;
				1653	for (c = 0; c < window_hidden_size; c++)
				1654	neu2e[c] += dHardTanh(neu2[c], g) * g
				1655	* syn_hidden_word_neg[c + l2];
				1656	for (c = 0; c < window_hidden_size; c++)
				1657	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1658	* g * neu2[c];
				1659	}
				1660	for (a = 0; a < window_hidden_size; a++)
				1661	for (b = 0; b < window_layer_size; b++)
				1662	neu1e[b] += neu2e[a]
				1663	* syn_window_hidden[a * window_layer_size + b];
				1664	for (a = 0; a < window_hidden_size; a++)
				1665	for (b = 0; b < window_layer_size; b++)
				1666	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1667	* neu1[b];
				1668	// hidden -> in
				1669	for (a = 0; a < window * 2 + 1; a++)
				1670	if (a != window) {
				1671	c = sentence_position - window + a;
				1672	if (c < 0)
				1673	continue;
				1674	if (c >= sentence_length)
				1675	continue;
				1676	last_word = sen[c];
				1677	if (last_word == -1)
				1678	continue;
				1679	window_offset = a * layer1_size;
				1680	if (a > window)
				1681	window_offset -= layer1_size;
				1682	for (c = 0; c < layer1_size; c++)
				1683	syn0[c + last_word * layer1_size] += neu1e[c
				1684	+ window_offset];
				1685	}
				1686	}
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	1687	} else if(type == 5) {
				1688	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				1689	c = sentence_position - window + a;
				1690	if (c < 0) continue;
				1691	if (c >= sentence_length) continue;
				1692	last_word = sen[c];
				1693	if (last_word == -1) continue;
				1694	inc_collocator(cdb, word, last_word, a - window);
				1695	// printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
				1696	// cw++;
				1697	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1698	} else {
				1699	printf("unknown type %i", type);
				1700	exit(0);
				1701	}
				1702	sentence_position++;
				1703	if (sentence_position >= sentence_length) {
				1704	sentence_length = 0;
				1705	continue;
				1706	}
				1707	}
				1708	fclose(fi);
				1709	free(neu1);
				1710	free(neu1e);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1711	threadPos[(long) id] = -1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1712	pthread_exit(NULL);
				1713	}
				1714
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1715	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1716	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1717	real f, max_f, maxmax_f;
Marc Kupietz	f00e7b0	2023-12-22 11:11:56 +0100	[diff] [blame]	1718	real *target_sums=0L, bestf[MAX_CC], worstbest;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1719	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1720	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1721	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1722
				1723	for (d = cc; d < vocab_size; d++) {
				1724	for (b = 0; b < vocab_size; b++)
				1725	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1726	for (b = 0; b < N; b++)
				1727	bestf[b]=-1;
				1728	worstbest = -1;
				1729
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1730	maxmax_f = -1;
				1731	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1732	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1733	if (a != window) {
				1734	max_f = -1;
				1735	window_offset = a * layer1_size;
				1736	if (a > window)
				1737	window_offset -= layer1_size;
				1738	for(target = 0; target < vocab_size; target ++) {
				1739	if(target == d)
				1740	continue;
				1741	f = 0;
				1742	for (c = 0; c < layer1_size; c++)
				1743	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1744	if (f < -MAX_EXP)
				1745	continue;
				1746	else if (f > MAX_EXP)
				1747	continue;
				1748	else
				1749	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1750	if(f > max_f) {
				1751	max_f = f;
				1752	max_target = target;
				1753	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1754	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1755	if(f > worstbest) {
				1756	for (b = 0; b < N; b++) {
				1757	if (f > bestf[b]) {
				1758	for (e = N - 1; e > b; e--) {
				1759	bestf[e] = bestf[e - 1];
				1760	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1761	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1762	}
				1763	bestf[b] = f;
				1764	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1765	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1766	break;
				1767	}
				1768	}
				1769	worstbest = bestf[N-1];
				1770	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1771	}
				1772	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1773	if(max_f > maxmax_f) {
				1774	maxmax_f = max_f;
				1775	maxmax_target = max_target;
				1776	}
				1777	} else {
				1778	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1779	}
				1780	}
				1781	max_f = -1;
				1782	for (b = 0; b < vocab_size; b++) {
				1783	if(target_sums[b] > max_f) {
				1784	max_f = target_sums[b];
				1785	max_target = b;
				1786	}
				1787	}
				1788	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1789	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1790	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1791	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1792	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1793	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1794	}
				1795	}
				1796
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1797	void TrainModel() {
				1798	long a, b, c, d;
				1799	FILE *fo;
				1800	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1801	threadPos = malloc(num_threads * sizeof(long long));
				1802	threadIters = malloc(num_threads * sizeof(int));
				1803	char *timebuf = malloc(80);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1804	printf("Starting training using file %s\n", train_file);
				1805	starting_alpha = alpha;
				1806	if (read_vocab_file[0] != 0)
				1807	ReadVocab();
				1808	else
				1809	LearnVocabFromTrainFile();
				1810	if (save_vocab_file[0] != 0)
				1811	SaveVocab();
				1812	if (output_file[0] == 0)
				1813	return;
				1814	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1815	if(cc > 0)
				1816	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1817	if (negative > 0 \|\| nce > 0)
				1818	InitUnigramTable();
				1819	if (negative_classes_file[0] != 0)
				1820	InitClassUnigramTable();
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	1821	start = time(NULL);
				1822	start_clock = clock();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1823	for (a = 0; a < num_threads; a++)
				1824	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1825	if(debug_mode > 1)
				1826	pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1827	for (a = 0; a < num_threads; a++)
				1828	pthread_join(pt[a], NULL);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1829	if(debug_mode > 1) {
				1830	pthread_join(pt[num_threads], NULL);
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	1831	clock_t now = time(NULL);
				1832	clock_t now_clock = clock();
				1833	printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	1834	if(type == 5) // don't save vectorsmfor classic collocators
				1835	return;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1836	printf("Saving vectors to %s ...", output_file);
				1837	fflush(stdout);
				1838	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1839	fo = fopen(output_file, "wb");
				1840	if (classes == 0) {
				1841	// Save the word vectors
				1842	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1843	for (a = 0; a < vocab_size; a++) {
				1844	fprintf(fo, "%s ", vocab[a].word);
				1845	if (binary)
				1846	for (b = 0; b < layer1_size; b++)
				1847	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1848	else
				1849	for (b = 0; b < layer1_size; b++)
				1850	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1851	fprintf(fo, "\n");
				1852	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1853	if(debug_mode > 1)
				1854	fprintf(stderr, "\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1855	} else {
				1856	// Run K-means on the word vectors
				1857	int clcn = classes, iter = 10, closeid;
				1858	int centcn = (int ) malloc(classes * sizeof(int));
				1859	int cl = (int ) calloc(vocab_size, sizeof(int));
				1860	real closev, x;
				1861	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1862	for (a = 0; a < vocab_size; a++)
				1863	cl[a] = a % clcn;
				1864	for (a = 0; a < iter; a++) {
				1865	for (b = 0; b < clcn * layer1_size; b++)
				1866	cent[b] = 0;
				1867	for (b = 0; b < clcn; b++)
				1868	centcn[b] = 1;
				1869	for (c = 0; c < vocab_size; c++) {
				1870	for (d = 0; d < layer1_size; d++)
				1871	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1872	centcn[cl[c]]++;
				1873	}
				1874	for (b = 0; b < clcn; b++) {
				1875	closev = 0;
				1876	for (c = 0; c < layer1_size; c++) {
				1877	cent[layer1_size * b + c] /= centcn[b];
				1878	closev += cent[layer1_size * b + c]
				1879	* cent[layer1_size * b + c];
				1880	}
				1881	closev = sqrt(closev);
				1882	for (c = 0; c < layer1_size; c++)
				1883	cent[layer1_size * b + c] /= closev;
				1884	}
				1885	for (c = 0; c < vocab_size; c++) {
				1886	closev = -10;
				1887	closeid = 0;
				1888	for (d = 0; d < clcn; d++) {
				1889	x = 0;
				1890	for (b = 0; b < layer1_size; b++)
				1891	x += cent[layer1_size * d + b]
				1892	* syn0[c * layer1_size + b];
				1893	if (x > closev) {
				1894	closev = x;
				1895	closeid = d;
				1896	}
				1897	}
				1898	cl[c] = closeid;
				1899	}
				1900	}
				1901	// Save the K-means classes
				1902	for (a = 0; a < vocab_size; a++)
				1903	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1904	free(centcn);
				1905	free(cent);
				1906	free(cl);
				1907	}
				1908	fclose(fo);
				1909	if (save_net_file[0] != 0)
				1910	SaveNet();
				1911	}
				1912
				1913	int ArgPos(char str, int argc, char *argv) {
				1914	int a;
				1915	for (a = 1; a < argc; a++)
				1916	if (!strcmp(str, argv[a])) {
				1917	if (a == argc - 1) {
				1918	printf("Argument missing for %s\n", str);
				1919	exit(1);
				1920	}
				1921	return a;
				1922	}
				1923	return -1;
				1924	}
				1925
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1926	void print_help() {
Marc Kupietz	83a67d4	2021-03-22 17:29:36 +0100	[diff] [blame]	1927	printf("WORD VECTOR estimation toolkit v 0.9.0\n\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1928	printf("Options:\n");
				1929	printf("Parameters for training:\n");
				1930	printf("\t-train <file>\n");
				1931	printf("\t\tUse text data from <file> to train the model\n");
				1932	printf("\t-output <file>\n");
				1933	printf(
				1934	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1935	printf("\t-size <int>\n");
				1936	printf("\t\tSet size of word vectors; default is 100\n");
				1937	printf("\t-window <int>\n");
				1938	printf("\t\tSet max skip length between words; default is 5\n");
				1939	printf("\t-sample <float>\n");
				1940	printf(
				1941	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1942	printf(
				1943	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1944	printf("\t-hs <int>\n");
				1945	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1946	printf("\t-negative <int>\n");
				1947	printf(
				1948	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1949	printf("\t-negative-classes <file>\n");
				1950	printf("\t\tNegative classes to sample from\n");
				1951	printf("\t-nce <int>\n");
				1952	printf(
				1953	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1954	printf("\t-threads <int>\n");
				1955	printf("\t\tUse <int> threads (default 12)\n");
				1956	printf("\t-iter <int>\n");
				1957	printf("\t\tRun more training iterations (default 5)\n");
				1958	printf("\t-min-count <int>\n");
				1959	printf(
				1960	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1961	printf("\t-alpha <float>\n");
				1962	printf(
				1963	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1964	printf("\t-classes <int>\n");
				1965	printf(
				1966	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1967	printf("\t-debug <int>\n");
				1968	printf(
				1969	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1970	printf("\t-binary <int>\n");
				1971	printf(
				1972	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1973	printf("\t-save-vocab <file>\n");
				1974	printf("\t\tThe vocabulary will be saved to <file>\n");
				1975	printf("\t-read-vocab <file>\n");
				1976	printf(
				1977	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1978	printf("\t-read-net <file>\n");
				1979	printf(
				1980	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1981	printf("\t-save-net <file>\n");
				1982	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	1983	printf("\t-magic-stop-file <file>\n");
				1984	printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1985	printf("\t-show-cc <int>\n");
				1986	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1987	printf("\t-type <int>\n");
				1988	printf(
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	1989	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1990	printf("\t-cap <int>\n");
				1991	printf(
				1992	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1993	printf("\nExamples:\n");
				1994	printf(
Marc Kupietz	83a67d4	2021-03-22 17:29:36 +0100	[diff] [blame]	1995	"./dereko2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1996	}
				1997
				1998	int main(int argc, char **argv) {
				1999	int i;
				2000	setlocale(LC_ALL, "");
				2001	if (argc == 1) {
				2002	print_help();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2003	return 0;
				2004	}
				2005	output_file[0] = 0;
				2006	save_vocab_file[0] = 0;
				2007	read_vocab_file[0] = 0;
				2008	save_net_file[0] = 0;
				2009	read_net_file[0] = 0;
				2010	negative_classes_file[0] = 0;
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	2011	if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
				2012	print_help();
				2013	return(0);
				2014	}
				2015	if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
				2016	print_help();
				2017	return(0);
				2018	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2019	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				2020	layer1_size = atoi(argv[i + 1]);
				2021	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				2022	strcpy(train_file, argv[i + 1]);
				2023	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				2024	strcpy(save_vocab_file, argv[i + 1]);
				2025	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				2026	strcpy(read_vocab_file, argv[i + 1]);
				2027	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				2028	strcpy(save_net_file, argv[i + 1]);
				2029	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				2030	strcpy(read_net_file, argv[i + 1]);
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	2031	if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
				2032	strcpy(magic_stop_file, argv[i + 1]);
				2033	if (access(magic_stop_file, F_OK ) != -1) {
				2034	printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
				2035	exit(1);
				2036	}
				2037	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2038	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				2039	debug_mode = atoi(argv[i + 1]);
				2040	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				2041	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	2042	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				2043	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2044	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				2045	type = atoi(argv[i + 1]);
				2046	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				2047	strcpy(output_file, argv[i + 1]);
				2048	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				2049	window = atoi(argv[i + 1]);
				2050	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				2051	sample = atof(argv[i + 1]);
				2052	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				2053	hs = atoi(argv[i + 1]);
				2054	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				2055	negative = atoi(argv[i + 1]);
				2056	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				2057	strcpy(negative_classes_file, argv[i + 1]);
				2058	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				2059	nce = atoi(argv[i + 1]);
				2060	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				2061	num_threads = atoi(argv[i + 1]);
				2062	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				2063	iter = atoi(argv[i + 1]);
				2064	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				2065	min_count = atoi(argv[i + 1]);
				2066	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				2067	classes = atoi(argv[i + 1]);
				2068	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				2069	cap = atoi(argv[i + 1]);
				2070	if (type == 0 \|\| type == 2 \|\| type == 4)
				2071	alpha = 0.05;
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	2072	if (type==5) {
				2073	sample = 0;
				2074	cdb = open_collocatordb_for_write(output_file);
				2075	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2076	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				2077	alpha = atof(argv[i + 1]);
				2078	vocab = (struct vocab_word *) calloc(vocab_max_size,
				2079	sizeof(struct vocab_word));
				2080	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				2081	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				2082	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				2083	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				2084	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				2085	}
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	2086	SaveArgs(argc, argv);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2087	TrainModel();
				2088	return 0;
				2089	}
				2090