Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 69b738a66e0819a3c758d206e01068d4d1e9517e [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	15	#include <locale.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	16	#include <stdio.h>
				17	#include <stdlib.h>
				18	#include <string.h>
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	19	#include <unistd.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	20	#include <math.h>
				21	#include <pthread.h>
				22
				23	#define MAX_STRING 100
				24	#define EXP_TABLE_SIZE 1000
				25	#define MAX_EXP 6
				26	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	27	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	28	#define MAX_CODE_LENGTH 40
				29
				30	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				31
				32	typedef float real; // Precision of float numbers
				33
				34	struct vocab_word {
				35	long long cn;
				36	int *point;
				37	char word, code, codelen;
				38	};
				39
				40	char train_file[MAX_STRING], output_file[MAX_STRING];
				41	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				42	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				43	struct vocab_word *vocab;
				44	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	45	num_threads = 12, min_reduce = 1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	46	int *vocab_hash;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	47	long long *threadPos;
				48	int *threadIters;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	49	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				50	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				51	classes = 0;
				52	real alpha = 0.025, starting_alpha, sample = 1e-3;
				53	real syn0, syn1, syn1neg, syn1nce, *expTable;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	54	real avgWordLength=0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	55	clock_t start;
				56
				57	real syn1_window, syn1neg_window, *syn1nce_window;
				58	int w_offset, window_layer_size;
				59
				60	int window_hidden_size = 500;
				61	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				62	*syn_hidden_word_nce;
				63
				64	int hs = 0, negative = 5;
				65	const int table_size = 1e8;
				66	int *table;
				67
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	68	long cc = 0;
				69
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	70	//constrastive negative sampling
				71	char negative_classes_file[MAX_STRING];
				72	int *word_to_group;
				73	int group_to_table; //group_sizetable_size
				74	int class_number;
				75
				76	//nce
				77	real* noise_distribution;
				78	int nce = 0;
				79
				80	//param caps
				81	real CAP_VALUE = 50;
				82	int cap = 0;
				83
				84	void capParam(real* array, int index) {
				85	if (array[index] > CAP_VALUE)
				86	array[index] = CAP_VALUE;
				87	else if (array[index] < -CAP_VALUE)
				88	array[index] = -CAP_VALUE;
				89	}
				90
				91	real hardTanh(real x) {
				92	if (x >= 1) {
				93	return 1;
				94	} else if (x <= -1) {
				95	return -1;
				96	} else {
				97	return x;
				98	}
				99	}
				100
				101	real dHardTanh(real x, real g) {
				102	if (x > 1 && g > 0) {
				103	return 0;
				104	}
				105	if (x < -1 && g < 0) {
				106	return 0;
				107	}
				108	return 1;
				109	}
				110
				111	void InitUnigramTable() {
				112	int a, i;
				113	long long train_words_pow = 0;
				114	real d1, power = 0.75;
				115	table = (int ) malloc(table_size sizeof(int));
				116	for (a = 0; a < vocab_size; a++)
				117	train_words_pow += pow(vocab[a].cn, power);
				118	i = 0;
				119	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				120	for (a = 0; a < table_size; a++) {
				121	table[a] = i;
				122	if (a / (real) table_size > d1) {
				123	i++;
				124	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				125	}
				126	if (i >= vocab_size)
				127	i = vocab_size - 1;
				128	}
				129
				130	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				131	for (a = 0; a < vocab_size; a++)
				132	noise_distribution[a] = pow(vocab[a].cn, power)
				133	/ (real) train_words_pow;
				134	}
				135
				136	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				137	void ReadWord(char word, FILE fin) {
				138	int a = 0, ch;
				139	while (!feof(fin)) {
				140	ch = fgetc(fin);
				141	if (ch == 13)
				142	continue;
				143	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				144	if (a > 0) {
				145	if (ch == '\n')
				146	ungetc(ch, fin);
				147	break;
				148	}
				149	if (ch == '\n') {
				150	strcpy(word, (char *) "</s>");
				151	return;
				152	} else
				153	continue;
				154	}
				155	word[a] = ch;
				156	a++;
				157	if (a >= MAX_STRING - 1)
				158	a--; // Truncate too long words
				159	}
				160	word[a] = 0;
				161	}
				162
				163	// Returns hash value of a word
				164	int GetWordHash(char *word) {
				165	unsigned long long a, hash = 0;
				166	for (a = 0; a < strlen(word); a++)
				167	hash = hash * 257 + word[a];
				168	hash = hash % vocab_hash_size;
				169	return hash;
				170	}
				171
				172	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				173	int SearchVocab(char *word) {
				174	unsigned int hash = GetWordHash(word);
				175	while (1) {
				176	if (vocab_hash[hash] == -1)
				177	return -1;
				178	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				179	return vocab_hash[hash];
				180	hash = (hash + 1) % vocab_hash_size;
				181	}
				182	return -1;
				183	}
				184
				185	// Reads a word and returns its index in the vocabulary
				186	int ReadWordIndex(FILE *fin) {
				187	char word[MAX_STRING];
				188	ReadWord(word, fin);
				189	if (feof(fin))
				190	return -1;
				191	return SearchVocab(word);
				192	}
				193
				194	// Adds a word to the vocabulary
				195	int AddWordToVocab(char *word) {
				196	unsigned int hash, length = strlen(word) + 1;
				197	if (length > MAX_STRING)
				198	length = MAX_STRING;
				199	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				200	strcpy(vocab[vocab_size].word, word);
				201	vocab[vocab_size].cn = 0;
				202	vocab_size++;
				203	// Reallocate memory if needed
				204	if (vocab_size + 2 >= vocab_max_size) {
				205	vocab_max_size += 1000;
				206	vocab = (struct vocab_word *) realloc(vocab,
				207	vocab_max_size * sizeof(struct vocab_word));
				208	}
				209	hash = GetWordHash(word);
				210	while (vocab_hash[hash] != -1)
				211	hash = (hash + 1) % vocab_hash_size;
				212	vocab_hash[hash] = vocab_size - 1;
				213	return vocab_size - 1;
				214	}
				215
				216	// Used later for sorting by word counts
				217	int VocabCompare(const void a, const void b) {
				218	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				219	}
				220
				221	// Sorts the vocabulary by frequency using word counts
				222	void SortVocab() {
				223	int a, size;
				224	unsigned int hash;
				225	// Sort the vocabulary and keep </s> at the first position
				226	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				227	for (a = 0; a < vocab_hash_size; a++)
				228	vocab_hash[a] = -1;
				229	size = vocab_size;
				230	train_words = 0;
				231	for (a = 0; a < size; a++) {
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	232	avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	233	// Words occuring less than min_count times will be discarded from the vocab
				234	if ((vocab[a].cn < min_count) && (a != 0)) {
				235	vocab_size--;
				236	free(vocab[a].word);
				237	} else {
				238	// Hash will be re-computed, as after the sorting it is not actual
				239	hash = GetWordHash(vocab[a].word);
				240	while (vocab_hash[hash] != -1)
				241	hash = (hash + 1) % vocab_hash_size;
				242	vocab_hash[hash] = a;
				243	train_words += vocab[a].cn;
				244	}
				245	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	246	avgWordLength /= train_words;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	247	vocab = (struct vocab_word *) realloc(vocab,
				248	(vocab_size + 1) * sizeof(struct vocab_word));
				249	// Allocate memory for the binary tree construction
				250	for (a = 0; a < vocab_size; a++) {
				251	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				252	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				253	}
				254	}
				255
				256	// Reduces the vocabulary by removing infrequent tokens
				257	void ReduceVocab() {
				258	int a, b = 0;
				259	unsigned int hash;
				260	for (a = 0; a < vocab_size; a++)
				261	if (vocab[a].cn > min_reduce) {
				262	vocab[b].cn = vocab[a].cn;
				263	vocab[b].word = vocab[a].word;
				264	b++;
				265	} else
				266	free(vocab[a].word);
				267	vocab_size = b;
				268	for (a = 0; a < vocab_hash_size; a++)
				269	vocab_hash[a] = -1;
				270	for (a = 0; a < vocab_size; a++) {
				271	// Hash will be re-computed, as it is not actual
				272	hash = GetWordHash(vocab[a].word);
				273	while (vocab_hash[hash] != -1)
				274	hash = (hash + 1) % vocab_hash_size;
				275	vocab_hash[hash] = a;
				276	}
				277	fflush(stdout);
				278	min_reduce++;
				279	}
				280
				281	// Create binary Huffman tree using the word counts
				282	// Frequent words will have short uniqe binary codes
				283	void CreateBinaryTree() {
				284	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				285	char code[MAX_CODE_LENGTH];
				286	long long count = (long long ) calloc(vocab_size * 2 + 1,
				287	sizeof(long long));
				288	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				289	sizeof(long long));
				290	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				291	sizeof(long long));
				292	for (a = 0; a < vocab_size; a++)
				293	count[a] = vocab[a].cn;
				294	for (a = vocab_size; a < vocab_size * 2; a++)
				295	count[a] = 1e15;
				296	pos1 = vocab_size - 1;
				297	pos2 = vocab_size;
				298	// Following algorithm constructs the Huffman tree by adding one node at a time
				299	for (a = 0; a < vocab_size - 1; a++) {
				300	// First, find two smallest nodes 'min1, min2'
				301	if (pos1 >= 0) {
				302	if (count[pos1] < count[pos2]) {
				303	min1i = pos1;
				304	pos1--;
				305	} else {
				306	min1i = pos2;
				307	pos2++;
				308	}
				309	} else {
				310	min1i = pos2;
				311	pos2++;
				312	}
				313	if (pos1 >= 0) {
				314	if (count[pos1] < count[pos2]) {
				315	min2i = pos1;
				316	pos1--;
				317	} else {
				318	min2i = pos2;
				319	pos2++;
				320	}
				321	} else {
				322	min2i = pos2;
				323	pos2++;
				324	}
				325	count[vocab_size + a] = count[min1i] + count[min2i];
				326	parent_node[min1i] = vocab_size + a;
				327	parent_node[min2i] = vocab_size + a;
				328	binary[min2i] = 1;
				329	}
				330	// Now assign binary code to each vocabulary word
				331	for (a = 0; a < vocab_size; a++) {
				332	b = a;
				333	i = 0;
				334	while (1) {
				335	code[i] = binary[b];
				336	point[i] = b;
				337	i++;
				338	b = parent_node[b];
				339	if (b == vocab_size * 2 - 2)
				340	break;
				341	}
				342	vocab[a].codelen = i;
				343	vocab[a].point[0] = vocab_size - 2;
				344	for (b = 0; b < i; b++) {
				345	vocab[a].code[i - b - 1] = code[b];
				346	vocab[a].point[i - b] = point[b] - vocab_size;
				347	}
				348	}
				349	free(count);
				350	free(binary);
				351	free(parent_node);
				352	}
				353
				354	void LearnVocabFromTrainFile() {
				355	char word[MAX_STRING];
				356	FILE *fin;
				357	long long a, i;
				358	for (a = 0; a < vocab_hash_size; a++)
				359	vocab_hash[a] = -1;
				360	fin = fopen(train_file, "rb");
				361	if (fin == NULL) {
				362	printf("ERROR: training data file not found!\n");
				363	exit(1);
				364	}
				365	vocab_size = 0;
				366	AddWordToVocab((char *) "</s>");
				367	while (1) {
				368	ReadWord(word, fin);
				369	if (feof(fin))
				370	break;
				371	train_words++;
				372	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				373	printf("%lldK%c", train_words / 1000, 13);
				374	fflush(stdout);
				375	}
				376	i = SearchVocab(word);
				377	if (i == -1) {
				378	a = AddWordToVocab(word);
				379	vocab[a].cn = 1;
				380	} else
				381	vocab[i].cn++;
				382	if (vocab_size > vocab_hash_size * 0.7)
				383	ReduceVocab();
				384	}
				385	SortVocab();
				386	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	387	printf("Vocab size: %'lld\n", vocab_size);
				388	printf("Words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	389	}
				390	file_size = ftell(fin);
				391	fclose(fin);
				392	}
				393
				394	void SaveVocab() {
				395	long long i;
				396	FILE *fo = fopen(save_vocab_file, "wb");
				397	for (i = 0; i < vocab_size; i++)
				398	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				399	fclose(fo);
				400	}
				401
				402	void ReadVocab() {
				403	long long a, i = 0;
				404	char c;
				405	char word[MAX_STRING];
				406	FILE *fin = fopen(read_vocab_file, "rb");
				407	if (fin == NULL) {
				408	printf("Vocabulary file not found\n");
				409	exit(1);
				410	}
				411	for (a = 0; a < vocab_hash_size; a++)
				412	vocab_hash[a] = -1;
				413	vocab_size = 0;
				414	while (1) {
				415	ReadWord(word, fin);
				416	if (feof(fin))
				417	break;
				418	a = AddWordToVocab(word);
				419	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				420	i++;
				421	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	422	fclose(fin);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	423	fin = fopen(train_file, "rb");
				424	if (fin == NULL) {
				425	printf("ERROR: training data file not found!\n");
				426	exit(1);
				427	}
				428	fseek(fin, 0, SEEK_END);
				429	file_size = ftell(fin);
				430	fclose(fin);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	431	SortVocab();
				432	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	433	printf("Vocab size: %'lld\n", vocab_size);
				434	printf("Words in vocab's train file: %'lld\n", train_words);
				435	printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	436	}
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	437	train_words = file_size / avgWordLength;
				438	if(debug_mode > 0)
				439	printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	440	}
				441
				442	void InitClassUnigramTable() {
				443	long long a, c;
				444	printf("loading class unigrams \n");
				445	FILE *fin = fopen(negative_classes_file, "rb");
				446	if (fin == NULL) {
				447	printf("ERROR: class file not found!\n");
				448	exit(1);
				449	}
				450	word_to_group = (int ) malloc(vocab_size sizeof(int));
				451	for (a = 0; a < vocab_size; a++)
				452	word_to_group[a] = -1;
				453	char class[MAX_STRING];
				454	char prev_class[MAX_STRING];
				455	prev_class[0] = 0;
				456	char word[MAX_STRING];
				457	class_number = -1;
				458	while (1) {
				459	if (feof(fin))
				460	break;
				461	ReadWord(class, fin);
				462	ReadWord(word, fin);
				463	int word_index = SearchVocab(word);
				464	if (word_index != -1) {
				465	if (strcmp(class, prev_class) != 0) {
				466	class_number++;
				467	strcpy(prev_class, class);
				468	}
				469	word_to_group[word_index] = class_number;
				470	}
				471	ReadWord(word, fin);
				472	}
				473	class_number++;
				474	fclose(fin);
				475
				476	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				477	long long train_words_pow = 0;
				478	real d1, power = 0.75;
				479
				480	for (c = 0; c < class_number; c++) {
				481	long long offset = c * table_size;
				482	train_words_pow = 0;
				483	for (a = 0; a < vocab_size; a++)
				484	if (word_to_group[a] == c)
				485	train_words_pow += pow(vocab[a].cn, power);
				486	int i = 0;
				487	while (word_to_group[i] != c && i < vocab_size)
				488	i++;
				489	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				490	for (a = 0; a < table_size; a++) {
				491	//printf("index %lld , word %d\n", a, i);
				492	group_to_table[offset + a] = i;
				493	if (a / (real) table_size > d1) {
				494	i++;
				495	while (word_to_group[i] != c && i < vocab_size)
				496	i++;
				497	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				498	}
				499	if (i >= vocab_size)
				500	while (word_to_group[i] != c && i >= 0)
				501	i--;
				502	}
				503	}
				504	}
				505
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	506	void SaveArgs(int argc, char **argv) {
				507	unsigned int i;
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame^]	508	char args_file[MAX_STRING];
				509	strcpy(args_file, output_file);
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	510	strcat(args_file, ".args");
				511	FILE *fargs = fopen(args_file, "w");
				512	if (fargs == NULL) {
				513	printf("Cannot save args to %s.\n", args_file);
				514	return;
				515	}
				516
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame^]	517	for(i=1; i<argc; i++)
				518	fprintf(fargs, "%s ", argv[i]);
				519
				520	fprintf(fargs, "\n");
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	521	fclose(fargs);
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame^]	522
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	523	return;
				524	}
				525
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	526	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	527	if(type != 3 \|\| negative <= 0) {
				528	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				529	return;
				530	}
				531
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	532	FILE *fnet = fopen(save_net_file, "wb");
				533	if (fnet == NULL) {
				534	printf("Net parameter file not found\n");
				535	exit(1);
				536	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	537	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	538	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	539	fclose(fnet);
				540	}
				541
				542	void InitNet() {
				543	long long a, b;
				544	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	545	long long read;
				546
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	547	window_layer_size = layer1_size * window * 2;
				548	a = posix_memalign((void **) &syn0, 128,
				549	(long long) vocab_size * layer1_size * sizeof(real));
				550	if (syn0 == NULL) {
				551	printf("Memory allocation failed\n");
				552	exit(1);
				553	}
				554
				555	if (hs) {
				556	a = posix_memalign((void **) &syn1, 128,
				557	(long long) vocab_size * layer1_size * sizeof(real));
				558	if (syn1 == NULL) {
				559	printf("Memory allocation failed\n");
				560	exit(1);
				561	}
				562	a = posix_memalign((void **) &syn1_window, 128,
				563	(long long) vocab_size * window_layer_size * sizeof(real));
				564	if (syn1_window == NULL) {
				565	printf("Memory allocation failed\n");
				566	exit(1);
				567	}
				568	a = posix_memalign((void **) &syn_hidden_word, 128,
				569	(long long) vocab_size * window_hidden_size * sizeof(real));
				570	if (syn_hidden_word == NULL) {
				571	printf("Memory allocation failed\n");
				572	exit(1);
				573	}
				574
				575	for (a = 0; a < vocab_size; a++)
				576	for (b = 0; b < layer1_size; b++)
				577	syn1[a * layer1_size + b] = 0;
				578	for (a = 0; a < vocab_size; a++)
				579	for (b = 0; b < window_layer_size; b++)
				580	syn1_window[a * window_layer_size + b] = 0;
				581	for (a = 0; a < vocab_size; a++)
				582	for (b = 0; b < window_hidden_size; b++)
				583	syn_hidden_word[a * window_hidden_size + b] = 0;
				584	}
				585	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	586	if(type == 0) {
				587	a = posix_memalign((void **) &syn1neg, 128,
				588	(long long) vocab_size * layer1_size * sizeof(real));
				589	if (syn1neg == NULL) {
				590	printf("Memory allocation failed\n");
				591	exit(1);
				592	}
				593	for (a = 0; a < vocab_size; a++)
				594	for (b = 0; b < layer1_size; b++)
				595	syn1neg[a * layer1_size + b] = 0;
				596	} else if (type == 3) {
				597	a = posix_memalign((void **) &syn1neg_window, 128,
				598	(long long) vocab_size * window_layer_size * sizeof(real));
				599	if (syn1neg_window == NULL) {
				600	printf("Memory allocation failed\n");
				601	exit(1);
				602	}
				603	for (a = 0; a < vocab_size; a++)
				604	for (b = 0; b < window_layer_size; b++)
				605	syn1neg_window[a * window_layer_size + b] = 0;
				606	} else if (type == 4) {
				607	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				608	(long long) vocab_size * window_hidden_size * sizeof(real));
				609	if (syn_hidden_word_neg == NULL) {
				610	printf("Memory allocation failed\n");
				611	exit(1);
				612	}
				613	for (a = 0; a < vocab_size; a++)
				614	for (b = 0; b < window_hidden_size; b++)
				615	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	616	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	617	}
				618	if (nce > 0) {
				619	a = posix_memalign((void **) &syn1nce, 128,
				620	(long long) vocab_size * layer1_size * sizeof(real));
				621	if (syn1nce == NULL) {
				622	printf("Memory allocation failed\n");
				623	exit(1);
				624	}
				625	a = posix_memalign((void **) &syn1nce_window, 128,
				626	(long long) vocab_size * window_layer_size * sizeof(real));
				627	if (syn1nce_window == NULL) {
				628	printf("Memory allocation failed\n");
				629	exit(1);
				630	}
				631	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				632	(long long) vocab_size * window_hidden_size * sizeof(real));
				633	if (syn_hidden_word_nce == NULL) {
				634	printf("Memory allocation failed\n");
				635	exit(1);
				636	}
				637
				638	for (a = 0; a < vocab_size; a++)
				639	for (b = 0; b < layer1_size; b++)
				640	syn1nce[a * layer1_size + b] = 0;
				641	for (a = 0; a < vocab_size; a++)
				642	for (b = 0; b < window_layer_size; b++)
				643	syn1nce_window[a * window_layer_size + b] = 0;
				644	for (a = 0; a < vocab_size; a++)
				645	for (b = 0; b < window_hidden_size; b++)
				646	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				647	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	648
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	649	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	650	a = posix_memalign((void **) &syn_window_hidden, 128,
				651	window_hidden_size * window_layer_size * sizeof(real));
				652	if (syn_window_hidden == NULL) {
				653	printf("Memory allocation failed\n");
				654	exit(1);
				655	}
				656	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				657	next_random = next_random * (unsigned long long) 25214903917 + 11;
				658	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				659	- 0.5) / (window_hidden_size * window_layer_size);
				660	}
				661	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	662
				663	if (read_net_file[0] == 0) {
				664	for (a = 0; a < vocab_size; a++)
				665	for (b = 0; b < layer1_size; b++) {
				666	next_random = next_random * (unsigned long long) 25214903917
				667	+ 11;
				668	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				669	/ (real) 65536) - 0.5) / layer1_size;
				670	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	671	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	672	FILE *fnet = fopen(read_net_file, "rb");
				673	if (fnet == NULL) {
				674	printf("Net parameter file not found\n");
				675	exit(1);
				676	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	677	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				678	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				679	if(read != vocab_size * layer1_size) {
				680	fprintf(stderr, "read-net failed %lld\n", read);
				681	exit(-1);
				682	}
				683	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				684	if(read != (long long) vocab_size * window_layer_size) {
				685	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				686	(long long) sizeof(real) * vocab_size * window_layer_size);
				687	exit(-1);
				688	}
				689	fgetc(fnet);
				690	if(!feof(fnet)) {
				691	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				692	exit(-1);
				693	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	694	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	695	} else {
				696	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				697	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	698	}
				699
				700	CreateBinaryTree();
				701	}
				702
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	703	char currentDateTime(char buf, real offset) {
				704	time_t t;
				705	time(&t);
				706	t += (long) offset;
				707	struct tm tstruct;
				708	tstruct = *localtime(&t);
				709	strftime(buf, 80, "%c", &tstruct);
				710	return buf;
				711	}
				712
				713	void MonitorThread(void id) {
				714	char *timebuf = malloc(80);;
				715	int i, n=num_threads;
				716	long long sum;
				717	sleep(1);
				718	while(n > 0) {
				719	sleep(1);
				720	sum = n = 0;
				721	for(i=0; i < num_threads; i++) {
				722	if(threadPos[i] >= 0) {
				723	sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
				724	n++;
				725	} else {
				726	sum += iter * file_size / num_threads;
				727	}
				728	}
				729	if(n == 0)
				730	break;
				731	real finished_portion = (real) sum / (float) (file_size * iter);
				732	long long now = clock();
				733	long long elapsed = (now - start) / CLOCKS_PER_SEC / num_threads;
				734	long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed) * ((real) num_threads / n) ;
				735
				736	printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/t/s TE: %llds TTG: %llds ETA: %s\033[K",
				737	alpha,
				738	finished_portion * 100,
				739	(float) sum / elapsed / num_threads / 1000,
				740	elapsed,
				741	ttg,
				742	currentDateTime(timebuf, ttg)
				743	);
				744	fflush(stdout);
				745	}
				746	pthread_exit(NULL);
				747	}
				748
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	749	void TrainModelThread(void id) {
				750	long long a, b, d, cw, word, last_word, sentence_length = 0,
				751	sentence_position = 0;
				752	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				753	long long l1, l2, c, target, label, local_iter = iter;
				754	unsigned long long next_random = (long long) id;
				755	real f, g;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	756	int input_len_1 = layer1_size;
				757	int window_offset = -1;
				758	if (type == 2 \|\| type == 4) {
				759	input_len_1 = window_layer_size;
				760	}
				761	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				762	real neu1e = (real ) calloc(input_len_1, sizeof(real));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	763	threadIters[(long) id] = iter;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	764
				765	int input_len_2 = 0;
				766	if (type == 4) {
				767	input_len_2 = window_hidden_size;
				768	}
				769	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				770	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				771
				772	FILE *fi = fopen(train_file, "rb");
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	773	long long start_pos = file_size / (long long) num_threads * (long long) id;
				774	long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
				775	long long current_pos = start_pos;
				776	long long last_pos = start_pos;;
				777	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	778	while (1) {
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	779	if ((current_pos - last_pos > 100000)) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	780	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	781	last_pos = current_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	782	last_word_count = word_count;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	783	alpha = starting_alpha
				784	* (1 - word_count_actual / (real) (iter * train_words + 1));
				785	if (alpha < starting_alpha * 0.0001)
				786	alpha = starting_alpha * 0.0001;
				787	}
				788	if (sentence_length == 0) {
				789	while (1) {
				790	word = ReadWordIndex(fi);
				791	if (feof(fi))
				792	break;
				793	if (word == -1)
				794	continue;
				795	word_count++;
				796	if (word == 0)
				797	break;
				798	// The subsampling randomly discards frequent words while keeping the ranking same
				799	if (sample > 0) {
				800	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				801	+ 1) * (sample * train_words) / vocab[word].cn;
				802	next_random = next_random * (unsigned long long) 25214903917
				803	+ 11;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	804	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				805	if(type == 3) // in structured skipgrams
				806	word = -2; // keep the window position correct
				807	else
				808	continue;
				809	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	810	}
				811	sen[sentence_length] = word;
				812	sentence_length++;
				813	if (sentence_length >= MAX_SENTENCE_LENGTH)
				814	break;
				815	}
				816	sentence_position = 0;
				817	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	818	current_pos = threadPos[(long) id] = ftell(fi);
				819	if (feof(fi) \|\| current_pos >= end_pos ) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	820	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	821	threadIters[(long) id]--;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	822	local_iter--;
				823	if (local_iter == 0)
				824	break;
				825	word_count = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	826	current_pos = last_pos = start_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	827	last_word_count = 0;
				828	sentence_length = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	829	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	830	continue;
				831	}
				832	word = sen[sentence_position];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	833	while (word == -2 && sentence_position<sentence_length)
				834	word = sen[++sentence_position];
				835	if (sentence_position>=sentence_length) {
				836	sentence_length=0;
				837	continue;
				838	}
				839	if (word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	840	continue;
				841	for (c = 0; c < input_len_1; c++)
				842	neu1[c] = 0;
				843	for (c = 0; c < input_len_1; c++)
				844	neu1e[c] = 0;
				845	for (c = 0; c < input_len_2; c++)
				846	neu2[c] = 0;
				847	for (c = 0; c < input_len_2; c++)
				848	neu2e[c] = 0;
				849	next_random = next_random * (unsigned long long) 25214903917 + 11;
				850	b = next_random % window;
				851	if (type == 0) { //train the cbow architecture
				852	// in -> hidden
				853	cw = 0;
				854	for (a = b; a < window * 2 + 1 - b; a++)
				855	if (a != window) {
				856	c = sentence_position - window + a;
				857	if (c < 0)
				858	continue;
				859	if (c >= sentence_length)
				860	continue;
				861	last_word = sen[c];
				862	if (last_word == -1)
				863	continue;
				864	for (c = 0; c < layer1_size; c++)
				865	neu1[c] += syn0[c + last_word * layer1_size];
				866	cw++;
				867	}
				868	if (cw) {
				869	for (c = 0; c < layer1_size; c++)
				870	neu1[c] /= cw;
				871	if (hs)
				872	for (d = 0; d < vocab[word].codelen; d++) {
				873	f = 0;
				874	l2 = vocab[word].point[d] * layer1_size;
				875	// Propagate hidden -> output
				876	for (c = 0; c < layer1_size; c++)
				877	f += neu1[c] * syn1[c + l2];
				878	if (f <= -MAX_EXP)
				879	continue;
				880	else if (f >= MAX_EXP)
				881	continue;
				882	else
				883	f = expTable[(int) ((f + MAX_EXP)
				884	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				885	// 'g' is the gradient multiplied by the learning rate
				886	g = (1 - vocab[word].code[d] - f) * alpha;
				887	// Propagate errors output -> hidden
				888	for (c = 0; c < layer1_size; c++)
				889	neu1e[c] += g * syn1[c + l2];
				890	// Learn weights hidden -> output
				891	for (c = 0; c < layer1_size; c++)
				892	syn1[c + l2] += g * neu1[c];
				893	if (cap == 1)
				894	for (c = 0; c < layer1_size; c++)
				895	capParam(syn1, c + l2);
				896	}
				897	// NEGATIVE SAMPLING
				898	if (negative > 0)
				899	for (d = 0; d < negative + 1; d++) {
				900	if (d == 0) {
				901	target = word;
				902	label = 1;
				903	} else {
				904	next_random = next_random
				905	* (unsigned long long) 25214903917 + 11;
				906	if (word_to_group != NULL
				907	&& word_to_group[word] != -1) {
				908	target = word;
				909	while (target == word) {
				910	target = group_to_table[word_to_group[word]
				911	* table_size
				912	+ (next_random >> 16) % table_size];
				913	next_random = next_random
				914	* (unsigned long long) 25214903917
				915	+ 11;
				916	}
				917	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				918	} else {
				919	target =
				920	table[(next_random >> 16) % table_size];
				921	}
				922	if (target == 0)
				923	target = next_random % (vocab_size - 1) + 1;
				924	if (target == word)
				925	continue;
				926	label = 0;
				927	}
				928	l2 = target * layer1_size;
				929	f = 0;
				930	for (c = 0; c < layer1_size; c++)
				931	f += neu1[c] * syn1neg[c + l2];
				932	if (f > MAX_EXP)
				933	g = (label - 1) * alpha;
				934	else if (f < -MAX_EXP)
				935	g = (label - 0) * alpha;
				936	else
				937	g = (label
				938	- expTable[(int) ((f + MAX_EXP)
				939	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				940	* alpha;
				941	for (c = 0; c < layer1_size; c++)
				942	neu1e[c] += g * syn1neg[c + l2];
				943	for (c = 0; c < layer1_size; c++)
				944	syn1neg[c + l2] += g * neu1[c];
				945	if (cap == 1)
				946	for (c = 0; c < layer1_size; c++)
				947	capParam(syn1neg, c + l2);
				948	}
				949	// Noise Contrastive Estimation
				950	if (nce > 0)
				951	for (d = 0; d < nce + 1; d++) {
				952	if (d == 0) {
				953	target = word;
				954	label = 1;
				955	} else {
				956	next_random = next_random
				957	* (unsigned long long) 25214903917 + 11;
				958	if (word_to_group != NULL
				959	&& word_to_group[word] != -1) {
				960	target = word;
				961	while (target == word) {
				962	target = group_to_table[word_to_group[word]
				963	* table_size
				964	+ (next_random >> 16) % table_size];
				965	next_random = next_random
				966	* (unsigned long long) 25214903917
				967	+ 11;
				968	}
				969	} else {
				970	target =
				971	table[(next_random >> 16) % table_size];
				972	}
				973	if (target == 0)
				974	target = next_random % (vocab_size - 1) + 1;
				975	if (target == word)
				976	continue;
				977	label = 0;
				978	}
				979	l2 = target * layer1_size;
				980	f = 0;
				981
				982	for (c = 0; c < layer1_size; c++)
				983	f += neu1[c] * syn1nce[c + l2];
				984	if (f > MAX_EXP)
				985	g = (label - 1) * alpha;
				986	else if (f < -MAX_EXP)
				987	g = (label - 0) * alpha;
				988	else {
				989	f = exp(f);
				990	g =
				991	(label
				992	- f
				993	/ (noise_distribution[target]
				994	* nce + f)) * alpha;
				995	}
				996	for (c = 0; c < layer1_size; c++)
				997	neu1e[c] += g * syn1nce[c + l2];
				998	for (c = 0; c < layer1_size; c++)
				999	syn1nce[c + l2] += g * neu1[c];
				1000	if (cap == 1)
				1001	for (c = 0; c < layer1_size; c++)
				1002	capParam(syn1nce, c + l2);
				1003	}
				1004	// hidden -> in
				1005	for (a = b; a < window * 2 + 1 - b; a++)
				1006	if (a != window) {
				1007	c = sentence_position - window + a;
				1008	if (c < 0)
				1009	continue;
				1010	if (c >= sentence_length)
				1011	continue;
				1012	last_word = sen[c];
				1013	if (last_word == -1)
				1014	continue;
				1015	for (c = 0; c < layer1_size; c++)
				1016	syn0[c + last_word * layer1_size] += neu1e[c];
				1017	}
				1018	}
				1019	} else if (type == 1) { //train skip-gram
				1020	for (a = b; a < window * 2 + 1 - b; a++)
				1021	if (a != window) {
				1022	c = sentence_position - window + a;
				1023	if (c < 0)
				1024	continue;
				1025	if (c >= sentence_length)
				1026	continue;
				1027	last_word = sen[c];
				1028	if (last_word == -1)
				1029	continue;
				1030	l1 = last_word * layer1_size;
				1031	for (c = 0; c < layer1_size; c++)
				1032	neu1e[c] = 0;
				1033	// HIERARCHICAL SOFTMAX
				1034	if (hs)
				1035	for (d = 0; d < vocab[word].codelen; d++) {
				1036	f = 0;
				1037	l2 = vocab[word].point[d] * layer1_size;
				1038	// Propagate hidden -> output
				1039	for (c = 0; c < layer1_size; c++)
				1040	f += syn0[c + l1] * syn1[c + l2];
				1041	if (f <= -MAX_EXP)
				1042	continue;
				1043	else if (f >= MAX_EXP)
				1044	continue;
				1045	else
				1046	f = expTable[(int) ((f + MAX_EXP)
				1047	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1048	// 'g' is the gradient multiplied by the learning rate
				1049	g = (1 - vocab[word].code[d] - f) * alpha;
				1050	// Propagate errors output -> hidden
				1051	for (c = 0; c < layer1_size; c++)
				1052	neu1e[c] += g * syn1[c + l2];
				1053	// Learn weights hidden -> output
				1054	for (c = 0; c < layer1_size; c++)
				1055	syn1[c + l2] += g * syn0[c + l1];
				1056	if (cap == 1)
				1057	for (c = 0; c < layer1_size; c++)
				1058	capParam(syn1, c + l2);
				1059	}
				1060	// NEGATIVE SAMPLING
				1061	if (negative > 0)
				1062	for (d = 0; d < negative + 1; d++) {
				1063	if (d == 0) {
				1064	target = word;
				1065	label = 1;
				1066	} else {
				1067	next_random = next_random
				1068	* (unsigned long long) 25214903917 + 11;
				1069	if (word_to_group != NULL
				1070	&& word_to_group[word] != -1) {
				1071	target = word;
				1072	while (target == word) {
				1073	target =
				1074	group_to_table[word_to_group[word]
				1075	* table_size
				1076	+ (next_random >> 16)
				1077	% table_size];
				1078	next_random =
				1079	next_random
				1080	* (unsigned long long) 25214903917
				1081	+ 11;
				1082	}
				1083	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1084	} else {
				1085	target = table[(next_random >> 16)
				1086	% table_size];
				1087	}
				1088	if (target == 0)
				1089	target = next_random % (vocab_size - 1) + 1;
				1090	if (target == word)
				1091	continue;
				1092	label = 0;
				1093	}
				1094	l2 = target * layer1_size;
				1095	f = 0;
				1096	for (c = 0; c < layer1_size; c++)
				1097	f += syn0[c + l1] * syn1neg[c + l2];
				1098	if (f > MAX_EXP)
				1099	g = (label - 1) * alpha;
				1100	else if (f < -MAX_EXP)
				1101	g = (label - 0) * alpha;
				1102	else
				1103	g =
				1104	(label
				1105	- expTable[(int) ((f + MAX_EXP)
				1106	* (EXP_TABLE_SIZE
				1107	/ MAX_EXP / 2))])
				1108	* alpha;
				1109	for (c = 0; c < layer1_size; c++)
				1110	neu1e[c] += g * syn1neg[c + l2];
				1111	for (c = 0; c < layer1_size; c++)
				1112	syn1neg[c + l2] += g * syn0[c + l1];
				1113	if (cap == 1)
				1114	for (c = 0; c < layer1_size; c++)
				1115	capParam(syn1neg, c + l2);
				1116	}
				1117	//Noise Contrastive Estimation
				1118	if (nce > 0)
				1119	for (d = 0; d < nce + 1; d++) {
				1120	if (d == 0) {
				1121	target = word;
				1122	label = 1;
				1123	} else {
				1124	next_random = next_random
				1125	* (unsigned long long) 25214903917 + 11;
				1126	if (word_to_group != NULL
				1127	&& word_to_group[word] != -1) {
				1128	target = word;
				1129	while (target == word) {
				1130	target =
				1131	group_to_table[word_to_group[word]
				1132	* table_size
				1133	+ (next_random >> 16)
				1134	% table_size];
				1135	next_random =
				1136	next_random
				1137	* (unsigned long long) 25214903917
				1138	+ 11;
				1139	}
				1140	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1141	} else {
				1142	target = table[(next_random >> 16)
				1143	% table_size];
				1144	}
				1145	if (target == 0)
				1146	target = next_random % (vocab_size - 1) + 1;
				1147	if (target == word)
				1148	continue;
				1149	label = 0;
				1150	}
				1151	l2 = target * layer1_size;
				1152	f = 0;
				1153	for (c = 0; c < layer1_size; c++)
				1154	f += syn0[c + l1] * syn1nce[c + l2];
				1155	if (f > MAX_EXP)
				1156	g = (label - 1) * alpha;
				1157	else if (f < -MAX_EXP)
				1158	g = (label - 0) * alpha;
				1159	else {
				1160	f = exp(f);
				1161	g = (label
				1162	- f
				1163	/ (noise_distribution[target]
				1164	* nce + f)) * alpha;
				1165	}
				1166	for (c = 0; c < layer1_size; c++)
				1167	neu1e[c] += g * syn1nce[c + l2];
				1168	for (c = 0; c < layer1_size; c++)
				1169	syn1nce[c + l2] += g * syn0[c + l1];
				1170	if (cap == 1)
				1171	for (c = 0; c < layer1_size; c++)
				1172	capParam(syn1nce, c + l2);
				1173	}
				1174	// Learn weights input -> hidden
				1175	for (c = 0; c < layer1_size; c++)
				1176	syn0[c + l1] += neu1e[c];
				1177	}
				1178	} else if (type == 2) { //train the cwindow architecture
				1179	// in -> hidden
				1180	cw = 0;
				1181	for (a = 0; a < window * 2 + 1; a++)
				1182	if (a != window) {
				1183	c = sentence_position - window + a;
				1184	if (c < 0)
				1185	continue;
				1186	if (c >= sentence_length)
				1187	continue;
				1188	last_word = sen[c];
				1189	if (last_word == -1)
				1190	continue;
				1191	window_offset = a * layer1_size;
				1192	if (a > window)
				1193	window_offset -= layer1_size;
				1194	for (c = 0; c < layer1_size; c++)
				1195	neu1[c + window_offset] += syn0[c
				1196	+ last_word * layer1_size];
				1197	cw++;
				1198	}
				1199	if (cw) {
				1200	if (hs)
				1201	for (d = 0; d < vocab[word].codelen; d++) {
				1202	f = 0;
				1203	l2 = vocab[word].point[d] * window_layer_size;
				1204	// Propagate hidden -> output
				1205	for (c = 0; c < window_layer_size; c++)
				1206	f += neu1[c] * syn1_window[c + l2];
				1207	if (f <= -MAX_EXP)
				1208	continue;
				1209	else if (f >= MAX_EXP)
				1210	continue;
				1211	else
				1212	f = expTable[(int) ((f + MAX_EXP)
				1213	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1214	// 'g' is the gradient multiplied by the learning rate
				1215	g = (1 - vocab[word].code[d] - f) * alpha;
				1216	// Propagate errors output -> hidden
				1217	for (c = 0; c < window_layer_size; c++)
				1218	neu1e[c] += g * syn1_window[c + l2];
				1219	// Learn weights hidden -> output
				1220	for (c = 0; c < window_layer_size; c++)
				1221	syn1_window[c + l2] += g * neu1[c];
				1222	if (cap == 1)
				1223	for (c = 0; c < window_layer_size; c++)
				1224	capParam(syn1_window, c + l2);
				1225	}
				1226	// NEGATIVE SAMPLING
				1227	if (negative > 0)
				1228	for (d = 0; d < negative + 1; d++) {
				1229	if (d == 0) {
				1230	target = word;
				1231	label = 1;
				1232	} else {
				1233	next_random = next_random
				1234	* (unsigned long long) 25214903917 + 11;
				1235	if (word_to_group != NULL
				1236	&& word_to_group[word] != -1) {
				1237	target = word;
				1238	while (target == word) {
				1239	target = group_to_table[word_to_group[word]
				1240	* table_size
				1241	+ (next_random >> 16) % table_size];
				1242	next_random = next_random
				1243	* (unsigned long long) 25214903917
				1244	+ 11;
				1245	}
				1246	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1247	} else {
				1248	target =
				1249	table[(next_random >> 16) % table_size];
				1250	}
				1251	if (target == 0)
				1252	target = next_random % (vocab_size - 1) + 1;
				1253	if (target == word)
				1254	continue;
				1255	label = 0;
				1256	}
				1257	l2 = target * window_layer_size;
				1258	f = 0;
				1259	for (c = 0; c < window_layer_size; c++)
				1260	f += neu1[c] * syn1neg_window[c + l2];
				1261	if (f > MAX_EXP)
				1262	g = (label - 1) * alpha;
				1263	else if (f < -MAX_EXP)
				1264	g = (label - 0) * alpha;
				1265	else
				1266	g = (label
				1267	- expTable[(int) ((f + MAX_EXP)
				1268	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1269	* alpha;
				1270	for (c = 0; c < window_layer_size; c++)
				1271	neu1e[c] += g * syn1neg_window[c + l2];
				1272	for (c = 0; c < window_layer_size; c++)
				1273	syn1neg_window[c + l2] += g * neu1[c];
				1274	if (cap == 1)
				1275	for (c = 0; c < window_layer_size; c++)
				1276	capParam(syn1neg_window, c + l2);
				1277	}
				1278	// Noise Contrastive Estimation
				1279	if (nce > 0)
				1280	for (d = 0; d < nce + 1; d++) {
				1281	if (d == 0) {
				1282	target = word;
				1283	label = 1;
				1284	} else {
				1285	next_random = next_random
				1286	* (unsigned long long) 25214903917 + 11;
				1287	if (word_to_group != NULL
				1288	&& word_to_group[word] != -1) {
				1289	target = word;
				1290	while (target == word) {
				1291	target = group_to_table[word_to_group[word]
				1292	* table_size
				1293	+ (next_random >> 16) % table_size];
				1294	next_random = next_random
				1295	* (unsigned long long) 25214903917
				1296	+ 11;
				1297	}
				1298	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1299	} else {
				1300	target =
				1301	table[(next_random >> 16) % table_size];
				1302	}
				1303	if (target == 0)
				1304	target = next_random % (vocab_size - 1) + 1;
				1305	if (target == word)
				1306	continue;
				1307	label = 0;
				1308	}
				1309	l2 = target * window_layer_size;
				1310	f = 0;
				1311	for (c = 0; c < window_layer_size; c++)
				1312	f += neu1[c] * syn1nce_window[c + l2];
				1313	if (f > MAX_EXP)
				1314	g = (label - 1) * alpha;
				1315	else if (f < -MAX_EXP)
				1316	g = (label - 0) * alpha;
				1317	else {
				1318	f = exp(f);
				1319	g =
				1320	(label
				1321	- f
				1322	/ (noise_distribution[target]
				1323	* nce + f)) * alpha;
				1324	}
				1325	for (c = 0; c < window_layer_size; c++)
				1326	neu1e[c] += g * syn1nce_window[c + l2];
				1327	for (c = 0; c < window_layer_size; c++)
				1328	syn1nce_window[c + l2] += g * neu1[c];
				1329	if (cap == 1)
				1330	for (c = 0; c < window_layer_size; c++)
				1331	capParam(syn1nce_window, c + l2);
				1332	}
				1333	// hidden -> in
				1334	for (a = 0; a < window * 2 + 1; a++)
				1335	if (a != window) {
				1336	c = sentence_position - window + a;
				1337	if (c < 0)
				1338	continue;
				1339	if (c >= sentence_length)
				1340	continue;
				1341	last_word = sen[c];
				1342	if (last_word == -1)
				1343	continue;
				1344	window_offset = a * layer1_size;
				1345	if (a > window)
				1346	window_offset -= layer1_size;
				1347	for (c = 0; c < layer1_size; c++)
				1348	syn0[c + last_word * layer1_size] += neu1e[c
				1349	+ window_offset];
				1350	}
				1351	}
				1352	} else if (type == 3) { //train structured skip-gram
				1353	for (a = 0; a < window * 2 + 1; a++)
				1354	if (a != window) {
				1355	c = sentence_position - window + a;
				1356	if (c < 0)
				1357	continue;
				1358	if (c >= sentence_length)
				1359	continue;
				1360	last_word = sen[c];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	1361	if (last_word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1362	continue;
				1363	l1 = last_word * layer1_size;
				1364	window_offset = a * layer1_size;
				1365	if (a > window)
				1366	window_offset -= layer1_size;
				1367	for (c = 0; c < layer1_size; c++)
				1368	neu1e[c] = 0;
				1369	// HIERARCHICAL SOFTMAX
				1370	if (hs)
				1371	for (d = 0; d < vocab[word].codelen; d++) {
				1372	f = 0;
				1373	l2 = vocab[word].point[d] * window_layer_size;
				1374	// Propagate hidden -> output
				1375	for (c = 0; c < layer1_size; c++)
				1376	f += syn0[c + l1]
				1377	* syn1_window[c + l2 + window_offset];
				1378	if (f <= -MAX_EXP)
				1379	continue;
				1380	else if (f >= MAX_EXP)
				1381	continue;
				1382	else
				1383	f = expTable[(int) ((f + MAX_EXP)
				1384	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1385	// 'g' is the gradient multiplied by the learning rate
				1386	g = (1 - vocab[word].code[d] - f) * alpha;
				1387	// Propagate errors output -> hidden
				1388	for (c = 0; c < layer1_size; c++)
				1389	neu1e[c] += g
				1390	* syn1_window[c + l2 + window_offset];
				1391	// Learn weights hidden -> output
				1392	for (c = 0; c < layer1_size; c++)
				1393	syn1[c + l2 + window_offset] += g
				1394	* syn0[c + l1];
				1395	if (cap == 1)
				1396	for (c = 0; c < layer1_size; c++)
				1397	capParam(syn1, c + l2 + window_offset);
				1398	}
				1399	// NEGATIVE SAMPLING
				1400	if (negative > 0)
				1401	for (d = 0; d < negative + 1; d++) {
				1402	if (d == 0) {
				1403	target = word;
				1404	label = 1;
				1405	} else {
				1406	next_random = next_random
				1407	* (unsigned long long) 25214903917 + 11;
				1408	if (word_to_group != NULL
				1409	&& word_to_group[word] != -1) {
				1410	target = word;
				1411	while (target == word) {
				1412	target =
				1413	group_to_table[word_to_group[word]
				1414	* table_size
				1415	+ (next_random >> 16)
				1416	% table_size];
				1417	next_random =
				1418	next_random
				1419	* (unsigned long long) 25214903917
				1420	+ 11;
				1421	}
				1422	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1423	} else {
				1424	target = table[(next_random >> 16)
				1425	% table_size];
				1426	}
				1427	if (target == 0)
				1428	target = next_random % (vocab_size - 1) + 1;
				1429	if (target == word)
				1430	continue;
				1431	label = 0;
				1432	}
				1433	l2 = target * window_layer_size;
				1434	f = 0;
				1435	for (c = 0; c < layer1_size; c++)
				1436	f +=
				1437	syn0[c + l1]
				1438	* syn1neg_window[c + l2
				1439	+ window_offset];
				1440	if (f > MAX_EXP)
				1441	g = (label - 1) * alpha;
				1442	else if (f < -MAX_EXP)
				1443	g = (label - 0) * alpha;
				1444	else
				1445	g =
				1446	(label
				1447	- expTable[(int) ((f + MAX_EXP)
				1448	* (EXP_TABLE_SIZE
				1449	/ MAX_EXP / 2))])
				1450	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1451	if(debug_mode > 2 && ((long long) id) == 0) {
				1452	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1453	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1454	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1455	for (c = 0; c < layer1_size; c++)
				1456	neu1e[c] +=
				1457	g
				1458	* syn1neg_window[c + l2
				1459	+ window_offset];
				1460	for (c = 0; c < layer1_size; c++)
				1461	syn1neg_window[c + l2 + window_offset] += g
				1462	* syn0[c + l1];
				1463	if (cap == 1)
				1464	for (c = 0; c < layer1_size; c++)
				1465	capParam(syn1neg_window,
				1466	c + l2 + window_offset);
				1467	}
				1468	// Noise Constrastive Estimation
				1469	if (nce > 0)
				1470	for (d = 0; d < nce + 1; d++) {
				1471	if (d == 0) {
				1472	target = word;
				1473	label = 1;
				1474	} else {
				1475	next_random = next_random
				1476	* (unsigned long long) 25214903917 + 11;
				1477	if (word_to_group != NULL
				1478	&& word_to_group[word] != -1) {
				1479	target = word;
				1480	while (target == word) {
				1481	target =
				1482	group_to_table[word_to_group[word]
				1483	* table_size
				1484	+ (next_random >> 16)
				1485	% table_size];
				1486	next_random =
				1487	next_random
				1488	* (unsigned long long) 25214903917
				1489	+ 11;
				1490	}
				1491	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1492	} else {
				1493	target = table[(next_random >> 16)
				1494	% table_size];
				1495	}
				1496	if (target == 0)
				1497	target = next_random % (vocab_size - 1) + 1;
				1498	if (target == word)
				1499	continue;
				1500	label = 0;
				1501	}
				1502	l2 = target * window_layer_size;
				1503	f = 0;
				1504	for (c = 0; c < layer1_size; c++)
				1505	f +=
				1506	syn0[c + l1]
				1507	* syn1nce_window[c + l2
				1508	+ window_offset];
				1509	if (f > MAX_EXP)
				1510	g = (label - 1) * alpha;
				1511	else if (f < -MAX_EXP)
				1512	g = (label - 0) * alpha;
				1513	else {
				1514	f = exp(f);
				1515	g = (label
				1516	- f
				1517	/ (noise_distribution[target]
				1518	* nce + f)) * alpha;
				1519	}
				1520	for (c = 0; c < layer1_size; c++)
				1521	neu1e[c] +=
				1522	g
				1523	* syn1nce_window[c + l2
				1524	+ window_offset];
				1525	for (c = 0; c < layer1_size; c++)
				1526	syn1nce_window[c + l2 + window_offset] += g
				1527	* syn0[c + l1];
				1528	if (cap == 1)
				1529	for (c = 0; c < layer1_size; c++)
				1530	capParam(syn1nce_window,
				1531	c + l2 + window_offset);
				1532	}
				1533	// Learn weights input -> hidden
				1534	for (c = 0; c < layer1_size; c++) {
				1535	syn0[c + l1] += neu1e[c];
				1536	if (syn0[c + l1] > 50)
				1537	syn0[c + l1] = 50;
				1538	if (syn0[c + l1] < -50)
				1539	syn0[c + l1] = -50;
				1540	}
				1541	}
				1542	} else if (type == 4) { //training senna
				1543	// in -> hidden
				1544	cw = 0;
				1545	for (a = 0; a < window * 2 + 1; a++)
				1546	if (a != window) {
				1547	c = sentence_position - window + a;
				1548	if (c < 0)
				1549	continue;
				1550	if (c >= sentence_length)
				1551	continue;
				1552	last_word = sen[c];
				1553	if (last_word == -1)
				1554	continue;
				1555	window_offset = a * layer1_size;
				1556	if (a > window)
				1557	window_offset -= layer1_size;
				1558	for (c = 0; c < layer1_size; c++)
				1559	neu1[c + window_offset] += syn0[c
				1560	+ last_word * layer1_size];
				1561	cw++;
				1562	}
				1563	if (cw) {
				1564	for (a = 0; a < window_hidden_size; a++) {
				1565	c = a * window_layer_size;
				1566	for (b = 0; b < window_layer_size; b++) {
				1567	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1568	}
				1569	}
				1570	if (hs)
				1571	for (d = 0; d < vocab[word].codelen; d++) {
				1572	f = 0;
				1573	l2 = vocab[word].point[d] * window_hidden_size;
				1574	// Propagate hidden -> output
				1575	for (c = 0; c < window_hidden_size; c++)
				1576	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1577	if (f <= -MAX_EXP)
				1578	continue;
				1579	else if (f >= MAX_EXP)
				1580	continue;
				1581	else
				1582	f = expTable[(int) ((f + MAX_EXP)
				1583	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1584	// 'g' is the gradient multiplied by the learning rate
				1585	g = (1 - vocab[word].code[d] - f) * alpha;
				1586	// Propagate errors output -> hidden
				1587	for (c = 0; c < window_hidden_size; c++)
				1588	neu2e[c] += dHardTanh(neu2[c], g) * g
				1589	* syn_hidden_word[c + l2];
				1590	// Learn weights hidden -> output
				1591	for (c = 0; c < window_hidden_size; c++)
				1592	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1593	* neu2[c];
				1594	}
				1595	// NEGATIVE SAMPLING
				1596	if (negative > 0)
				1597	for (d = 0; d < negative + 1; d++) {
				1598	if (d == 0) {
				1599	target = word;
				1600	label = 1;
				1601	} else {
				1602	next_random = next_random
				1603	* (unsigned long long) 25214903917 + 11;
				1604	if (word_to_group != NULL
				1605	&& word_to_group[word] != -1) {
				1606	target = word;
				1607	while (target == word) {
				1608	target = group_to_table[word_to_group[word]
				1609	* table_size
				1610	+ (next_random >> 16) % table_size];
				1611	next_random = next_random
				1612	* (unsigned long long) 25214903917
				1613	+ 11;
				1614	}
				1615	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1616	} else {
				1617	target =
				1618	table[(next_random >> 16) % table_size];
				1619	}
				1620	if (target == 0)
				1621	target = next_random % (vocab_size - 1) + 1;
				1622	if (target == word)
				1623	continue;
				1624	label = 0;
				1625	}
				1626	l2 = target * window_hidden_size;
				1627	f = 0;
				1628	for (c = 0; c < window_hidden_size; c++)
				1629	f += hardTanh(neu2[c])
				1630	* syn_hidden_word_neg[c + l2];
				1631	if (f > MAX_EXP)
				1632	g = (label - 1) * alpha / negative;
				1633	else if (f < -MAX_EXP)
				1634	g = (label - 0) * alpha / negative;
				1635	else
				1636	g = (label
				1637	- expTable[(int) ((f + MAX_EXP)
				1638	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1639	* alpha / negative;
				1640	for (c = 0; c < window_hidden_size; c++)
				1641	neu2e[c] += dHardTanh(neu2[c], g) * g
				1642	* syn_hidden_word_neg[c + l2];
				1643	for (c = 0; c < window_hidden_size; c++)
				1644	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1645	* g * neu2[c];
				1646	}
				1647	for (a = 0; a < window_hidden_size; a++)
				1648	for (b = 0; b < window_layer_size; b++)
				1649	neu1e[b] += neu2e[a]
				1650	* syn_window_hidden[a * window_layer_size + b];
				1651	for (a = 0; a < window_hidden_size; a++)
				1652	for (b = 0; b < window_layer_size; b++)
				1653	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1654	* neu1[b];
				1655	// hidden -> in
				1656	for (a = 0; a < window * 2 + 1; a++)
				1657	if (a != window) {
				1658	c = sentence_position - window + a;
				1659	if (c < 0)
				1660	continue;
				1661	if (c >= sentence_length)
				1662	continue;
				1663	last_word = sen[c];
				1664	if (last_word == -1)
				1665	continue;
				1666	window_offset = a * layer1_size;
				1667	if (a > window)
				1668	window_offset -= layer1_size;
				1669	for (c = 0; c < layer1_size; c++)
				1670	syn0[c + last_word * layer1_size] += neu1e[c
				1671	+ window_offset];
				1672	}
				1673	}
				1674	} else {
				1675	printf("unknown type %i", type);
				1676	exit(0);
				1677	}
				1678	sentence_position++;
				1679	if (sentence_position >= sentence_length) {
				1680	sentence_length = 0;
				1681	continue;
				1682	}
				1683	}
				1684	fclose(fi);
				1685	free(neu1);
				1686	free(neu1e);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1687	threadPos[(long) id] = -1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1688	pthread_exit(NULL);
				1689	}
				1690
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1691	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1692	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1693	real f, max_f, maxmax_f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1694	real *target_sums, bestf[MAX_CC], worstbest;
				1695	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1696	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1697	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1698
				1699	for (d = cc; d < vocab_size; d++) {
				1700	for (b = 0; b < vocab_size; b++)
				1701	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1702	for (b = 0; b < N; b++)
				1703	bestf[b]=-1;
				1704	worstbest = -1;
				1705
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1706	maxmax_f = -1;
				1707	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1708	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1709	if (a != window) {
				1710	max_f = -1;
				1711	window_offset = a * layer1_size;
				1712	if (a > window)
				1713	window_offset -= layer1_size;
				1714	for(target = 0; target < vocab_size; target ++) {
				1715	if(target == d)
				1716	continue;
				1717	f = 0;
				1718	for (c = 0; c < layer1_size; c++)
				1719	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1720	if (f < -MAX_EXP)
				1721	continue;
				1722	else if (f > MAX_EXP)
				1723	continue;
				1724	else
				1725	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1726	if(f > max_f) {
				1727	max_f = f;
				1728	max_target = target;
				1729	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1730	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1731	if(f > worstbest) {
				1732	for (b = 0; b < N; b++) {
				1733	if (f > bestf[b]) {
				1734	for (e = N - 1; e > b; e--) {
				1735	bestf[e] = bestf[e - 1];
				1736	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1737	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1738	}
				1739	bestf[b] = f;
				1740	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1741	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1742	break;
				1743	}
				1744	}
				1745	worstbest = bestf[N-1];
				1746	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1747	}
				1748	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1749	if(max_f > maxmax_f) {
				1750	maxmax_f = max_f;
				1751	maxmax_target = max_target;
				1752	}
				1753	} else {
				1754	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1755	}
				1756	}
				1757	max_f = -1;
				1758	for (b = 0; b < vocab_size; b++) {
				1759	if(target_sums[b] > max_f) {
				1760	max_f = target_sums[b];
				1761	max_target = b;
				1762	}
				1763	}
				1764	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1765	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1766	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1767	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1768	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1769	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1770	}
				1771	}
				1772
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1773	void TrainModel() {
				1774	long a, b, c, d;
				1775	FILE *fo;
				1776	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1777	threadPos = malloc(num_threads * sizeof(long long));
				1778	threadIters = malloc(num_threads * sizeof(int));
				1779	char *timebuf = malloc(80);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1780	printf("Starting training using file %s\n", train_file);
				1781	starting_alpha = alpha;
				1782	if (read_vocab_file[0] != 0)
				1783	ReadVocab();
				1784	else
				1785	LearnVocabFromTrainFile();
				1786	if (save_vocab_file[0] != 0)
				1787	SaveVocab();
				1788	if (output_file[0] == 0)
				1789	return;
				1790	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1791	if(cc > 0)
				1792	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1793	if (negative > 0 \|\| nce > 0)
				1794	InitUnigramTable();
				1795	if (negative_classes_file[0] != 0)
				1796	InitClassUnigramTable();
				1797	start = clock();
				1798	for (a = 0; a < num_threads; a++)
				1799	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1800	if(debug_mode > 1)
				1801	pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1802	for (a = 0; a < num_threads; a++)
				1803	pthread_join(pt[a], NULL);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1804	if(debug_mode > 1) {
				1805	pthread_join(pt[num_threads], NULL);
				1806	clock_t now = clock();
				1807	printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now-start) / CLOCKS_PER_SEC / num_threads, (now-start) / CLOCKS_PER_SEC);
				1808	printf("Saving vectors to %s ...", output_file);
				1809	fflush(stdout);
				1810	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1811	fo = fopen(output_file, "wb");
				1812	if (classes == 0) {
				1813	// Save the word vectors
				1814	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1815	for (a = 0; a < vocab_size; a++) {
				1816	fprintf(fo, "%s ", vocab[a].word);
				1817	if (binary)
				1818	for (b = 0; b < layer1_size; b++)
				1819	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1820	else
				1821	for (b = 0; b < layer1_size; b++)
				1822	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1823	fprintf(fo, "\n");
				1824	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1825	if(debug_mode > 1)
				1826	fprintf(stderr, "\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1827	} else {
				1828	// Run K-means on the word vectors
				1829	int clcn = classes, iter = 10, closeid;
				1830	int centcn = (int ) malloc(classes * sizeof(int));
				1831	int cl = (int ) calloc(vocab_size, sizeof(int));
				1832	real closev, x;
				1833	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1834	for (a = 0; a < vocab_size; a++)
				1835	cl[a] = a % clcn;
				1836	for (a = 0; a < iter; a++) {
				1837	for (b = 0; b < clcn * layer1_size; b++)
				1838	cent[b] = 0;
				1839	for (b = 0; b < clcn; b++)
				1840	centcn[b] = 1;
				1841	for (c = 0; c < vocab_size; c++) {
				1842	for (d = 0; d < layer1_size; d++)
				1843	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1844	centcn[cl[c]]++;
				1845	}
				1846	for (b = 0; b < clcn; b++) {
				1847	closev = 0;
				1848	for (c = 0; c < layer1_size; c++) {
				1849	cent[layer1_size * b + c] /= centcn[b];
				1850	closev += cent[layer1_size * b + c]
				1851	* cent[layer1_size * b + c];
				1852	}
				1853	closev = sqrt(closev);
				1854	for (c = 0; c < layer1_size; c++)
				1855	cent[layer1_size * b + c] /= closev;
				1856	}
				1857	for (c = 0; c < vocab_size; c++) {
				1858	closev = -10;
				1859	closeid = 0;
				1860	for (d = 0; d < clcn; d++) {
				1861	x = 0;
				1862	for (b = 0; b < layer1_size; b++)
				1863	x += cent[layer1_size * d + b]
				1864	* syn0[c * layer1_size + b];
				1865	if (x > closev) {
				1866	closev = x;
				1867	closeid = d;
				1868	}
				1869	}
				1870	cl[c] = closeid;
				1871	}
				1872	}
				1873	// Save the K-means classes
				1874	for (a = 0; a < vocab_size; a++)
				1875	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1876	free(centcn);
				1877	free(cent);
				1878	free(cl);
				1879	}
				1880	fclose(fo);
				1881	if (save_net_file[0] != 0)
				1882	SaveNet();
				1883	}
				1884
				1885	int ArgPos(char str, int argc, char *argv) {
				1886	int a;
				1887	for (a = 1; a < argc; a++)
				1888	if (!strcmp(str, argv[a])) {
				1889	if (a == argc - 1) {
				1890	printf("Argument missing for %s\n", str);
				1891	exit(1);
				1892	}
				1893	return a;
				1894	}
				1895	return -1;
				1896	}
				1897
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1898	void print_help() {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1899	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1900	printf("Options:\n");
				1901	printf("Parameters for training:\n");
				1902	printf("\t-train <file>\n");
				1903	printf("\t\tUse text data from <file> to train the model\n");
				1904	printf("\t-output <file>\n");
				1905	printf(
				1906	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1907	printf("\t-size <int>\n");
				1908	printf("\t\tSet size of word vectors; default is 100\n");
				1909	printf("\t-window <int>\n");
				1910	printf("\t\tSet max skip length between words; default is 5\n");
				1911	printf("\t-sample <float>\n");
				1912	printf(
				1913	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1914	printf(
				1915	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1916	printf("\t-hs <int>\n");
				1917	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1918	printf("\t-negative <int>\n");
				1919	printf(
				1920	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1921	printf("\t-negative-classes <file>\n");
				1922	printf("\t\tNegative classes to sample from\n");
				1923	printf("\t-nce <int>\n");
				1924	printf(
				1925	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1926	printf("\t-threads <int>\n");
				1927	printf("\t\tUse <int> threads (default 12)\n");
				1928	printf("\t-iter <int>\n");
				1929	printf("\t\tRun more training iterations (default 5)\n");
				1930	printf("\t-min-count <int>\n");
				1931	printf(
				1932	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1933	printf("\t-alpha <float>\n");
				1934	printf(
				1935	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1936	printf("\t-classes <int>\n");
				1937	printf(
				1938	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1939	printf("\t-debug <int>\n");
				1940	printf(
				1941	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1942	printf("\t-binary <int>\n");
				1943	printf(
				1944	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1945	printf("\t-save-vocab <file>\n");
				1946	printf("\t\tThe vocabulary will be saved to <file>\n");
				1947	printf("\t-read-vocab <file>\n");
				1948	printf(
				1949	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1950	printf("\t-read-net <file>\n");
				1951	printf(
				1952	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1953	printf("\t-save-net <file>\n");
				1954	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1955	printf("\t-show-cc <int>\n");
				1956	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1957	printf("\t-type <int>\n");
				1958	printf(
				1959	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1960	printf("\t-cap <int>\n");
				1961	printf(
				1962	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1963	printf("\nExamples:\n");
				1964	printf(
				1965	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1966	}
				1967
				1968	int main(int argc, char **argv) {
				1969	int i;
				1970	setlocale(LC_ALL, "");
				1971	if (argc == 1) {
				1972	print_help();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1973	return 0;
				1974	}
				1975	output_file[0] = 0;
				1976	save_vocab_file[0] = 0;
				1977	read_vocab_file[0] = 0;
				1978	save_net_file[0] = 0;
				1979	read_net_file[0] = 0;
				1980	negative_classes_file[0] = 0;
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1981	if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
				1982	print_help();
				1983	return(0);
				1984	}
				1985	if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
				1986	print_help();
				1987	return(0);
				1988	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1989	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				1990	layer1_size = atoi(argv[i + 1]);
				1991	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				1992	strcpy(train_file, argv[i + 1]);
				1993	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				1994	strcpy(save_vocab_file, argv[i + 1]);
				1995	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				1996	strcpy(read_vocab_file, argv[i + 1]);
				1997	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				1998	strcpy(save_net_file, argv[i + 1]);
				1999	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				2000	strcpy(read_net_file, argv[i + 1]);
				2001	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				2002	debug_mode = atoi(argv[i + 1]);
				2003	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				2004	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	2005	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				2006	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2007	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				2008	type = atoi(argv[i + 1]);
				2009	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				2010	strcpy(output_file, argv[i + 1]);
				2011	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				2012	window = atoi(argv[i + 1]);
				2013	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				2014	sample = atof(argv[i + 1]);
				2015	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				2016	hs = atoi(argv[i + 1]);
				2017	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				2018	negative = atoi(argv[i + 1]);
				2019	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				2020	strcpy(negative_classes_file, argv[i + 1]);
				2021	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				2022	nce = atoi(argv[i + 1]);
				2023	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				2024	num_threads = atoi(argv[i + 1]);
				2025	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				2026	iter = atoi(argv[i + 1]);
				2027	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				2028	min_count = atoi(argv[i + 1]);
				2029	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				2030	classes = atoi(argv[i + 1]);
				2031	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				2032	cap = atoi(argv[i + 1]);
				2033	if (type == 0 \|\| type == 2 \|\| type == 4)
				2034	alpha = 0.05;
				2035	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				2036	alpha = atof(argv[i + 1]);
				2037	vocab = (struct vocab_word *) calloc(vocab_max_size,
				2038	sizeof(struct vocab_word));
				2039	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				2040	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				2041	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				2042	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				2043	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				2044	}
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	2045	SaveArgs(argc, argv);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2046	TrainModel();
				2047	return 0;
				2048	}
				2049