Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 2eca7e2c8a66edade0f16319ce1bc6fe8c96b760 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	15	#include <locale.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	16	#include <stdio.h>
				17	#include <stdlib.h>
				18	#include <string.h>
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	19	#include <unistd.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	20	#include <math.h>
				21	#include <pthread.h>
				22
				23	#define MAX_STRING 100
				24	#define EXP_TABLE_SIZE 1000
				25	#define MAX_EXP 6
				26	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	27	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	28	#define MAX_CODE_LENGTH 40
				29
				30	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				31
				32	typedef float real; // Precision of float numbers
				33
				34	struct vocab_word {
				35	long long cn;
				36	int *point;
				37	char word, code, codelen;
				38	};
				39
				40	char train_file[MAX_STRING], output_file[MAX_STRING];
				41	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				42	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				43	struct vocab_word *vocab;
				44	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	45	num_threads = 12, min_reduce = 1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	46	int *vocab_hash;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	47	long long *threadPos;
				48	int *threadIters;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	49	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				50	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				51	classes = 0;
				52	real alpha = 0.025, starting_alpha, sample = 1e-3;
				53	real syn0, syn1, syn1neg, syn1nce, *expTable;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	54	real avgWordLength=0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	55	clock_t start;
				56
				57	real syn1_window, syn1neg_window, *syn1nce_window;
				58	int w_offset, window_layer_size;
				59
				60	int window_hidden_size = 500;
				61	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				62	*syn_hidden_word_nce;
				63
				64	int hs = 0, negative = 5;
				65	const int table_size = 1e8;
				66	int *table;
				67
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	68	long cc = 0;
				69
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	70	//constrastive negative sampling
				71	char negative_classes_file[MAX_STRING];
				72	int *word_to_group;
				73	int group_to_table; //group_sizetable_size
				74	int class_number;
				75
				76	//nce
				77	real* noise_distribution;
				78	int nce = 0;
				79
				80	//param caps
				81	real CAP_VALUE = 50;
				82	int cap = 0;
				83
				84	void capParam(real* array, int index) {
				85	if (array[index] > CAP_VALUE)
				86	array[index] = CAP_VALUE;
				87	else if (array[index] < -CAP_VALUE)
				88	array[index] = -CAP_VALUE;
				89	}
				90
				91	real hardTanh(real x) {
				92	if (x >= 1) {
				93	return 1;
				94	} else if (x <= -1) {
				95	return -1;
				96	} else {
				97	return x;
				98	}
				99	}
				100
				101	real dHardTanh(real x, real g) {
				102	if (x > 1 && g > 0) {
				103	return 0;
				104	}
				105	if (x < -1 && g < 0) {
				106	return 0;
				107	}
				108	return 1;
				109	}
				110
				111	void InitUnigramTable() {
				112	int a, i;
				113	long long train_words_pow = 0;
				114	real d1, power = 0.75;
				115	table = (int ) malloc(table_size sizeof(int));
				116	for (a = 0; a < vocab_size; a++)
				117	train_words_pow += pow(vocab[a].cn, power);
				118	i = 0;
				119	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				120	for (a = 0; a < table_size; a++) {
				121	table[a] = i;
				122	if (a / (real) table_size > d1) {
				123	i++;
				124	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				125	}
				126	if (i >= vocab_size)
				127	i = vocab_size - 1;
				128	}
				129
				130	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				131	for (a = 0; a < vocab_size; a++)
				132	noise_distribution[a] = pow(vocab[a].cn, power)
				133	/ (real) train_words_pow;
				134	}
				135
				136	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				137	void ReadWord(char word, FILE fin) {
				138	int a = 0, ch;
				139	while (!feof(fin)) {
				140	ch = fgetc(fin);
				141	if (ch == 13)
				142	continue;
				143	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				144	if (a > 0) {
				145	if (ch == '\n')
				146	ungetc(ch, fin);
				147	break;
				148	}
				149	if (ch == '\n') {
				150	strcpy(word, (char *) "</s>");
				151	return;
				152	} else
				153	continue;
				154	}
				155	word[a] = ch;
				156	a++;
				157	if (a >= MAX_STRING - 1)
				158	a--; // Truncate too long words
				159	}
				160	word[a] = 0;
				161	}
				162
				163	// Returns hash value of a word
				164	int GetWordHash(char *word) {
				165	unsigned long long a, hash = 0;
				166	for (a = 0; a < strlen(word); a++)
				167	hash = hash * 257 + word[a];
				168	hash = hash % vocab_hash_size;
				169	return hash;
				170	}
				171
				172	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				173	int SearchVocab(char *word) {
				174	unsigned int hash = GetWordHash(word);
				175	while (1) {
				176	if (vocab_hash[hash] == -1)
				177	return -1;
				178	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				179	return vocab_hash[hash];
				180	hash = (hash + 1) % vocab_hash_size;
				181	}
				182	return -1;
				183	}
				184
				185	// Reads a word and returns its index in the vocabulary
				186	int ReadWordIndex(FILE *fin) {
				187	char word[MAX_STRING];
				188	ReadWord(word, fin);
				189	if (feof(fin))
				190	return -1;
				191	return SearchVocab(word);
				192	}
				193
				194	// Adds a word to the vocabulary
				195	int AddWordToVocab(char *word) {
				196	unsigned int hash, length = strlen(word) + 1;
				197	if (length > MAX_STRING)
				198	length = MAX_STRING;
				199	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				200	strcpy(vocab[vocab_size].word, word);
				201	vocab[vocab_size].cn = 0;
				202	vocab_size++;
				203	// Reallocate memory if needed
				204	if (vocab_size + 2 >= vocab_max_size) {
				205	vocab_max_size += 1000;
				206	vocab = (struct vocab_word *) realloc(vocab,
				207	vocab_max_size * sizeof(struct vocab_word));
				208	}
				209	hash = GetWordHash(word);
				210	while (vocab_hash[hash] != -1)
				211	hash = (hash + 1) % vocab_hash_size;
				212	vocab_hash[hash] = vocab_size - 1;
				213	return vocab_size - 1;
				214	}
				215
				216	// Used later for sorting by word counts
				217	int VocabCompare(const void a, const void b) {
				218	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				219	}
				220
				221	// Sorts the vocabulary by frequency using word counts
				222	void SortVocab() {
				223	int a, size;
				224	unsigned int hash;
				225	// Sort the vocabulary and keep </s> at the first position
				226	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				227	for (a = 0; a < vocab_hash_size; a++)
				228	vocab_hash[a] = -1;
				229	size = vocab_size;
				230	train_words = 0;
				231	for (a = 0; a < size; a++) {
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	232	avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	233	// Words occuring less than min_count times will be discarded from the vocab
				234	if ((vocab[a].cn < min_count) && (a != 0)) {
				235	vocab_size--;
				236	free(vocab[a].word);
				237	} else {
				238	// Hash will be re-computed, as after the sorting it is not actual
				239	hash = GetWordHash(vocab[a].word);
				240	while (vocab_hash[hash] != -1)
				241	hash = (hash + 1) % vocab_hash_size;
				242	vocab_hash[hash] = a;
				243	train_words += vocab[a].cn;
				244	}
				245	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	246	avgWordLength /= train_words;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	247	vocab = (struct vocab_word *) realloc(vocab,
				248	(vocab_size + 1) * sizeof(struct vocab_word));
				249	// Allocate memory for the binary tree construction
				250	for (a = 0; a < vocab_size; a++) {
				251	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				252	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				253	}
				254	}
				255
				256	// Reduces the vocabulary by removing infrequent tokens
				257	void ReduceVocab() {
				258	int a, b = 0;
				259	unsigned int hash;
				260	for (a = 0; a < vocab_size; a++)
				261	if (vocab[a].cn > min_reduce) {
				262	vocab[b].cn = vocab[a].cn;
				263	vocab[b].word = vocab[a].word;
				264	b++;
				265	} else
				266	free(vocab[a].word);
				267	vocab_size = b;
				268	for (a = 0; a < vocab_hash_size; a++)
				269	vocab_hash[a] = -1;
				270	for (a = 0; a < vocab_size; a++) {
				271	// Hash will be re-computed, as it is not actual
				272	hash = GetWordHash(vocab[a].word);
				273	while (vocab_hash[hash] != -1)
				274	hash = (hash + 1) % vocab_hash_size;
				275	vocab_hash[hash] = a;
				276	}
				277	fflush(stdout);
				278	min_reduce++;
				279	}
				280
				281	// Create binary Huffman tree using the word counts
				282	// Frequent words will have short uniqe binary codes
				283	void CreateBinaryTree() {
				284	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				285	char code[MAX_CODE_LENGTH];
				286	long long count = (long long ) calloc(vocab_size * 2 + 1,
				287	sizeof(long long));
				288	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				289	sizeof(long long));
				290	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				291	sizeof(long long));
				292	for (a = 0; a < vocab_size; a++)
				293	count[a] = vocab[a].cn;
				294	for (a = vocab_size; a < vocab_size * 2; a++)
				295	count[a] = 1e15;
				296	pos1 = vocab_size - 1;
				297	pos2 = vocab_size;
				298	// Following algorithm constructs the Huffman tree by adding one node at a time
				299	for (a = 0; a < vocab_size - 1; a++) {
				300	// First, find two smallest nodes 'min1, min2'
				301	if (pos1 >= 0) {
				302	if (count[pos1] < count[pos2]) {
				303	min1i = pos1;
				304	pos1--;
				305	} else {
				306	min1i = pos2;
				307	pos2++;
				308	}
				309	} else {
				310	min1i = pos2;
				311	pos2++;
				312	}
				313	if (pos1 >= 0) {
				314	if (count[pos1] < count[pos2]) {
				315	min2i = pos1;
				316	pos1--;
				317	} else {
				318	min2i = pos2;
				319	pos2++;
				320	}
				321	} else {
				322	min2i = pos2;
				323	pos2++;
				324	}
				325	count[vocab_size + a] = count[min1i] + count[min2i];
				326	parent_node[min1i] = vocab_size + a;
				327	parent_node[min2i] = vocab_size + a;
				328	binary[min2i] = 1;
				329	}
				330	// Now assign binary code to each vocabulary word
				331	for (a = 0; a < vocab_size; a++) {
				332	b = a;
				333	i = 0;
				334	while (1) {
				335	code[i] = binary[b];
				336	point[i] = b;
				337	i++;
				338	b = parent_node[b];
				339	if (b == vocab_size * 2 - 2)
				340	break;
				341	}
				342	vocab[a].codelen = i;
				343	vocab[a].point[0] = vocab_size - 2;
				344	for (b = 0; b < i; b++) {
				345	vocab[a].code[i - b - 1] = code[b];
				346	vocab[a].point[i - b] = point[b] - vocab_size;
				347	}
				348	}
				349	free(count);
				350	free(binary);
				351	free(parent_node);
				352	}
				353
				354	void LearnVocabFromTrainFile() {
				355	char word[MAX_STRING];
				356	FILE *fin;
				357	long long a, i;
				358	for (a = 0; a < vocab_hash_size; a++)
				359	vocab_hash[a] = -1;
				360	fin = fopen(train_file, "rb");
				361	if (fin == NULL) {
				362	printf("ERROR: training data file not found!\n");
				363	exit(1);
				364	}
				365	vocab_size = 0;
				366	AddWordToVocab((char *) "</s>");
				367	while (1) {
				368	ReadWord(word, fin);
				369	if (feof(fin))
				370	break;
				371	train_words++;
				372	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				373	printf("%lldK%c", train_words / 1000, 13);
				374	fflush(stdout);
				375	}
				376	i = SearchVocab(word);
				377	if (i == -1) {
				378	a = AddWordToVocab(word);
				379	vocab[a].cn = 1;
				380	} else
				381	vocab[i].cn++;
				382	if (vocab_size > vocab_hash_size * 0.7)
				383	ReduceVocab();
				384	}
				385	SortVocab();
				386	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	387	printf("Vocab size: %'lld\n", vocab_size);
				388	printf("Words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	389	}
				390	file_size = ftell(fin);
				391	fclose(fin);
				392	}
				393
				394	void SaveVocab() {
				395	long long i;
				396	FILE *fo = fopen(save_vocab_file, "wb");
				397	for (i = 0; i < vocab_size; i++)
				398	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				399	fclose(fo);
				400	}
				401
				402	void ReadVocab() {
				403	long long a, i = 0;
				404	char c;
				405	char word[MAX_STRING];
				406	FILE *fin = fopen(read_vocab_file, "rb");
				407	if (fin == NULL) {
				408	printf("Vocabulary file not found\n");
				409	exit(1);
				410	}
				411	for (a = 0; a < vocab_hash_size; a++)
				412	vocab_hash[a] = -1;
				413	vocab_size = 0;
				414	while (1) {
				415	ReadWord(word, fin);
				416	if (feof(fin))
				417	break;
				418	a = AddWordToVocab(word);
				419	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				420	i++;
				421	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	422	fclose(fin);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	423	fin = fopen(train_file, "rb");
				424	if (fin == NULL) {
				425	printf("ERROR: training data file not found!\n");
				426	exit(1);
				427	}
				428	fseek(fin, 0, SEEK_END);
				429	file_size = ftell(fin);
				430	fclose(fin);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	431	SortVocab();
				432	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	433	printf("Vocab size: %'lld\n", vocab_size);
				434	printf("Words in vocab's train file: %'lld\n", train_words);
				435	printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	436	}
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	437	train_words = file_size / avgWordLength;
				438	if(debug_mode > 0)
				439	printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	440	}
				441
				442	void InitClassUnigramTable() {
				443	long long a, c;
				444	printf("loading class unigrams \n");
				445	FILE *fin = fopen(negative_classes_file, "rb");
				446	if (fin == NULL) {
				447	printf("ERROR: class file not found!\n");
				448	exit(1);
				449	}
				450	word_to_group = (int ) malloc(vocab_size sizeof(int));
				451	for (a = 0; a < vocab_size; a++)
				452	word_to_group[a] = -1;
				453	char class[MAX_STRING];
				454	char prev_class[MAX_STRING];
				455	prev_class[0] = 0;
				456	char word[MAX_STRING];
				457	class_number = -1;
				458	while (1) {
				459	if (feof(fin))
				460	break;
				461	ReadWord(class, fin);
				462	ReadWord(word, fin);
				463	int word_index = SearchVocab(word);
				464	if (word_index != -1) {
				465	if (strcmp(class, prev_class) != 0) {
				466	class_number++;
				467	strcpy(prev_class, class);
				468	}
				469	word_to_group[word_index] = class_number;
				470	}
				471	ReadWord(word, fin);
				472	}
				473	class_number++;
				474	fclose(fin);
				475
				476	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				477	long long train_words_pow = 0;
				478	real d1, power = 0.75;
				479
				480	for (c = 0; c < class_number; c++) {
				481	long long offset = c * table_size;
				482	train_words_pow = 0;
				483	for (a = 0; a < vocab_size; a++)
				484	if (word_to_group[a] == c)
				485	train_words_pow += pow(vocab[a].cn, power);
				486	int i = 0;
				487	while (word_to_group[i] != c && i < vocab_size)
				488	i++;
				489	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				490	for (a = 0; a < table_size; a++) {
				491	//printf("index %lld , word %d\n", a, i);
				492	group_to_table[offset + a] = i;
				493	if (a / (real) table_size > d1) {
				494	i++;
				495	while (word_to_group[i] != c && i < vocab_size)
				496	i++;
				497	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				498	}
				499	if (i >= vocab_size)
				500	while (word_to_group[i] != c && i >= 0)
				501	i--;
				502	}
				503	}
				504	}
				505
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	506	void SaveArgs(int argc, char **argv) {
				507	unsigned int i;
				508	size_t len = 0;
				509	char _all_args, all_args;
				510	char *args_file = strdup(output_file);
				511	strcat(args_file, ".args");
				512	FILE *fargs = fopen(args_file, "w");
				513	if (fargs == NULL) {
				514	printf("Cannot save args to %s.\n", args_file);
				515	return;
				516	}
				517
				518	for(i=1; i<argc; i++) {
				519	len += strlen(argv[i]);
				520	}
				521
				522	_all_args = all_args = (char *)malloc(len+argc-1);
				523
				524	for(i=1; i<argc; i++) {
				525	memcpy(_all_args, argv[i], strlen(argv[i]));
				526	_all_args += strlen(argv[i])+1;
				527	*(_all_args-1) = ' ';
				528	}
				529	*(_all_args-1) = 0;
				530
				531	fprintf(fargs, "%s\n", all_args);
				532	fclose(fargs);
				533
				534	free(all_args);
				535
				536	return;
				537	}
				538
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	539	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	540	if(type != 3 \|\| negative <= 0) {
				541	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				542	return;
				543	}
				544
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	545	FILE *fnet = fopen(save_net_file, "wb");
				546	if (fnet == NULL) {
				547	printf("Net parameter file not found\n");
				548	exit(1);
				549	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	550	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	551	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	552	fclose(fnet);
				553	}
				554
				555	void InitNet() {
				556	long long a, b;
				557	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	558	long long read;
				559
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	560	window_layer_size = layer1_size * window * 2;
				561	a = posix_memalign((void **) &syn0, 128,
				562	(long long) vocab_size * layer1_size * sizeof(real));
				563	if (syn0 == NULL) {
				564	printf("Memory allocation failed\n");
				565	exit(1);
				566	}
				567
				568	if (hs) {
				569	a = posix_memalign((void **) &syn1, 128,
				570	(long long) vocab_size * layer1_size * sizeof(real));
				571	if (syn1 == NULL) {
				572	printf("Memory allocation failed\n");
				573	exit(1);
				574	}
				575	a = posix_memalign((void **) &syn1_window, 128,
				576	(long long) vocab_size * window_layer_size * sizeof(real));
				577	if (syn1_window == NULL) {
				578	printf("Memory allocation failed\n");
				579	exit(1);
				580	}
				581	a = posix_memalign((void **) &syn_hidden_word, 128,
				582	(long long) vocab_size * window_hidden_size * sizeof(real));
				583	if (syn_hidden_word == NULL) {
				584	printf("Memory allocation failed\n");
				585	exit(1);
				586	}
				587
				588	for (a = 0; a < vocab_size; a++)
				589	for (b = 0; b < layer1_size; b++)
				590	syn1[a * layer1_size + b] = 0;
				591	for (a = 0; a < vocab_size; a++)
				592	for (b = 0; b < window_layer_size; b++)
				593	syn1_window[a * window_layer_size + b] = 0;
				594	for (a = 0; a < vocab_size; a++)
				595	for (b = 0; b < window_hidden_size; b++)
				596	syn_hidden_word[a * window_hidden_size + b] = 0;
				597	}
				598	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	599	if(type == 0) {
				600	a = posix_memalign((void **) &syn1neg, 128,
				601	(long long) vocab_size * layer1_size * sizeof(real));
				602	if (syn1neg == NULL) {
				603	printf("Memory allocation failed\n");
				604	exit(1);
				605	}
				606	for (a = 0; a < vocab_size; a++)
				607	for (b = 0; b < layer1_size; b++)
				608	syn1neg[a * layer1_size + b] = 0;
				609	} else if (type == 3) {
				610	a = posix_memalign((void **) &syn1neg_window, 128,
				611	(long long) vocab_size * window_layer_size * sizeof(real));
				612	if (syn1neg_window == NULL) {
				613	printf("Memory allocation failed\n");
				614	exit(1);
				615	}
				616	for (a = 0; a < vocab_size; a++)
				617	for (b = 0; b < window_layer_size; b++)
				618	syn1neg_window[a * window_layer_size + b] = 0;
				619	} else if (type == 4) {
				620	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				621	(long long) vocab_size * window_hidden_size * sizeof(real));
				622	if (syn_hidden_word_neg == NULL) {
				623	printf("Memory allocation failed\n");
				624	exit(1);
				625	}
				626	for (a = 0; a < vocab_size; a++)
				627	for (b = 0; b < window_hidden_size; b++)
				628	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	629	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	630	}
				631	if (nce > 0) {
				632	a = posix_memalign((void **) &syn1nce, 128,
				633	(long long) vocab_size * layer1_size * sizeof(real));
				634	if (syn1nce == NULL) {
				635	printf("Memory allocation failed\n");
				636	exit(1);
				637	}
				638	a = posix_memalign((void **) &syn1nce_window, 128,
				639	(long long) vocab_size * window_layer_size * sizeof(real));
				640	if (syn1nce_window == NULL) {
				641	printf("Memory allocation failed\n");
				642	exit(1);
				643	}
				644	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				645	(long long) vocab_size * window_hidden_size * sizeof(real));
				646	if (syn_hidden_word_nce == NULL) {
				647	printf("Memory allocation failed\n");
				648	exit(1);
				649	}
				650
				651	for (a = 0; a < vocab_size; a++)
				652	for (b = 0; b < layer1_size; b++)
				653	syn1nce[a * layer1_size + b] = 0;
				654	for (a = 0; a < vocab_size; a++)
				655	for (b = 0; b < window_layer_size; b++)
				656	syn1nce_window[a * window_layer_size + b] = 0;
				657	for (a = 0; a < vocab_size; a++)
				658	for (b = 0; b < window_hidden_size; b++)
				659	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				660	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	661
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	662	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	663	a = posix_memalign((void **) &syn_window_hidden, 128,
				664	window_hidden_size * window_layer_size * sizeof(real));
				665	if (syn_window_hidden == NULL) {
				666	printf("Memory allocation failed\n");
				667	exit(1);
				668	}
				669	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				670	next_random = next_random * (unsigned long long) 25214903917 + 11;
				671	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				672	- 0.5) / (window_hidden_size * window_layer_size);
				673	}
				674	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	675
				676	if (read_net_file[0] == 0) {
				677	for (a = 0; a < vocab_size; a++)
				678	for (b = 0; b < layer1_size; b++) {
				679	next_random = next_random * (unsigned long long) 25214903917
				680	+ 11;
				681	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				682	/ (real) 65536) - 0.5) / layer1_size;
				683	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	684	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	685	FILE *fnet = fopen(read_net_file, "rb");
				686	if (fnet == NULL) {
				687	printf("Net parameter file not found\n");
				688	exit(1);
				689	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	690	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				691	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				692	if(read != vocab_size * layer1_size) {
				693	fprintf(stderr, "read-net failed %lld\n", read);
				694	exit(-1);
				695	}
				696	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				697	if(read != (long long) vocab_size * window_layer_size) {
				698	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				699	(long long) sizeof(real) * vocab_size * window_layer_size);
				700	exit(-1);
				701	}
				702	fgetc(fnet);
				703	if(!feof(fnet)) {
				704	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				705	exit(-1);
				706	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	707	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	708	} else {
				709	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				710	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	711	}
				712
				713	CreateBinaryTree();
				714	}
				715
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	716	char currentDateTime(char buf, real offset) {
				717	time_t t;
				718	time(&t);
				719	t += (long) offset;
				720	struct tm tstruct;
				721	tstruct = *localtime(&t);
				722	strftime(buf, 80, "%c", &tstruct);
				723	return buf;
				724	}
				725
				726	void MonitorThread(void id) {
				727	char *timebuf = malloc(80);;
				728	int i, n=num_threads;
				729	long long sum;
				730	sleep(1);
				731	while(n > 0) {
				732	sleep(1);
				733	sum = n = 0;
				734	for(i=0; i < num_threads; i++) {
				735	if(threadPos[i] >= 0) {
				736	sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
				737	n++;
				738	} else {
				739	sum += iter * file_size / num_threads;
				740	}
				741	}
				742	if(n == 0)
				743	break;
				744	real finished_portion = (real) sum / (float) (file_size * iter);
				745	long long now = clock();
				746	long long elapsed = (now - start) / CLOCKS_PER_SEC / num_threads;
				747	long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed) * ((real) num_threads / n) ;
				748
				749	printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/t/s TE: %llds TTG: %llds ETA: %s\033[K",
				750	alpha,
				751	finished_portion * 100,
				752	(float) sum / elapsed / num_threads / 1000,
				753	elapsed,
				754	ttg,
				755	currentDateTime(timebuf, ttg)
				756	);
				757	fflush(stdout);
				758	}
				759	pthread_exit(NULL);
				760	}
				761
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	762	void TrainModelThread(void id) {
				763	long long a, b, d, cw, word, last_word, sentence_length = 0,
				764	sentence_position = 0;
				765	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				766	long long l1, l2, c, target, label, local_iter = iter;
				767	unsigned long long next_random = (long long) id;
				768	real f, g;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	769	int input_len_1 = layer1_size;
				770	int window_offset = -1;
				771	if (type == 2 \|\| type == 4) {
				772	input_len_1 = window_layer_size;
				773	}
				774	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				775	real neu1e = (real ) calloc(input_len_1, sizeof(real));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	776	threadIters[(long) id] = iter;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	777
				778	int input_len_2 = 0;
				779	if (type == 4) {
				780	input_len_2 = window_hidden_size;
				781	}
				782	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				783	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				784
				785	FILE *fi = fopen(train_file, "rb");
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	786	long long start_pos = file_size / (long long) num_threads * (long long) id;
				787	long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
				788	long long current_pos = start_pos;
				789	long long last_pos = start_pos;;
				790	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	791	while (1) {
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	792	if ((current_pos - last_pos > 100000)) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	793	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	794	last_pos = current_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	795	last_word_count = word_count;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	796	alpha = starting_alpha
				797	* (1 - word_count_actual / (real) (iter * train_words + 1));
				798	if (alpha < starting_alpha * 0.0001)
				799	alpha = starting_alpha * 0.0001;
				800	}
				801	if (sentence_length == 0) {
				802	while (1) {
				803	word = ReadWordIndex(fi);
				804	if (feof(fi))
				805	break;
				806	if (word == -1)
				807	continue;
				808	word_count++;
				809	if (word == 0)
				810	break;
				811	// The subsampling randomly discards frequent words while keeping the ranking same
				812	if (sample > 0) {
				813	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				814	+ 1) * (sample * train_words) / vocab[word].cn;
				815	next_random = next_random * (unsigned long long) 25214903917
				816	+ 11;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	817	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				818	if(type == 3) // in structured skipgrams
				819	word = -2; // keep the window position correct
				820	else
				821	continue;
				822	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	823	}
				824	sen[sentence_length] = word;
				825	sentence_length++;
				826	if (sentence_length >= MAX_SENTENCE_LENGTH)
				827	break;
				828	}
				829	sentence_position = 0;
				830	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	831	current_pos = threadPos[(long) id] = ftell(fi);
				832	if (feof(fi) \|\| current_pos >= end_pos ) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	833	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	834	threadIters[(long) id]--;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	835	local_iter--;
				836	if (local_iter == 0)
				837	break;
				838	word_count = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	839	current_pos = last_pos = start_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	840	last_word_count = 0;
				841	sentence_length = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	842	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	843	continue;
				844	}
				845	word = sen[sentence_position];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	846	while (word == -2 && sentence_position<sentence_length)
				847	word = sen[++sentence_position];
				848	if (sentence_position>=sentence_length) {
				849	sentence_length=0;
				850	continue;
				851	}
				852	if (word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	853	continue;
				854	for (c = 0; c < input_len_1; c++)
				855	neu1[c] = 0;
				856	for (c = 0; c < input_len_1; c++)
				857	neu1e[c] = 0;
				858	for (c = 0; c < input_len_2; c++)
				859	neu2[c] = 0;
				860	for (c = 0; c < input_len_2; c++)
				861	neu2e[c] = 0;
				862	next_random = next_random * (unsigned long long) 25214903917 + 11;
				863	b = next_random % window;
				864	if (type == 0) { //train the cbow architecture
				865	// in -> hidden
				866	cw = 0;
				867	for (a = b; a < window * 2 + 1 - b; a++)
				868	if (a != window) {
				869	c = sentence_position - window + a;
				870	if (c < 0)
				871	continue;
				872	if (c >= sentence_length)
				873	continue;
				874	last_word = sen[c];
				875	if (last_word == -1)
				876	continue;
				877	for (c = 0; c < layer1_size; c++)
				878	neu1[c] += syn0[c + last_word * layer1_size];
				879	cw++;
				880	}
				881	if (cw) {
				882	for (c = 0; c < layer1_size; c++)
				883	neu1[c] /= cw;
				884	if (hs)
				885	for (d = 0; d < vocab[word].codelen; d++) {
				886	f = 0;
				887	l2 = vocab[word].point[d] * layer1_size;
				888	// Propagate hidden -> output
				889	for (c = 0; c < layer1_size; c++)
				890	f += neu1[c] * syn1[c + l2];
				891	if (f <= -MAX_EXP)
				892	continue;
				893	else if (f >= MAX_EXP)
				894	continue;
				895	else
				896	f = expTable[(int) ((f + MAX_EXP)
				897	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				898	// 'g' is the gradient multiplied by the learning rate
				899	g = (1 - vocab[word].code[d] - f) * alpha;
				900	// Propagate errors output -> hidden
				901	for (c = 0; c < layer1_size; c++)
				902	neu1e[c] += g * syn1[c + l2];
				903	// Learn weights hidden -> output
				904	for (c = 0; c < layer1_size; c++)
				905	syn1[c + l2] += g * neu1[c];
				906	if (cap == 1)
				907	for (c = 0; c < layer1_size; c++)
				908	capParam(syn1, c + l2);
				909	}
				910	// NEGATIVE SAMPLING
				911	if (negative > 0)
				912	for (d = 0; d < negative + 1; d++) {
				913	if (d == 0) {
				914	target = word;
				915	label = 1;
				916	} else {
				917	next_random = next_random
				918	* (unsigned long long) 25214903917 + 11;
				919	if (word_to_group != NULL
				920	&& word_to_group[word] != -1) {
				921	target = word;
				922	while (target == word) {
				923	target = group_to_table[word_to_group[word]
				924	* table_size
				925	+ (next_random >> 16) % table_size];
				926	next_random = next_random
				927	* (unsigned long long) 25214903917
				928	+ 11;
				929	}
				930	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				931	} else {
				932	target =
				933	table[(next_random >> 16) % table_size];
				934	}
				935	if (target == 0)
				936	target = next_random % (vocab_size - 1) + 1;
				937	if (target == word)
				938	continue;
				939	label = 0;
				940	}
				941	l2 = target * layer1_size;
				942	f = 0;
				943	for (c = 0; c < layer1_size; c++)
				944	f += neu1[c] * syn1neg[c + l2];
				945	if (f > MAX_EXP)
				946	g = (label - 1) * alpha;
				947	else if (f < -MAX_EXP)
				948	g = (label - 0) * alpha;
				949	else
				950	g = (label
				951	- expTable[(int) ((f + MAX_EXP)
				952	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				953	* alpha;
				954	for (c = 0; c < layer1_size; c++)
				955	neu1e[c] += g * syn1neg[c + l2];
				956	for (c = 0; c < layer1_size; c++)
				957	syn1neg[c + l2] += g * neu1[c];
				958	if (cap == 1)
				959	for (c = 0; c < layer1_size; c++)
				960	capParam(syn1neg, c + l2);
				961	}
				962	// Noise Contrastive Estimation
				963	if (nce > 0)
				964	for (d = 0; d < nce + 1; d++) {
				965	if (d == 0) {
				966	target = word;
				967	label = 1;
				968	} else {
				969	next_random = next_random
				970	* (unsigned long long) 25214903917 + 11;
				971	if (word_to_group != NULL
				972	&& word_to_group[word] != -1) {
				973	target = word;
				974	while (target == word) {
				975	target = group_to_table[word_to_group[word]
				976	* table_size
				977	+ (next_random >> 16) % table_size];
				978	next_random = next_random
				979	* (unsigned long long) 25214903917
				980	+ 11;
				981	}
				982	} else {
				983	target =
				984	table[(next_random >> 16) % table_size];
				985	}
				986	if (target == 0)
				987	target = next_random % (vocab_size - 1) + 1;
				988	if (target == word)
				989	continue;
				990	label = 0;
				991	}
				992	l2 = target * layer1_size;
				993	f = 0;
				994
				995	for (c = 0; c < layer1_size; c++)
				996	f += neu1[c] * syn1nce[c + l2];
				997	if (f > MAX_EXP)
				998	g = (label - 1) * alpha;
				999	else if (f < -MAX_EXP)
				1000	g = (label - 0) * alpha;
				1001	else {
				1002	f = exp(f);
				1003	g =
				1004	(label
				1005	- f
				1006	/ (noise_distribution[target]
				1007	* nce + f)) * alpha;
				1008	}
				1009	for (c = 0; c < layer1_size; c++)
				1010	neu1e[c] += g * syn1nce[c + l2];
				1011	for (c = 0; c < layer1_size; c++)
				1012	syn1nce[c + l2] += g * neu1[c];
				1013	if (cap == 1)
				1014	for (c = 0; c < layer1_size; c++)
				1015	capParam(syn1nce, c + l2);
				1016	}
				1017	// hidden -> in
				1018	for (a = b; a < window * 2 + 1 - b; a++)
				1019	if (a != window) {
				1020	c = sentence_position - window + a;
				1021	if (c < 0)
				1022	continue;
				1023	if (c >= sentence_length)
				1024	continue;
				1025	last_word = sen[c];
				1026	if (last_word == -1)
				1027	continue;
				1028	for (c = 0; c < layer1_size; c++)
				1029	syn0[c + last_word * layer1_size] += neu1e[c];
				1030	}
				1031	}
				1032	} else if (type == 1) { //train skip-gram
				1033	for (a = b; a < window * 2 + 1 - b; a++)
				1034	if (a != window) {
				1035	c = sentence_position - window + a;
				1036	if (c < 0)
				1037	continue;
				1038	if (c >= sentence_length)
				1039	continue;
				1040	last_word = sen[c];
				1041	if (last_word == -1)
				1042	continue;
				1043	l1 = last_word * layer1_size;
				1044	for (c = 0; c < layer1_size; c++)
				1045	neu1e[c] = 0;
				1046	// HIERARCHICAL SOFTMAX
				1047	if (hs)
				1048	for (d = 0; d < vocab[word].codelen; d++) {
				1049	f = 0;
				1050	l2 = vocab[word].point[d] * layer1_size;
				1051	// Propagate hidden -> output
				1052	for (c = 0; c < layer1_size; c++)
				1053	f += syn0[c + l1] * syn1[c + l2];
				1054	if (f <= -MAX_EXP)
				1055	continue;
				1056	else if (f >= MAX_EXP)
				1057	continue;
				1058	else
				1059	f = expTable[(int) ((f + MAX_EXP)
				1060	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1061	// 'g' is the gradient multiplied by the learning rate
				1062	g = (1 - vocab[word].code[d] - f) * alpha;
				1063	// Propagate errors output -> hidden
				1064	for (c = 0; c < layer1_size; c++)
				1065	neu1e[c] += g * syn1[c + l2];
				1066	// Learn weights hidden -> output
				1067	for (c = 0; c < layer1_size; c++)
				1068	syn1[c + l2] += g * syn0[c + l1];
				1069	if (cap == 1)
				1070	for (c = 0; c < layer1_size; c++)
				1071	capParam(syn1, c + l2);
				1072	}
				1073	// NEGATIVE SAMPLING
				1074	if (negative > 0)
				1075	for (d = 0; d < negative + 1; d++) {
				1076	if (d == 0) {
				1077	target = word;
				1078	label = 1;
				1079	} else {
				1080	next_random = next_random
				1081	* (unsigned long long) 25214903917 + 11;
				1082	if (word_to_group != NULL
				1083	&& word_to_group[word] != -1) {
				1084	target = word;
				1085	while (target == word) {
				1086	target =
				1087	group_to_table[word_to_group[word]
				1088	* table_size
				1089	+ (next_random >> 16)
				1090	% table_size];
				1091	next_random =
				1092	next_random
				1093	* (unsigned long long) 25214903917
				1094	+ 11;
				1095	}
				1096	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1097	} else {
				1098	target = table[(next_random >> 16)
				1099	% table_size];
				1100	}
				1101	if (target == 0)
				1102	target = next_random % (vocab_size - 1) + 1;
				1103	if (target == word)
				1104	continue;
				1105	label = 0;
				1106	}
				1107	l2 = target * layer1_size;
				1108	f = 0;
				1109	for (c = 0; c < layer1_size; c++)
				1110	f += syn0[c + l1] * syn1neg[c + l2];
				1111	if (f > MAX_EXP)
				1112	g = (label - 1) * alpha;
				1113	else if (f < -MAX_EXP)
				1114	g = (label - 0) * alpha;
				1115	else
				1116	g =
				1117	(label
				1118	- expTable[(int) ((f + MAX_EXP)
				1119	* (EXP_TABLE_SIZE
				1120	/ MAX_EXP / 2))])
				1121	* alpha;
				1122	for (c = 0; c < layer1_size; c++)
				1123	neu1e[c] += g * syn1neg[c + l2];
				1124	for (c = 0; c < layer1_size; c++)
				1125	syn1neg[c + l2] += g * syn0[c + l1];
				1126	if (cap == 1)
				1127	for (c = 0; c < layer1_size; c++)
				1128	capParam(syn1neg, c + l2);
				1129	}
				1130	//Noise Contrastive Estimation
				1131	if (nce > 0)
				1132	for (d = 0; d < nce + 1; d++) {
				1133	if (d == 0) {
				1134	target = word;
				1135	label = 1;
				1136	} else {
				1137	next_random = next_random
				1138	* (unsigned long long) 25214903917 + 11;
				1139	if (word_to_group != NULL
				1140	&& word_to_group[word] != -1) {
				1141	target = word;
				1142	while (target == word) {
				1143	target =
				1144	group_to_table[word_to_group[word]
				1145	* table_size
				1146	+ (next_random >> 16)
				1147	% table_size];
				1148	next_random =
				1149	next_random
				1150	* (unsigned long long) 25214903917
				1151	+ 11;
				1152	}
				1153	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1154	} else {
				1155	target = table[(next_random >> 16)
				1156	% table_size];
				1157	}
				1158	if (target == 0)
				1159	target = next_random % (vocab_size - 1) + 1;
				1160	if (target == word)
				1161	continue;
				1162	label = 0;
				1163	}
				1164	l2 = target * layer1_size;
				1165	f = 0;
				1166	for (c = 0; c < layer1_size; c++)
				1167	f += syn0[c + l1] * syn1nce[c + l2];
				1168	if (f > MAX_EXP)
				1169	g = (label - 1) * alpha;
				1170	else if (f < -MAX_EXP)
				1171	g = (label - 0) * alpha;
				1172	else {
				1173	f = exp(f);
				1174	g = (label
				1175	- f
				1176	/ (noise_distribution[target]
				1177	* nce + f)) * alpha;
				1178	}
				1179	for (c = 0; c < layer1_size; c++)
				1180	neu1e[c] += g * syn1nce[c + l2];
				1181	for (c = 0; c < layer1_size; c++)
				1182	syn1nce[c + l2] += g * syn0[c + l1];
				1183	if (cap == 1)
				1184	for (c = 0; c < layer1_size; c++)
				1185	capParam(syn1nce, c + l2);
				1186	}
				1187	// Learn weights input -> hidden
				1188	for (c = 0; c < layer1_size; c++)
				1189	syn0[c + l1] += neu1e[c];
				1190	}
				1191	} else if (type == 2) { //train the cwindow architecture
				1192	// in -> hidden
				1193	cw = 0;
				1194	for (a = 0; a < window * 2 + 1; a++)
				1195	if (a != window) {
				1196	c = sentence_position - window + a;
				1197	if (c < 0)
				1198	continue;
				1199	if (c >= sentence_length)
				1200	continue;
				1201	last_word = sen[c];
				1202	if (last_word == -1)
				1203	continue;
				1204	window_offset = a * layer1_size;
				1205	if (a > window)
				1206	window_offset -= layer1_size;
				1207	for (c = 0; c < layer1_size; c++)
				1208	neu1[c + window_offset] += syn0[c
				1209	+ last_word * layer1_size];
				1210	cw++;
				1211	}
				1212	if (cw) {
				1213	if (hs)
				1214	for (d = 0; d < vocab[word].codelen; d++) {
				1215	f = 0;
				1216	l2 = vocab[word].point[d] * window_layer_size;
				1217	// Propagate hidden -> output
				1218	for (c = 0; c < window_layer_size; c++)
				1219	f += neu1[c] * syn1_window[c + l2];
				1220	if (f <= -MAX_EXP)
				1221	continue;
				1222	else if (f >= MAX_EXP)
				1223	continue;
				1224	else
				1225	f = expTable[(int) ((f + MAX_EXP)
				1226	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1227	// 'g' is the gradient multiplied by the learning rate
				1228	g = (1 - vocab[word].code[d] - f) * alpha;
				1229	// Propagate errors output -> hidden
				1230	for (c = 0; c < window_layer_size; c++)
				1231	neu1e[c] += g * syn1_window[c + l2];
				1232	// Learn weights hidden -> output
				1233	for (c = 0; c < window_layer_size; c++)
				1234	syn1_window[c + l2] += g * neu1[c];
				1235	if (cap == 1)
				1236	for (c = 0; c < window_layer_size; c++)
				1237	capParam(syn1_window, c + l2);
				1238	}
				1239	// NEGATIVE SAMPLING
				1240	if (negative > 0)
				1241	for (d = 0; d < negative + 1; d++) {
				1242	if (d == 0) {
				1243	target = word;
				1244	label = 1;
				1245	} else {
				1246	next_random = next_random
				1247	* (unsigned long long) 25214903917 + 11;
				1248	if (word_to_group != NULL
				1249	&& word_to_group[word] != -1) {
				1250	target = word;
				1251	while (target == word) {
				1252	target = group_to_table[word_to_group[word]
				1253	* table_size
				1254	+ (next_random >> 16) % table_size];
				1255	next_random = next_random
				1256	* (unsigned long long) 25214903917
				1257	+ 11;
				1258	}
				1259	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1260	} else {
				1261	target =
				1262	table[(next_random >> 16) % table_size];
				1263	}
				1264	if (target == 0)
				1265	target = next_random % (vocab_size - 1) + 1;
				1266	if (target == word)
				1267	continue;
				1268	label = 0;
				1269	}
				1270	l2 = target * window_layer_size;
				1271	f = 0;
				1272	for (c = 0; c < window_layer_size; c++)
				1273	f += neu1[c] * syn1neg_window[c + l2];
				1274	if (f > MAX_EXP)
				1275	g = (label - 1) * alpha;
				1276	else if (f < -MAX_EXP)
				1277	g = (label - 0) * alpha;
				1278	else
				1279	g = (label
				1280	- expTable[(int) ((f + MAX_EXP)
				1281	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1282	* alpha;
				1283	for (c = 0; c < window_layer_size; c++)
				1284	neu1e[c] += g * syn1neg_window[c + l2];
				1285	for (c = 0; c < window_layer_size; c++)
				1286	syn1neg_window[c + l2] += g * neu1[c];
				1287	if (cap == 1)
				1288	for (c = 0; c < window_layer_size; c++)
				1289	capParam(syn1neg_window, c + l2);
				1290	}
				1291	// Noise Contrastive Estimation
				1292	if (nce > 0)
				1293	for (d = 0; d < nce + 1; d++) {
				1294	if (d == 0) {
				1295	target = word;
				1296	label = 1;
				1297	} else {
				1298	next_random = next_random
				1299	* (unsigned long long) 25214903917 + 11;
				1300	if (word_to_group != NULL
				1301	&& word_to_group[word] != -1) {
				1302	target = word;
				1303	while (target == word) {
				1304	target = group_to_table[word_to_group[word]
				1305	* table_size
				1306	+ (next_random >> 16) % table_size];
				1307	next_random = next_random
				1308	* (unsigned long long) 25214903917
				1309	+ 11;
				1310	}
				1311	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1312	} else {
				1313	target =
				1314	table[(next_random >> 16) % table_size];
				1315	}
				1316	if (target == 0)
				1317	target = next_random % (vocab_size - 1) + 1;
				1318	if (target == word)
				1319	continue;
				1320	label = 0;
				1321	}
				1322	l2 = target * window_layer_size;
				1323	f = 0;
				1324	for (c = 0; c < window_layer_size; c++)
				1325	f += neu1[c] * syn1nce_window[c + l2];
				1326	if (f > MAX_EXP)
				1327	g = (label - 1) * alpha;
				1328	else if (f < -MAX_EXP)
				1329	g = (label - 0) * alpha;
				1330	else {
				1331	f = exp(f);
				1332	g =
				1333	(label
				1334	- f
				1335	/ (noise_distribution[target]
				1336	* nce + f)) * alpha;
				1337	}
				1338	for (c = 0; c < window_layer_size; c++)
				1339	neu1e[c] += g * syn1nce_window[c + l2];
				1340	for (c = 0; c < window_layer_size; c++)
				1341	syn1nce_window[c + l2] += g * neu1[c];
				1342	if (cap == 1)
				1343	for (c = 0; c < window_layer_size; c++)
				1344	capParam(syn1nce_window, c + l2);
				1345	}
				1346	// hidden -> in
				1347	for (a = 0; a < window * 2 + 1; a++)
				1348	if (a != window) {
				1349	c = sentence_position - window + a;
				1350	if (c < 0)
				1351	continue;
				1352	if (c >= sentence_length)
				1353	continue;
				1354	last_word = sen[c];
				1355	if (last_word == -1)
				1356	continue;
				1357	window_offset = a * layer1_size;
				1358	if (a > window)
				1359	window_offset -= layer1_size;
				1360	for (c = 0; c < layer1_size; c++)
				1361	syn0[c + last_word * layer1_size] += neu1e[c
				1362	+ window_offset];
				1363	}
				1364	}
				1365	} else if (type == 3) { //train structured skip-gram
				1366	for (a = 0; a < window * 2 + 1; a++)
				1367	if (a != window) {
				1368	c = sentence_position - window + a;
				1369	if (c < 0)
				1370	continue;
				1371	if (c >= sentence_length)
				1372	continue;
				1373	last_word = sen[c];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	1374	if (last_word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1375	continue;
				1376	l1 = last_word * layer1_size;
				1377	window_offset = a * layer1_size;
				1378	if (a > window)
				1379	window_offset -= layer1_size;
				1380	for (c = 0; c < layer1_size; c++)
				1381	neu1e[c] = 0;
				1382	// HIERARCHICAL SOFTMAX
				1383	if (hs)
				1384	for (d = 0; d < vocab[word].codelen; d++) {
				1385	f = 0;
				1386	l2 = vocab[word].point[d] * window_layer_size;
				1387	// Propagate hidden -> output
				1388	for (c = 0; c < layer1_size; c++)
				1389	f += syn0[c + l1]
				1390	* syn1_window[c + l2 + window_offset];
				1391	if (f <= -MAX_EXP)
				1392	continue;
				1393	else if (f >= MAX_EXP)
				1394	continue;
				1395	else
				1396	f = expTable[(int) ((f + MAX_EXP)
				1397	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1398	// 'g' is the gradient multiplied by the learning rate
				1399	g = (1 - vocab[word].code[d] - f) * alpha;
				1400	// Propagate errors output -> hidden
				1401	for (c = 0; c < layer1_size; c++)
				1402	neu1e[c] += g
				1403	* syn1_window[c + l2 + window_offset];
				1404	// Learn weights hidden -> output
				1405	for (c = 0; c < layer1_size; c++)
				1406	syn1[c + l2 + window_offset] += g
				1407	* syn0[c + l1];
				1408	if (cap == 1)
				1409	for (c = 0; c < layer1_size; c++)
				1410	capParam(syn1, c + l2 + window_offset);
				1411	}
				1412	// NEGATIVE SAMPLING
				1413	if (negative > 0)
				1414	for (d = 0; d < negative + 1; d++) {
				1415	if (d == 0) {
				1416	target = word;
				1417	label = 1;
				1418	} else {
				1419	next_random = next_random
				1420	* (unsigned long long) 25214903917 + 11;
				1421	if (word_to_group != NULL
				1422	&& word_to_group[word] != -1) {
				1423	target = word;
				1424	while (target == word) {
				1425	target =
				1426	group_to_table[word_to_group[word]
				1427	* table_size
				1428	+ (next_random >> 16)
				1429	% table_size];
				1430	next_random =
				1431	next_random
				1432	* (unsigned long long) 25214903917
				1433	+ 11;
				1434	}
				1435	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1436	} else {
				1437	target = table[(next_random >> 16)
				1438	% table_size];
				1439	}
				1440	if (target == 0)
				1441	target = next_random % (vocab_size - 1) + 1;
				1442	if (target == word)
				1443	continue;
				1444	label = 0;
				1445	}
				1446	l2 = target * window_layer_size;
				1447	f = 0;
				1448	for (c = 0; c < layer1_size; c++)
				1449	f +=
				1450	syn0[c + l1]
				1451	* syn1neg_window[c + l2
				1452	+ window_offset];
				1453	if (f > MAX_EXP)
				1454	g = (label - 1) * alpha;
				1455	else if (f < -MAX_EXP)
				1456	g = (label - 0) * alpha;
				1457	else
				1458	g =
				1459	(label
				1460	- expTable[(int) ((f + MAX_EXP)
				1461	* (EXP_TABLE_SIZE
				1462	/ MAX_EXP / 2))])
				1463	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1464	if(debug_mode > 2 && ((long long) id) == 0) {
				1465	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1466	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1467	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1468	for (c = 0; c < layer1_size; c++)
				1469	neu1e[c] +=
				1470	g
				1471	* syn1neg_window[c + l2
				1472	+ window_offset];
				1473	for (c = 0; c < layer1_size; c++)
				1474	syn1neg_window[c + l2 + window_offset] += g
				1475	* syn0[c + l1];
				1476	if (cap == 1)
				1477	for (c = 0; c < layer1_size; c++)
				1478	capParam(syn1neg_window,
				1479	c + l2 + window_offset);
				1480	}
				1481	// Noise Constrastive Estimation
				1482	if (nce > 0)
				1483	for (d = 0; d < nce + 1; d++) {
				1484	if (d == 0) {
				1485	target = word;
				1486	label = 1;
				1487	} else {
				1488	next_random = next_random
				1489	* (unsigned long long) 25214903917 + 11;
				1490	if (word_to_group != NULL
				1491	&& word_to_group[word] != -1) {
				1492	target = word;
				1493	while (target == word) {
				1494	target =
				1495	group_to_table[word_to_group[word]
				1496	* table_size
				1497	+ (next_random >> 16)
				1498	% table_size];
				1499	next_random =
				1500	next_random
				1501	* (unsigned long long) 25214903917
				1502	+ 11;
				1503	}
				1504	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1505	} else {
				1506	target = table[(next_random >> 16)
				1507	% table_size];
				1508	}
				1509	if (target == 0)
				1510	target = next_random % (vocab_size - 1) + 1;
				1511	if (target == word)
				1512	continue;
				1513	label = 0;
				1514	}
				1515	l2 = target * window_layer_size;
				1516	f = 0;
				1517	for (c = 0; c < layer1_size; c++)
				1518	f +=
				1519	syn0[c + l1]
				1520	* syn1nce_window[c + l2
				1521	+ window_offset];
				1522	if (f > MAX_EXP)
				1523	g = (label - 1) * alpha;
				1524	else if (f < -MAX_EXP)
				1525	g = (label - 0) * alpha;
				1526	else {
				1527	f = exp(f);
				1528	g = (label
				1529	- f
				1530	/ (noise_distribution[target]
				1531	* nce + f)) * alpha;
				1532	}
				1533	for (c = 0; c < layer1_size; c++)
				1534	neu1e[c] +=
				1535	g
				1536	* syn1nce_window[c + l2
				1537	+ window_offset];
				1538	for (c = 0; c < layer1_size; c++)
				1539	syn1nce_window[c + l2 + window_offset] += g
				1540	* syn0[c + l1];
				1541	if (cap == 1)
				1542	for (c = 0; c < layer1_size; c++)
				1543	capParam(syn1nce_window,
				1544	c + l2 + window_offset);
				1545	}
				1546	// Learn weights input -> hidden
				1547	for (c = 0; c < layer1_size; c++) {
				1548	syn0[c + l1] += neu1e[c];
				1549	if (syn0[c + l1] > 50)
				1550	syn0[c + l1] = 50;
				1551	if (syn0[c + l1] < -50)
				1552	syn0[c + l1] = -50;
				1553	}
				1554	}
				1555	} else if (type == 4) { //training senna
				1556	// in -> hidden
				1557	cw = 0;
				1558	for (a = 0; a < window * 2 + 1; a++)
				1559	if (a != window) {
				1560	c = sentence_position - window + a;
				1561	if (c < 0)
				1562	continue;
				1563	if (c >= sentence_length)
				1564	continue;
				1565	last_word = sen[c];
				1566	if (last_word == -1)
				1567	continue;
				1568	window_offset = a * layer1_size;
				1569	if (a > window)
				1570	window_offset -= layer1_size;
				1571	for (c = 0; c < layer1_size; c++)
				1572	neu1[c + window_offset] += syn0[c
				1573	+ last_word * layer1_size];
				1574	cw++;
				1575	}
				1576	if (cw) {
				1577	for (a = 0; a < window_hidden_size; a++) {
				1578	c = a * window_layer_size;
				1579	for (b = 0; b < window_layer_size; b++) {
				1580	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1581	}
				1582	}
				1583	if (hs)
				1584	for (d = 0; d < vocab[word].codelen; d++) {
				1585	f = 0;
				1586	l2 = vocab[word].point[d] * window_hidden_size;
				1587	// Propagate hidden -> output
				1588	for (c = 0; c < window_hidden_size; c++)
				1589	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1590	if (f <= -MAX_EXP)
				1591	continue;
				1592	else if (f >= MAX_EXP)
				1593	continue;
				1594	else
				1595	f = expTable[(int) ((f + MAX_EXP)
				1596	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1597	// 'g' is the gradient multiplied by the learning rate
				1598	g = (1 - vocab[word].code[d] - f) * alpha;
				1599	// Propagate errors output -> hidden
				1600	for (c = 0; c < window_hidden_size; c++)
				1601	neu2e[c] += dHardTanh(neu2[c], g) * g
				1602	* syn_hidden_word[c + l2];
				1603	// Learn weights hidden -> output
				1604	for (c = 0; c < window_hidden_size; c++)
				1605	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1606	* neu2[c];
				1607	}
				1608	// NEGATIVE SAMPLING
				1609	if (negative > 0)
				1610	for (d = 0; d < negative + 1; d++) {
				1611	if (d == 0) {
				1612	target = word;
				1613	label = 1;
				1614	} else {
				1615	next_random = next_random
				1616	* (unsigned long long) 25214903917 + 11;
				1617	if (word_to_group != NULL
				1618	&& word_to_group[word] != -1) {
				1619	target = word;
				1620	while (target == word) {
				1621	target = group_to_table[word_to_group[word]
				1622	* table_size
				1623	+ (next_random >> 16) % table_size];
				1624	next_random = next_random
				1625	* (unsigned long long) 25214903917
				1626	+ 11;
				1627	}
				1628	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1629	} else {
				1630	target =
				1631	table[(next_random >> 16) % table_size];
				1632	}
				1633	if (target == 0)
				1634	target = next_random % (vocab_size - 1) + 1;
				1635	if (target == word)
				1636	continue;
				1637	label = 0;
				1638	}
				1639	l2 = target * window_hidden_size;
				1640	f = 0;
				1641	for (c = 0; c < window_hidden_size; c++)
				1642	f += hardTanh(neu2[c])
				1643	* syn_hidden_word_neg[c + l2];
				1644	if (f > MAX_EXP)
				1645	g = (label - 1) * alpha / negative;
				1646	else if (f < -MAX_EXP)
				1647	g = (label - 0) * alpha / negative;
				1648	else
				1649	g = (label
				1650	- expTable[(int) ((f + MAX_EXP)
				1651	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1652	* alpha / negative;
				1653	for (c = 0; c < window_hidden_size; c++)
				1654	neu2e[c] += dHardTanh(neu2[c], g) * g
				1655	* syn_hidden_word_neg[c + l2];
				1656	for (c = 0; c < window_hidden_size; c++)
				1657	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1658	* g * neu2[c];
				1659	}
				1660	for (a = 0; a < window_hidden_size; a++)
				1661	for (b = 0; b < window_layer_size; b++)
				1662	neu1e[b] += neu2e[a]
				1663	* syn_window_hidden[a * window_layer_size + b];
				1664	for (a = 0; a < window_hidden_size; a++)
				1665	for (b = 0; b < window_layer_size; b++)
				1666	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1667	* neu1[b];
				1668	// hidden -> in
				1669	for (a = 0; a < window * 2 + 1; a++)
				1670	if (a != window) {
				1671	c = sentence_position - window + a;
				1672	if (c < 0)
				1673	continue;
				1674	if (c >= sentence_length)
				1675	continue;
				1676	last_word = sen[c];
				1677	if (last_word == -1)
				1678	continue;
				1679	window_offset = a * layer1_size;
				1680	if (a > window)
				1681	window_offset -= layer1_size;
				1682	for (c = 0; c < layer1_size; c++)
				1683	syn0[c + last_word * layer1_size] += neu1e[c
				1684	+ window_offset];
				1685	}
				1686	}
				1687	} else {
				1688	printf("unknown type %i", type);
				1689	exit(0);
				1690	}
				1691	sentence_position++;
				1692	if (sentence_position >= sentence_length) {
				1693	sentence_length = 0;
				1694	continue;
				1695	}
				1696	}
				1697	fclose(fi);
				1698	free(neu1);
				1699	free(neu1e);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1700	threadPos[(long) id] = -1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1701	pthread_exit(NULL);
				1702	}
				1703
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1704	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1705	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1706	real f, max_f, maxmax_f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1707	real *target_sums, bestf[MAX_CC], worstbest;
				1708	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1709	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1710	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1711
				1712	for (d = cc; d < vocab_size; d++) {
				1713	for (b = 0; b < vocab_size; b++)
				1714	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1715	for (b = 0; b < N; b++)
				1716	bestf[b]=-1;
				1717	worstbest = -1;
				1718
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1719	maxmax_f = -1;
				1720	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1721	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1722	if (a != window) {
				1723	max_f = -1;
				1724	window_offset = a * layer1_size;
				1725	if (a > window)
				1726	window_offset -= layer1_size;
				1727	for(target = 0; target < vocab_size; target ++) {
				1728	if(target == d)
				1729	continue;
				1730	f = 0;
				1731	for (c = 0; c < layer1_size; c++)
				1732	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1733	if (f < -MAX_EXP)
				1734	continue;
				1735	else if (f > MAX_EXP)
				1736	continue;
				1737	else
				1738	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1739	if(f > max_f) {
				1740	max_f = f;
				1741	max_target = target;
				1742	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1743	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1744	if(f > worstbest) {
				1745	for (b = 0; b < N; b++) {
				1746	if (f > bestf[b]) {
				1747	for (e = N - 1; e > b; e--) {
				1748	bestf[e] = bestf[e - 1];
				1749	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1750	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1751	}
				1752	bestf[b] = f;
				1753	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1754	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1755	break;
				1756	}
				1757	}
				1758	worstbest = bestf[N-1];
				1759	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1760	}
				1761	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1762	if(max_f > maxmax_f) {
				1763	maxmax_f = max_f;
				1764	maxmax_target = max_target;
				1765	}
				1766	} else {
				1767	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1768	}
				1769	}
				1770	max_f = -1;
				1771	for (b = 0; b < vocab_size; b++) {
				1772	if(target_sums[b] > max_f) {
				1773	max_f = target_sums[b];
				1774	max_target = b;
				1775	}
				1776	}
				1777	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1778	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1779	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1780	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1781	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1782	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1783	}
				1784	}
				1785
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1786	void TrainModel() {
				1787	long a, b, c, d;
				1788	FILE *fo;
				1789	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1790	threadPos = malloc(num_threads * sizeof(long long));
				1791	threadIters = malloc(num_threads * sizeof(int));
				1792	char *timebuf = malloc(80);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1793	printf("Starting training using file %s\n", train_file);
				1794	starting_alpha = alpha;
				1795	if (read_vocab_file[0] != 0)
				1796	ReadVocab();
				1797	else
				1798	LearnVocabFromTrainFile();
				1799	if (save_vocab_file[0] != 0)
				1800	SaveVocab();
				1801	if (output_file[0] == 0)
				1802	return;
				1803	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1804	if(cc > 0)
				1805	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1806	if (negative > 0 \|\| nce > 0)
				1807	InitUnigramTable();
				1808	if (negative_classes_file[0] != 0)
				1809	InitClassUnigramTable();
				1810	start = clock();
				1811	for (a = 0; a < num_threads; a++)
				1812	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1813	if(debug_mode > 1)
				1814	pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1815	for (a = 0; a < num_threads; a++)
				1816	pthread_join(pt[a], NULL);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1817	if(debug_mode > 1) {
				1818	pthread_join(pt[num_threads], NULL);
				1819	clock_t now = clock();
				1820	printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now-start) / CLOCKS_PER_SEC / num_threads, (now-start) / CLOCKS_PER_SEC);
				1821	printf("Saving vectors to %s ...", output_file);
				1822	fflush(stdout);
				1823	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1824	fo = fopen(output_file, "wb");
				1825	if (classes == 0) {
				1826	// Save the word vectors
				1827	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1828	for (a = 0; a < vocab_size; a++) {
				1829	fprintf(fo, "%s ", vocab[a].word);
				1830	if (binary)
				1831	for (b = 0; b < layer1_size; b++)
				1832	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1833	else
				1834	for (b = 0; b < layer1_size; b++)
				1835	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1836	fprintf(fo, "\n");
				1837	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1838	if(debug_mode > 1)
				1839	fprintf(stderr, "\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1840	} else {
				1841	// Run K-means on the word vectors
				1842	int clcn = classes, iter = 10, closeid;
				1843	int centcn = (int ) malloc(classes * sizeof(int));
				1844	int cl = (int ) calloc(vocab_size, sizeof(int));
				1845	real closev, x;
				1846	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1847	for (a = 0; a < vocab_size; a++)
				1848	cl[a] = a % clcn;
				1849	for (a = 0; a < iter; a++) {
				1850	for (b = 0; b < clcn * layer1_size; b++)
				1851	cent[b] = 0;
				1852	for (b = 0; b < clcn; b++)
				1853	centcn[b] = 1;
				1854	for (c = 0; c < vocab_size; c++) {
				1855	for (d = 0; d < layer1_size; d++)
				1856	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1857	centcn[cl[c]]++;
				1858	}
				1859	for (b = 0; b < clcn; b++) {
				1860	closev = 0;
				1861	for (c = 0; c < layer1_size; c++) {
				1862	cent[layer1_size * b + c] /= centcn[b];
				1863	closev += cent[layer1_size * b + c]
				1864	* cent[layer1_size * b + c];
				1865	}
				1866	closev = sqrt(closev);
				1867	for (c = 0; c < layer1_size; c++)
				1868	cent[layer1_size * b + c] /= closev;
				1869	}
				1870	for (c = 0; c < vocab_size; c++) {
				1871	closev = -10;
				1872	closeid = 0;
				1873	for (d = 0; d < clcn; d++) {
				1874	x = 0;
				1875	for (b = 0; b < layer1_size; b++)
				1876	x += cent[layer1_size * d + b]
				1877	* syn0[c * layer1_size + b];
				1878	if (x > closev) {
				1879	closev = x;
				1880	closeid = d;
				1881	}
				1882	}
				1883	cl[c] = closeid;
				1884	}
				1885	}
				1886	// Save the K-means classes
				1887	for (a = 0; a < vocab_size; a++)
				1888	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1889	free(centcn);
				1890	free(cent);
				1891	free(cl);
				1892	}
				1893	fclose(fo);
				1894	if (save_net_file[0] != 0)
				1895	SaveNet();
				1896	}
				1897
				1898	int ArgPos(char str, int argc, char *argv) {
				1899	int a;
				1900	for (a = 1; a < argc; a++)
				1901	if (!strcmp(str, argv[a])) {
				1902	if (a == argc - 1) {
				1903	printf("Argument missing for %s\n", str);
				1904	exit(1);
				1905	}
				1906	return a;
				1907	}
				1908	return -1;
				1909	}
				1910
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame^]	1911	void print_help() {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1912	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1913	printf("Options:\n");
				1914	printf("Parameters for training:\n");
				1915	printf("\t-train <file>\n");
				1916	printf("\t\tUse text data from <file> to train the model\n");
				1917	printf("\t-output <file>\n");
				1918	printf(
				1919	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1920	printf("\t-size <int>\n");
				1921	printf("\t\tSet size of word vectors; default is 100\n");
				1922	printf("\t-window <int>\n");
				1923	printf("\t\tSet max skip length between words; default is 5\n");
				1924	printf("\t-sample <float>\n");
				1925	printf(
				1926	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1927	printf(
				1928	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1929	printf("\t-hs <int>\n");
				1930	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1931	printf("\t-negative <int>\n");
				1932	printf(
				1933	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1934	printf("\t-negative-classes <file>\n");
				1935	printf("\t\tNegative classes to sample from\n");
				1936	printf("\t-nce <int>\n");
				1937	printf(
				1938	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1939	printf("\t-threads <int>\n");
				1940	printf("\t\tUse <int> threads (default 12)\n");
				1941	printf("\t-iter <int>\n");
				1942	printf("\t\tRun more training iterations (default 5)\n");
				1943	printf("\t-min-count <int>\n");
				1944	printf(
				1945	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1946	printf("\t-alpha <float>\n");
				1947	printf(
				1948	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1949	printf("\t-classes <int>\n");
				1950	printf(
				1951	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1952	printf("\t-debug <int>\n");
				1953	printf(
				1954	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1955	printf("\t-binary <int>\n");
				1956	printf(
				1957	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1958	printf("\t-save-vocab <file>\n");
				1959	printf("\t\tThe vocabulary will be saved to <file>\n");
				1960	printf("\t-read-vocab <file>\n");
				1961	printf(
				1962	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1963	printf("\t-read-net <file>\n");
				1964	printf(
				1965	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1966	printf("\t-save-net <file>\n");
				1967	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1968	printf("\t-show-cc <int>\n");
				1969	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1970	printf("\t-type <int>\n");
				1971	printf(
				1972	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1973	printf("\t-cap <int>\n");
				1974	printf(
				1975	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1976	printf("\nExamples:\n");
				1977	printf(
				1978	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame^]	1979	}
				1980
				1981	int main(int argc, char **argv) {
				1982	int i;
				1983	setlocale(LC_ALL, "");
				1984	if (argc == 1) {
				1985	print_help();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1986	return 0;
				1987	}
				1988	output_file[0] = 0;
				1989	save_vocab_file[0] = 0;
				1990	read_vocab_file[0] = 0;
				1991	save_net_file[0] = 0;
				1992	read_net_file[0] = 0;
				1993	negative_classes_file[0] = 0;
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame^]	1994	if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
				1995	print_help();
				1996	return(0);
				1997	}
				1998	if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
				1999	print_help();
				2000	return(0);
				2001	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2002	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				2003	layer1_size = atoi(argv[i + 1]);
				2004	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				2005	strcpy(train_file, argv[i + 1]);
				2006	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				2007	strcpy(save_vocab_file, argv[i + 1]);
				2008	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				2009	strcpy(read_vocab_file, argv[i + 1]);
				2010	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				2011	strcpy(save_net_file, argv[i + 1]);
				2012	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				2013	strcpy(read_net_file, argv[i + 1]);
				2014	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				2015	debug_mode = atoi(argv[i + 1]);
				2016	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				2017	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	2018	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				2019	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2020	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				2021	type = atoi(argv[i + 1]);
				2022	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				2023	strcpy(output_file, argv[i + 1]);
				2024	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				2025	window = atoi(argv[i + 1]);
				2026	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				2027	sample = atof(argv[i + 1]);
				2028	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				2029	hs = atoi(argv[i + 1]);
				2030	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				2031	negative = atoi(argv[i + 1]);
				2032	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				2033	strcpy(negative_classes_file, argv[i + 1]);
				2034	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				2035	nce = atoi(argv[i + 1]);
				2036	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				2037	num_threads = atoi(argv[i + 1]);
				2038	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				2039	iter = atoi(argv[i + 1]);
				2040	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				2041	min_count = atoi(argv[i + 1]);
				2042	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				2043	classes = atoi(argv[i + 1]);
				2044	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				2045	cap = atoi(argv[i + 1]);
				2046	if (type == 0 \|\| type == 2 \|\| type == 4)
				2047	alpha = 0.05;
				2048	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				2049	alpha = atof(argv[i + 1]);
				2050	vocab = (struct vocab_word *) calloc(vocab_max_size,
				2051	sizeof(struct vocab_word));
				2052	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				2053	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				2054	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				2055	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				2056	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				2057	}
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	2058	SaveArgs(argc, argv);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2059	TrainModel();
				2060	return 0;
				2061	}
				2062