Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 6f2851d2826ff5d5254c2e7227e2ecef2a9d4cfd [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	15	#include <locale.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	16	#include <stdio.h>
				17	#include <stdlib.h>
				18	#include <string.h>
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	19	#include <unistd.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	20	#include <math.h>
				21	#include <pthread.h>
				22
				23	#define MAX_STRING 100
				24	#define EXP_TABLE_SIZE 1000
				25	#define MAX_EXP 6
				26	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	27	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	28	#define MAX_CODE_LENGTH 40
				29
				30	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				31
				32	typedef float real; // Precision of float numbers
				33
				34	struct vocab_word {
				35	long long cn;
				36	int *point;
				37	char word, code, codelen;
				38	};
				39
				40	char train_file[MAX_STRING], output_file[MAX_STRING];
				41	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				42	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	43	char magic_stop_file[MAX_STRING];
				44
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	45	struct vocab_word *vocab;
				46	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	47	num_threads = 12, min_reduce = 1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	48	int *vocab_hash;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	49	long long *threadPos;
				50	int *threadIters;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	51	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				52	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				53	classes = 0;
				54	real alpha = 0.025, starting_alpha, sample = 1e-3;
				55	real syn0, syn1, syn1neg, syn1nce, *expTable;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	56	real avgWordLength=0;
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame^]	57	clock_t start, start_clock;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	58
				59	real syn1_window, syn1neg_window, *syn1nce_window;
				60	int w_offset, window_layer_size;
				61
				62	int window_hidden_size = 500;
				63	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				64	*syn_hidden_word_nce;
				65
				66	int hs = 0, negative = 5;
				67	const int table_size = 1e8;
				68	int *table;
				69
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	70	long cc = 0;
				71
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	72	//constrastive negative sampling
				73	char negative_classes_file[MAX_STRING];
				74	int *word_to_group;
				75	int group_to_table; //group_sizetable_size
				76	int class_number;
				77
				78	//nce
				79	real* noise_distribution;
				80	int nce = 0;
				81
				82	//param caps
				83	real CAP_VALUE = 50;
				84	int cap = 0;
				85
				86	void capParam(real* array, int index) {
				87	if (array[index] > CAP_VALUE)
				88	array[index] = CAP_VALUE;
				89	else if (array[index] < -CAP_VALUE)
				90	array[index] = -CAP_VALUE;
				91	}
				92
				93	real hardTanh(real x) {
				94	if (x >= 1) {
				95	return 1;
				96	} else if (x <= -1) {
				97	return -1;
				98	} else {
				99	return x;
				100	}
				101	}
				102
				103	real dHardTanh(real x, real g) {
				104	if (x > 1 && g > 0) {
				105	return 0;
				106	}
				107	if (x < -1 && g < 0) {
				108	return 0;
				109	}
				110	return 1;
				111	}
				112
				113	void InitUnigramTable() {
				114	int a, i;
				115	long long train_words_pow = 0;
				116	real d1, power = 0.75;
				117	table = (int ) malloc(table_size sizeof(int));
				118	for (a = 0; a < vocab_size; a++)
				119	train_words_pow += pow(vocab[a].cn, power);
				120	i = 0;
				121	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				122	for (a = 0; a < table_size; a++) {
				123	table[a] = i;
				124	if (a / (real) table_size > d1) {
				125	i++;
				126	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				127	}
				128	if (i >= vocab_size)
				129	i = vocab_size - 1;
				130	}
				131
				132	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				133	for (a = 0; a < vocab_size; a++)
				134	noise_distribution[a] = pow(vocab[a].cn, power)
				135	/ (real) train_words_pow;
				136	}
				137
				138	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				139	void ReadWord(char word, FILE fin) {
				140	int a = 0, ch;
				141	while (!feof(fin)) {
				142	ch = fgetc(fin);
				143	if (ch == 13)
				144	continue;
				145	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				146	if (a > 0) {
				147	if (ch == '\n')
				148	ungetc(ch, fin);
				149	break;
				150	}
				151	if (ch == '\n') {
				152	strcpy(word, (char *) "</s>");
				153	return;
				154	} else
				155	continue;
				156	}
				157	word[a] = ch;
				158	a++;
				159	if (a >= MAX_STRING - 1)
				160	a--; // Truncate too long words
				161	}
				162	word[a] = 0;
				163	}
				164
				165	// Returns hash value of a word
				166	int GetWordHash(char *word) {
				167	unsigned long long a, hash = 0;
				168	for (a = 0; a < strlen(word); a++)
				169	hash = hash * 257 + word[a];
				170	hash = hash % vocab_hash_size;
				171	return hash;
				172	}
				173
				174	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				175	int SearchVocab(char *word) {
				176	unsigned int hash = GetWordHash(word);
				177	while (1) {
				178	if (vocab_hash[hash] == -1)
				179	return -1;
				180	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				181	return vocab_hash[hash];
				182	hash = (hash + 1) % vocab_hash_size;
				183	}
				184	return -1;
				185	}
				186
				187	// Reads a word and returns its index in the vocabulary
				188	int ReadWordIndex(FILE *fin) {
				189	char word[MAX_STRING];
				190	ReadWord(word, fin);
				191	if (feof(fin))
				192	return -1;
				193	return SearchVocab(word);
				194	}
				195
				196	// Adds a word to the vocabulary
				197	int AddWordToVocab(char *word) {
				198	unsigned int hash, length = strlen(word) + 1;
				199	if (length > MAX_STRING)
				200	length = MAX_STRING;
				201	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				202	strcpy(vocab[vocab_size].word, word);
				203	vocab[vocab_size].cn = 0;
				204	vocab_size++;
				205	// Reallocate memory if needed
				206	if (vocab_size + 2 >= vocab_max_size) {
				207	vocab_max_size += 1000;
				208	vocab = (struct vocab_word *) realloc(vocab,
				209	vocab_max_size * sizeof(struct vocab_word));
				210	}
				211	hash = GetWordHash(word);
				212	while (vocab_hash[hash] != -1)
				213	hash = (hash + 1) % vocab_hash_size;
				214	vocab_hash[hash] = vocab_size - 1;
				215	return vocab_size - 1;
				216	}
				217
				218	// Used later for sorting by word counts
				219	int VocabCompare(const void a, const void b) {
				220	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				221	}
				222
				223	// Sorts the vocabulary by frequency using word counts
				224	void SortVocab() {
				225	int a, size;
				226	unsigned int hash;
				227	// Sort the vocabulary and keep </s> at the first position
				228	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				229	for (a = 0; a < vocab_hash_size; a++)
				230	vocab_hash[a] = -1;
				231	size = vocab_size;
				232	train_words = 0;
				233	for (a = 0; a < size; a++) {
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	234	avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	235	// Words occuring less than min_count times will be discarded from the vocab
				236	if ((vocab[a].cn < min_count) && (a != 0)) {
				237	vocab_size--;
				238	free(vocab[a].word);
				239	} else {
				240	// Hash will be re-computed, as after the sorting it is not actual
				241	hash = GetWordHash(vocab[a].word);
				242	while (vocab_hash[hash] != -1)
				243	hash = (hash + 1) % vocab_hash_size;
				244	vocab_hash[hash] = a;
				245	train_words += vocab[a].cn;
				246	}
				247	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	248	avgWordLength /= train_words;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	249	vocab = (struct vocab_word *) realloc(vocab,
				250	(vocab_size + 1) * sizeof(struct vocab_word));
				251	// Allocate memory for the binary tree construction
				252	for (a = 0; a < vocab_size; a++) {
				253	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				254	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				255	}
				256	}
				257
				258	// Reduces the vocabulary by removing infrequent tokens
				259	void ReduceVocab() {
				260	int a, b = 0;
				261	unsigned int hash;
				262	for (a = 0; a < vocab_size; a++)
				263	if (vocab[a].cn > min_reduce) {
				264	vocab[b].cn = vocab[a].cn;
				265	vocab[b].word = vocab[a].word;
				266	b++;
				267	} else
				268	free(vocab[a].word);
				269	vocab_size = b;
				270	for (a = 0; a < vocab_hash_size; a++)
				271	vocab_hash[a] = -1;
				272	for (a = 0; a < vocab_size; a++) {
				273	// Hash will be re-computed, as it is not actual
				274	hash = GetWordHash(vocab[a].word);
				275	while (vocab_hash[hash] != -1)
				276	hash = (hash + 1) % vocab_hash_size;
				277	vocab_hash[hash] = a;
				278	}
				279	fflush(stdout);
				280	min_reduce++;
				281	}
				282
				283	// Create binary Huffman tree using the word counts
				284	// Frequent words will have short uniqe binary codes
				285	void CreateBinaryTree() {
				286	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				287	char code[MAX_CODE_LENGTH];
				288	long long count = (long long ) calloc(vocab_size * 2 + 1,
				289	sizeof(long long));
				290	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				291	sizeof(long long));
				292	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				293	sizeof(long long));
				294	for (a = 0; a < vocab_size; a++)
				295	count[a] = vocab[a].cn;
				296	for (a = vocab_size; a < vocab_size * 2; a++)
				297	count[a] = 1e15;
				298	pos1 = vocab_size - 1;
				299	pos2 = vocab_size;
				300	// Following algorithm constructs the Huffman tree by adding one node at a time
				301	for (a = 0; a < vocab_size - 1; a++) {
				302	// First, find two smallest nodes 'min1, min2'
				303	if (pos1 >= 0) {
				304	if (count[pos1] < count[pos2]) {
				305	min1i = pos1;
				306	pos1--;
				307	} else {
				308	min1i = pos2;
				309	pos2++;
				310	}
				311	} else {
				312	min1i = pos2;
				313	pos2++;
				314	}
				315	if (pos1 >= 0) {
				316	if (count[pos1] < count[pos2]) {
				317	min2i = pos1;
				318	pos1--;
				319	} else {
				320	min2i = pos2;
				321	pos2++;
				322	}
				323	} else {
				324	min2i = pos2;
				325	pos2++;
				326	}
				327	count[vocab_size + a] = count[min1i] + count[min2i];
				328	parent_node[min1i] = vocab_size + a;
				329	parent_node[min2i] = vocab_size + a;
				330	binary[min2i] = 1;
				331	}
				332	// Now assign binary code to each vocabulary word
				333	for (a = 0; a < vocab_size; a++) {
				334	b = a;
				335	i = 0;
				336	while (1) {
				337	code[i] = binary[b];
				338	point[i] = b;
				339	i++;
				340	b = parent_node[b];
				341	if (b == vocab_size * 2 - 2)
				342	break;
				343	}
				344	vocab[a].codelen = i;
				345	vocab[a].point[0] = vocab_size - 2;
				346	for (b = 0; b < i; b++) {
				347	vocab[a].code[i - b - 1] = code[b];
				348	vocab[a].point[i - b] = point[b] - vocab_size;
				349	}
				350	}
				351	free(count);
				352	free(binary);
				353	free(parent_node);
				354	}
				355
				356	void LearnVocabFromTrainFile() {
				357	char word[MAX_STRING];
				358	FILE *fin;
				359	long long a, i;
				360	for (a = 0; a < vocab_hash_size; a++)
				361	vocab_hash[a] = -1;
				362	fin = fopen(train_file, "rb");
				363	if (fin == NULL) {
				364	printf("ERROR: training data file not found!\n");
				365	exit(1);
				366	}
				367	vocab_size = 0;
				368	AddWordToVocab((char *) "</s>");
				369	while (1) {
				370	ReadWord(word, fin);
				371	if (feof(fin))
				372	break;
				373	train_words++;
				374	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				375	printf("%lldK%c", train_words / 1000, 13);
				376	fflush(stdout);
				377	}
				378	i = SearchVocab(word);
				379	if (i == -1) {
				380	a = AddWordToVocab(word);
				381	vocab[a].cn = 1;
				382	} else
				383	vocab[i].cn++;
				384	if (vocab_size > vocab_hash_size * 0.7)
				385	ReduceVocab();
				386	}
				387	SortVocab();
				388	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	389	printf("Vocab size: %'lld\n", vocab_size);
				390	printf("Words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	391	}
				392	file_size = ftell(fin);
				393	fclose(fin);
				394	}
				395
				396	void SaveVocab() {
				397	long long i;
				398	FILE *fo = fopen(save_vocab_file, "wb");
				399	for (i = 0; i < vocab_size; i++)
				400	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				401	fclose(fo);
				402	}
				403
				404	void ReadVocab() {
				405	long long a, i = 0;
				406	char c;
				407	char word[MAX_STRING];
				408	FILE *fin = fopen(read_vocab_file, "rb");
				409	if (fin == NULL) {
				410	printf("Vocabulary file not found\n");
				411	exit(1);
				412	}
				413	for (a = 0; a < vocab_hash_size; a++)
				414	vocab_hash[a] = -1;
				415	vocab_size = 0;
				416	while (1) {
				417	ReadWord(word, fin);
				418	if (feof(fin))
				419	break;
				420	a = AddWordToVocab(word);
				421	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				422	i++;
				423	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	424	fclose(fin);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	425	fin = fopen(train_file, "rb");
				426	if (fin == NULL) {
				427	printf("ERROR: training data file not found!\n");
				428	exit(1);
				429	}
				430	fseek(fin, 0, SEEK_END);
				431	file_size = ftell(fin);
				432	fclose(fin);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	433	SortVocab();
				434	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	435	printf("Vocab size: %'lld\n", vocab_size);
				436	printf("Words in vocab's train file: %'lld\n", train_words);
				437	printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	438	}
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	439	train_words = file_size / avgWordLength;
				440	if(debug_mode > 0)
				441	printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	442	}
				443
				444	void InitClassUnigramTable() {
				445	long long a, c;
				446	printf("loading class unigrams \n");
				447	FILE *fin = fopen(negative_classes_file, "rb");
				448	if (fin == NULL) {
				449	printf("ERROR: class file not found!\n");
				450	exit(1);
				451	}
				452	word_to_group = (int ) malloc(vocab_size sizeof(int));
				453	for (a = 0; a < vocab_size; a++)
				454	word_to_group[a] = -1;
				455	char class[MAX_STRING];
				456	char prev_class[MAX_STRING];
				457	prev_class[0] = 0;
				458	char word[MAX_STRING];
				459	class_number = -1;
				460	while (1) {
				461	if (feof(fin))
				462	break;
				463	ReadWord(class, fin);
				464	ReadWord(word, fin);
				465	int word_index = SearchVocab(word);
				466	if (word_index != -1) {
				467	if (strcmp(class, prev_class) != 0) {
				468	class_number++;
				469	strcpy(prev_class, class);
				470	}
				471	word_to_group[word_index] = class_number;
				472	}
				473	ReadWord(word, fin);
				474	}
				475	class_number++;
				476	fclose(fin);
				477
				478	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				479	long long train_words_pow = 0;
				480	real d1, power = 0.75;
				481
				482	for (c = 0; c < class_number; c++) {
				483	long long offset = c * table_size;
				484	train_words_pow = 0;
				485	for (a = 0; a < vocab_size; a++)
				486	if (word_to_group[a] == c)
				487	train_words_pow += pow(vocab[a].cn, power);
				488	int i = 0;
				489	while (word_to_group[i] != c && i < vocab_size)
				490	i++;
				491	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				492	for (a = 0; a < table_size; a++) {
				493	//printf("index %lld , word %d\n", a, i);
				494	group_to_table[offset + a] = i;
				495	if (a / (real) table_size > d1) {
				496	i++;
				497	while (word_to_group[i] != c && i < vocab_size)
				498	i++;
				499	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				500	}
				501	if (i >= vocab_size)
				502	while (word_to_group[i] != c && i >= 0)
				503	i--;
				504	}
				505	}
				506	}
				507
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	508	void SaveArgs(int argc, char **argv) {
				509	unsigned int i;
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	510	char args_file[MAX_STRING];
				511	strcpy(args_file, output_file);
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	512	strcat(args_file, ".args");
				513	FILE *fargs = fopen(args_file, "w");
				514	if (fargs == NULL) {
				515	printf("Cannot save args to %s.\n", args_file);
				516	return;
				517	}
				518
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	519	for(i=1; i<argc; i++)
				520	fprintf(fargs, "%s ", argv[i]);
				521
				522	fprintf(fargs, "\n");
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	523	fclose(fargs);
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	524
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	525	return;
				526	}
				527
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	528	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	529	if(type != 3 \|\| negative <= 0) {
				530	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				531	return;
				532	}
				533
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	534	FILE *fnet = fopen(save_net_file, "wb");
				535	if (fnet == NULL) {
				536	printf("Net parameter file not found\n");
				537	exit(1);
				538	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	539	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	540	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	541	fclose(fnet);
				542	}
				543
				544	void InitNet() {
				545	long long a, b;
				546	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	547	long long read;
				548
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	549	window_layer_size = layer1_size * window * 2;
				550	a = posix_memalign((void **) &syn0, 128,
				551	(long long) vocab_size * layer1_size * sizeof(real));
				552	if (syn0 == NULL) {
				553	printf("Memory allocation failed\n");
				554	exit(1);
				555	}
				556
				557	if (hs) {
				558	a = posix_memalign((void **) &syn1, 128,
				559	(long long) vocab_size * layer1_size * sizeof(real));
				560	if (syn1 == NULL) {
				561	printf("Memory allocation failed\n");
				562	exit(1);
				563	}
				564	a = posix_memalign((void **) &syn1_window, 128,
				565	(long long) vocab_size * window_layer_size * sizeof(real));
				566	if (syn1_window == NULL) {
				567	printf("Memory allocation failed\n");
				568	exit(1);
				569	}
				570	a = posix_memalign((void **) &syn_hidden_word, 128,
				571	(long long) vocab_size * window_hidden_size * sizeof(real));
				572	if (syn_hidden_word == NULL) {
				573	printf("Memory allocation failed\n");
				574	exit(1);
				575	}
				576
				577	for (a = 0; a < vocab_size; a++)
				578	for (b = 0; b < layer1_size; b++)
				579	syn1[a * layer1_size + b] = 0;
				580	for (a = 0; a < vocab_size; a++)
				581	for (b = 0; b < window_layer_size; b++)
				582	syn1_window[a * window_layer_size + b] = 0;
				583	for (a = 0; a < vocab_size; a++)
				584	for (b = 0; b < window_hidden_size; b++)
				585	syn_hidden_word[a * window_hidden_size + b] = 0;
				586	}
				587	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	588	if(type == 0) {
				589	a = posix_memalign((void **) &syn1neg, 128,
				590	(long long) vocab_size * layer1_size * sizeof(real));
				591	if (syn1neg == NULL) {
				592	printf("Memory allocation failed\n");
				593	exit(1);
				594	}
				595	for (a = 0; a < vocab_size; a++)
				596	for (b = 0; b < layer1_size; b++)
				597	syn1neg[a * layer1_size + b] = 0;
				598	} else if (type == 3) {
				599	a = posix_memalign((void **) &syn1neg_window, 128,
				600	(long long) vocab_size * window_layer_size * sizeof(real));
				601	if (syn1neg_window == NULL) {
				602	printf("Memory allocation failed\n");
				603	exit(1);
				604	}
				605	for (a = 0; a < vocab_size; a++)
				606	for (b = 0; b < window_layer_size; b++)
				607	syn1neg_window[a * window_layer_size + b] = 0;
				608	} else if (type == 4) {
				609	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				610	(long long) vocab_size * window_hidden_size * sizeof(real));
				611	if (syn_hidden_word_neg == NULL) {
				612	printf("Memory allocation failed\n");
				613	exit(1);
				614	}
				615	for (a = 0; a < vocab_size; a++)
				616	for (b = 0; b < window_hidden_size; b++)
				617	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	618	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	619	}
				620	if (nce > 0) {
				621	a = posix_memalign((void **) &syn1nce, 128,
				622	(long long) vocab_size * layer1_size * sizeof(real));
				623	if (syn1nce == NULL) {
				624	printf("Memory allocation failed\n");
				625	exit(1);
				626	}
				627	a = posix_memalign((void **) &syn1nce_window, 128,
				628	(long long) vocab_size * window_layer_size * sizeof(real));
				629	if (syn1nce_window == NULL) {
				630	printf("Memory allocation failed\n");
				631	exit(1);
				632	}
				633	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				634	(long long) vocab_size * window_hidden_size * sizeof(real));
				635	if (syn_hidden_word_nce == NULL) {
				636	printf("Memory allocation failed\n");
				637	exit(1);
				638	}
				639
				640	for (a = 0; a < vocab_size; a++)
				641	for (b = 0; b < layer1_size; b++)
				642	syn1nce[a * layer1_size + b] = 0;
				643	for (a = 0; a < vocab_size; a++)
				644	for (b = 0; b < window_layer_size; b++)
				645	syn1nce_window[a * window_layer_size + b] = 0;
				646	for (a = 0; a < vocab_size; a++)
				647	for (b = 0; b < window_hidden_size; b++)
				648	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				649	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	650
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	651	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	652	a = posix_memalign((void **) &syn_window_hidden, 128,
				653	window_hidden_size * window_layer_size * sizeof(real));
				654	if (syn_window_hidden == NULL) {
				655	printf("Memory allocation failed\n");
				656	exit(1);
				657	}
				658	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				659	next_random = next_random * (unsigned long long) 25214903917 + 11;
				660	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				661	- 0.5) / (window_hidden_size * window_layer_size);
				662	}
				663	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	664
				665	if (read_net_file[0] == 0) {
				666	for (a = 0; a < vocab_size; a++)
				667	for (b = 0; b < layer1_size; b++) {
				668	next_random = next_random * (unsigned long long) 25214903917
				669	+ 11;
				670	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				671	/ (real) 65536) - 0.5) / layer1_size;
				672	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	673	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	674	FILE *fnet = fopen(read_net_file, "rb");
				675	if (fnet == NULL) {
				676	printf("Net parameter file not found\n");
				677	exit(1);
				678	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	679	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				680	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				681	if(read != vocab_size * layer1_size) {
				682	fprintf(stderr, "read-net failed %lld\n", read);
				683	exit(-1);
				684	}
				685	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				686	if(read != (long long) vocab_size * window_layer_size) {
				687	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				688	(long long) sizeof(real) * vocab_size * window_layer_size);
				689	exit(-1);
				690	}
				691	fgetc(fnet);
				692	if(!feof(fnet)) {
				693	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				694	exit(-1);
				695	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	696	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	697	} else {
				698	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				699	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	700	}
				701
				702	CreateBinaryTree();
				703	}
				704
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	705	char currentDateTime(char buf, real offset) {
				706	time_t t;
				707	time(&t);
				708	t += (long) offset;
				709	struct tm tstruct;
				710	tstruct = *localtime(&t);
				711	strftime(buf, 80, "%c", &tstruct);
				712	return buf;
				713	}
				714
				715	void MonitorThread(void id) {
				716	char *timebuf = malloc(80);;
				717	int i, n=num_threads;
				718	long long sum;
				719	sleep(1);
				720	while(n > 0) {
				721	sleep(1);
				722	sum = n = 0;
				723	for(i=0; i < num_threads; i++) {
				724	if(threadPos[i] >= 0) {
				725	sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
				726	n++;
				727	} else {
				728	sum += iter * file_size / num_threads;
				729	}
				730	}
				731	if(n == 0)
				732	break;
				733	real finished_portion = (real) sum / (float) (file_size * iter);
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame^]	734	long long now = time(NULL);
				735	long long elapsed = (now - start);
				736	long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	737
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame^]	738	printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	739	alpha,
				740	finished_portion * 100,
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame^]	741	(float) sum / elapsed / 1000,
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	742	elapsed,
				743	ttg,
				744	currentDateTime(timebuf, ttg)
				745	);
				746	fflush(stdout);
				747	}
				748	pthread_exit(NULL);
				749	}
				750
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	751	void TrainModelThread(void id) {
				752	long long a, b, d, cw, word, last_word, sentence_length = 0,
				753	sentence_position = 0;
				754	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				755	long long l1, l2, c, target, label, local_iter = iter;
				756	unsigned long long next_random = (long long) id;
				757	real f, g;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	758	int input_len_1 = layer1_size;
				759	int window_offset = -1;
				760	if (type == 2 \|\| type == 4) {
				761	input_len_1 = window_layer_size;
				762	}
				763	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				764	real neu1e = (real ) calloc(input_len_1, sizeof(real));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	765	threadIters[(long) id] = iter;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	766
				767	int input_len_2 = 0;
				768	if (type == 4) {
				769	input_len_2 = window_hidden_size;
				770	}
				771	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				772	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				773
				774	FILE *fi = fopen(train_file, "rb");
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	775	long long start_pos = file_size / (long long) num_threads * (long long) id;
				776	long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
				777	long long current_pos = start_pos;
				778	long long last_pos = start_pos;;
				779	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	780	while (1) {
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	781	if ((current_pos - last_pos > 100000)) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	782	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	783	last_pos = current_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	784	last_word_count = word_count;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	785	alpha = starting_alpha
				786	* (1 - word_count_actual / (real) (iter * train_words + 1));
				787	if (alpha < starting_alpha * 0.0001)
				788	alpha = starting_alpha * 0.0001;
				789	}
				790	if (sentence_length == 0) {
				791	while (1) {
				792	word = ReadWordIndex(fi);
				793	if (feof(fi))
				794	break;
				795	if (word == -1)
				796	continue;
				797	word_count++;
				798	if (word == 0)
				799	break;
				800	// The subsampling randomly discards frequent words while keeping the ranking same
				801	if (sample > 0) {
				802	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				803	+ 1) * (sample * train_words) / vocab[word].cn;
				804	next_random = next_random * (unsigned long long) 25214903917
				805	+ 11;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	806	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				807	if(type == 3) // in structured skipgrams
				808	word = -2; // keep the window position correct
				809	else
				810	continue;
				811	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	812	}
				813	sen[sentence_length] = word;
				814	sentence_length++;
				815	if (sentence_length >= MAX_SENTENCE_LENGTH)
				816	break;
				817	}
				818	sentence_position = 0;
				819	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	820	current_pos = threadPos[(long) id] = ftell(fi);
				821	if (feof(fi) \|\| current_pos >= end_pos ) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	822	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	823	threadIters[(long) id]--;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	824	local_iter--;
				825	if (local_iter == 0)
				826	break;
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	827	if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
				828	printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
				829	break;
				830	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	831	word_count = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	832	current_pos = last_pos = start_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	833	last_word_count = 0;
				834	sentence_length = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	835	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	836	continue;
				837	}
				838	word = sen[sentence_position];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	839	while (word == -2 && sentence_position<sentence_length)
				840	word = sen[++sentence_position];
				841	if (sentence_position>=sentence_length) {
				842	sentence_length=0;
				843	continue;
				844	}
				845	if (word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	846	continue;
				847	for (c = 0; c < input_len_1; c++)
				848	neu1[c] = 0;
				849	for (c = 0; c < input_len_1; c++)
				850	neu1e[c] = 0;
				851	for (c = 0; c < input_len_2; c++)
				852	neu2[c] = 0;
				853	for (c = 0; c < input_len_2; c++)
				854	neu2e[c] = 0;
				855	next_random = next_random * (unsigned long long) 25214903917 + 11;
				856	b = next_random % window;
				857	if (type == 0) { //train the cbow architecture
				858	// in -> hidden
				859	cw = 0;
				860	for (a = b; a < window * 2 + 1 - b; a++)
				861	if (a != window) {
				862	c = sentence_position - window + a;
				863	if (c < 0)
				864	continue;
				865	if (c >= sentence_length)
				866	continue;
				867	last_word = sen[c];
				868	if (last_word == -1)
				869	continue;
				870	for (c = 0; c < layer1_size; c++)
				871	neu1[c] += syn0[c + last_word * layer1_size];
				872	cw++;
				873	}
				874	if (cw) {
				875	for (c = 0; c < layer1_size; c++)
				876	neu1[c] /= cw;
				877	if (hs)
				878	for (d = 0; d < vocab[word].codelen; d++) {
				879	f = 0;
				880	l2 = vocab[word].point[d] * layer1_size;
				881	// Propagate hidden -> output
				882	for (c = 0; c < layer1_size; c++)
				883	f += neu1[c] * syn1[c + l2];
				884	if (f <= -MAX_EXP)
				885	continue;
				886	else if (f >= MAX_EXP)
				887	continue;
				888	else
				889	f = expTable[(int) ((f + MAX_EXP)
				890	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				891	// 'g' is the gradient multiplied by the learning rate
				892	g = (1 - vocab[word].code[d] - f) * alpha;
				893	// Propagate errors output -> hidden
				894	for (c = 0; c < layer1_size; c++)
				895	neu1e[c] += g * syn1[c + l2];
				896	// Learn weights hidden -> output
				897	for (c = 0; c < layer1_size; c++)
				898	syn1[c + l2] += g * neu1[c];
				899	if (cap == 1)
				900	for (c = 0; c < layer1_size; c++)
				901	capParam(syn1, c + l2);
				902	}
				903	// NEGATIVE SAMPLING
				904	if (negative > 0)
				905	for (d = 0; d < negative + 1; d++) {
				906	if (d == 0) {
				907	target = word;
				908	label = 1;
				909	} else {
				910	next_random = next_random
				911	* (unsigned long long) 25214903917 + 11;
				912	if (word_to_group != NULL
				913	&& word_to_group[word] != -1) {
				914	target = word;
				915	while (target == word) {
				916	target = group_to_table[word_to_group[word]
				917	* table_size
				918	+ (next_random >> 16) % table_size];
				919	next_random = next_random
				920	* (unsigned long long) 25214903917
				921	+ 11;
				922	}
				923	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				924	} else {
				925	target =
				926	table[(next_random >> 16) % table_size];
				927	}
				928	if (target == 0)
				929	target = next_random % (vocab_size - 1) + 1;
				930	if (target == word)
				931	continue;
				932	label = 0;
				933	}
				934	l2 = target * layer1_size;
				935	f = 0;
				936	for (c = 0; c < layer1_size; c++)
				937	f += neu1[c] * syn1neg[c + l2];
				938	if (f > MAX_EXP)
				939	g = (label - 1) * alpha;
				940	else if (f < -MAX_EXP)
				941	g = (label - 0) * alpha;
				942	else
				943	g = (label
				944	- expTable[(int) ((f + MAX_EXP)
				945	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				946	* alpha;
				947	for (c = 0; c < layer1_size; c++)
				948	neu1e[c] += g * syn1neg[c + l2];
				949	for (c = 0; c < layer1_size; c++)
				950	syn1neg[c + l2] += g * neu1[c];
				951	if (cap == 1)
				952	for (c = 0; c < layer1_size; c++)
				953	capParam(syn1neg, c + l2);
				954	}
				955	// Noise Contrastive Estimation
				956	if (nce > 0)
				957	for (d = 0; d < nce + 1; d++) {
				958	if (d == 0) {
				959	target = word;
				960	label = 1;
				961	} else {
				962	next_random = next_random
				963	* (unsigned long long) 25214903917 + 11;
				964	if (word_to_group != NULL
				965	&& word_to_group[word] != -1) {
				966	target = word;
				967	while (target == word) {
				968	target = group_to_table[word_to_group[word]
				969	* table_size
				970	+ (next_random >> 16) % table_size];
				971	next_random = next_random
				972	* (unsigned long long) 25214903917
				973	+ 11;
				974	}
				975	} else {
				976	target =
				977	table[(next_random >> 16) % table_size];
				978	}
				979	if (target == 0)
				980	target = next_random % (vocab_size - 1) + 1;
				981	if (target == word)
				982	continue;
				983	label = 0;
				984	}
				985	l2 = target * layer1_size;
				986	f = 0;
				987
				988	for (c = 0; c < layer1_size; c++)
				989	f += neu1[c] * syn1nce[c + l2];
				990	if (f > MAX_EXP)
				991	g = (label - 1) * alpha;
				992	else if (f < -MAX_EXP)
				993	g = (label - 0) * alpha;
				994	else {
				995	f = exp(f);
				996	g =
				997	(label
				998	- f
				999	/ (noise_distribution[target]
				1000	* nce + f)) * alpha;
				1001	}
				1002	for (c = 0; c < layer1_size; c++)
				1003	neu1e[c] += g * syn1nce[c + l2];
				1004	for (c = 0; c < layer1_size; c++)
				1005	syn1nce[c + l2] += g * neu1[c];
				1006	if (cap == 1)
				1007	for (c = 0; c < layer1_size; c++)
				1008	capParam(syn1nce, c + l2);
				1009	}
				1010	// hidden -> in
				1011	for (a = b; a < window * 2 + 1 - b; a++)
				1012	if (a != window) {
				1013	c = sentence_position - window + a;
				1014	if (c < 0)
				1015	continue;
				1016	if (c >= sentence_length)
				1017	continue;
				1018	last_word = sen[c];
				1019	if (last_word == -1)
				1020	continue;
				1021	for (c = 0; c < layer1_size; c++)
				1022	syn0[c + last_word * layer1_size] += neu1e[c];
				1023	}
				1024	}
				1025	} else if (type == 1) { //train skip-gram
				1026	for (a = b; a < window * 2 + 1 - b; a++)
				1027	if (a != window) {
				1028	c = sentence_position - window + a;
				1029	if (c < 0)
				1030	continue;
				1031	if (c >= sentence_length)
				1032	continue;
				1033	last_word = sen[c];
				1034	if (last_word == -1)
				1035	continue;
				1036	l1 = last_word * layer1_size;
				1037	for (c = 0; c < layer1_size; c++)
				1038	neu1e[c] = 0;
				1039	// HIERARCHICAL SOFTMAX
				1040	if (hs)
				1041	for (d = 0; d < vocab[word].codelen; d++) {
				1042	f = 0;
				1043	l2 = vocab[word].point[d] * layer1_size;
				1044	// Propagate hidden -> output
				1045	for (c = 0; c < layer1_size; c++)
				1046	f += syn0[c + l1] * syn1[c + l2];
				1047	if (f <= -MAX_EXP)
				1048	continue;
				1049	else if (f >= MAX_EXP)
				1050	continue;
				1051	else
				1052	f = expTable[(int) ((f + MAX_EXP)
				1053	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1054	// 'g' is the gradient multiplied by the learning rate
				1055	g = (1 - vocab[word].code[d] - f) * alpha;
				1056	// Propagate errors output -> hidden
				1057	for (c = 0; c < layer1_size; c++)
				1058	neu1e[c] += g * syn1[c + l2];
				1059	// Learn weights hidden -> output
				1060	for (c = 0; c < layer1_size; c++)
				1061	syn1[c + l2] += g * syn0[c + l1];
				1062	if (cap == 1)
				1063	for (c = 0; c < layer1_size; c++)
				1064	capParam(syn1, c + l2);
				1065	}
				1066	// NEGATIVE SAMPLING
				1067	if (negative > 0)
				1068	for (d = 0; d < negative + 1; d++) {
				1069	if (d == 0) {
				1070	target = word;
				1071	label = 1;
				1072	} else {
				1073	next_random = next_random
				1074	* (unsigned long long) 25214903917 + 11;
				1075	if (word_to_group != NULL
				1076	&& word_to_group[word] != -1) {
				1077	target = word;
				1078	while (target == word) {
				1079	target =
				1080	group_to_table[word_to_group[word]
				1081	* table_size
				1082	+ (next_random >> 16)
				1083	% table_size];
				1084	next_random =
				1085	next_random
				1086	* (unsigned long long) 25214903917
				1087	+ 11;
				1088	}
				1089	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1090	} else {
				1091	target = table[(next_random >> 16)
				1092	% table_size];
				1093	}
				1094	if (target == 0)
				1095	target = next_random % (vocab_size - 1) + 1;
				1096	if (target == word)
				1097	continue;
				1098	label = 0;
				1099	}
				1100	l2 = target * layer1_size;
				1101	f = 0;
				1102	for (c = 0; c < layer1_size; c++)
				1103	f += syn0[c + l1] * syn1neg[c + l2];
				1104	if (f > MAX_EXP)
				1105	g = (label - 1) * alpha;
				1106	else if (f < -MAX_EXP)
				1107	g = (label - 0) * alpha;
				1108	else
				1109	g =
				1110	(label
				1111	- expTable[(int) ((f + MAX_EXP)
				1112	* (EXP_TABLE_SIZE
				1113	/ MAX_EXP / 2))])
				1114	* alpha;
				1115	for (c = 0; c < layer1_size; c++)
				1116	neu1e[c] += g * syn1neg[c + l2];
				1117	for (c = 0; c < layer1_size; c++)
				1118	syn1neg[c + l2] += g * syn0[c + l1];
				1119	if (cap == 1)
				1120	for (c = 0; c < layer1_size; c++)
				1121	capParam(syn1neg, c + l2);
				1122	}
				1123	//Noise Contrastive Estimation
				1124	if (nce > 0)
				1125	for (d = 0; d < nce + 1; d++) {
				1126	if (d == 0) {
				1127	target = word;
				1128	label = 1;
				1129	} else {
				1130	next_random = next_random
				1131	* (unsigned long long) 25214903917 + 11;
				1132	if (word_to_group != NULL
				1133	&& word_to_group[word] != -1) {
				1134	target = word;
				1135	while (target == word) {
				1136	target =
				1137	group_to_table[word_to_group[word]
				1138	* table_size
				1139	+ (next_random >> 16)
				1140	% table_size];
				1141	next_random =
				1142	next_random
				1143	* (unsigned long long) 25214903917
				1144	+ 11;
				1145	}
				1146	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1147	} else {
				1148	target = table[(next_random >> 16)
				1149	% table_size];
				1150	}
				1151	if (target == 0)
				1152	target = next_random % (vocab_size - 1) + 1;
				1153	if (target == word)
				1154	continue;
				1155	label = 0;
				1156	}
				1157	l2 = target * layer1_size;
				1158	f = 0;
				1159	for (c = 0; c < layer1_size; c++)
				1160	f += syn0[c + l1] * syn1nce[c + l2];
				1161	if (f > MAX_EXP)
				1162	g = (label - 1) * alpha;
				1163	else if (f < -MAX_EXP)
				1164	g = (label - 0) * alpha;
				1165	else {
				1166	f = exp(f);
				1167	g = (label
				1168	- f
				1169	/ (noise_distribution[target]
				1170	* nce + f)) * alpha;
				1171	}
				1172	for (c = 0; c < layer1_size; c++)
				1173	neu1e[c] += g * syn1nce[c + l2];
				1174	for (c = 0; c < layer1_size; c++)
				1175	syn1nce[c + l2] += g * syn0[c + l1];
				1176	if (cap == 1)
				1177	for (c = 0; c < layer1_size; c++)
				1178	capParam(syn1nce, c + l2);
				1179	}
				1180	// Learn weights input -> hidden
				1181	for (c = 0; c < layer1_size; c++)
				1182	syn0[c + l1] += neu1e[c];
				1183	}
				1184	} else if (type == 2) { //train the cwindow architecture
				1185	// in -> hidden
				1186	cw = 0;
				1187	for (a = 0; a < window * 2 + 1; a++)
				1188	if (a != window) {
				1189	c = sentence_position - window + a;
				1190	if (c < 0)
				1191	continue;
				1192	if (c >= sentence_length)
				1193	continue;
				1194	last_word = sen[c];
				1195	if (last_word == -1)
				1196	continue;
				1197	window_offset = a * layer1_size;
				1198	if (a > window)
				1199	window_offset -= layer1_size;
				1200	for (c = 0; c < layer1_size; c++)
				1201	neu1[c + window_offset] += syn0[c
				1202	+ last_word * layer1_size];
				1203	cw++;
				1204	}
				1205	if (cw) {
				1206	if (hs)
				1207	for (d = 0; d < vocab[word].codelen; d++) {
				1208	f = 0;
				1209	l2 = vocab[word].point[d] * window_layer_size;
				1210	// Propagate hidden -> output
				1211	for (c = 0; c < window_layer_size; c++)
				1212	f += neu1[c] * syn1_window[c + l2];
				1213	if (f <= -MAX_EXP)
				1214	continue;
				1215	else if (f >= MAX_EXP)
				1216	continue;
				1217	else
				1218	f = expTable[(int) ((f + MAX_EXP)
				1219	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1220	// 'g' is the gradient multiplied by the learning rate
				1221	g = (1 - vocab[word].code[d] - f) * alpha;
				1222	// Propagate errors output -> hidden
				1223	for (c = 0; c < window_layer_size; c++)
				1224	neu1e[c] += g * syn1_window[c + l2];
				1225	// Learn weights hidden -> output
				1226	for (c = 0; c < window_layer_size; c++)
				1227	syn1_window[c + l2] += g * neu1[c];
				1228	if (cap == 1)
				1229	for (c = 0; c < window_layer_size; c++)
				1230	capParam(syn1_window, c + l2);
				1231	}
				1232	// NEGATIVE SAMPLING
				1233	if (negative > 0)
				1234	for (d = 0; d < negative + 1; d++) {
				1235	if (d == 0) {
				1236	target = word;
				1237	label = 1;
				1238	} else {
				1239	next_random = next_random
				1240	* (unsigned long long) 25214903917 + 11;
				1241	if (word_to_group != NULL
				1242	&& word_to_group[word] != -1) {
				1243	target = word;
				1244	while (target == word) {
				1245	target = group_to_table[word_to_group[word]
				1246	* table_size
				1247	+ (next_random >> 16) % table_size];
				1248	next_random = next_random
				1249	* (unsigned long long) 25214903917
				1250	+ 11;
				1251	}
				1252	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1253	} else {
				1254	target =
				1255	table[(next_random >> 16) % table_size];
				1256	}
				1257	if (target == 0)
				1258	target = next_random % (vocab_size - 1) + 1;
				1259	if (target == word)
				1260	continue;
				1261	label = 0;
				1262	}
				1263	l2 = target * window_layer_size;
				1264	f = 0;
				1265	for (c = 0; c < window_layer_size; c++)
				1266	f += neu1[c] * syn1neg_window[c + l2];
				1267	if (f > MAX_EXP)
				1268	g = (label - 1) * alpha;
				1269	else if (f < -MAX_EXP)
				1270	g = (label - 0) * alpha;
				1271	else
				1272	g = (label
				1273	- expTable[(int) ((f + MAX_EXP)
				1274	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1275	* alpha;
				1276	for (c = 0; c < window_layer_size; c++)
				1277	neu1e[c] += g * syn1neg_window[c + l2];
				1278	for (c = 0; c < window_layer_size; c++)
				1279	syn1neg_window[c + l2] += g * neu1[c];
				1280	if (cap == 1)
				1281	for (c = 0; c < window_layer_size; c++)
				1282	capParam(syn1neg_window, c + l2);
				1283	}
				1284	// Noise Contrastive Estimation
				1285	if (nce > 0)
				1286	for (d = 0; d < nce + 1; d++) {
				1287	if (d == 0) {
				1288	target = word;
				1289	label = 1;
				1290	} else {
				1291	next_random = next_random
				1292	* (unsigned long long) 25214903917 + 11;
				1293	if (word_to_group != NULL
				1294	&& word_to_group[word] != -1) {
				1295	target = word;
				1296	while (target == word) {
				1297	target = group_to_table[word_to_group[word]
				1298	* table_size
				1299	+ (next_random >> 16) % table_size];
				1300	next_random = next_random
				1301	* (unsigned long long) 25214903917
				1302	+ 11;
				1303	}
				1304	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1305	} else {
				1306	target =
				1307	table[(next_random >> 16) % table_size];
				1308	}
				1309	if (target == 0)
				1310	target = next_random % (vocab_size - 1) + 1;
				1311	if (target == word)
				1312	continue;
				1313	label = 0;
				1314	}
				1315	l2 = target * window_layer_size;
				1316	f = 0;
				1317	for (c = 0; c < window_layer_size; c++)
				1318	f += neu1[c] * syn1nce_window[c + l2];
				1319	if (f > MAX_EXP)
				1320	g = (label - 1) * alpha;
				1321	else if (f < -MAX_EXP)
				1322	g = (label - 0) * alpha;
				1323	else {
				1324	f = exp(f);
				1325	g =
				1326	(label
				1327	- f
				1328	/ (noise_distribution[target]
				1329	* nce + f)) * alpha;
				1330	}
				1331	for (c = 0; c < window_layer_size; c++)
				1332	neu1e[c] += g * syn1nce_window[c + l2];
				1333	for (c = 0; c < window_layer_size; c++)
				1334	syn1nce_window[c + l2] += g * neu1[c];
				1335	if (cap == 1)
				1336	for (c = 0; c < window_layer_size; c++)
				1337	capParam(syn1nce_window, c + l2);
				1338	}
				1339	// hidden -> in
				1340	for (a = 0; a < window * 2 + 1; a++)
				1341	if (a != window) {
				1342	c = sentence_position - window + a;
				1343	if (c < 0)
				1344	continue;
				1345	if (c >= sentence_length)
				1346	continue;
				1347	last_word = sen[c];
				1348	if (last_word == -1)
				1349	continue;
				1350	window_offset = a * layer1_size;
				1351	if (a > window)
				1352	window_offset -= layer1_size;
				1353	for (c = 0; c < layer1_size; c++)
				1354	syn0[c + last_word * layer1_size] += neu1e[c
				1355	+ window_offset];
				1356	}
				1357	}
				1358	} else if (type == 3) { //train structured skip-gram
				1359	for (a = 0; a < window * 2 + 1; a++)
				1360	if (a != window) {
				1361	c = sentence_position - window + a;
				1362	if (c < 0)
				1363	continue;
				1364	if (c >= sentence_length)
				1365	continue;
				1366	last_word = sen[c];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	1367	if (last_word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1368	continue;
				1369	l1 = last_word * layer1_size;
				1370	window_offset = a * layer1_size;
				1371	if (a > window)
				1372	window_offset -= layer1_size;
				1373	for (c = 0; c < layer1_size; c++)
				1374	neu1e[c] = 0;
				1375	// HIERARCHICAL SOFTMAX
				1376	if (hs)
				1377	for (d = 0; d < vocab[word].codelen; d++) {
				1378	f = 0;
				1379	l2 = vocab[word].point[d] * window_layer_size;
				1380	// Propagate hidden -> output
				1381	for (c = 0; c < layer1_size; c++)
				1382	f += syn0[c + l1]
				1383	* syn1_window[c + l2 + window_offset];
				1384	if (f <= -MAX_EXP)
				1385	continue;
				1386	else if (f >= MAX_EXP)
				1387	continue;
				1388	else
				1389	f = expTable[(int) ((f + MAX_EXP)
				1390	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1391	// 'g' is the gradient multiplied by the learning rate
				1392	g = (1 - vocab[word].code[d] - f) * alpha;
				1393	// Propagate errors output -> hidden
				1394	for (c = 0; c < layer1_size; c++)
				1395	neu1e[c] += g
				1396	* syn1_window[c + l2 + window_offset];
				1397	// Learn weights hidden -> output
				1398	for (c = 0; c < layer1_size; c++)
				1399	syn1[c + l2 + window_offset] += g
				1400	* syn0[c + l1];
				1401	if (cap == 1)
				1402	for (c = 0; c < layer1_size; c++)
				1403	capParam(syn1, c + l2 + window_offset);
				1404	}
				1405	// NEGATIVE SAMPLING
				1406	if (negative > 0)
				1407	for (d = 0; d < negative + 1; d++) {
				1408	if (d == 0) {
				1409	target = word;
				1410	label = 1;
				1411	} else {
				1412	next_random = next_random
				1413	* (unsigned long long) 25214903917 + 11;
				1414	if (word_to_group != NULL
				1415	&& word_to_group[word] != -1) {
				1416	target = word;
				1417	while (target == word) {
				1418	target =
				1419	group_to_table[word_to_group[word]
				1420	* table_size
				1421	+ (next_random >> 16)
				1422	% table_size];
				1423	next_random =
				1424	next_random
				1425	* (unsigned long long) 25214903917
				1426	+ 11;
				1427	}
				1428	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1429	} else {
				1430	target = table[(next_random >> 16)
				1431	% table_size];
				1432	}
				1433	if (target == 0)
				1434	target = next_random % (vocab_size - 1) + 1;
				1435	if (target == word)
				1436	continue;
				1437	label = 0;
				1438	}
				1439	l2 = target * window_layer_size;
				1440	f = 0;
				1441	for (c = 0; c < layer1_size; c++)
				1442	f +=
				1443	syn0[c + l1]
				1444	* syn1neg_window[c + l2
				1445	+ window_offset];
				1446	if (f > MAX_EXP)
				1447	g = (label - 1) * alpha;
				1448	else if (f < -MAX_EXP)
				1449	g = (label - 0) * alpha;
				1450	else
				1451	g =
				1452	(label
				1453	- expTable[(int) ((f + MAX_EXP)
				1454	* (EXP_TABLE_SIZE
				1455	/ MAX_EXP / 2))])
				1456	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1457	if(debug_mode > 2 && ((long long) id) == 0) {
				1458	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1459	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1460	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1461	for (c = 0; c < layer1_size; c++)
				1462	neu1e[c] +=
				1463	g
				1464	* syn1neg_window[c + l2
				1465	+ window_offset];
				1466	for (c = 0; c < layer1_size; c++)
				1467	syn1neg_window[c + l2 + window_offset] += g
				1468	* syn0[c + l1];
				1469	if (cap == 1)
				1470	for (c = 0; c < layer1_size; c++)
				1471	capParam(syn1neg_window,
				1472	c + l2 + window_offset);
				1473	}
				1474	// Noise Constrastive Estimation
				1475	if (nce > 0)
				1476	for (d = 0; d < nce + 1; d++) {
				1477	if (d == 0) {
				1478	target = word;
				1479	label = 1;
				1480	} else {
				1481	next_random = next_random
				1482	* (unsigned long long) 25214903917 + 11;
				1483	if (word_to_group != NULL
				1484	&& word_to_group[word] != -1) {
				1485	target = word;
				1486	while (target == word) {
				1487	target =
				1488	group_to_table[word_to_group[word]
				1489	* table_size
				1490	+ (next_random >> 16)
				1491	% table_size];
				1492	next_random =
				1493	next_random
				1494	* (unsigned long long) 25214903917
				1495	+ 11;
				1496	}
				1497	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1498	} else {
				1499	target = table[(next_random >> 16)
				1500	% table_size];
				1501	}
				1502	if (target == 0)
				1503	target = next_random % (vocab_size - 1) + 1;
				1504	if (target == word)
				1505	continue;
				1506	label = 0;
				1507	}
				1508	l2 = target * window_layer_size;
				1509	f = 0;
				1510	for (c = 0; c < layer1_size; c++)
				1511	f +=
				1512	syn0[c + l1]
				1513	* syn1nce_window[c + l2
				1514	+ window_offset];
				1515	if (f > MAX_EXP)
				1516	g = (label - 1) * alpha;
				1517	else if (f < -MAX_EXP)
				1518	g = (label - 0) * alpha;
				1519	else {
				1520	f = exp(f);
				1521	g = (label
				1522	- f
				1523	/ (noise_distribution[target]
				1524	* nce + f)) * alpha;
				1525	}
				1526	for (c = 0; c < layer1_size; c++)
				1527	neu1e[c] +=
				1528	g
				1529	* syn1nce_window[c + l2
				1530	+ window_offset];
				1531	for (c = 0; c < layer1_size; c++)
				1532	syn1nce_window[c + l2 + window_offset] += g
				1533	* syn0[c + l1];
				1534	if (cap == 1)
				1535	for (c = 0; c < layer1_size; c++)
				1536	capParam(syn1nce_window,
				1537	c + l2 + window_offset);
				1538	}
				1539	// Learn weights input -> hidden
				1540	for (c = 0; c < layer1_size; c++) {
				1541	syn0[c + l1] += neu1e[c];
				1542	if (syn0[c + l1] > 50)
				1543	syn0[c + l1] = 50;
				1544	if (syn0[c + l1] < -50)
				1545	syn0[c + l1] = -50;
				1546	}
				1547	}
				1548	} else if (type == 4) { //training senna
				1549	// in -> hidden
				1550	cw = 0;
				1551	for (a = 0; a < window * 2 + 1; a++)
				1552	if (a != window) {
				1553	c = sentence_position - window + a;
				1554	if (c < 0)
				1555	continue;
				1556	if (c >= sentence_length)
				1557	continue;
				1558	last_word = sen[c];
				1559	if (last_word == -1)
				1560	continue;
				1561	window_offset = a * layer1_size;
				1562	if (a > window)
				1563	window_offset -= layer1_size;
				1564	for (c = 0; c < layer1_size; c++)
				1565	neu1[c + window_offset] += syn0[c
				1566	+ last_word * layer1_size];
				1567	cw++;
				1568	}
				1569	if (cw) {
				1570	for (a = 0; a < window_hidden_size; a++) {
				1571	c = a * window_layer_size;
				1572	for (b = 0; b < window_layer_size; b++) {
				1573	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1574	}
				1575	}
				1576	if (hs)
				1577	for (d = 0; d < vocab[word].codelen; d++) {
				1578	f = 0;
				1579	l2 = vocab[word].point[d] * window_hidden_size;
				1580	// Propagate hidden -> output
				1581	for (c = 0; c < window_hidden_size; c++)
				1582	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1583	if (f <= -MAX_EXP)
				1584	continue;
				1585	else if (f >= MAX_EXP)
				1586	continue;
				1587	else
				1588	f = expTable[(int) ((f + MAX_EXP)
				1589	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1590	// 'g' is the gradient multiplied by the learning rate
				1591	g = (1 - vocab[word].code[d] - f) * alpha;
				1592	// Propagate errors output -> hidden
				1593	for (c = 0; c < window_hidden_size; c++)
				1594	neu2e[c] += dHardTanh(neu2[c], g) * g
				1595	* syn_hidden_word[c + l2];
				1596	// Learn weights hidden -> output
				1597	for (c = 0; c < window_hidden_size; c++)
				1598	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1599	* neu2[c];
				1600	}
				1601	// NEGATIVE SAMPLING
				1602	if (negative > 0)
				1603	for (d = 0; d < negative + 1; d++) {
				1604	if (d == 0) {
				1605	target = word;
				1606	label = 1;
				1607	} else {
				1608	next_random = next_random
				1609	* (unsigned long long) 25214903917 + 11;
				1610	if (word_to_group != NULL
				1611	&& word_to_group[word] != -1) {
				1612	target = word;
				1613	while (target == word) {
				1614	target = group_to_table[word_to_group[word]
				1615	* table_size
				1616	+ (next_random >> 16) % table_size];
				1617	next_random = next_random
				1618	* (unsigned long long) 25214903917
				1619	+ 11;
				1620	}
				1621	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1622	} else {
				1623	target =
				1624	table[(next_random >> 16) % table_size];
				1625	}
				1626	if (target == 0)
				1627	target = next_random % (vocab_size - 1) + 1;
				1628	if (target == word)
				1629	continue;
				1630	label = 0;
				1631	}
				1632	l2 = target * window_hidden_size;
				1633	f = 0;
				1634	for (c = 0; c < window_hidden_size; c++)
				1635	f += hardTanh(neu2[c])
				1636	* syn_hidden_word_neg[c + l2];
				1637	if (f > MAX_EXP)
				1638	g = (label - 1) * alpha / negative;
				1639	else if (f < -MAX_EXP)
				1640	g = (label - 0) * alpha / negative;
				1641	else
				1642	g = (label
				1643	- expTable[(int) ((f + MAX_EXP)
				1644	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1645	* alpha / negative;
				1646	for (c = 0; c < window_hidden_size; c++)
				1647	neu2e[c] += dHardTanh(neu2[c], g) * g
				1648	* syn_hidden_word_neg[c + l2];
				1649	for (c = 0; c < window_hidden_size; c++)
				1650	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1651	* g * neu2[c];
				1652	}
				1653	for (a = 0; a < window_hidden_size; a++)
				1654	for (b = 0; b < window_layer_size; b++)
				1655	neu1e[b] += neu2e[a]
				1656	* syn_window_hidden[a * window_layer_size + b];
				1657	for (a = 0; a < window_hidden_size; a++)
				1658	for (b = 0; b < window_layer_size; b++)
				1659	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1660	* neu1[b];
				1661	// hidden -> in
				1662	for (a = 0; a < window * 2 + 1; a++)
				1663	if (a != window) {
				1664	c = sentence_position - window + a;
				1665	if (c < 0)
				1666	continue;
				1667	if (c >= sentence_length)
				1668	continue;
				1669	last_word = sen[c];
				1670	if (last_word == -1)
				1671	continue;
				1672	window_offset = a * layer1_size;
				1673	if (a > window)
				1674	window_offset -= layer1_size;
				1675	for (c = 0; c < layer1_size; c++)
				1676	syn0[c + last_word * layer1_size] += neu1e[c
				1677	+ window_offset];
				1678	}
				1679	}
				1680	} else {
				1681	printf("unknown type %i", type);
				1682	exit(0);
				1683	}
				1684	sentence_position++;
				1685	if (sentence_position >= sentence_length) {
				1686	sentence_length = 0;
				1687	continue;
				1688	}
				1689	}
				1690	fclose(fi);
				1691	free(neu1);
				1692	free(neu1e);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1693	threadPos[(long) id] = -1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1694	pthread_exit(NULL);
				1695	}
				1696
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1697	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1698	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1699	real f, max_f, maxmax_f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1700	real *target_sums, bestf[MAX_CC], worstbest;
				1701	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1702	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1703	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1704
				1705	for (d = cc; d < vocab_size; d++) {
				1706	for (b = 0; b < vocab_size; b++)
				1707	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1708	for (b = 0; b < N; b++)
				1709	bestf[b]=-1;
				1710	worstbest = -1;
				1711
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1712	maxmax_f = -1;
				1713	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1714	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1715	if (a != window) {
				1716	max_f = -1;
				1717	window_offset = a * layer1_size;
				1718	if (a > window)
				1719	window_offset -= layer1_size;
				1720	for(target = 0; target < vocab_size; target ++) {
				1721	if(target == d)
				1722	continue;
				1723	f = 0;
				1724	for (c = 0; c < layer1_size; c++)
				1725	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1726	if (f < -MAX_EXP)
				1727	continue;
				1728	else if (f > MAX_EXP)
				1729	continue;
				1730	else
				1731	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1732	if(f > max_f) {
				1733	max_f = f;
				1734	max_target = target;
				1735	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1736	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1737	if(f > worstbest) {
				1738	for (b = 0; b < N; b++) {
				1739	if (f > bestf[b]) {
				1740	for (e = N - 1; e > b; e--) {
				1741	bestf[e] = bestf[e - 1];
				1742	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1743	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1744	}
				1745	bestf[b] = f;
				1746	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1747	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1748	break;
				1749	}
				1750	}
				1751	worstbest = bestf[N-1];
				1752	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1753	}
				1754	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1755	if(max_f > maxmax_f) {
				1756	maxmax_f = max_f;
				1757	maxmax_target = max_target;
				1758	}
				1759	} else {
				1760	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1761	}
				1762	}
				1763	max_f = -1;
				1764	for (b = 0; b < vocab_size; b++) {
				1765	if(target_sums[b] > max_f) {
				1766	max_f = target_sums[b];
				1767	max_target = b;
				1768	}
				1769	}
				1770	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1771	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1772	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1773	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1774	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1775	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1776	}
				1777	}
				1778
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1779	void TrainModel() {
				1780	long a, b, c, d;
				1781	FILE *fo;
				1782	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1783	threadPos = malloc(num_threads * sizeof(long long));
				1784	threadIters = malloc(num_threads * sizeof(int));
				1785	char *timebuf = malloc(80);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1786	printf("Starting training using file %s\n", train_file);
				1787	starting_alpha = alpha;
				1788	if (read_vocab_file[0] != 0)
				1789	ReadVocab();
				1790	else
				1791	LearnVocabFromTrainFile();
				1792	if (save_vocab_file[0] != 0)
				1793	SaveVocab();
				1794	if (output_file[0] == 0)
				1795	return;
				1796	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1797	if(cc > 0)
				1798	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1799	if (negative > 0 \|\| nce > 0)
				1800	InitUnigramTable();
				1801	if (negative_classes_file[0] != 0)
				1802	InitClassUnigramTable();
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame^]	1803	start = time(NULL);
				1804	start_clock = clock();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1805	for (a = 0; a < num_threads; a++)
				1806	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1807	if(debug_mode > 1)
				1808	pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1809	for (a = 0; a < num_threads; a++)
				1810	pthread_join(pt[a], NULL);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1811	if(debug_mode > 1) {
				1812	pthread_join(pt[num_threads], NULL);
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame^]	1813	clock_t now = time(NULL);
				1814	clock_t now_clock = clock();
				1815	printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
				1816	if(type == 5) {
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1817	printf("Saving vectors to %s ...", output_file);
				1818	fflush(stdout);
				1819	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1820	fo = fopen(output_file, "wb");
				1821	if (classes == 0) {
				1822	// Save the word vectors
				1823	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1824	for (a = 0; a < vocab_size; a++) {
				1825	fprintf(fo, "%s ", vocab[a].word);
				1826	if (binary)
				1827	for (b = 0; b < layer1_size; b++)
				1828	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1829	else
				1830	for (b = 0; b < layer1_size; b++)
				1831	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1832	fprintf(fo, "\n");
				1833	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1834	if(debug_mode > 1)
				1835	fprintf(stderr, "\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1836	} else {
				1837	// Run K-means on the word vectors
				1838	int clcn = classes, iter = 10, closeid;
				1839	int centcn = (int ) malloc(classes * sizeof(int));
				1840	int cl = (int ) calloc(vocab_size, sizeof(int));
				1841	real closev, x;
				1842	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1843	for (a = 0; a < vocab_size; a++)
				1844	cl[a] = a % clcn;
				1845	for (a = 0; a < iter; a++) {
				1846	for (b = 0; b < clcn * layer1_size; b++)
				1847	cent[b] = 0;
				1848	for (b = 0; b < clcn; b++)
				1849	centcn[b] = 1;
				1850	for (c = 0; c < vocab_size; c++) {
				1851	for (d = 0; d < layer1_size; d++)
				1852	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1853	centcn[cl[c]]++;
				1854	}
				1855	for (b = 0; b < clcn; b++) {
				1856	closev = 0;
				1857	for (c = 0; c < layer1_size; c++) {
				1858	cent[layer1_size * b + c] /= centcn[b];
				1859	closev += cent[layer1_size * b + c]
				1860	* cent[layer1_size * b + c];
				1861	}
				1862	closev = sqrt(closev);
				1863	for (c = 0; c < layer1_size; c++)
				1864	cent[layer1_size * b + c] /= closev;
				1865	}
				1866	for (c = 0; c < vocab_size; c++) {
				1867	closev = -10;
				1868	closeid = 0;
				1869	for (d = 0; d < clcn; d++) {
				1870	x = 0;
				1871	for (b = 0; b < layer1_size; b++)
				1872	x += cent[layer1_size * d + b]
				1873	* syn0[c * layer1_size + b];
				1874	if (x > closev) {
				1875	closev = x;
				1876	closeid = d;
				1877	}
				1878	}
				1879	cl[c] = closeid;
				1880	}
				1881	}
				1882	// Save the K-means classes
				1883	for (a = 0; a < vocab_size; a++)
				1884	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1885	free(centcn);
				1886	free(cent);
				1887	free(cl);
				1888	}
				1889	fclose(fo);
				1890	if (save_net_file[0] != 0)
				1891	SaveNet();
				1892	}
				1893
				1894	int ArgPos(char str, int argc, char *argv) {
				1895	int a;
				1896	for (a = 1; a < argc; a++)
				1897	if (!strcmp(str, argv[a])) {
				1898	if (a == argc - 1) {
				1899	printf("Argument missing for %s\n", str);
				1900	exit(1);
				1901	}
				1902	return a;
				1903	}
				1904	return -1;
				1905	}
				1906
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1907	void print_help() {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1908	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1909	printf("Options:\n");
				1910	printf("Parameters for training:\n");
				1911	printf("\t-train <file>\n");
				1912	printf("\t\tUse text data from <file> to train the model\n");
				1913	printf("\t-output <file>\n");
				1914	printf(
				1915	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1916	printf("\t-size <int>\n");
				1917	printf("\t\tSet size of word vectors; default is 100\n");
				1918	printf("\t-window <int>\n");
				1919	printf("\t\tSet max skip length between words; default is 5\n");
				1920	printf("\t-sample <float>\n");
				1921	printf(
				1922	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1923	printf(
				1924	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1925	printf("\t-hs <int>\n");
				1926	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1927	printf("\t-negative <int>\n");
				1928	printf(
				1929	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1930	printf("\t-negative-classes <file>\n");
				1931	printf("\t\tNegative classes to sample from\n");
				1932	printf("\t-nce <int>\n");
				1933	printf(
				1934	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1935	printf("\t-threads <int>\n");
				1936	printf("\t\tUse <int> threads (default 12)\n");
				1937	printf("\t-iter <int>\n");
				1938	printf("\t\tRun more training iterations (default 5)\n");
				1939	printf("\t-min-count <int>\n");
				1940	printf(
				1941	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1942	printf("\t-alpha <float>\n");
				1943	printf(
				1944	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1945	printf("\t-classes <int>\n");
				1946	printf(
				1947	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1948	printf("\t-debug <int>\n");
				1949	printf(
				1950	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1951	printf("\t-binary <int>\n");
				1952	printf(
				1953	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1954	printf("\t-save-vocab <file>\n");
				1955	printf("\t\tThe vocabulary will be saved to <file>\n");
				1956	printf("\t-read-vocab <file>\n");
				1957	printf(
				1958	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1959	printf("\t-read-net <file>\n");
				1960	printf(
				1961	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1962	printf("\t-save-net <file>\n");
				1963	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	1964	printf("\t-magic-stop-file <file>\n");
				1965	printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1966	printf("\t-show-cc <int>\n");
				1967	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1968	printf("\t-type <int>\n");
				1969	printf(
				1970	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1971	printf("\t-cap <int>\n");
				1972	printf(
				1973	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1974	printf("\nExamples:\n");
				1975	printf(
				1976	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1977	}
				1978
				1979	int main(int argc, char **argv) {
				1980	int i;
				1981	setlocale(LC_ALL, "");
				1982	if (argc == 1) {
				1983	print_help();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1984	return 0;
				1985	}
				1986	output_file[0] = 0;
				1987	save_vocab_file[0] = 0;
				1988	read_vocab_file[0] = 0;
				1989	save_net_file[0] = 0;
				1990	read_net_file[0] = 0;
				1991	negative_classes_file[0] = 0;
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1992	if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
				1993	print_help();
				1994	return(0);
				1995	}
				1996	if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
				1997	print_help();
				1998	return(0);
				1999	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2000	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				2001	layer1_size = atoi(argv[i + 1]);
				2002	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				2003	strcpy(train_file, argv[i + 1]);
				2004	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				2005	strcpy(save_vocab_file, argv[i + 1]);
				2006	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				2007	strcpy(read_vocab_file, argv[i + 1]);
				2008	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				2009	strcpy(save_net_file, argv[i + 1]);
				2010	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				2011	strcpy(read_net_file, argv[i + 1]);
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	2012	if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
				2013	strcpy(magic_stop_file, argv[i + 1]);
				2014	if (access(magic_stop_file, F_OK ) != -1) {
				2015	printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
				2016	exit(1);
				2017	}
				2018	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2019	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				2020	debug_mode = atoi(argv[i + 1]);
				2021	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				2022	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	2023	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				2024	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2025	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				2026	type = atoi(argv[i + 1]);
				2027	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				2028	strcpy(output_file, argv[i + 1]);
				2029	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				2030	window = atoi(argv[i + 1]);
				2031	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				2032	sample = atof(argv[i + 1]);
				2033	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				2034	hs = atoi(argv[i + 1]);
				2035	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				2036	negative = atoi(argv[i + 1]);
				2037	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				2038	strcpy(negative_classes_file, argv[i + 1]);
				2039	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				2040	nce = atoi(argv[i + 1]);
				2041	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				2042	num_threads = atoi(argv[i + 1]);
				2043	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				2044	iter = atoi(argv[i + 1]);
				2045	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				2046	min_count = atoi(argv[i + 1]);
				2047	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				2048	classes = atoi(argv[i + 1]);
				2049	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				2050	cap = atoi(argv[i + 1]);
				2051	if (type == 0 \|\| type == 2 \|\| type == 4)
				2052	alpha = 0.05;
				2053	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				2054	alpha = atof(argv[i + 1]);
				2055	vocab = (struct vocab_word *) calloc(vocab_max_size,
				2056	sizeof(struct vocab_word));
				2057	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				2058	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				2059	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				2060	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				2061	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				2062	}
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	2063	SaveArgs(argc, argv);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2064	TrainModel();
				2065	return 0;
				2066	}
				2067