Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 83b8e3c8c5f0d119fc755eaa956e862248d4da10 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	18	#include <unistd.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	19	#include <math.h>
				20	#include <pthread.h>
				21
				22	#define MAX_STRING 100
				23	#define EXP_TABLE_SIZE 1000
				24	#define MAX_EXP 6
				25	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	26	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	27	#define MAX_CODE_LENGTH 40
				28
				29	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				30
				31	typedef float real; // Precision of float numbers
				32
				33	struct vocab_word {
				34	long long cn;
				35	int *point;
				36	char word, code, codelen;
				37	};
				38
				39	char train_file[MAX_STRING], output_file[MAX_STRING];
				40	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				41	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				42	struct vocab_word *vocab;
				43	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	44	num_threads = 12, min_reduce = 1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	45	int *vocab_hash;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	46	long long *threadPos;
				47	int *threadIters;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	48	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				49	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				50	classes = 0;
				51	real alpha = 0.025, starting_alpha, sample = 1e-3;
				52	real syn0, syn1, syn1neg, syn1nce, *expTable;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	53	real avgWordLength=0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	54	clock_t start;
				55
				56	real syn1_window, syn1neg_window, *syn1nce_window;
				57	int w_offset, window_layer_size;
				58
				59	int window_hidden_size = 500;
				60	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				61	*syn_hidden_word_nce;
				62
				63	int hs = 0, negative = 5;
				64	const int table_size = 1e8;
				65	int *table;
				66
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	67	long cc = 0;
				68
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	69	//constrastive negative sampling
				70	char negative_classes_file[MAX_STRING];
				71	int *word_to_group;
				72	int group_to_table; //group_sizetable_size
				73	int class_number;
				74
				75	//nce
				76	real* noise_distribution;
				77	int nce = 0;
				78
				79	//param caps
				80	real CAP_VALUE = 50;
				81	int cap = 0;
				82
				83	void capParam(real* array, int index) {
				84	if (array[index] > CAP_VALUE)
				85	array[index] = CAP_VALUE;
				86	else if (array[index] < -CAP_VALUE)
				87	array[index] = -CAP_VALUE;
				88	}
				89
				90	real hardTanh(real x) {
				91	if (x >= 1) {
				92	return 1;
				93	} else if (x <= -1) {
				94	return -1;
				95	} else {
				96	return x;
				97	}
				98	}
				99
				100	real dHardTanh(real x, real g) {
				101	if (x > 1 && g > 0) {
				102	return 0;
				103	}
				104	if (x < -1 && g < 0) {
				105	return 0;
				106	}
				107	return 1;
				108	}
				109
				110	void InitUnigramTable() {
				111	int a, i;
				112	long long train_words_pow = 0;
				113	real d1, power = 0.75;
				114	table = (int ) malloc(table_size sizeof(int));
				115	for (a = 0; a < vocab_size; a++)
				116	train_words_pow += pow(vocab[a].cn, power);
				117	i = 0;
				118	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				119	for (a = 0; a < table_size; a++) {
				120	table[a] = i;
				121	if (a / (real) table_size > d1) {
				122	i++;
				123	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				124	}
				125	if (i >= vocab_size)
				126	i = vocab_size - 1;
				127	}
				128
				129	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				130	for (a = 0; a < vocab_size; a++)
				131	noise_distribution[a] = pow(vocab[a].cn, power)
				132	/ (real) train_words_pow;
				133	}
				134
				135	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				136	void ReadWord(char word, FILE fin) {
				137	int a = 0, ch;
				138	while (!feof(fin)) {
				139	ch = fgetc(fin);
				140	if (ch == 13)
				141	continue;
				142	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				143	if (a > 0) {
				144	if (ch == '\n')
				145	ungetc(ch, fin);
				146	break;
				147	}
				148	if (ch == '\n') {
				149	strcpy(word, (char *) "</s>");
				150	return;
				151	} else
				152	continue;
				153	}
				154	word[a] = ch;
				155	a++;
				156	if (a >= MAX_STRING - 1)
				157	a--; // Truncate too long words
				158	}
				159	word[a] = 0;
				160	}
				161
				162	// Returns hash value of a word
				163	int GetWordHash(char *word) {
				164	unsigned long long a, hash = 0;
				165	for (a = 0; a < strlen(word); a++)
				166	hash = hash * 257 + word[a];
				167	hash = hash % vocab_hash_size;
				168	return hash;
				169	}
				170
				171	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				172	int SearchVocab(char *word) {
				173	unsigned int hash = GetWordHash(word);
				174	while (1) {
				175	if (vocab_hash[hash] == -1)
				176	return -1;
				177	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				178	return vocab_hash[hash];
				179	hash = (hash + 1) % vocab_hash_size;
				180	}
				181	return -1;
				182	}
				183
				184	// Reads a word and returns its index in the vocabulary
				185	int ReadWordIndex(FILE *fin) {
				186	char word[MAX_STRING];
				187	ReadWord(word, fin);
				188	if (feof(fin))
				189	return -1;
				190	return SearchVocab(word);
				191	}
				192
				193	// Adds a word to the vocabulary
				194	int AddWordToVocab(char *word) {
				195	unsigned int hash, length = strlen(word) + 1;
				196	if (length > MAX_STRING)
				197	length = MAX_STRING;
				198	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				199	strcpy(vocab[vocab_size].word, word);
				200	vocab[vocab_size].cn = 0;
				201	vocab_size++;
				202	// Reallocate memory if needed
				203	if (vocab_size + 2 >= vocab_max_size) {
				204	vocab_max_size += 1000;
				205	vocab = (struct vocab_word *) realloc(vocab,
				206	vocab_max_size * sizeof(struct vocab_word));
				207	}
				208	hash = GetWordHash(word);
				209	while (vocab_hash[hash] != -1)
				210	hash = (hash + 1) % vocab_hash_size;
				211	vocab_hash[hash] = vocab_size - 1;
				212	return vocab_size - 1;
				213	}
				214
				215	// Used later for sorting by word counts
				216	int VocabCompare(const void a, const void b) {
				217	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				218	}
				219
				220	// Sorts the vocabulary by frequency using word counts
				221	void SortVocab() {
				222	int a, size;
				223	unsigned int hash;
				224	// Sort the vocabulary and keep </s> at the first position
				225	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				226	for (a = 0; a < vocab_hash_size; a++)
				227	vocab_hash[a] = -1;
				228	size = vocab_size;
				229	train_words = 0;
				230	for (a = 0; a < size; a++) {
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	231	avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	232	// Words occuring less than min_count times will be discarded from the vocab
				233	if ((vocab[a].cn < min_count) && (a != 0)) {
				234	vocab_size--;
				235	free(vocab[a].word);
				236	} else {
				237	// Hash will be re-computed, as after the sorting it is not actual
				238	hash = GetWordHash(vocab[a].word);
				239	while (vocab_hash[hash] != -1)
				240	hash = (hash + 1) % vocab_hash_size;
				241	vocab_hash[hash] = a;
				242	train_words += vocab[a].cn;
				243	}
				244	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	245	avgWordLength /= train_words;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	246	vocab = (struct vocab_word *) realloc(vocab,
				247	(vocab_size + 1) * sizeof(struct vocab_word));
				248	// Allocate memory for the binary tree construction
				249	for (a = 0; a < vocab_size; a++) {
				250	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				251	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				252	}
				253	}
				254
				255	// Reduces the vocabulary by removing infrequent tokens
				256	void ReduceVocab() {
				257	int a, b = 0;
				258	unsigned int hash;
				259	for (a = 0; a < vocab_size; a++)
				260	if (vocab[a].cn > min_reduce) {
				261	vocab[b].cn = vocab[a].cn;
				262	vocab[b].word = vocab[a].word;
				263	b++;
				264	} else
				265	free(vocab[a].word);
				266	vocab_size = b;
				267	for (a = 0; a < vocab_hash_size; a++)
				268	vocab_hash[a] = -1;
				269	for (a = 0; a < vocab_size; a++) {
				270	// Hash will be re-computed, as it is not actual
				271	hash = GetWordHash(vocab[a].word);
				272	while (vocab_hash[hash] != -1)
				273	hash = (hash + 1) % vocab_hash_size;
				274	vocab_hash[hash] = a;
				275	}
				276	fflush(stdout);
				277	min_reduce++;
				278	}
				279
				280	// Create binary Huffman tree using the word counts
				281	// Frequent words will have short uniqe binary codes
				282	void CreateBinaryTree() {
				283	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				284	char code[MAX_CODE_LENGTH];
				285	long long count = (long long ) calloc(vocab_size * 2 + 1,
				286	sizeof(long long));
				287	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				288	sizeof(long long));
				289	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				290	sizeof(long long));
				291	for (a = 0; a < vocab_size; a++)
				292	count[a] = vocab[a].cn;
				293	for (a = vocab_size; a < vocab_size * 2; a++)
				294	count[a] = 1e15;
				295	pos1 = vocab_size - 1;
				296	pos2 = vocab_size;
				297	// Following algorithm constructs the Huffman tree by adding one node at a time
				298	for (a = 0; a < vocab_size - 1; a++) {
				299	// First, find two smallest nodes 'min1, min2'
				300	if (pos1 >= 0) {
				301	if (count[pos1] < count[pos2]) {
				302	min1i = pos1;
				303	pos1--;
				304	} else {
				305	min1i = pos2;
				306	pos2++;
				307	}
				308	} else {
				309	min1i = pos2;
				310	pos2++;
				311	}
				312	if (pos1 >= 0) {
				313	if (count[pos1] < count[pos2]) {
				314	min2i = pos1;
				315	pos1--;
				316	} else {
				317	min2i = pos2;
				318	pos2++;
				319	}
				320	} else {
				321	min2i = pos2;
				322	pos2++;
				323	}
				324	count[vocab_size + a] = count[min1i] + count[min2i];
				325	parent_node[min1i] = vocab_size + a;
				326	parent_node[min2i] = vocab_size + a;
				327	binary[min2i] = 1;
				328	}
				329	// Now assign binary code to each vocabulary word
				330	for (a = 0; a < vocab_size; a++) {
				331	b = a;
				332	i = 0;
				333	while (1) {
				334	code[i] = binary[b];
				335	point[i] = b;
				336	i++;
				337	b = parent_node[b];
				338	if (b == vocab_size * 2 - 2)
				339	break;
				340	}
				341	vocab[a].codelen = i;
				342	vocab[a].point[0] = vocab_size - 2;
				343	for (b = 0; b < i; b++) {
				344	vocab[a].code[i - b - 1] = code[b];
				345	vocab[a].point[i - b] = point[b] - vocab_size;
				346	}
				347	}
				348	free(count);
				349	free(binary);
				350	free(parent_node);
				351	}
				352
				353	void LearnVocabFromTrainFile() {
				354	char word[MAX_STRING];
				355	FILE *fin;
				356	long long a, i;
				357	for (a = 0; a < vocab_hash_size; a++)
				358	vocab_hash[a] = -1;
				359	fin = fopen(train_file, "rb");
				360	if (fin == NULL) {
				361	printf("ERROR: training data file not found!\n");
				362	exit(1);
				363	}
				364	vocab_size = 0;
				365	AddWordToVocab((char *) "</s>");
				366	while (1) {
				367	ReadWord(word, fin);
				368	if (feof(fin))
				369	break;
				370	train_words++;
				371	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				372	printf("%lldK%c", train_words / 1000, 13);
				373	fflush(stdout);
				374	}
				375	i = SearchVocab(word);
				376	if (i == -1) {
				377	a = AddWordToVocab(word);
				378	vocab[a].cn = 1;
				379	} else
				380	vocab[i].cn++;
				381	if (vocab_size > vocab_hash_size * 0.7)
				382	ReduceVocab();
				383	}
				384	SortVocab();
				385	if (debug_mode > 0) {
				386	printf("Vocab size: %lld\n", vocab_size);
				387	printf("Words in train file: %lld\n", train_words);
				388	}
				389	file_size = ftell(fin);
				390	fclose(fin);
				391	}
				392
				393	void SaveVocab() {
				394	long long i;
				395	FILE *fo = fopen(save_vocab_file, "wb");
				396	for (i = 0; i < vocab_size; i++)
				397	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				398	fclose(fo);
				399	}
				400
				401	void ReadVocab() {
				402	long long a, i = 0;
				403	char c;
				404	char word[MAX_STRING];
				405	FILE *fin = fopen(read_vocab_file, "rb");
				406	if (fin == NULL) {
				407	printf("Vocabulary file not found\n");
				408	exit(1);
				409	}
				410	for (a = 0; a < vocab_hash_size; a++)
				411	vocab_hash[a] = -1;
				412	vocab_size = 0;
				413	while (1) {
				414	ReadWord(word, fin);
				415	if (feof(fin))
				416	break;
				417	a = AddWordToVocab(word);
				418	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				419	i++;
				420	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	421	fclose(fin);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	422	fin = fopen(train_file, "rb");
				423	if (fin == NULL) {
				424	printf("ERROR: training data file not found!\n");
				425	exit(1);
				426	}
				427	fseek(fin, 0, SEEK_END);
				428	file_size = ftell(fin);
				429	fclose(fin);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	430	SortVocab();
				431	if (debug_mode > 0) {
				432	printf("Vocab size: %lld\n", vocab_size);
				433	if(*read_vocab_file) {
				434	printf("Words in vocab's train file: %lld\n", train_words);
				435	printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
				436	} else {
				437	printf("Words in train file: %lld\n", train_words);
				438	}
				439	}
				440	if(*read_vocab_file) {
				441	train_words = file_size / avgWordLength;
				442	if(debug_mode > 0)
				443	printf("Estimated words in train file: %lld\n", train_words);
				444	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	445	}
				446
				447	void InitClassUnigramTable() {
				448	long long a, c;
				449	printf("loading class unigrams \n");
				450	FILE *fin = fopen(negative_classes_file, "rb");
				451	if (fin == NULL) {
				452	printf("ERROR: class file not found!\n");
				453	exit(1);
				454	}
				455	word_to_group = (int ) malloc(vocab_size sizeof(int));
				456	for (a = 0; a < vocab_size; a++)
				457	word_to_group[a] = -1;
				458	char class[MAX_STRING];
				459	char prev_class[MAX_STRING];
				460	prev_class[0] = 0;
				461	char word[MAX_STRING];
				462	class_number = -1;
				463	while (1) {
				464	if (feof(fin))
				465	break;
				466	ReadWord(class, fin);
				467	ReadWord(word, fin);
				468	int word_index = SearchVocab(word);
				469	if (word_index != -1) {
				470	if (strcmp(class, prev_class) != 0) {
				471	class_number++;
				472	strcpy(prev_class, class);
				473	}
				474	word_to_group[word_index] = class_number;
				475	}
				476	ReadWord(word, fin);
				477	}
				478	class_number++;
				479	fclose(fin);
				480
				481	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				482	long long train_words_pow = 0;
				483	real d1, power = 0.75;
				484
				485	for (c = 0; c < class_number; c++) {
				486	long long offset = c * table_size;
				487	train_words_pow = 0;
				488	for (a = 0; a < vocab_size; a++)
				489	if (word_to_group[a] == c)
				490	train_words_pow += pow(vocab[a].cn, power);
				491	int i = 0;
				492	while (word_to_group[i] != c && i < vocab_size)
				493	i++;
				494	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				495	for (a = 0; a < table_size; a++) {
				496	//printf("index %lld , word %d\n", a, i);
				497	group_to_table[offset + a] = i;
				498	if (a / (real) table_size > d1) {
				499	i++;
				500	while (word_to_group[i] != c && i < vocab_size)
				501	i++;
				502	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				503	}
				504	if (i >= vocab_size)
				505	while (word_to_group[i] != c && i >= 0)
				506	i--;
				507	}
				508	}
				509	}
				510
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	511	void SaveArgs(int argc, char **argv) {
				512	unsigned int i;
				513	size_t len = 0;
				514	char _all_args, all_args;
				515	char *args_file = strdup(output_file);
				516	strcat(args_file, ".args");
				517	FILE *fargs = fopen(args_file, "w");
				518	if (fargs == NULL) {
				519	printf("Cannot save args to %s.\n", args_file);
				520	return;
				521	}
				522
				523	for(i=1; i<argc; i++) {
				524	len += strlen(argv[i]);
				525	}
				526
				527	_all_args = all_args = (char *)malloc(len+argc-1);
				528
				529	for(i=1; i<argc; i++) {
				530	memcpy(_all_args, argv[i], strlen(argv[i]));
				531	_all_args += strlen(argv[i])+1;
				532	*(_all_args-1) = ' ';
				533	}
				534	*(_all_args-1) = 0;
				535
				536	fprintf(fargs, "%s\n", all_args);
				537	fclose(fargs);
				538
				539	free(all_args);
				540
				541	return;
				542	}
				543
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	544	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	545	if(type != 3 \|\| negative <= 0) {
				546	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				547	return;
				548	}
				549
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	550	FILE *fnet = fopen(save_net_file, "wb");
				551	if (fnet == NULL) {
				552	printf("Net parameter file not found\n");
				553	exit(1);
				554	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	555	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	556	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	557	fclose(fnet);
				558	}
				559
				560	void InitNet() {
				561	long long a, b;
				562	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	563	long long read;
				564
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	565	window_layer_size = layer1_size * window * 2;
				566	a = posix_memalign((void **) &syn0, 128,
				567	(long long) vocab_size * layer1_size * sizeof(real));
				568	if (syn0 == NULL) {
				569	printf("Memory allocation failed\n");
				570	exit(1);
				571	}
				572
				573	if (hs) {
				574	a = posix_memalign((void **) &syn1, 128,
				575	(long long) vocab_size * layer1_size * sizeof(real));
				576	if (syn1 == NULL) {
				577	printf("Memory allocation failed\n");
				578	exit(1);
				579	}
				580	a = posix_memalign((void **) &syn1_window, 128,
				581	(long long) vocab_size * window_layer_size * sizeof(real));
				582	if (syn1_window == NULL) {
				583	printf("Memory allocation failed\n");
				584	exit(1);
				585	}
				586	a = posix_memalign((void **) &syn_hidden_word, 128,
				587	(long long) vocab_size * window_hidden_size * sizeof(real));
				588	if (syn_hidden_word == NULL) {
				589	printf("Memory allocation failed\n");
				590	exit(1);
				591	}
				592
				593	for (a = 0; a < vocab_size; a++)
				594	for (b = 0; b < layer1_size; b++)
				595	syn1[a * layer1_size + b] = 0;
				596	for (a = 0; a < vocab_size; a++)
				597	for (b = 0; b < window_layer_size; b++)
				598	syn1_window[a * window_layer_size + b] = 0;
				599	for (a = 0; a < vocab_size; a++)
				600	for (b = 0; b < window_hidden_size; b++)
				601	syn_hidden_word[a * window_hidden_size + b] = 0;
				602	}
				603	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	604	if(type == 0) {
				605	a = posix_memalign((void **) &syn1neg, 128,
				606	(long long) vocab_size * layer1_size * sizeof(real));
				607	if (syn1neg == NULL) {
				608	printf("Memory allocation failed\n");
				609	exit(1);
				610	}
				611	for (a = 0; a < vocab_size; a++)
				612	for (b = 0; b < layer1_size; b++)
				613	syn1neg[a * layer1_size + b] = 0;
				614	} else if (type == 3) {
				615	a = posix_memalign((void **) &syn1neg_window, 128,
				616	(long long) vocab_size * window_layer_size * sizeof(real));
				617	if (syn1neg_window == NULL) {
				618	printf("Memory allocation failed\n");
				619	exit(1);
				620	}
				621	for (a = 0; a < vocab_size; a++)
				622	for (b = 0; b < window_layer_size; b++)
				623	syn1neg_window[a * window_layer_size + b] = 0;
				624	} else if (type == 4) {
				625	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				626	(long long) vocab_size * window_hidden_size * sizeof(real));
				627	if (syn_hidden_word_neg == NULL) {
				628	printf("Memory allocation failed\n");
				629	exit(1);
				630	}
				631	for (a = 0; a < vocab_size; a++)
				632	for (b = 0; b < window_hidden_size; b++)
				633	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	634	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	635	}
				636	if (nce > 0) {
				637	a = posix_memalign((void **) &syn1nce, 128,
				638	(long long) vocab_size * layer1_size * sizeof(real));
				639	if (syn1nce == NULL) {
				640	printf("Memory allocation failed\n");
				641	exit(1);
				642	}
				643	a = posix_memalign((void **) &syn1nce_window, 128,
				644	(long long) vocab_size * window_layer_size * sizeof(real));
				645	if (syn1nce_window == NULL) {
				646	printf("Memory allocation failed\n");
				647	exit(1);
				648	}
				649	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				650	(long long) vocab_size * window_hidden_size * sizeof(real));
				651	if (syn_hidden_word_nce == NULL) {
				652	printf("Memory allocation failed\n");
				653	exit(1);
				654	}
				655
				656	for (a = 0; a < vocab_size; a++)
				657	for (b = 0; b < layer1_size; b++)
				658	syn1nce[a * layer1_size + b] = 0;
				659	for (a = 0; a < vocab_size; a++)
				660	for (b = 0; b < window_layer_size; b++)
				661	syn1nce_window[a * window_layer_size + b] = 0;
				662	for (a = 0; a < vocab_size; a++)
				663	for (b = 0; b < window_hidden_size; b++)
				664	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				665	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	666
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	667	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	668	a = posix_memalign((void **) &syn_window_hidden, 128,
				669	window_hidden_size * window_layer_size * sizeof(real));
				670	if (syn_window_hidden == NULL) {
				671	printf("Memory allocation failed\n");
				672	exit(1);
				673	}
				674	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				675	next_random = next_random * (unsigned long long) 25214903917 + 11;
				676	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				677	- 0.5) / (window_hidden_size * window_layer_size);
				678	}
				679	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	680
				681	if (read_net_file[0] == 0) {
				682	for (a = 0; a < vocab_size; a++)
				683	for (b = 0; b < layer1_size; b++) {
				684	next_random = next_random * (unsigned long long) 25214903917
				685	+ 11;
				686	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				687	/ (real) 65536) - 0.5) / layer1_size;
				688	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	689	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	690	FILE *fnet = fopen(read_net_file, "rb");
				691	if (fnet == NULL) {
				692	printf("Net parameter file not found\n");
				693	exit(1);
				694	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	695	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				696	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				697	if(read != vocab_size * layer1_size) {
				698	fprintf(stderr, "read-net failed %lld\n", read);
				699	exit(-1);
				700	}
				701	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				702	if(read != (long long) vocab_size * window_layer_size) {
				703	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				704	(long long) sizeof(real) * vocab_size * window_layer_size);
				705	exit(-1);
				706	}
				707	fgetc(fnet);
				708	if(!feof(fnet)) {
				709	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				710	exit(-1);
				711	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	712	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	713	} else {
				714	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				715	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	716	}
				717
				718	CreateBinaryTree();
				719	}
				720
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	721	char currentDateTime(char buf, real offset) {
				722	time_t t;
				723	time(&t);
				724	t += (long) offset;
				725	struct tm tstruct;
				726	tstruct = *localtime(&t);
				727	strftime(buf, 80, "%c", &tstruct);
				728	return buf;
				729	}
				730
				731	void MonitorThread(void id) {
				732	char *timebuf = malloc(80);;
				733	int i, n=num_threads;
				734	long long sum;
				735	sleep(1);
				736	while(n > 0) {
				737	sleep(1);
				738	sum = n = 0;
				739	for(i=0; i < num_threads; i++) {
				740	if(threadPos[i] >= 0) {
				741	sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
				742	n++;
				743	} else {
				744	sum += iter * file_size / num_threads;
				745	}
				746	}
				747	if(n == 0)
				748	break;
				749	real finished_portion = (real) sum / (float) (file_size * iter);
				750	long long now = clock();
				751	long long elapsed = (now - start) / CLOCKS_PER_SEC / num_threads;
				752	long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed) * ((real) num_threads / n) ;
				753
				754	printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/t/s TE: %llds TTG: %llds ETA: %s\033[K",
				755	alpha,
				756	finished_portion * 100,
				757	(float) sum / elapsed / num_threads / 1000,
				758	elapsed,
				759	ttg,
				760	currentDateTime(timebuf, ttg)
				761	);
				762	fflush(stdout);
				763	}
				764	pthread_exit(NULL);
				765	}
				766
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	767	void TrainModelThread(void id) {
				768	long long a, b, d, cw, word, last_word, sentence_length = 0,
				769	sentence_position = 0;
				770	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				771	long long l1, l2, c, target, label, local_iter = iter;
				772	unsigned long long next_random = (long long) id;
				773	real f, g;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	774	int input_len_1 = layer1_size;
				775	int window_offset = -1;
				776	if (type == 2 \|\| type == 4) {
				777	input_len_1 = window_layer_size;
				778	}
				779	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				780	real neu1e = (real ) calloc(input_len_1, sizeof(real));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	781	threadIters[(long) id] = iter;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	782
				783	int input_len_2 = 0;
				784	if (type == 4) {
				785	input_len_2 = window_hidden_size;
				786	}
				787	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				788	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				789
				790	FILE *fi = fopen(train_file, "rb");
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	791	long long start_pos = file_size / (long long) num_threads * (long long) id;
				792	long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
				793	long long current_pos = start_pos;
				794	long long last_pos = start_pos;;
				795	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	796	while (1) {
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	797	if ((current_pos - last_pos > 100000)) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	798	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	799	last_pos = current_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	800	last_word_count = word_count;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	801	alpha = starting_alpha
				802	* (1 - word_count_actual / (real) (iter * train_words + 1));
				803	if (alpha < starting_alpha * 0.0001)
				804	alpha = starting_alpha * 0.0001;
				805	}
				806	if (sentence_length == 0) {
				807	while (1) {
				808	word = ReadWordIndex(fi);
				809	if (feof(fi))
				810	break;
				811	if (word == -1)
				812	continue;
				813	word_count++;
				814	if (word == 0)
				815	break;
				816	// The subsampling randomly discards frequent words while keeping the ranking same
				817	if (sample > 0) {
				818	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				819	+ 1) * (sample * train_words) / vocab[word].cn;
				820	next_random = next_random * (unsigned long long) 25214903917
				821	+ 11;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	822	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				823	if(type == 3) // in structured skipgrams
				824	word = -2; // keep the window position correct
				825	else
				826	continue;
				827	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	828	}
				829	sen[sentence_length] = word;
				830	sentence_length++;
				831	if (sentence_length >= MAX_SENTENCE_LENGTH)
				832	break;
				833	}
				834	sentence_position = 0;
				835	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	836	current_pos = threadPos[(long) id] = ftell(fi);
				837	if (feof(fi) \|\| current_pos >= end_pos ) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	838	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	839	threadIters[(long) id]--;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	840	local_iter--;
				841	if (local_iter == 0)
				842	break;
				843	word_count = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	844	current_pos = last_pos = start_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	845	last_word_count = 0;
				846	sentence_length = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	847	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	848	continue;
				849	}
				850	word = sen[sentence_position];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	851	while (word == -2 && sentence_position<sentence_length)
				852	word = sen[++sentence_position];
				853	if (sentence_position>=sentence_length) {
				854	sentence_length=0;
				855	continue;
				856	}
				857	if (word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	858	continue;
				859	for (c = 0; c < input_len_1; c++)
				860	neu1[c] = 0;
				861	for (c = 0; c < input_len_1; c++)
				862	neu1e[c] = 0;
				863	for (c = 0; c < input_len_2; c++)
				864	neu2[c] = 0;
				865	for (c = 0; c < input_len_2; c++)
				866	neu2e[c] = 0;
				867	next_random = next_random * (unsigned long long) 25214903917 + 11;
				868	b = next_random % window;
				869	if (type == 0) { //train the cbow architecture
				870	// in -> hidden
				871	cw = 0;
				872	for (a = b; a < window * 2 + 1 - b; a++)
				873	if (a != window) {
				874	c = sentence_position - window + a;
				875	if (c < 0)
				876	continue;
				877	if (c >= sentence_length)
				878	continue;
				879	last_word = sen[c];
				880	if (last_word == -1)
				881	continue;
				882	for (c = 0; c < layer1_size; c++)
				883	neu1[c] += syn0[c + last_word * layer1_size];
				884	cw++;
				885	}
				886	if (cw) {
				887	for (c = 0; c < layer1_size; c++)
				888	neu1[c] /= cw;
				889	if (hs)
				890	for (d = 0; d < vocab[word].codelen; d++) {
				891	f = 0;
				892	l2 = vocab[word].point[d] * layer1_size;
				893	// Propagate hidden -> output
				894	for (c = 0; c < layer1_size; c++)
				895	f += neu1[c] * syn1[c + l2];
				896	if (f <= -MAX_EXP)
				897	continue;
				898	else if (f >= MAX_EXP)
				899	continue;
				900	else
				901	f = expTable[(int) ((f + MAX_EXP)
				902	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				903	// 'g' is the gradient multiplied by the learning rate
				904	g = (1 - vocab[word].code[d] - f) * alpha;
				905	// Propagate errors output -> hidden
				906	for (c = 0; c < layer1_size; c++)
				907	neu1e[c] += g * syn1[c + l2];
				908	// Learn weights hidden -> output
				909	for (c = 0; c < layer1_size; c++)
				910	syn1[c + l2] += g * neu1[c];
				911	if (cap == 1)
				912	for (c = 0; c < layer1_size; c++)
				913	capParam(syn1, c + l2);
				914	}
				915	// NEGATIVE SAMPLING
				916	if (negative > 0)
				917	for (d = 0; d < negative + 1; d++) {
				918	if (d == 0) {
				919	target = word;
				920	label = 1;
				921	} else {
				922	next_random = next_random
				923	* (unsigned long long) 25214903917 + 11;
				924	if (word_to_group != NULL
				925	&& word_to_group[word] != -1) {
				926	target = word;
				927	while (target == word) {
				928	target = group_to_table[word_to_group[word]
				929	* table_size
				930	+ (next_random >> 16) % table_size];
				931	next_random = next_random
				932	* (unsigned long long) 25214903917
				933	+ 11;
				934	}
				935	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				936	} else {
				937	target =
				938	table[(next_random >> 16) % table_size];
				939	}
				940	if (target == 0)
				941	target = next_random % (vocab_size - 1) + 1;
				942	if (target == word)
				943	continue;
				944	label = 0;
				945	}
				946	l2 = target * layer1_size;
				947	f = 0;
				948	for (c = 0; c < layer1_size; c++)
				949	f += neu1[c] * syn1neg[c + l2];
				950	if (f > MAX_EXP)
				951	g = (label - 1) * alpha;
				952	else if (f < -MAX_EXP)
				953	g = (label - 0) * alpha;
				954	else
				955	g = (label
				956	- expTable[(int) ((f + MAX_EXP)
				957	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				958	* alpha;
				959	for (c = 0; c < layer1_size; c++)
				960	neu1e[c] += g * syn1neg[c + l2];
				961	for (c = 0; c < layer1_size; c++)
				962	syn1neg[c + l2] += g * neu1[c];
				963	if (cap == 1)
				964	for (c = 0; c < layer1_size; c++)
				965	capParam(syn1neg, c + l2);
				966	}
				967	// Noise Contrastive Estimation
				968	if (nce > 0)
				969	for (d = 0; d < nce + 1; d++) {
				970	if (d == 0) {
				971	target = word;
				972	label = 1;
				973	} else {
				974	next_random = next_random
				975	* (unsigned long long) 25214903917 + 11;
				976	if (word_to_group != NULL
				977	&& word_to_group[word] != -1) {
				978	target = word;
				979	while (target == word) {
				980	target = group_to_table[word_to_group[word]
				981	* table_size
				982	+ (next_random >> 16) % table_size];
				983	next_random = next_random
				984	* (unsigned long long) 25214903917
				985	+ 11;
				986	}
				987	} else {
				988	target =
				989	table[(next_random >> 16) % table_size];
				990	}
				991	if (target == 0)
				992	target = next_random % (vocab_size - 1) + 1;
				993	if (target == word)
				994	continue;
				995	label = 0;
				996	}
				997	l2 = target * layer1_size;
				998	f = 0;
				999
				1000	for (c = 0; c < layer1_size; c++)
				1001	f += neu1[c] * syn1nce[c + l2];
				1002	if (f > MAX_EXP)
				1003	g = (label - 1) * alpha;
				1004	else if (f < -MAX_EXP)
				1005	g = (label - 0) * alpha;
				1006	else {
				1007	f = exp(f);
				1008	g =
				1009	(label
				1010	- f
				1011	/ (noise_distribution[target]
				1012	* nce + f)) * alpha;
				1013	}
				1014	for (c = 0; c < layer1_size; c++)
				1015	neu1e[c] += g * syn1nce[c + l2];
				1016	for (c = 0; c < layer1_size; c++)
				1017	syn1nce[c + l2] += g * neu1[c];
				1018	if (cap == 1)
				1019	for (c = 0; c < layer1_size; c++)
				1020	capParam(syn1nce, c + l2);
				1021	}
				1022	// hidden -> in
				1023	for (a = b; a < window * 2 + 1 - b; a++)
				1024	if (a != window) {
				1025	c = sentence_position - window + a;
				1026	if (c < 0)
				1027	continue;
				1028	if (c >= sentence_length)
				1029	continue;
				1030	last_word = sen[c];
				1031	if (last_word == -1)
				1032	continue;
				1033	for (c = 0; c < layer1_size; c++)
				1034	syn0[c + last_word * layer1_size] += neu1e[c];
				1035	}
				1036	}
				1037	} else if (type == 1) { //train skip-gram
				1038	for (a = b; a < window * 2 + 1 - b; a++)
				1039	if (a != window) {
				1040	c = sentence_position - window + a;
				1041	if (c < 0)
				1042	continue;
				1043	if (c >= sentence_length)
				1044	continue;
				1045	last_word = sen[c];
				1046	if (last_word == -1)
				1047	continue;
				1048	l1 = last_word * layer1_size;
				1049	for (c = 0; c < layer1_size; c++)
				1050	neu1e[c] = 0;
				1051	// HIERARCHICAL SOFTMAX
				1052	if (hs)
				1053	for (d = 0; d < vocab[word].codelen; d++) {
				1054	f = 0;
				1055	l2 = vocab[word].point[d] * layer1_size;
				1056	// Propagate hidden -> output
				1057	for (c = 0; c < layer1_size; c++)
				1058	f += syn0[c + l1] * syn1[c + l2];
				1059	if (f <= -MAX_EXP)
				1060	continue;
				1061	else if (f >= MAX_EXP)
				1062	continue;
				1063	else
				1064	f = expTable[(int) ((f + MAX_EXP)
				1065	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1066	// 'g' is the gradient multiplied by the learning rate
				1067	g = (1 - vocab[word].code[d] - f) * alpha;
				1068	// Propagate errors output -> hidden
				1069	for (c = 0; c < layer1_size; c++)
				1070	neu1e[c] += g * syn1[c + l2];
				1071	// Learn weights hidden -> output
				1072	for (c = 0; c < layer1_size; c++)
				1073	syn1[c + l2] += g * syn0[c + l1];
				1074	if (cap == 1)
				1075	for (c = 0; c < layer1_size; c++)
				1076	capParam(syn1, c + l2);
				1077	}
				1078	// NEGATIVE SAMPLING
				1079	if (negative > 0)
				1080	for (d = 0; d < negative + 1; d++) {
				1081	if (d == 0) {
				1082	target = word;
				1083	label = 1;
				1084	} else {
				1085	next_random = next_random
				1086	* (unsigned long long) 25214903917 + 11;
				1087	if (word_to_group != NULL
				1088	&& word_to_group[word] != -1) {
				1089	target = word;
				1090	while (target == word) {
				1091	target =
				1092	group_to_table[word_to_group[word]
				1093	* table_size
				1094	+ (next_random >> 16)
				1095	% table_size];
				1096	next_random =
				1097	next_random
				1098	* (unsigned long long) 25214903917
				1099	+ 11;
				1100	}
				1101	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1102	} else {
				1103	target = table[(next_random >> 16)
				1104	% table_size];
				1105	}
				1106	if (target == 0)
				1107	target = next_random % (vocab_size - 1) + 1;
				1108	if (target == word)
				1109	continue;
				1110	label = 0;
				1111	}
				1112	l2 = target * layer1_size;
				1113	f = 0;
				1114	for (c = 0; c < layer1_size; c++)
				1115	f += syn0[c + l1] * syn1neg[c + l2];
				1116	if (f > MAX_EXP)
				1117	g = (label - 1) * alpha;
				1118	else if (f < -MAX_EXP)
				1119	g = (label - 0) * alpha;
				1120	else
				1121	g =
				1122	(label
				1123	- expTable[(int) ((f + MAX_EXP)
				1124	* (EXP_TABLE_SIZE
				1125	/ MAX_EXP / 2))])
				1126	* alpha;
				1127	for (c = 0; c < layer1_size; c++)
				1128	neu1e[c] += g * syn1neg[c + l2];
				1129	for (c = 0; c < layer1_size; c++)
				1130	syn1neg[c + l2] += g * syn0[c + l1];
				1131	if (cap == 1)
				1132	for (c = 0; c < layer1_size; c++)
				1133	capParam(syn1neg, c + l2);
				1134	}
				1135	//Noise Contrastive Estimation
				1136	if (nce > 0)
				1137	for (d = 0; d < nce + 1; d++) {
				1138	if (d == 0) {
				1139	target = word;
				1140	label = 1;
				1141	} else {
				1142	next_random = next_random
				1143	* (unsigned long long) 25214903917 + 11;
				1144	if (word_to_group != NULL
				1145	&& word_to_group[word] != -1) {
				1146	target = word;
				1147	while (target == word) {
				1148	target =
				1149	group_to_table[word_to_group[word]
				1150	* table_size
				1151	+ (next_random >> 16)
				1152	% table_size];
				1153	next_random =
				1154	next_random
				1155	* (unsigned long long) 25214903917
				1156	+ 11;
				1157	}
				1158	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1159	} else {
				1160	target = table[(next_random >> 16)
				1161	% table_size];
				1162	}
				1163	if (target == 0)
				1164	target = next_random % (vocab_size - 1) + 1;
				1165	if (target == word)
				1166	continue;
				1167	label = 0;
				1168	}
				1169	l2 = target * layer1_size;
				1170	f = 0;
				1171	for (c = 0; c < layer1_size; c++)
				1172	f += syn0[c + l1] * syn1nce[c + l2];
				1173	if (f > MAX_EXP)
				1174	g = (label - 1) * alpha;
				1175	else if (f < -MAX_EXP)
				1176	g = (label - 0) * alpha;
				1177	else {
				1178	f = exp(f);
				1179	g = (label
				1180	- f
				1181	/ (noise_distribution[target]
				1182	* nce + f)) * alpha;
				1183	}
				1184	for (c = 0; c < layer1_size; c++)
				1185	neu1e[c] += g * syn1nce[c + l2];
				1186	for (c = 0; c < layer1_size; c++)
				1187	syn1nce[c + l2] += g * syn0[c + l1];
				1188	if (cap == 1)
				1189	for (c = 0; c < layer1_size; c++)
				1190	capParam(syn1nce, c + l2);
				1191	}
				1192	// Learn weights input -> hidden
				1193	for (c = 0; c < layer1_size; c++)
				1194	syn0[c + l1] += neu1e[c];
				1195	}
				1196	} else if (type == 2) { //train the cwindow architecture
				1197	// in -> hidden
				1198	cw = 0;
				1199	for (a = 0; a < window * 2 + 1; a++)
				1200	if (a != window) {
				1201	c = sentence_position - window + a;
				1202	if (c < 0)
				1203	continue;
				1204	if (c >= sentence_length)
				1205	continue;
				1206	last_word = sen[c];
				1207	if (last_word == -1)
				1208	continue;
				1209	window_offset = a * layer1_size;
				1210	if (a > window)
				1211	window_offset -= layer1_size;
				1212	for (c = 0; c < layer1_size; c++)
				1213	neu1[c + window_offset] += syn0[c
				1214	+ last_word * layer1_size];
				1215	cw++;
				1216	}
				1217	if (cw) {
				1218	if (hs)
				1219	for (d = 0; d < vocab[word].codelen; d++) {
				1220	f = 0;
				1221	l2 = vocab[word].point[d] * window_layer_size;
				1222	// Propagate hidden -> output
				1223	for (c = 0; c < window_layer_size; c++)
				1224	f += neu1[c] * syn1_window[c + l2];
				1225	if (f <= -MAX_EXP)
				1226	continue;
				1227	else if (f >= MAX_EXP)
				1228	continue;
				1229	else
				1230	f = expTable[(int) ((f + MAX_EXP)
				1231	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1232	// 'g' is the gradient multiplied by the learning rate
				1233	g = (1 - vocab[word].code[d] - f) * alpha;
				1234	// Propagate errors output -> hidden
				1235	for (c = 0; c < window_layer_size; c++)
				1236	neu1e[c] += g * syn1_window[c + l2];
				1237	// Learn weights hidden -> output
				1238	for (c = 0; c < window_layer_size; c++)
				1239	syn1_window[c + l2] += g * neu1[c];
				1240	if (cap == 1)
				1241	for (c = 0; c < window_layer_size; c++)
				1242	capParam(syn1_window, c + l2);
				1243	}
				1244	// NEGATIVE SAMPLING
				1245	if (negative > 0)
				1246	for (d = 0; d < negative + 1; d++) {
				1247	if (d == 0) {
				1248	target = word;
				1249	label = 1;
				1250	} else {
				1251	next_random = next_random
				1252	* (unsigned long long) 25214903917 + 11;
				1253	if (word_to_group != NULL
				1254	&& word_to_group[word] != -1) {
				1255	target = word;
				1256	while (target == word) {
				1257	target = group_to_table[word_to_group[word]
				1258	* table_size
				1259	+ (next_random >> 16) % table_size];
				1260	next_random = next_random
				1261	* (unsigned long long) 25214903917
				1262	+ 11;
				1263	}
				1264	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1265	} else {
				1266	target =
				1267	table[(next_random >> 16) % table_size];
				1268	}
				1269	if (target == 0)
				1270	target = next_random % (vocab_size - 1) + 1;
				1271	if (target == word)
				1272	continue;
				1273	label = 0;
				1274	}
				1275	l2 = target * window_layer_size;
				1276	f = 0;
				1277	for (c = 0; c < window_layer_size; c++)
				1278	f += neu1[c] * syn1neg_window[c + l2];
				1279	if (f > MAX_EXP)
				1280	g = (label - 1) * alpha;
				1281	else if (f < -MAX_EXP)
				1282	g = (label - 0) * alpha;
				1283	else
				1284	g = (label
				1285	- expTable[(int) ((f + MAX_EXP)
				1286	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1287	* alpha;
				1288	for (c = 0; c < window_layer_size; c++)
				1289	neu1e[c] += g * syn1neg_window[c + l2];
				1290	for (c = 0; c < window_layer_size; c++)
				1291	syn1neg_window[c + l2] += g * neu1[c];
				1292	if (cap == 1)
				1293	for (c = 0; c < window_layer_size; c++)
				1294	capParam(syn1neg_window, c + l2);
				1295	}
				1296	// Noise Contrastive Estimation
				1297	if (nce > 0)
				1298	for (d = 0; d < nce + 1; d++) {
				1299	if (d == 0) {
				1300	target = word;
				1301	label = 1;
				1302	} else {
				1303	next_random = next_random
				1304	* (unsigned long long) 25214903917 + 11;
				1305	if (word_to_group != NULL
				1306	&& word_to_group[word] != -1) {
				1307	target = word;
				1308	while (target == word) {
				1309	target = group_to_table[word_to_group[word]
				1310	* table_size
				1311	+ (next_random >> 16) % table_size];
				1312	next_random = next_random
				1313	* (unsigned long long) 25214903917
				1314	+ 11;
				1315	}
				1316	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1317	} else {
				1318	target =
				1319	table[(next_random >> 16) % table_size];
				1320	}
				1321	if (target == 0)
				1322	target = next_random % (vocab_size - 1) + 1;
				1323	if (target == word)
				1324	continue;
				1325	label = 0;
				1326	}
				1327	l2 = target * window_layer_size;
				1328	f = 0;
				1329	for (c = 0; c < window_layer_size; c++)
				1330	f += neu1[c] * syn1nce_window[c + l2];
				1331	if (f > MAX_EXP)
				1332	g = (label - 1) * alpha;
				1333	else if (f < -MAX_EXP)
				1334	g = (label - 0) * alpha;
				1335	else {
				1336	f = exp(f);
				1337	g =
				1338	(label
				1339	- f
				1340	/ (noise_distribution[target]
				1341	* nce + f)) * alpha;
				1342	}
				1343	for (c = 0; c < window_layer_size; c++)
				1344	neu1e[c] += g * syn1nce_window[c + l2];
				1345	for (c = 0; c < window_layer_size; c++)
				1346	syn1nce_window[c + l2] += g * neu1[c];
				1347	if (cap == 1)
				1348	for (c = 0; c < window_layer_size; c++)
				1349	capParam(syn1nce_window, c + l2);
				1350	}
				1351	// hidden -> in
				1352	for (a = 0; a < window * 2 + 1; a++)
				1353	if (a != window) {
				1354	c = sentence_position - window + a;
				1355	if (c < 0)
				1356	continue;
				1357	if (c >= sentence_length)
				1358	continue;
				1359	last_word = sen[c];
				1360	if (last_word == -1)
				1361	continue;
				1362	window_offset = a * layer1_size;
				1363	if (a > window)
				1364	window_offset -= layer1_size;
				1365	for (c = 0; c < layer1_size; c++)
				1366	syn0[c + last_word * layer1_size] += neu1e[c
				1367	+ window_offset];
				1368	}
				1369	}
				1370	} else if (type == 3) { //train structured skip-gram
				1371	for (a = 0; a < window * 2 + 1; a++)
				1372	if (a != window) {
				1373	c = sentence_position - window + a;
				1374	if (c < 0)
				1375	continue;
				1376	if (c >= sentence_length)
				1377	continue;
				1378	last_word = sen[c];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	1379	if (last_word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1380	continue;
				1381	l1 = last_word * layer1_size;
				1382	window_offset = a * layer1_size;
				1383	if (a > window)
				1384	window_offset -= layer1_size;
				1385	for (c = 0; c < layer1_size; c++)
				1386	neu1e[c] = 0;
				1387	// HIERARCHICAL SOFTMAX
				1388	if (hs)
				1389	for (d = 0; d < vocab[word].codelen; d++) {
				1390	f = 0;
				1391	l2 = vocab[word].point[d] * window_layer_size;
				1392	// Propagate hidden -> output
				1393	for (c = 0; c < layer1_size; c++)
				1394	f += syn0[c + l1]
				1395	* syn1_window[c + l2 + window_offset];
				1396	if (f <= -MAX_EXP)
				1397	continue;
				1398	else if (f >= MAX_EXP)
				1399	continue;
				1400	else
				1401	f = expTable[(int) ((f + MAX_EXP)
				1402	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1403	// 'g' is the gradient multiplied by the learning rate
				1404	g = (1 - vocab[word].code[d] - f) * alpha;
				1405	// Propagate errors output -> hidden
				1406	for (c = 0; c < layer1_size; c++)
				1407	neu1e[c] += g
				1408	* syn1_window[c + l2 + window_offset];
				1409	// Learn weights hidden -> output
				1410	for (c = 0; c < layer1_size; c++)
				1411	syn1[c + l2 + window_offset] += g
				1412	* syn0[c + l1];
				1413	if (cap == 1)
				1414	for (c = 0; c < layer1_size; c++)
				1415	capParam(syn1, c + l2 + window_offset);
				1416	}
				1417	// NEGATIVE SAMPLING
				1418	if (negative > 0)
				1419	for (d = 0; d < negative + 1; d++) {
				1420	if (d == 0) {
				1421	target = word;
				1422	label = 1;
				1423	} else {
				1424	next_random = next_random
				1425	* (unsigned long long) 25214903917 + 11;
				1426	if (word_to_group != NULL
				1427	&& word_to_group[word] != -1) {
				1428	target = word;
				1429	while (target == word) {
				1430	target =
				1431	group_to_table[word_to_group[word]
				1432	* table_size
				1433	+ (next_random >> 16)
				1434	% table_size];
				1435	next_random =
				1436	next_random
				1437	* (unsigned long long) 25214903917
				1438	+ 11;
				1439	}
				1440	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1441	} else {
				1442	target = table[(next_random >> 16)
				1443	% table_size];
				1444	}
				1445	if (target == 0)
				1446	target = next_random % (vocab_size - 1) + 1;
				1447	if (target == word)
				1448	continue;
				1449	label = 0;
				1450	}
				1451	l2 = target * window_layer_size;
				1452	f = 0;
				1453	for (c = 0; c < layer1_size; c++)
				1454	f +=
				1455	syn0[c + l1]
				1456	* syn1neg_window[c + l2
				1457	+ window_offset];
				1458	if (f > MAX_EXP)
				1459	g = (label - 1) * alpha;
				1460	else if (f < -MAX_EXP)
				1461	g = (label - 0) * alpha;
				1462	else
				1463	g =
				1464	(label
				1465	- expTable[(int) ((f + MAX_EXP)
				1466	* (EXP_TABLE_SIZE
				1467	/ MAX_EXP / 2))])
				1468	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1469	if(debug_mode > 2 && ((long long) id) == 0) {
				1470	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1471	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1472	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1473	for (c = 0; c < layer1_size; c++)
				1474	neu1e[c] +=
				1475	g
				1476	* syn1neg_window[c + l2
				1477	+ window_offset];
				1478	for (c = 0; c < layer1_size; c++)
				1479	syn1neg_window[c + l2 + window_offset] += g
				1480	* syn0[c + l1];
				1481	if (cap == 1)
				1482	for (c = 0; c < layer1_size; c++)
				1483	capParam(syn1neg_window,
				1484	c + l2 + window_offset);
				1485	}
				1486	// Noise Constrastive Estimation
				1487	if (nce > 0)
				1488	for (d = 0; d < nce + 1; d++) {
				1489	if (d == 0) {
				1490	target = word;
				1491	label = 1;
				1492	} else {
				1493	next_random = next_random
				1494	* (unsigned long long) 25214903917 + 11;
				1495	if (word_to_group != NULL
				1496	&& word_to_group[word] != -1) {
				1497	target = word;
				1498	while (target == word) {
				1499	target =
				1500	group_to_table[word_to_group[word]
				1501	* table_size
				1502	+ (next_random >> 16)
				1503	% table_size];
				1504	next_random =
				1505	next_random
				1506	* (unsigned long long) 25214903917
				1507	+ 11;
				1508	}
				1509	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1510	} else {
				1511	target = table[(next_random >> 16)
				1512	% table_size];
				1513	}
				1514	if (target == 0)
				1515	target = next_random % (vocab_size - 1) + 1;
				1516	if (target == word)
				1517	continue;
				1518	label = 0;
				1519	}
				1520	l2 = target * window_layer_size;
				1521	f = 0;
				1522	for (c = 0; c < layer1_size; c++)
				1523	f +=
				1524	syn0[c + l1]
				1525	* syn1nce_window[c + l2
				1526	+ window_offset];
				1527	if (f > MAX_EXP)
				1528	g = (label - 1) * alpha;
				1529	else if (f < -MAX_EXP)
				1530	g = (label - 0) * alpha;
				1531	else {
				1532	f = exp(f);
				1533	g = (label
				1534	- f
				1535	/ (noise_distribution[target]
				1536	* nce + f)) * alpha;
				1537	}
				1538	for (c = 0; c < layer1_size; c++)
				1539	neu1e[c] +=
				1540	g
				1541	* syn1nce_window[c + l2
				1542	+ window_offset];
				1543	for (c = 0; c < layer1_size; c++)
				1544	syn1nce_window[c + l2 + window_offset] += g
				1545	* syn0[c + l1];
				1546	if (cap == 1)
				1547	for (c = 0; c < layer1_size; c++)
				1548	capParam(syn1nce_window,
				1549	c + l2 + window_offset);
				1550	}
				1551	// Learn weights input -> hidden
				1552	for (c = 0; c < layer1_size; c++) {
				1553	syn0[c + l1] += neu1e[c];
				1554	if (syn0[c + l1] > 50)
				1555	syn0[c + l1] = 50;
				1556	if (syn0[c + l1] < -50)
				1557	syn0[c + l1] = -50;
				1558	}
				1559	}
				1560	} else if (type == 4) { //training senna
				1561	// in -> hidden
				1562	cw = 0;
				1563	for (a = 0; a < window * 2 + 1; a++)
				1564	if (a != window) {
				1565	c = sentence_position - window + a;
				1566	if (c < 0)
				1567	continue;
				1568	if (c >= sentence_length)
				1569	continue;
				1570	last_word = sen[c];
				1571	if (last_word == -1)
				1572	continue;
				1573	window_offset = a * layer1_size;
				1574	if (a > window)
				1575	window_offset -= layer1_size;
				1576	for (c = 0; c < layer1_size; c++)
				1577	neu1[c + window_offset] += syn0[c
				1578	+ last_word * layer1_size];
				1579	cw++;
				1580	}
				1581	if (cw) {
				1582	for (a = 0; a < window_hidden_size; a++) {
				1583	c = a * window_layer_size;
				1584	for (b = 0; b < window_layer_size; b++) {
				1585	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1586	}
				1587	}
				1588	if (hs)
				1589	for (d = 0; d < vocab[word].codelen; d++) {
				1590	f = 0;
				1591	l2 = vocab[word].point[d] * window_hidden_size;
				1592	// Propagate hidden -> output
				1593	for (c = 0; c < window_hidden_size; c++)
				1594	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1595	if (f <= -MAX_EXP)
				1596	continue;
				1597	else if (f >= MAX_EXP)
				1598	continue;
				1599	else
				1600	f = expTable[(int) ((f + MAX_EXP)
				1601	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1602	// 'g' is the gradient multiplied by the learning rate
				1603	g = (1 - vocab[word].code[d] - f) * alpha;
				1604	// Propagate errors output -> hidden
				1605	for (c = 0; c < window_hidden_size; c++)
				1606	neu2e[c] += dHardTanh(neu2[c], g) * g
				1607	* syn_hidden_word[c + l2];
				1608	// Learn weights hidden -> output
				1609	for (c = 0; c < window_hidden_size; c++)
				1610	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1611	* neu2[c];
				1612	}
				1613	// NEGATIVE SAMPLING
				1614	if (negative > 0)
				1615	for (d = 0; d < negative + 1; d++) {
				1616	if (d == 0) {
				1617	target = word;
				1618	label = 1;
				1619	} else {
				1620	next_random = next_random
				1621	* (unsigned long long) 25214903917 + 11;
				1622	if (word_to_group != NULL
				1623	&& word_to_group[word] != -1) {
				1624	target = word;
				1625	while (target == word) {
				1626	target = group_to_table[word_to_group[word]
				1627	* table_size
				1628	+ (next_random >> 16) % table_size];
				1629	next_random = next_random
				1630	* (unsigned long long) 25214903917
				1631	+ 11;
				1632	}
				1633	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1634	} else {
				1635	target =
				1636	table[(next_random >> 16) % table_size];
				1637	}
				1638	if (target == 0)
				1639	target = next_random % (vocab_size - 1) + 1;
				1640	if (target == word)
				1641	continue;
				1642	label = 0;
				1643	}
				1644	l2 = target * window_hidden_size;
				1645	f = 0;
				1646	for (c = 0; c < window_hidden_size; c++)
				1647	f += hardTanh(neu2[c])
				1648	* syn_hidden_word_neg[c + l2];
				1649	if (f > MAX_EXP)
				1650	g = (label - 1) * alpha / negative;
				1651	else if (f < -MAX_EXP)
				1652	g = (label - 0) * alpha / negative;
				1653	else
				1654	g = (label
				1655	- expTable[(int) ((f + MAX_EXP)
				1656	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1657	* alpha / negative;
				1658	for (c = 0; c < window_hidden_size; c++)
				1659	neu2e[c] += dHardTanh(neu2[c], g) * g
				1660	* syn_hidden_word_neg[c + l2];
				1661	for (c = 0; c < window_hidden_size; c++)
				1662	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1663	* g * neu2[c];
				1664	}
				1665	for (a = 0; a < window_hidden_size; a++)
				1666	for (b = 0; b < window_layer_size; b++)
				1667	neu1e[b] += neu2e[a]
				1668	* syn_window_hidden[a * window_layer_size + b];
				1669	for (a = 0; a < window_hidden_size; a++)
				1670	for (b = 0; b < window_layer_size; b++)
				1671	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1672	* neu1[b];
				1673	// hidden -> in
				1674	for (a = 0; a < window * 2 + 1; a++)
				1675	if (a != window) {
				1676	c = sentence_position - window + a;
				1677	if (c < 0)
				1678	continue;
				1679	if (c >= sentence_length)
				1680	continue;
				1681	last_word = sen[c];
				1682	if (last_word == -1)
				1683	continue;
				1684	window_offset = a * layer1_size;
				1685	if (a > window)
				1686	window_offset -= layer1_size;
				1687	for (c = 0; c < layer1_size; c++)
				1688	syn0[c + last_word * layer1_size] += neu1e[c
				1689	+ window_offset];
				1690	}
				1691	}
				1692	} else {
				1693	printf("unknown type %i", type);
				1694	exit(0);
				1695	}
				1696	sentence_position++;
				1697	if (sentence_position >= sentence_length) {
				1698	sentence_length = 0;
				1699	continue;
				1700	}
				1701	}
				1702	fclose(fi);
				1703	free(neu1);
				1704	free(neu1e);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	1705	threadPos[(long) id] = -1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1706	pthread_exit(NULL);
				1707	}
				1708
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1709	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1710	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1711	real f, max_f, maxmax_f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1712	real *target_sums, bestf[MAX_CC], worstbest;
				1713	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1714	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1715	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1716
				1717	for (d = cc; d < vocab_size; d++) {
				1718	for (b = 0; b < vocab_size; b++)
				1719	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1720	for (b = 0; b < N; b++)
				1721	bestf[b]=-1;
				1722	worstbest = -1;
				1723
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1724	maxmax_f = -1;
				1725	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1726	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1727	if (a != window) {
				1728	max_f = -1;
				1729	window_offset = a * layer1_size;
				1730	if (a > window)
				1731	window_offset -= layer1_size;
				1732	for(target = 0; target < vocab_size; target ++) {
				1733	if(target == d)
				1734	continue;
				1735	f = 0;
				1736	for (c = 0; c < layer1_size; c++)
				1737	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1738	if (f < -MAX_EXP)
				1739	continue;
				1740	else if (f > MAX_EXP)
				1741	continue;
				1742	else
				1743	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1744	if(f > max_f) {
				1745	max_f = f;
				1746	max_target = target;
				1747	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1748	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1749	if(f > worstbest) {
				1750	for (b = 0; b < N; b++) {
				1751	if (f > bestf[b]) {
				1752	for (e = N - 1; e > b; e--) {
				1753	bestf[e] = bestf[e - 1];
				1754	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1755	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1756	}
				1757	bestf[b] = f;
				1758	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1759	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1760	break;
				1761	}
				1762	}
				1763	worstbest = bestf[N-1];
				1764	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1765	}
				1766	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1767	if(max_f > maxmax_f) {
				1768	maxmax_f = max_f;
				1769	maxmax_target = max_target;
				1770	}
				1771	} else {
				1772	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1773	}
				1774	}
				1775	max_f = -1;
				1776	for (b = 0; b < vocab_size; b++) {
				1777	if(target_sums[b] > max_f) {
				1778	max_f = target_sums[b];
				1779	max_target = b;
				1780	}
				1781	}
				1782	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1783	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1784	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1785	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1786	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1787	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1788	}
				1789	}
				1790
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1791	void TrainModel() {
				1792	long a, b, c, d;
				1793	FILE *fo;
				1794	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	1795	threadPos = malloc(num_threads * sizeof(long long));
				1796	threadIters = malloc(num_threads * sizeof(int));
				1797	char *timebuf = malloc(80);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1798	printf("Starting training using file %s\n", train_file);
				1799	starting_alpha = alpha;
				1800	if (read_vocab_file[0] != 0)
				1801	ReadVocab();
				1802	else
				1803	LearnVocabFromTrainFile();
				1804	if (save_vocab_file[0] != 0)
				1805	SaveVocab();
				1806	if (output_file[0] == 0)
				1807	return;
				1808	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1809	if(cc > 0)
				1810	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1811	if (negative > 0 \|\| nce > 0)
				1812	InitUnigramTable();
				1813	if (negative_classes_file[0] != 0)
				1814	InitClassUnigramTable();
				1815	start = clock();
				1816	for (a = 0; a < num_threads; a++)
				1817	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	1818	if(debug_mode > 1)
				1819	pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1820	for (a = 0; a < num_threads; a++)
				1821	pthread_join(pt[a], NULL);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	1822	if(debug_mode > 1) {
				1823	pthread_join(pt[num_threads], NULL);
				1824	clock_t now = clock();
				1825	printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now-start) / CLOCKS_PER_SEC / num_threads, (now-start) / CLOCKS_PER_SEC);
				1826	printf("Saving vectors to %s ...", output_file);
				1827	fflush(stdout);
				1828	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1829	fo = fopen(output_file, "wb");
				1830	if (classes == 0) {
				1831	// Save the word vectors
				1832	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1833	for (a = 0; a < vocab_size; a++) {
				1834	fprintf(fo, "%s ", vocab[a].word);
				1835	if (binary)
				1836	for (b = 0; b < layer1_size; b++)
				1837	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1838	else
				1839	for (b = 0; b < layer1_size; b++)
				1840	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1841	fprintf(fo, "\n");
				1842	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame^]	1843	if(debug_mode > 1)
				1844	fprintf(stderr, "\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1845	} else {
				1846	// Run K-means on the word vectors
				1847	int clcn = classes, iter = 10, closeid;
				1848	int centcn = (int ) malloc(classes * sizeof(int));
				1849	int cl = (int ) calloc(vocab_size, sizeof(int));
				1850	real closev, x;
				1851	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1852	for (a = 0; a < vocab_size; a++)
				1853	cl[a] = a % clcn;
				1854	for (a = 0; a < iter; a++) {
				1855	for (b = 0; b < clcn * layer1_size; b++)
				1856	cent[b] = 0;
				1857	for (b = 0; b < clcn; b++)
				1858	centcn[b] = 1;
				1859	for (c = 0; c < vocab_size; c++) {
				1860	for (d = 0; d < layer1_size; d++)
				1861	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1862	centcn[cl[c]]++;
				1863	}
				1864	for (b = 0; b < clcn; b++) {
				1865	closev = 0;
				1866	for (c = 0; c < layer1_size; c++) {
				1867	cent[layer1_size * b + c] /= centcn[b];
				1868	closev += cent[layer1_size * b + c]
				1869	* cent[layer1_size * b + c];
				1870	}
				1871	closev = sqrt(closev);
				1872	for (c = 0; c < layer1_size; c++)
				1873	cent[layer1_size * b + c] /= closev;
				1874	}
				1875	for (c = 0; c < vocab_size; c++) {
				1876	closev = -10;
				1877	closeid = 0;
				1878	for (d = 0; d < clcn; d++) {
				1879	x = 0;
				1880	for (b = 0; b < layer1_size; b++)
				1881	x += cent[layer1_size * d + b]
				1882	* syn0[c * layer1_size + b];
				1883	if (x > closev) {
				1884	closev = x;
				1885	closeid = d;
				1886	}
				1887	}
				1888	cl[c] = closeid;
				1889	}
				1890	}
				1891	// Save the K-means classes
				1892	for (a = 0; a < vocab_size; a++)
				1893	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1894	free(centcn);
				1895	free(cent);
				1896	free(cl);
				1897	}
				1898	fclose(fo);
				1899	if (save_net_file[0] != 0)
				1900	SaveNet();
				1901	}
				1902
				1903	int ArgPos(char str, int argc, char *argv) {
				1904	int a;
				1905	for (a = 1; a < argc; a++)
				1906	if (!strcmp(str, argv[a])) {
				1907	if (a == argc - 1) {
				1908	printf("Argument missing for %s\n", str);
				1909	exit(1);
				1910	}
				1911	return a;
				1912	}
				1913	return -1;
				1914	}
				1915
				1916	int main(int argc, char **argv) {
				1917	int i;
				1918	if (argc == 1) {
				1919	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1920	printf("Options:\n");
				1921	printf("Parameters for training:\n");
				1922	printf("\t-train <file>\n");
				1923	printf("\t\tUse text data from <file> to train the model\n");
				1924	printf("\t-output <file>\n");
				1925	printf(
				1926	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1927	printf("\t-size <int>\n");
				1928	printf("\t\tSet size of word vectors; default is 100\n");
				1929	printf("\t-window <int>\n");
				1930	printf("\t\tSet max skip length between words; default is 5\n");
				1931	printf("\t-sample <float>\n");
				1932	printf(
				1933	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1934	printf(
				1935	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1936	printf("\t-hs <int>\n");
				1937	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1938	printf("\t-negative <int>\n");
				1939	printf(
				1940	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1941	printf("\t-negative-classes <file>\n");
				1942	printf("\t\tNegative classes to sample from\n");
				1943	printf("\t-nce <int>\n");
				1944	printf(
				1945	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1946	printf("\t-threads <int>\n");
				1947	printf("\t\tUse <int> threads (default 12)\n");
				1948	printf("\t-iter <int>\n");
				1949	printf("\t\tRun more training iterations (default 5)\n");
				1950	printf("\t-min-count <int>\n");
				1951	printf(
				1952	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1953	printf("\t-alpha <float>\n");
				1954	printf(
				1955	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1956	printf("\t-classes <int>\n");
				1957	printf(
				1958	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1959	printf("\t-debug <int>\n");
				1960	printf(
				1961	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1962	printf("\t-binary <int>\n");
				1963	printf(
				1964	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1965	printf("\t-save-vocab <file>\n");
				1966	printf("\t\tThe vocabulary will be saved to <file>\n");
				1967	printf("\t-read-vocab <file>\n");
				1968	printf(
				1969	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1970	printf("\t-read-net <file>\n");
				1971	printf(
				1972	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1973	printf("\t-save-net <file>\n");
				1974	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1975	printf("\t-show-cc <int>\n");
				1976	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1977	printf("\t-type <int>\n");
				1978	printf(
				1979	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1980	printf("\t-cap <int>\n");
				1981	printf(
				1982	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1983	printf("\nExamples:\n");
				1984	printf(
				1985	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1986	return 0;
				1987	}
				1988	output_file[0] = 0;
				1989	save_vocab_file[0] = 0;
				1990	read_vocab_file[0] = 0;
				1991	save_net_file[0] = 0;
				1992	read_net_file[0] = 0;
				1993	negative_classes_file[0] = 0;
				1994	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				1995	layer1_size = atoi(argv[i + 1]);
				1996	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				1997	strcpy(train_file, argv[i + 1]);
				1998	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				1999	strcpy(save_vocab_file, argv[i + 1]);
				2000	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				2001	strcpy(read_vocab_file, argv[i + 1]);
				2002	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				2003	strcpy(save_net_file, argv[i + 1]);
				2004	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				2005	strcpy(read_net_file, argv[i + 1]);
				2006	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				2007	debug_mode = atoi(argv[i + 1]);
				2008	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				2009	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	2010	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				2011	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2012	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				2013	type = atoi(argv[i + 1]);
				2014	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				2015	strcpy(output_file, argv[i + 1]);
				2016	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				2017	window = atoi(argv[i + 1]);
				2018	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				2019	sample = atof(argv[i + 1]);
				2020	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				2021	hs = atoi(argv[i + 1]);
				2022	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				2023	negative = atoi(argv[i + 1]);
				2024	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				2025	strcpy(negative_classes_file, argv[i + 1]);
				2026	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				2027	nce = atoi(argv[i + 1]);
				2028	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				2029	num_threads = atoi(argv[i + 1]);
				2030	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				2031	iter = atoi(argv[i + 1]);
				2032	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				2033	min_count = atoi(argv[i + 1]);
				2034	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				2035	classes = atoi(argv[i + 1]);
				2036	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				2037	cap = atoi(argv[i + 1]);
				2038	if (type == 0 \|\| type == 2 \|\| type == 4)
				2039	alpha = 0.05;
				2040	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				2041	alpha = atof(argv[i + 1]);
				2042	vocab = (struct vocab_word *) calloc(vocab_max_size,
				2043	sizeof(struct vocab_word));
				2044	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				2045	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				2046	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				2047	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				2048	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				2049	}
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	2050	SaveArgs(argc, argv);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2051	TrainModel();
				2052	return 0;
				2053	}
				2054