Blame - word2vecExt.c - ids-kl/dereko2vec

blob: 89bca7fdf46c05535866a92a5f9b11b1be81068c [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
				20
				21	#define MAX_STRING 100
				22	#define EXP_TABLE_SIZE 1000
				23	#define MAX_EXP 6
				24	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	25	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	26	#define MAX_CODE_LENGTH 40
				27
				28	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				29
				30	typedef float real; // Precision of float numbers
				31
				32	struct vocab_word {
				33	long long cn;
				34	int *point;
				35	char word, code, codelen;
				36	};
				37
				38	char train_file[MAX_STRING], output_file[MAX_STRING];
				39	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				40	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
				41	struct vocab_word *vocab;
				42	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame^]	43	num_threads = 12, min_reduce = 1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	44	int *vocab_hash;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame^]	45	long long *threadPos;
				46	int *threadIters;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	47	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				48	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				49	classes = 0;
				50	real alpha = 0.025, starting_alpha, sample = 1e-3;
				51	real syn0, syn1, syn1neg, syn1nce, *expTable;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame^]	52	real avgWordLength=0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	53	clock_t start;
				54
				55	real syn1_window, syn1neg_window, *syn1nce_window;
				56	int w_offset, window_layer_size;
				57
				58	int window_hidden_size = 500;
				59	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				60	*syn_hidden_word_nce;
				61
				62	int hs = 0, negative = 5;
				63	const int table_size = 1e8;
				64	int *table;
				65
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	66	long cc = 0;
				67
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	68	//constrastive negative sampling
				69	char negative_classes_file[MAX_STRING];
				70	int *word_to_group;
				71	int group_to_table; //group_sizetable_size
				72	int class_number;
				73
				74	//nce
				75	real* noise_distribution;
				76	int nce = 0;
				77
				78	//param caps
				79	real CAP_VALUE = 50;
				80	int cap = 0;
				81
				82	void capParam(real* array, int index) {
				83	if (array[index] > CAP_VALUE)
				84	array[index] = CAP_VALUE;
				85	else if (array[index] < -CAP_VALUE)
				86	array[index] = -CAP_VALUE;
				87	}
				88
				89	real hardTanh(real x) {
				90	if (x >= 1) {
				91	return 1;
				92	} else if (x <= -1) {
				93	return -1;
				94	} else {
				95	return x;
				96	}
				97	}
				98
				99	real dHardTanh(real x, real g) {
				100	if (x > 1 && g > 0) {
				101	return 0;
				102	}
				103	if (x < -1 && g < 0) {
				104	return 0;
				105	}
				106	return 1;
				107	}
				108
				109	void InitUnigramTable() {
				110	int a, i;
				111	long long train_words_pow = 0;
				112	real d1, power = 0.75;
				113	table = (int ) malloc(table_size sizeof(int));
				114	for (a = 0; a < vocab_size; a++)
				115	train_words_pow += pow(vocab[a].cn, power);
				116	i = 0;
				117	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				118	for (a = 0; a < table_size; a++) {
				119	table[a] = i;
				120	if (a / (real) table_size > d1) {
				121	i++;
				122	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				123	}
				124	if (i >= vocab_size)
				125	i = vocab_size - 1;
				126	}
				127
				128	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				129	for (a = 0; a < vocab_size; a++)
				130	noise_distribution[a] = pow(vocab[a].cn, power)
				131	/ (real) train_words_pow;
				132	}
				133
				134	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				135	void ReadWord(char word, FILE fin) {
				136	int a = 0, ch;
				137	while (!feof(fin)) {
				138	ch = fgetc(fin);
				139	if (ch == 13)
				140	continue;
				141	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				142	if (a > 0) {
				143	if (ch == '\n')
				144	ungetc(ch, fin);
				145	break;
				146	}
				147	if (ch == '\n') {
				148	strcpy(word, (char *) "</s>");
				149	return;
				150	} else
				151	continue;
				152	}
				153	word[a] = ch;
				154	a++;
				155	if (a >= MAX_STRING - 1)
				156	a--; // Truncate too long words
				157	}
				158	word[a] = 0;
				159	}
				160
				161	// Returns hash value of a word
				162	int GetWordHash(char *word) {
				163	unsigned long long a, hash = 0;
				164	for (a = 0; a < strlen(word); a++)
				165	hash = hash * 257 + word[a];
				166	hash = hash % vocab_hash_size;
				167	return hash;
				168	}
				169
				170	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				171	int SearchVocab(char *word) {
				172	unsigned int hash = GetWordHash(word);
				173	while (1) {
				174	if (vocab_hash[hash] == -1)
				175	return -1;
				176	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				177	return vocab_hash[hash];
				178	hash = (hash + 1) % vocab_hash_size;
				179	}
				180	return -1;
				181	}
				182
				183	// Reads a word and returns its index in the vocabulary
				184	int ReadWordIndex(FILE *fin) {
				185	char word[MAX_STRING];
				186	ReadWord(word, fin);
				187	if (feof(fin))
				188	return -1;
				189	return SearchVocab(word);
				190	}
				191
				192	// Adds a word to the vocabulary
				193	int AddWordToVocab(char *word) {
				194	unsigned int hash, length = strlen(word) + 1;
				195	if (length > MAX_STRING)
				196	length = MAX_STRING;
				197	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				198	strcpy(vocab[vocab_size].word, word);
				199	vocab[vocab_size].cn = 0;
				200	vocab_size++;
				201	// Reallocate memory if needed
				202	if (vocab_size + 2 >= vocab_max_size) {
				203	vocab_max_size += 1000;
				204	vocab = (struct vocab_word *) realloc(vocab,
				205	vocab_max_size * sizeof(struct vocab_word));
				206	}
				207	hash = GetWordHash(word);
				208	while (vocab_hash[hash] != -1)
				209	hash = (hash + 1) % vocab_hash_size;
				210	vocab_hash[hash] = vocab_size - 1;
				211	return vocab_size - 1;
				212	}
				213
				214	// Used later for sorting by word counts
				215	int VocabCompare(const void a, const void b) {
				216	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				217	}
				218
				219	// Sorts the vocabulary by frequency using word counts
				220	void SortVocab() {
				221	int a, size;
				222	unsigned int hash;
				223	// Sort the vocabulary and keep </s> at the first position
				224	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				225	for (a = 0; a < vocab_hash_size; a++)
				226	vocab_hash[a] = -1;
				227	size = vocab_size;
				228	train_words = 0;
				229	for (a = 0; a < size; a++) {
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame^]	230	avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	231	// Words occuring less than min_count times will be discarded from the vocab
				232	if ((vocab[a].cn < min_count) && (a != 0)) {
				233	vocab_size--;
				234	free(vocab[a].word);
				235	} else {
				236	// Hash will be re-computed, as after the sorting it is not actual
				237	hash = GetWordHash(vocab[a].word);
				238	while (vocab_hash[hash] != -1)
				239	hash = (hash + 1) % vocab_hash_size;
				240	vocab_hash[hash] = a;
				241	train_words += vocab[a].cn;
				242	}
				243	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame^]	244	avgWordLength /= train_words;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	245	vocab = (struct vocab_word *) realloc(vocab,
				246	(vocab_size + 1) * sizeof(struct vocab_word));
				247	// Allocate memory for the binary tree construction
				248	for (a = 0; a < vocab_size; a++) {
				249	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				250	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				251	}
				252	}
				253
				254	// Reduces the vocabulary by removing infrequent tokens
				255	void ReduceVocab() {
				256	int a, b = 0;
				257	unsigned int hash;
				258	for (a = 0; a < vocab_size; a++)
				259	if (vocab[a].cn > min_reduce) {
				260	vocab[b].cn = vocab[a].cn;
				261	vocab[b].word = vocab[a].word;
				262	b++;
				263	} else
				264	free(vocab[a].word);
				265	vocab_size = b;
				266	for (a = 0; a < vocab_hash_size; a++)
				267	vocab_hash[a] = -1;
				268	for (a = 0; a < vocab_size; a++) {
				269	// Hash will be re-computed, as it is not actual
				270	hash = GetWordHash(vocab[a].word);
				271	while (vocab_hash[hash] != -1)
				272	hash = (hash + 1) % vocab_hash_size;
				273	vocab_hash[hash] = a;
				274	}
				275	fflush(stdout);
				276	min_reduce++;
				277	}
				278
				279	// Create binary Huffman tree using the word counts
				280	// Frequent words will have short uniqe binary codes
				281	void CreateBinaryTree() {
				282	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				283	char code[MAX_CODE_LENGTH];
				284	long long count = (long long ) calloc(vocab_size * 2 + 1,
				285	sizeof(long long));
				286	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				287	sizeof(long long));
				288	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				289	sizeof(long long));
				290	for (a = 0; a < vocab_size; a++)
				291	count[a] = vocab[a].cn;
				292	for (a = vocab_size; a < vocab_size * 2; a++)
				293	count[a] = 1e15;
				294	pos1 = vocab_size - 1;
				295	pos2 = vocab_size;
				296	// Following algorithm constructs the Huffman tree by adding one node at a time
				297	for (a = 0; a < vocab_size - 1; a++) {
				298	// First, find two smallest nodes 'min1, min2'
				299	if (pos1 >= 0) {
				300	if (count[pos1] < count[pos2]) {
				301	min1i = pos1;
				302	pos1--;
				303	} else {
				304	min1i = pos2;
				305	pos2++;
				306	}
				307	} else {
				308	min1i = pos2;
				309	pos2++;
				310	}
				311	if (pos1 >= 0) {
				312	if (count[pos1] < count[pos2]) {
				313	min2i = pos1;
				314	pos1--;
				315	} else {
				316	min2i = pos2;
				317	pos2++;
				318	}
				319	} else {
				320	min2i = pos2;
				321	pos2++;
				322	}
				323	count[vocab_size + a] = count[min1i] + count[min2i];
				324	parent_node[min1i] = vocab_size + a;
				325	parent_node[min2i] = vocab_size + a;
				326	binary[min2i] = 1;
				327	}
				328	// Now assign binary code to each vocabulary word
				329	for (a = 0; a < vocab_size; a++) {
				330	b = a;
				331	i = 0;
				332	while (1) {
				333	code[i] = binary[b];
				334	point[i] = b;
				335	i++;
				336	b = parent_node[b];
				337	if (b == vocab_size * 2 - 2)
				338	break;
				339	}
				340	vocab[a].codelen = i;
				341	vocab[a].point[0] = vocab_size - 2;
				342	for (b = 0; b < i; b++) {
				343	vocab[a].code[i - b - 1] = code[b];
				344	vocab[a].point[i - b] = point[b] - vocab_size;
				345	}
				346	}
				347	free(count);
				348	free(binary);
				349	free(parent_node);
				350	}
				351
				352	void LearnVocabFromTrainFile() {
				353	char word[MAX_STRING];
				354	FILE *fin;
				355	long long a, i;
				356	for (a = 0; a < vocab_hash_size; a++)
				357	vocab_hash[a] = -1;
				358	fin = fopen(train_file, "rb");
				359	if (fin == NULL) {
				360	printf("ERROR: training data file not found!\n");
				361	exit(1);
				362	}
				363	vocab_size = 0;
				364	AddWordToVocab((char *) "</s>");
				365	while (1) {
				366	ReadWord(word, fin);
				367	if (feof(fin))
				368	break;
				369	train_words++;
				370	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				371	printf("%lldK%c", train_words / 1000, 13);
				372	fflush(stdout);
				373	}
				374	i = SearchVocab(word);
				375	if (i == -1) {
				376	a = AddWordToVocab(word);
				377	vocab[a].cn = 1;
				378	} else
				379	vocab[i].cn++;
				380	if (vocab_size > vocab_hash_size * 0.7)
				381	ReduceVocab();
				382	}
				383	SortVocab();
				384	if (debug_mode > 0) {
				385	printf("Vocab size: %lld\n", vocab_size);
				386	printf("Words in train file: %lld\n", train_words);
				387	}
				388	file_size = ftell(fin);
				389	fclose(fin);
				390	}
				391
				392	void SaveVocab() {
				393	long long i;
				394	FILE *fo = fopen(save_vocab_file, "wb");
				395	for (i = 0; i < vocab_size; i++)
				396	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				397	fclose(fo);
				398	}
				399
				400	void ReadVocab() {
				401	long long a, i = 0;
				402	char c;
				403	char word[MAX_STRING];
				404	FILE *fin = fopen(read_vocab_file, "rb");
				405	if (fin == NULL) {
				406	printf("Vocabulary file not found\n");
				407	exit(1);
				408	}
				409	for (a = 0; a < vocab_hash_size; a++)
				410	vocab_hash[a] = -1;
				411	vocab_size = 0;
				412	while (1) {
				413	ReadWord(word, fin);
				414	if (feof(fin))
				415	break;
				416	a = AddWordToVocab(word);
				417	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				418	i++;
				419	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame^]	420	fclose(fin);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	421	fin = fopen(train_file, "rb");
				422	if (fin == NULL) {
				423	printf("ERROR: training data file not found!\n");
				424	exit(1);
				425	}
				426	fseek(fin, 0, SEEK_END);
				427	file_size = ftell(fin);
				428	fclose(fin);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame^]	429	SortVocab();
				430	if (debug_mode > 0) {
				431	printf("Vocab size: %lld\n", vocab_size);
				432	if(*read_vocab_file) {
				433	printf("Words in vocab's train file: %lld\n", train_words);
				434	printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
				435	} else {
				436	printf("Words in train file: %lld\n", train_words);
				437	}
				438	}
				439	if(*read_vocab_file) {
				440	train_words = file_size / avgWordLength;
				441	if(debug_mode > 0)
				442	printf("Estimated words in train file: %lld\n", train_words);
				443	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	444	}
				445
				446	void InitClassUnigramTable() {
				447	long long a, c;
				448	printf("loading class unigrams \n");
				449	FILE *fin = fopen(negative_classes_file, "rb");
				450	if (fin == NULL) {
				451	printf("ERROR: class file not found!\n");
				452	exit(1);
				453	}
				454	word_to_group = (int ) malloc(vocab_size sizeof(int));
				455	for (a = 0; a < vocab_size; a++)
				456	word_to_group[a] = -1;
				457	char class[MAX_STRING];
				458	char prev_class[MAX_STRING];
				459	prev_class[0] = 0;
				460	char word[MAX_STRING];
				461	class_number = -1;
				462	while (1) {
				463	if (feof(fin))
				464	break;
				465	ReadWord(class, fin);
				466	ReadWord(word, fin);
				467	int word_index = SearchVocab(word);
				468	if (word_index != -1) {
				469	if (strcmp(class, prev_class) != 0) {
				470	class_number++;
				471	strcpy(prev_class, class);
				472	}
				473	word_to_group[word_index] = class_number;
				474	}
				475	ReadWord(word, fin);
				476	}
				477	class_number++;
				478	fclose(fin);
				479
				480	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				481	long long train_words_pow = 0;
				482	real d1, power = 0.75;
				483
				484	for (c = 0; c < class_number; c++) {
				485	long long offset = c * table_size;
				486	train_words_pow = 0;
				487	for (a = 0; a < vocab_size; a++)
				488	if (word_to_group[a] == c)
				489	train_words_pow += pow(vocab[a].cn, power);
				490	int i = 0;
				491	while (word_to_group[i] != c && i < vocab_size)
				492	i++;
				493	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				494	for (a = 0; a < table_size; a++) {
				495	//printf("index %lld , word %d\n", a, i);
				496	group_to_table[offset + a] = i;
				497	if (a / (real) table_size > d1) {
				498	i++;
				499	while (word_to_group[i] != c && i < vocab_size)
				500	i++;
				501	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				502	}
				503	if (i >= vocab_size)
				504	while (word_to_group[i] != c && i >= 0)
				505	i--;
				506	}
				507	}
				508	}
				509
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	510	void SaveArgs(int argc, char **argv) {
				511	unsigned int i;
				512	size_t len = 0;
				513	char _all_args, all_args;
				514	char *args_file = strdup(output_file);
				515	strcat(args_file, ".args");
				516	FILE *fargs = fopen(args_file, "w");
				517	if (fargs == NULL) {
				518	printf("Cannot save args to %s.\n", args_file);
				519	return;
				520	}
				521
				522	for(i=1; i<argc; i++) {
				523	len += strlen(argv[i]);
				524	}
				525
				526	_all_args = all_args = (char *)malloc(len+argc-1);
				527
				528	for(i=1; i<argc; i++) {
				529	memcpy(_all_args, argv[i], strlen(argv[i]));
				530	_all_args += strlen(argv[i])+1;
				531	*(_all_args-1) = ' ';
				532	}
				533	*(_all_args-1) = 0;
				534
				535	fprintf(fargs, "%s\n", all_args);
				536	fclose(fargs);
				537
				538	free(all_args);
				539
				540	return;
				541	}
				542
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	543	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	544	if(type != 3 \|\| negative <= 0) {
				545	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				546	return;
				547	}
				548
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	549	FILE *fnet = fopen(save_net_file, "wb");
				550	if (fnet == NULL) {
				551	printf("Net parameter file not found\n");
				552	exit(1);
				553	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	554	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	555	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	556	fclose(fnet);
				557	}
				558
				559	void InitNet() {
				560	long long a, b;
				561	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	562	long long read;
				563
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	564	window_layer_size = layer1_size * window * 2;
				565	a = posix_memalign((void **) &syn0, 128,
				566	(long long) vocab_size * layer1_size * sizeof(real));
				567	if (syn0 == NULL) {
				568	printf("Memory allocation failed\n");
				569	exit(1);
				570	}
				571
				572	if (hs) {
				573	a = posix_memalign((void **) &syn1, 128,
				574	(long long) vocab_size * layer1_size * sizeof(real));
				575	if (syn1 == NULL) {
				576	printf("Memory allocation failed\n");
				577	exit(1);
				578	}
				579	a = posix_memalign((void **) &syn1_window, 128,
				580	(long long) vocab_size * window_layer_size * sizeof(real));
				581	if (syn1_window == NULL) {
				582	printf("Memory allocation failed\n");
				583	exit(1);
				584	}
				585	a = posix_memalign((void **) &syn_hidden_word, 128,
				586	(long long) vocab_size * window_hidden_size * sizeof(real));
				587	if (syn_hidden_word == NULL) {
				588	printf("Memory allocation failed\n");
				589	exit(1);
				590	}
				591
				592	for (a = 0; a < vocab_size; a++)
				593	for (b = 0; b < layer1_size; b++)
				594	syn1[a * layer1_size + b] = 0;
				595	for (a = 0; a < vocab_size; a++)
				596	for (b = 0; b < window_layer_size; b++)
				597	syn1_window[a * window_layer_size + b] = 0;
				598	for (a = 0; a < vocab_size; a++)
				599	for (b = 0; b < window_hidden_size; b++)
				600	syn_hidden_word[a * window_hidden_size + b] = 0;
				601	}
				602	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	603	if(type == 0) {
				604	a = posix_memalign((void **) &syn1neg, 128,
				605	(long long) vocab_size * layer1_size * sizeof(real));
				606	if (syn1neg == NULL) {
				607	printf("Memory allocation failed\n");
				608	exit(1);
				609	}
				610	for (a = 0; a < vocab_size; a++)
				611	for (b = 0; b < layer1_size; b++)
				612	syn1neg[a * layer1_size + b] = 0;
				613	} else if (type == 3) {
				614	a = posix_memalign((void **) &syn1neg_window, 128,
				615	(long long) vocab_size * window_layer_size * sizeof(real));
				616	if (syn1neg_window == NULL) {
				617	printf("Memory allocation failed\n");
				618	exit(1);
				619	}
				620	for (a = 0; a < vocab_size; a++)
				621	for (b = 0; b < window_layer_size; b++)
				622	syn1neg_window[a * window_layer_size + b] = 0;
				623	} else if (type == 4) {
				624	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				625	(long long) vocab_size * window_hidden_size * sizeof(real));
				626	if (syn_hidden_word_neg == NULL) {
				627	printf("Memory allocation failed\n");
				628	exit(1);
				629	}
				630	for (a = 0; a < vocab_size; a++)
				631	for (b = 0; b < window_hidden_size; b++)
				632	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	633	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	634	}
				635	if (nce > 0) {
				636	a = posix_memalign((void **) &syn1nce, 128,
				637	(long long) vocab_size * layer1_size * sizeof(real));
				638	if (syn1nce == NULL) {
				639	printf("Memory allocation failed\n");
				640	exit(1);
				641	}
				642	a = posix_memalign((void **) &syn1nce_window, 128,
				643	(long long) vocab_size * window_layer_size * sizeof(real));
				644	if (syn1nce_window == NULL) {
				645	printf("Memory allocation failed\n");
				646	exit(1);
				647	}
				648	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				649	(long long) vocab_size * window_hidden_size * sizeof(real));
				650	if (syn_hidden_word_nce == NULL) {
				651	printf("Memory allocation failed\n");
				652	exit(1);
				653	}
				654
				655	for (a = 0; a < vocab_size; a++)
				656	for (b = 0; b < layer1_size; b++)
				657	syn1nce[a * layer1_size + b] = 0;
				658	for (a = 0; a < vocab_size; a++)
				659	for (b = 0; b < window_layer_size; b++)
				660	syn1nce_window[a * window_layer_size + b] = 0;
				661	for (a = 0; a < vocab_size; a++)
				662	for (b = 0; b < window_hidden_size; b++)
				663	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				664	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	665
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	666	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	667	a = posix_memalign((void **) &syn_window_hidden, 128,
				668	window_hidden_size * window_layer_size * sizeof(real));
				669	if (syn_window_hidden == NULL) {
				670	printf("Memory allocation failed\n");
				671	exit(1);
				672	}
				673	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				674	next_random = next_random * (unsigned long long) 25214903917 + 11;
				675	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				676	- 0.5) / (window_hidden_size * window_layer_size);
				677	}
				678	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	679
				680	if (read_net_file[0] == 0) {
				681	for (a = 0; a < vocab_size; a++)
				682	for (b = 0; b < layer1_size; b++) {
				683	next_random = next_random * (unsigned long long) 25214903917
				684	+ 11;
				685	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				686	/ (real) 65536) - 0.5) / layer1_size;
				687	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	688	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	689	FILE *fnet = fopen(read_net_file, "rb");
				690	if (fnet == NULL) {
				691	printf("Net parameter file not found\n");
				692	exit(1);
				693	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	694	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				695	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				696	if(read != vocab_size * layer1_size) {
				697	fprintf(stderr, "read-net failed %lld\n", read);
				698	exit(-1);
				699	}
				700	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				701	if(read != (long long) vocab_size * window_layer_size) {
				702	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				703	(long long) sizeof(real) * vocab_size * window_layer_size);
				704	exit(-1);
				705	}
				706	fgetc(fnet);
				707	if(!feof(fnet)) {
				708	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				709	exit(-1);
				710	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	711	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	712	} else {
				713	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				714	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	715	}
				716
				717	CreateBinaryTree();
				718	}
				719
				720	void TrainModelThread(void id) {
				721	long long a, b, d, cw, word, last_word, sentence_length = 0,
				722	sentence_position = 0;
				723	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				724	long long l1, l2, c, target, label, local_iter = iter;
				725	unsigned long long next_random = (long long) id;
				726	real f, g;
				727	clock_t now;
				728	int input_len_1 = layer1_size;
				729	int window_offset = -1;
				730	if (type == 2 \|\| type == 4) {
				731	input_len_1 = window_layer_size;
				732	}
				733	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				734	real neu1e = (real ) calloc(input_len_1, sizeof(real));
				735
				736	int input_len_2 = 0;
				737	if (type == 4) {
				738	input_len_2 = window_hidden_size;
				739	}
				740	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				741	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				742
				743	FILE *fi = fopen(train_file, "rb");
				744	fseek(fi, file_size / (long long) num_threads * (long long) id, SEEK_SET);
				745	while (1) {
				746	if (word_count - last_word_count > 10000) {
				747	word_count_actual += word_count - last_word_count;
				748	last_word_count = word_count;
				749	if ((debug_mode > 1)) {
				750	now = clock();
				751	printf(
Marc Kupietz	22f109f	2016-07-12 16:02:28 +0200	[diff] [blame]	752	"%cCycles ahead: %lld, Alpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ",
				753	13, local_iter, alpha,
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	754	word_count_actual / (real) (iter * train_words + 1)
				755	* 100,
				756	word_count_actual
				757	/ ((real) (now - start + 1)
				758	/ (real) CLOCKS_PER_SEC * 1000));
				759	fflush(stdout);
				760	}
				761	alpha = starting_alpha
				762	* (1 - word_count_actual / (real) (iter * train_words + 1));
				763	if (alpha < starting_alpha * 0.0001)
				764	alpha = starting_alpha * 0.0001;
				765	}
				766	if (sentence_length == 0) {
				767	while (1) {
				768	word = ReadWordIndex(fi);
				769	if (feof(fi))
				770	break;
				771	if (word == -1)
				772	continue;
				773	word_count++;
				774	if (word == 0)
				775	break;
				776	// The subsampling randomly discards frequent words while keeping the ranking same
				777	if (sample > 0) {
				778	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				779	+ 1) * (sample * train_words) / vocab[word].cn;
				780	next_random = next_random * (unsigned long long) 25214903917
				781	+ 11;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	782	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				783	if(type == 3) // in structured skipgrams
				784	word = -2; // keep the window position correct
				785	else
				786	continue;
				787	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	788	}
				789	sen[sentence_length] = word;
				790	sentence_length++;
				791	if (sentence_length >= MAX_SENTENCE_LENGTH)
				792	break;
				793	}
				794	sentence_position = 0;
				795	}
				796	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				797	word_count_actual += word_count - last_word_count;
				798	local_iter--;
				799	if (local_iter == 0)
				800	break;
				801	word_count = 0;
				802	last_word_count = 0;
				803	sentence_length = 0;
				804	fseek(fi, file_size / (long long) num_threads * (long long) id,
				805	SEEK_SET);
				806	continue;
				807	}
				808	word = sen[sentence_position];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	809	while (word == -2 && sentence_position<sentence_length)
				810	word = sen[++sentence_position];
				811	if (sentence_position>=sentence_length) {
				812	sentence_length=0;
				813	continue;
				814	}
				815	if (word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	816	continue;
				817	for (c = 0; c < input_len_1; c++)
				818	neu1[c] = 0;
				819	for (c = 0; c < input_len_1; c++)
				820	neu1e[c] = 0;
				821	for (c = 0; c < input_len_2; c++)
				822	neu2[c] = 0;
				823	for (c = 0; c < input_len_2; c++)
				824	neu2e[c] = 0;
				825	next_random = next_random * (unsigned long long) 25214903917 + 11;
				826	b = next_random % window;
				827	if (type == 0) { //train the cbow architecture
				828	// in -> hidden
				829	cw = 0;
				830	for (a = b; a < window * 2 + 1 - b; a++)
				831	if (a != window) {
				832	c = sentence_position - window + a;
				833	if (c < 0)
				834	continue;
				835	if (c >= sentence_length)
				836	continue;
				837	last_word = sen[c];
				838	if (last_word == -1)
				839	continue;
				840	for (c = 0; c < layer1_size; c++)
				841	neu1[c] += syn0[c + last_word * layer1_size];
				842	cw++;
				843	}
				844	if (cw) {
				845	for (c = 0; c < layer1_size; c++)
				846	neu1[c] /= cw;
				847	if (hs)
				848	for (d = 0; d < vocab[word].codelen; d++) {
				849	f = 0;
				850	l2 = vocab[word].point[d] * layer1_size;
				851	// Propagate hidden -> output
				852	for (c = 0; c < layer1_size; c++)
				853	f += neu1[c] * syn1[c + l2];
				854	if (f <= -MAX_EXP)
				855	continue;
				856	else if (f >= MAX_EXP)
				857	continue;
				858	else
				859	f = expTable[(int) ((f + MAX_EXP)
				860	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				861	// 'g' is the gradient multiplied by the learning rate
				862	g = (1 - vocab[word].code[d] - f) * alpha;
				863	// Propagate errors output -> hidden
				864	for (c = 0; c < layer1_size; c++)
				865	neu1e[c] += g * syn1[c + l2];
				866	// Learn weights hidden -> output
				867	for (c = 0; c < layer1_size; c++)
				868	syn1[c + l2] += g * neu1[c];
				869	if (cap == 1)
				870	for (c = 0; c < layer1_size; c++)
				871	capParam(syn1, c + l2);
				872	}
				873	// NEGATIVE SAMPLING
				874	if (negative > 0)
				875	for (d = 0; d < negative + 1; d++) {
				876	if (d == 0) {
				877	target = word;
				878	label = 1;
				879	} else {
				880	next_random = next_random
				881	* (unsigned long long) 25214903917 + 11;
				882	if (word_to_group != NULL
				883	&& word_to_group[word] != -1) {
				884	target = word;
				885	while (target == word) {
				886	target = group_to_table[word_to_group[word]
				887	* table_size
				888	+ (next_random >> 16) % table_size];
				889	next_random = next_random
				890	* (unsigned long long) 25214903917
				891	+ 11;
				892	}
				893	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				894	} else {
				895	target =
				896	table[(next_random >> 16) % table_size];
				897	}
				898	if (target == 0)
				899	target = next_random % (vocab_size - 1) + 1;
				900	if (target == word)
				901	continue;
				902	label = 0;
				903	}
				904	l2 = target * layer1_size;
				905	f = 0;
				906	for (c = 0; c < layer1_size; c++)
				907	f += neu1[c] * syn1neg[c + l2];
				908	if (f > MAX_EXP)
				909	g = (label - 1) * alpha;
				910	else if (f < -MAX_EXP)
				911	g = (label - 0) * alpha;
				912	else
				913	g = (label
				914	- expTable[(int) ((f + MAX_EXP)
				915	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				916	* alpha;
				917	for (c = 0; c < layer1_size; c++)
				918	neu1e[c] += g * syn1neg[c + l2];
				919	for (c = 0; c < layer1_size; c++)
				920	syn1neg[c + l2] += g * neu1[c];
				921	if (cap == 1)
				922	for (c = 0; c < layer1_size; c++)
				923	capParam(syn1neg, c + l2);
				924	}
				925	// Noise Contrastive Estimation
				926	if (nce > 0)
				927	for (d = 0; d < nce + 1; d++) {
				928	if (d == 0) {
				929	target = word;
				930	label = 1;
				931	} else {
				932	next_random = next_random
				933	* (unsigned long long) 25214903917 + 11;
				934	if (word_to_group != NULL
				935	&& word_to_group[word] != -1) {
				936	target = word;
				937	while (target == word) {
				938	target = group_to_table[word_to_group[word]
				939	* table_size
				940	+ (next_random >> 16) % table_size];
				941	next_random = next_random
				942	* (unsigned long long) 25214903917
				943	+ 11;
				944	}
				945	} else {
				946	target =
				947	table[(next_random >> 16) % table_size];
				948	}
				949	if (target == 0)
				950	target = next_random % (vocab_size - 1) + 1;
				951	if (target == word)
				952	continue;
				953	label = 0;
				954	}
				955	l2 = target * layer1_size;
				956	f = 0;
				957
				958	for (c = 0; c < layer1_size; c++)
				959	f += neu1[c] * syn1nce[c + l2];
				960	if (f > MAX_EXP)
				961	g = (label - 1) * alpha;
				962	else if (f < -MAX_EXP)
				963	g = (label - 0) * alpha;
				964	else {
				965	f = exp(f);
				966	g =
				967	(label
				968	- f
				969	/ (noise_distribution[target]
				970	* nce + f)) * alpha;
				971	}
				972	for (c = 0; c < layer1_size; c++)
				973	neu1e[c] += g * syn1nce[c + l2];
				974	for (c = 0; c < layer1_size; c++)
				975	syn1nce[c + l2] += g * neu1[c];
				976	if (cap == 1)
				977	for (c = 0; c < layer1_size; c++)
				978	capParam(syn1nce, c + l2);
				979	}
				980	// hidden -> in
				981	for (a = b; a < window * 2 + 1 - b; a++)
				982	if (a != window) {
				983	c = sentence_position - window + a;
				984	if (c < 0)
				985	continue;
				986	if (c >= sentence_length)
				987	continue;
				988	last_word = sen[c];
				989	if (last_word == -1)
				990	continue;
				991	for (c = 0; c < layer1_size; c++)
				992	syn0[c + last_word * layer1_size] += neu1e[c];
				993	}
				994	}
				995	} else if (type == 1) { //train skip-gram
				996	for (a = b; a < window * 2 + 1 - b; a++)
				997	if (a != window) {
				998	c = sentence_position - window + a;
				999	if (c < 0)
				1000	continue;
				1001	if (c >= sentence_length)
				1002	continue;
				1003	last_word = sen[c];
				1004	if (last_word == -1)
				1005	continue;
				1006	l1 = last_word * layer1_size;
				1007	for (c = 0; c < layer1_size; c++)
				1008	neu1e[c] = 0;
				1009	// HIERARCHICAL SOFTMAX
				1010	if (hs)
				1011	for (d = 0; d < vocab[word].codelen; d++) {
				1012	f = 0;
				1013	l2 = vocab[word].point[d] * layer1_size;
				1014	// Propagate hidden -> output
				1015	for (c = 0; c < layer1_size; c++)
				1016	f += syn0[c + l1] * syn1[c + l2];
				1017	if (f <= -MAX_EXP)
				1018	continue;
				1019	else if (f >= MAX_EXP)
				1020	continue;
				1021	else
				1022	f = expTable[(int) ((f + MAX_EXP)
				1023	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1024	// 'g' is the gradient multiplied by the learning rate
				1025	g = (1 - vocab[word].code[d] - f) * alpha;
				1026	// Propagate errors output -> hidden
				1027	for (c = 0; c < layer1_size; c++)
				1028	neu1e[c] += g * syn1[c + l2];
				1029	// Learn weights hidden -> output
				1030	for (c = 0; c < layer1_size; c++)
				1031	syn1[c + l2] += g * syn0[c + l1];
				1032	if (cap == 1)
				1033	for (c = 0; c < layer1_size; c++)
				1034	capParam(syn1, c + l2);
				1035	}
				1036	// NEGATIVE SAMPLING
				1037	if (negative > 0)
				1038	for (d = 0; d < negative + 1; d++) {
				1039	if (d == 0) {
				1040	target = word;
				1041	label = 1;
				1042	} else {
				1043	next_random = next_random
				1044	* (unsigned long long) 25214903917 + 11;
				1045	if (word_to_group != NULL
				1046	&& word_to_group[word] != -1) {
				1047	target = word;
				1048	while (target == word) {
				1049	target =
				1050	group_to_table[word_to_group[word]
				1051	* table_size
				1052	+ (next_random >> 16)
				1053	% table_size];
				1054	next_random =
				1055	next_random
				1056	* (unsigned long long) 25214903917
				1057	+ 11;
				1058	}
				1059	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1060	} else {
				1061	target = table[(next_random >> 16)
				1062	% table_size];
				1063	}
				1064	if (target == 0)
				1065	target = next_random % (vocab_size - 1) + 1;
				1066	if (target == word)
				1067	continue;
				1068	label = 0;
				1069	}
				1070	l2 = target * layer1_size;
				1071	f = 0;
				1072	for (c = 0; c < layer1_size; c++)
				1073	f += syn0[c + l1] * syn1neg[c + l2];
				1074	if (f > MAX_EXP)
				1075	g = (label - 1) * alpha;
				1076	else if (f < -MAX_EXP)
				1077	g = (label - 0) * alpha;
				1078	else
				1079	g =
				1080	(label
				1081	- expTable[(int) ((f + MAX_EXP)
				1082	* (EXP_TABLE_SIZE
				1083	/ MAX_EXP / 2))])
				1084	* alpha;
				1085	for (c = 0; c < layer1_size; c++)
				1086	neu1e[c] += g * syn1neg[c + l2];
				1087	for (c = 0; c < layer1_size; c++)
				1088	syn1neg[c + l2] += g * syn0[c + l1];
				1089	if (cap == 1)
				1090	for (c = 0; c < layer1_size; c++)
				1091	capParam(syn1neg, c + l2);
				1092	}
				1093	//Noise Contrastive Estimation
				1094	if (nce > 0)
				1095	for (d = 0; d < nce + 1; d++) {
				1096	if (d == 0) {
				1097	target = word;
				1098	label = 1;
				1099	} else {
				1100	next_random = next_random
				1101	* (unsigned long long) 25214903917 + 11;
				1102	if (word_to_group != NULL
				1103	&& word_to_group[word] != -1) {
				1104	target = word;
				1105	while (target == word) {
				1106	target =
				1107	group_to_table[word_to_group[word]
				1108	* table_size
				1109	+ (next_random >> 16)
				1110	% table_size];
				1111	next_random =
				1112	next_random
				1113	* (unsigned long long) 25214903917
				1114	+ 11;
				1115	}
				1116	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1117	} else {
				1118	target = table[(next_random >> 16)
				1119	% table_size];
				1120	}
				1121	if (target == 0)
				1122	target = next_random % (vocab_size - 1) + 1;
				1123	if (target == word)
				1124	continue;
				1125	label = 0;
				1126	}
				1127	l2 = target * layer1_size;
				1128	f = 0;
				1129	for (c = 0; c < layer1_size; c++)
				1130	f += syn0[c + l1] * syn1nce[c + l2];
				1131	if (f > MAX_EXP)
				1132	g = (label - 1) * alpha;
				1133	else if (f < -MAX_EXP)
				1134	g = (label - 0) * alpha;
				1135	else {
				1136	f = exp(f);
				1137	g = (label
				1138	- f
				1139	/ (noise_distribution[target]
				1140	* nce + f)) * alpha;
				1141	}
				1142	for (c = 0; c < layer1_size; c++)
				1143	neu1e[c] += g * syn1nce[c + l2];
				1144	for (c = 0; c < layer1_size; c++)
				1145	syn1nce[c + l2] += g * syn0[c + l1];
				1146	if (cap == 1)
				1147	for (c = 0; c < layer1_size; c++)
				1148	capParam(syn1nce, c + l2);
				1149	}
				1150	// Learn weights input -> hidden
				1151	for (c = 0; c < layer1_size; c++)
				1152	syn0[c + l1] += neu1e[c];
				1153	}
				1154	} else if (type == 2) { //train the cwindow architecture
				1155	// in -> hidden
				1156	cw = 0;
				1157	for (a = 0; a < window * 2 + 1; a++)
				1158	if (a != window) {
				1159	c = sentence_position - window + a;
				1160	if (c < 0)
				1161	continue;
				1162	if (c >= sentence_length)
				1163	continue;
				1164	last_word = sen[c];
				1165	if (last_word == -1)
				1166	continue;
				1167	window_offset = a * layer1_size;
				1168	if (a > window)
				1169	window_offset -= layer1_size;
				1170	for (c = 0; c < layer1_size; c++)
				1171	neu1[c + window_offset] += syn0[c
				1172	+ last_word * layer1_size];
				1173	cw++;
				1174	}
				1175	if (cw) {
				1176	if (hs)
				1177	for (d = 0; d < vocab[word].codelen; d++) {
				1178	f = 0;
				1179	l2 = vocab[word].point[d] * window_layer_size;
				1180	// Propagate hidden -> output
				1181	for (c = 0; c < window_layer_size; c++)
				1182	f += neu1[c] * syn1_window[c + l2];
				1183	if (f <= -MAX_EXP)
				1184	continue;
				1185	else if (f >= MAX_EXP)
				1186	continue;
				1187	else
				1188	f = expTable[(int) ((f + MAX_EXP)
				1189	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1190	// 'g' is the gradient multiplied by the learning rate
				1191	g = (1 - vocab[word].code[d] - f) * alpha;
				1192	// Propagate errors output -> hidden
				1193	for (c = 0; c < window_layer_size; c++)
				1194	neu1e[c] += g * syn1_window[c + l2];
				1195	// Learn weights hidden -> output
				1196	for (c = 0; c < window_layer_size; c++)
				1197	syn1_window[c + l2] += g * neu1[c];
				1198	if (cap == 1)
				1199	for (c = 0; c < window_layer_size; c++)
				1200	capParam(syn1_window, c + l2);
				1201	}
				1202	// NEGATIVE SAMPLING
				1203	if (negative > 0)
				1204	for (d = 0; d < negative + 1; d++) {
				1205	if (d == 0) {
				1206	target = word;
				1207	label = 1;
				1208	} else {
				1209	next_random = next_random
				1210	* (unsigned long long) 25214903917 + 11;
				1211	if (word_to_group != NULL
				1212	&& word_to_group[word] != -1) {
				1213	target = word;
				1214	while (target == word) {
				1215	target = group_to_table[word_to_group[word]
				1216	* table_size
				1217	+ (next_random >> 16) % table_size];
				1218	next_random = next_random
				1219	* (unsigned long long) 25214903917
				1220	+ 11;
				1221	}
				1222	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1223	} else {
				1224	target =
				1225	table[(next_random >> 16) % table_size];
				1226	}
				1227	if (target == 0)
				1228	target = next_random % (vocab_size - 1) + 1;
				1229	if (target == word)
				1230	continue;
				1231	label = 0;
				1232	}
				1233	l2 = target * window_layer_size;
				1234	f = 0;
				1235	for (c = 0; c < window_layer_size; c++)
				1236	f += neu1[c] * syn1neg_window[c + l2];
				1237	if (f > MAX_EXP)
				1238	g = (label - 1) * alpha;
				1239	else if (f < -MAX_EXP)
				1240	g = (label - 0) * alpha;
				1241	else
				1242	g = (label
				1243	- expTable[(int) ((f + MAX_EXP)
				1244	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1245	* alpha;
				1246	for (c = 0; c < window_layer_size; c++)
				1247	neu1e[c] += g * syn1neg_window[c + l2];
				1248	for (c = 0; c < window_layer_size; c++)
				1249	syn1neg_window[c + l2] += g * neu1[c];
				1250	if (cap == 1)
				1251	for (c = 0; c < window_layer_size; c++)
				1252	capParam(syn1neg_window, c + l2);
				1253	}
				1254	// Noise Contrastive Estimation
				1255	if (nce > 0)
				1256	for (d = 0; d < nce + 1; d++) {
				1257	if (d == 0) {
				1258	target = word;
				1259	label = 1;
				1260	} else {
				1261	next_random = next_random
				1262	* (unsigned long long) 25214903917 + 11;
				1263	if (word_to_group != NULL
				1264	&& word_to_group[word] != -1) {
				1265	target = word;
				1266	while (target == word) {
				1267	target = group_to_table[word_to_group[word]
				1268	* table_size
				1269	+ (next_random >> 16) % table_size];
				1270	next_random = next_random
				1271	* (unsigned long long) 25214903917
				1272	+ 11;
				1273	}
				1274	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1275	} else {
				1276	target =
				1277	table[(next_random >> 16) % table_size];
				1278	}
				1279	if (target == 0)
				1280	target = next_random % (vocab_size - 1) + 1;
				1281	if (target == word)
				1282	continue;
				1283	label = 0;
				1284	}
				1285	l2 = target * window_layer_size;
				1286	f = 0;
				1287	for (c = 0; c < window_layer_size; c++)
				1288	f += neu1[c] * syn1nce_window[c + l2];
				1289	if (f > MAX_EXP)
				1290	g = (label - 1) * alpha;
				1291	else if (f < -MAX_EXP)
				1292	g = (label - 0) * alpha;
				1293	else {
				1294	f = exp(f);
				1295	g =
				1296	(label
				1297	- f
				1298	/ (noise_distribution[target]
				1299	* nce + f)) * alpha;
				1300	}
				1301	for (c = 0; c < window_layer_size; c++)
				1302	neu1e[c] += g * syn1nce_window[c + l2];
				1303	for (c = 0; c < window_layer_size; c++)
				1304	syn1nce_window[c + l2] += g * neu1[c];
				1305	if (cap == 1)
				1306	for (c = 0; c < window_layer_size; c++)
				1307	capParam(syn1nce_window, c + l2);
				1308	}
				1309	// hidden -> in
				1310	for (a = 0; a < window * 2 + 1; a++)
				1311	if (a != window) {
				1312	c = sentence_position - window + a;
				1313	if (c < 0)
				1314	continue;
				1315	if (c >= sentence_length)
				1316	continue;
				1317	last_word = sen[c];
				1318	if (last_word == -1)
				1319	continue;
				1320	window_offset = a * layer1_size;
				1321	if (a > window)
				1322	window_offset -= layer1_size;
				1323	for (c = 0; c < layer1_size; c++)
				1324	syn0[c + last_word * layer1_size] += neu1e[c
				1325	+ window_offset];
				1326	}
				1327	}
				1328	} else if (type == 3) { //train structured skip-gram
				1329	for (a = 0; a < window * 2 + 1; a++)
				1330	if (a != window) {
				1331	c = sentence_position - window + a;
				1332	if (c < 0)
				1333	continue;
				1334	if (c >= sentence_length)
				1335	continue;
				1336	last_word = sen[c];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	1337	if (last_word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1338	continue;
				1339	l1 = last_word * layer1_size;
				1340	window_offset = a * layer1_size;
				1341	if (a > window)
				1342	window_offset -= layer1_size;
				1343	for (c = 0; c < layer1_size; c++)
				1344	neu1e[c] = 0;
				1345	// HIERARCHICAL SOFTMAX
				1346	if (hs)
				1347	for (d = 0; d < vocab[word].codelen; d++) {
				1348	f = 0;
				1349	l2 = vocab[word].point[d] * window_layer_size;
				1350	// Propagate hidden -> output
				1351	for (c = 0; c < layer1_size; c++)
				1352	f += syn0[c + l1]
				1353	* syn1_window[c + l2 + window_offset];
				1354	if (f <= -MAX_EXP)
				1355	continue;
				1356	else if (f >= MAX_EXP)
				1357	continue;
				1358	else
				1359	f = expTable[(int) ((f + MAX_EXP)
				1360	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1361	// 'g' is the gradient multiplied by the learning rate
				1362	g = (1 - vocab[word].code[d] - f) * alpha;
				1363	// Propagate errors output -> hidden
				1364	for (c = 0; c < layer1_size; c++)
				1365	neu1e[c] += g
				1366	* syn1_window[c + l2 + window_offset];
				1367	// Learn weights hidden -> output
				1368	for (c = 0; c < layer1_size; c++)
				1369	syn1[c + l2 + window_offset] += g
				1370	* syn0[c + l1];
				1371	if (cap == 1)
				1372	for (c = 0; c < layer1_size; c++)
				1373	capParam(syn1, c + l2 + window_offset);
				1374	}
				1375	// NEGATIVE SAMPLING
				1376	if (negative > 0)
				1377	for (d = 0; d < negative + 1; d++) {
				1378	if (d == 0) {
				1379	target = word;
				1380	label = 1;
				1381	} else {
				1382	next_random = next_random
				1383	* (unsigned long long) 25214903917 + 11;
				1384	if (word_to_group != NULL
				1385	&& word_to_group[word] != -1) {
				1386	target = word;
				1387	while (target == word) {
				1388	target =
				1389	group_to_table[word_to_group[word]
				1390	* table_size
				1391	+ (next_random >> 16)
				1392	% table_size];
				1393	next_random =
				1394	next_random
				1395	* (unsigned long long) 25214903917
				1396	+ 11;
				1397	}
				1398	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1399	} else {
				1400	target = table[(next_random >> 16)
				1401	% table_size];
				1402	}
				1403	if (target == 0)
				1404	target = next_random % (vocab_size - 1) + 1;
				1405	if (target == word)
				1406	continue;
				1407	label = 0;
				1408	}
				1409	l2 = target * window_layer_size;
				1410	f = 0;
				1411	for (c = 0; c < layer1_size; c++)
				1412	f +=
				1413	syn0[c + l1]
				1414	* syn1neg_window[c + l2
				1415	+ window_offset];
				1416	if (f > MAX_EXP)
				1417	g = (label - 1) * alpha;
				1418	else if (f < -MAX_EXP)
				1419	g = (label - 0) * alpha;
				1420	else
				1421	g =
				1422	(label
				1423	- expTable[(int) ((f + MAX_EXP)
				1424	* (EXP_TABLE_SIZE
				1425	/ MAX_EXP / 2))])
				1426	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1427	if(debug_mode > 2 && ((long long) id) == 0) {
				1428	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1429	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1430	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1431	for (c = 0; c < layer1_size; c++)
				1432	neu1e[c] +=
				1433	g
				1434	* syn1neg_window[c + l2
				1435	+ window_offset];
				1436	for (c = 0; c < layer1_size; c++)
				1437	syn1neg_window[c + l2 + window_offset] += g
				1438	* syn0[c + l1];
				1439	if (cap == 1)
				1440	for (c = 0; c < layer1_size; c++)
				1441	capParam(syn1neg_window,
				1442	c + l2 + window_offset);
				1443	}
				1444	// Noise Constrastive Estimation
				1445	if (nce > 0)
				1446	for (d = 0; d < nce + 1; d++) {
				1447	if (d == 0) {
				1448	target = word;
				1449	label = 1;
				1450	} else {
				1451	next_random = next_random
				1452	* (unsigned long long) 25214903917 + 11;
				1453	if (word_to_group != NULL
				1454	&& word_to_group[word] != -1) {
				1455	target = word;
				1456	while (target == word) {
				1457	target =
				1458	group_to_table[word_to_group[word]
				1459	* table_size
				1460	+ (next_random >> 16)
				1461	% table_size];
				1462	next_random =
				1463	next_random
				1464	* (unsigned long long) 25214903917
				1465	+ 11;
				1466	}
				1467	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1468	} else {
				1469	target = table[(next_random >> 16)
				1470	% table_size];
				1471	}
				1472	if (target == 0)
				1473	target = next_random % (vocab_size - 1) + 1;
				1474	if (target == word)
				1475	continue;
				1476	label = 0;
				1477	}
				1478	l2 = target * window_layer_size;
				1479	f = 0;
				1480	for (c = 0; c < layer1_size; c++)
				1481	f +=
				1482	syn0[c + l1]
				1483	* syn1nce_window[c + l2
				1484	+ window_offset];
				1485	if (f > MAX_EXP)
				1486	g = (label - 1) * alpha;
				1487	else if (f < -MAX_EXP)
				1488	g = (label - 0) * alpha;
				1489	else {
				1490	f = exp(f);
				1491	g = (label
				1492	- f
				1493	/ (noise_distribution[target]
				1494	* nce + f)) * alpha;
				1495	}
				1496	for (c = 0; c < layer1_size; c++)
				1497	neu1e[c] +=
				1498	g
				1499	* syn1nce_window[c + l2
				1500	+ window_offset];
				1501	for (c = 0; c < layer1_size; c++)
				1502	syn1nce_window[c + l2 + window_offset] += g
				1503	* syn0[c + l1];
				1504	if (cap == 1)
				1505	for (c = 0; c < layer1_size; c++)
				1506	capParam(syn1nce_window,
				1507	c + l2 + window_offset);
				1508	}
				1509	// Learn weights input -> hidden
				1510	for (c = 0; c < layer1_size; c++) {
				1511	syn0[c + l1] += neu1e[c];
				1512	if (syn0[c + l1] > 50)
				1513	syn0[c + l1] = 50;
				1514	if (syn0[c + l1] < -50)
				1515	syn0[c + l1] = -50;
				1516	}
				1517	}
				1518	} else if (type == 4) { //training senna
				1519	// in -> hidden
				1520	cw = 0;
				1521	for (a = 0; a < window * 2 + 1; a++)
				1522	if (a != window) {
				1523	c = sentence_position - window + a;
				1524	if (c < 0)
				1525	continue;
				1526	if (c >= sentence_length)
				1527	continue;
				1528	last_word = sen[c];
				1529	if (last_word == -1)
				1530	continue;
				1531	window_offset = a * layer1_size;
				1532	if (a > window)
				1533	window_offset -= layer1_size;
				1534	for (c = 0; c < layer1_size; c++)
				1535	neu1[c + window_offset] += syn0[c
				1536	+ last_word * layer1_size];
				1537	cw++;
				1538	}
				1539	if (cw) {
				1540	for (a = 0; a < window_hidden_size; a++) {
				1541	c = a * window_layer_size;
				1542	for (b = 0; b < window_layer_size; b++) {
				1543	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1544	}
				1545	}
				1546	if (hs)
				1547	for (d = 0; d < vocab[word].codelen; d++) {
				1548	f = 0;
				1549	l2 = vocab[word].point[d] * window_hidden_size;
				1550	// Propagate hidden -> output
				1551	for (c = 0; c < window_hidden_size; c++)
				1552	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1553	if (f <= -MAX_EXP)
				1554	continue;
				1555	else if (f >= MAX_EXP)
				1556	continue;
				1557	else
				1558	f = expTable[(int) ((f + MAX_EXP)
				1559	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1560	// 'g' is the gradient multiplied by the learning rate
				1561	g = (1 - vocab[word].code[d] - f) * alpha;
				1562	// Propagate errors output -> hidden
				1563	for (c = 0; c < window_hidden_size; c++)
				1564	neu2e[c] += dHardTanh(neu2[c], g) * g
				1565	* syn_hidden_word[c + l2];
				1566	// Learn weights hidden -> output
				1567	for (c = 0; c < window_hidden_size; c++)
				1568	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1569	* neu2[c];
				1570	}
				1571	// NEGATIVE SAMPLING
				1572	if (negative > 0)
				1573	for (d = 0; d < negative + 1; d++) {
				1574	if (d == 0) {
				1575	target = word;
				1576	label = 1;
				1577	} else {
				1578	next_random = next_random
				1579	* (unsigned long long) 25214903917 + 11;
				1580	if (word_to_group != NULL
				1581	&& word_to_group[word] != -1) {
				1582	target = word;
				1583	while (target == word) {
				1584	target = group_to_table[word_to_group[word]
				1585	* table_size
				1586	+ (next_random >> 16) % table_size];
				1587	next_random = next_random
				1588	* (unsigned long long) 25214903917
				1589	+ 11;
				1590	}
				1591	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1592	} else {
				1593	target =
				1594	table[(next_random >> 16) % table_size];
				1595	}
				1596	if (target == 0)
				1597	target = next_random % (vocab_size - 1) + 1;
				1598	if (target == word)
				1599	continue;
				1600	label = 0;
				1601	}
				1602	l2 = target * window_hidden_size;
				1603	f = 0;
				1604	for (c = 0; c < window_hidden_size; c++)
				1605	f += hardTanh(neu2[c])
				1606	* syn_hidden_word_neg[c + l2];
				1607	if (f > MAX_EXP)
				1608	g = (label - 1) * alpha / negative;
				1609	else if (f < -MAX_EXP)
				1610	g = (label - 0) * alpha / negative;
				1611	else
				1612	g = (label
				1613	- expTable[(int) ((f + MAX_EXP)
				1614	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1615	* alpha / negative;
				1616	for (c = 0; c < window_hidden_size; c++)
				1617	neu2e[c] += dHardTanh(neu2[c], g) * g
				1618	* syn_hidden_word_neg[c + l2];
				1619	for (c = 0; c < window_hidden_size; c++)
				1620	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1621	* g * neu2[c];
				1622	}
				1623	for (a = 0; a < window_hidden_size; a++)
				1624	for (b = 0; b < window_layer_size; b++)
				1625	neu1e[b] += neu2e[a]
				1626	* syn_window_hidden[a * window_layer_size + b];
				1627	for (a = 0; a < window_hidden_size; a++)
				1628	for (b = 0; b < window_layer_size; b++)
				1629	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1630	* neu1[b];
				1631	// hidden -> in
				1632	for (a = 0; a < window * 2 + 1; a++)
				1633	if (a != window) {
				1634	c = sentence_position - window + a;
				1635	if (c < 0)
				1636	continue;
				1637	if (c >= sentence_length)
				1638	continue;
				1639	last_word = sen[c];
				1640	if (last_word == -1)
				1641	continue;
				1642	window_offset = a * layer1_size;
				1643	if (a > window)
				1644	window_offset -= layer1_size;
				1645	for (c = 0; c < layer1_size; c++)
				1646	syn0[c + last_word * layer1_size] += neu1e[c
				1647	+ window_offset];
				1648	}
				1649	}
				1650	} else {
				1651	printf("unknown type %i", type);
				1652	exit(0);
				1653	}
				1654	sentence_position++;
				1655	if (sentence_position >= sentence_length) {
				1656	sentence_length = 0;
				1657	continue;
				1658	}
				1659	}
				1660	fclose(fi);
				1661	free(neu1);
				1662	free(neu1e);
				1663	pthread_exit(NULL);
				1664	}
				1665
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1666	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1667	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1668	real f, max_f, maxmax_f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1669	real *target_sums, bestf[MAX_CC], worstbest;
				1670	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1671	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1672	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1673
				1674	for (d = cc; d < vocab_size; d++) {
				1675	for (b = 0; b < vocab_size; b++)
				1676	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1677	for (b = 0; b < N; b++)
				1678	bestf[b]=-1;
				1679	worstbest = -1;
				1680
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1681	maxmax_f = -1;
				1682	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1683	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1684	if (a != window) {
				1685	max_f = -1;
				1686	window_offset = a * layer1_size;
				1687	if (a > window)
				1688	window_offset -= layer1_size;
				1689	for(target = 0; target < vocab_size; target ++) {
				1690	if(target == d)
				1691	continue;
				1692	f = 0;
				1693	for (c = 0; c < layer1_size; c++)
				1694	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1695	if (f < -MAX_EXP)
				1696	continue;
				1697	else if (f > MAX_EXP)
				1698	continue;
				1699	else
				1700	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1701	if(f > max_f) {
				1702	max_f = f;
				1703	max_target = target;
				1704	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1705	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1706	if(f > worstbest) {
				1707	for (b = 0; b < N; b++) {
				1708	if (f > bestf[b]) {
				1709	for (e = N - 1; e > b; e--) {
				1710	bestf[e] = bestf[e - 1];
				1711	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1712	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1713	}
				1714	bestf[b] = f;
				1715	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1716	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1717	break;
				1718	}
				1719	}
				1720	worstbest = bestf[N-1];
				1721	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1722	}
				1723	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1724	if(max_f > maxmax_f) {
				1725	maxmax_f = max_f;
				1726	maxmax_target = max_target;
				1727	}
				1728	} else {
				1729	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1730	}
				1731	}
				1732	max_f = -1;
				1733	for (b = 0; b < vocab_size; b++) {
				1734	if(target_sums[b] > max_f) {
				1735	max_f = target_sums[b];
				1736	max_target = b;
				1737	}
				1738	}
				1739	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1740	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1741	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1742	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1743	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1744	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1745	}
				1746	}
				1747
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1748	void TrainModel() {
				1749	long a, b, c, d;
				1750	FILE *fo;
				1751	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
				1752	printf("Starting training using file %s\n", train_file);
				1753	starting_alpha = alpha;
				1754	if (read_vocab_file[0] != 0)
				1755	ReadVocab();
				1756	else
				1757	LearnVocabFromTrainFile();
				1758	if (save_vocab_file[0] != 0)
				1759	SaveVocab();
				1760	if (output_file[0] == 0)
				1761	return;
				1762	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1763	if(cc > 0)
				1764	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1765	if (negative > 0 \|\| nce > 0)
				1766	InitUnigramTable();
				1767	if (negative_classes_file[0] != 0)
				1768	InitClassUnigramTable();
				1769	start = clock();
				1770	for (a = 0; a < num_threads; a++)
				1771	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
				1772	for (a = 0; a < num_threads; a++)
				1773	pthread_join(pt[a], NULL);
				1774	fo = fopen(output_file, "wb");
				1775	if (classes == 0) {
				1776	// Save the word vectors
				1777	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1778	for (a = 0; a < vocab_size; a++) {
				1779	fprintf(fo, "%s ", vocab[a].word);
				1780	if (binary)
				1781	for (b = 0; b < layer1_size; b++)
				1782	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1783	else
				1784	for (b = 0; b < layer1_size; b++)
				1785	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1786	fprintf(fo, "\n");
				1787	}
				1788	} else {
				1789	// Run K-means on the word vectors
				1790	int clcn = classes, iter = 10, closeid;
				1791	int centcn = (int ) malloc(classes * sizeof(int));
				1792	int cl = (int ) calloc(vocab_size, sizeof(int));
				1793	real closev, x;
				1794	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1795	for (a = 0; a < vocab_size; a++)
				1796	cl[a] = a % clcn;
				1797	for (a = 0; a < iter; a++) {
				1798	for (b = 0; b < clcn * layer1_size; b++)
				1799	cent[b] = 0;
				1800	for (b = 0; b < clcn; b++)
				1801	centcn[b] = 1;
				1802	for (c = 0; c < vocab_size; c++) {
				1803	for (d = 0; d < layer1_size; d++)
				1804	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1805	centcn[cl[c]]++;
				1806	}
				1807	for (b = 0; b < clcn; b++) {
				1808	closev = 0;
				1809	for (c = 0; c < layer1_size; c++) {
				1810	cent[layer1_size * b + c] /= centcn[b];
				1811	closev += cent[layer1_size * b + c]
				1812	* cent[layer1_size * b + c];
				1813	}
				1814	closev = sqrt(closev);
				1815	for (c = 0; c < layer1_size; c++)
				1816	cent[layer1_size * b + c] /= closev;
				1817	}
				1818	for (c = 0; c < vocab_size; c++) {
				1819	closev = -10;
				1820	closeid = 0;
				1821	for (d = 0; d < clcn; d++) {
				1822	x = 0;
				1823	for (b = 0; b < layer1_size; b++)
				1824	x += cent[layer1_size * d + b]
				1825	* syn0[c * layer1_size + b];
				1826	if (x > closev) {
				1827	closev = x;
				1828	closeid = d;
				1829	}
				1830	}
				1831	cl[c] = closeid;
				1832	}
				1833	}
				1834	// Save the K-means classes
				1835	for (a = 0; a < vocab_size; a++)
				1836	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1837	free(centcn);
				1838	free(cent);
				1839	free(cl);
				1840	}
				1841	fclose(fo);
				1842	if (save_net_file[0] != 0)
				1843	SaveNet();
				1844	}
				1845
				1846	int ArgPos(char str, int argc, char *argv) {
				1847	int a;
				1848	for (a = 1; a < argc; a++)
				1849	if (!strcmp(str, argv[a])) {
				1850	if (a == argc - 1) {
				1851	printf("Argument missing for %s\n", str);
				1852	exit(1);
				1853	}
				1854	return a;
				1855	}
				1856	return -1;
				1857	}
				1858
				1859	int main(int argc, char **argv) {
				1860	int i;
				1861	if (argc == 1) {
				1862	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1863	printf("Options:\n");
				1864	printf("Parameters for training:\n");
				1865	printf("\t-train <file>\n");
				1866	printf("\t\tUse text data from <file> to train the model\n");
				1867	printf("\t-output <file>\n");
				1868	printf(
				1869	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1870	printf("\t-size <int>\n");
				1871	printf("\t\tSet size of word vectors; default is 100\n");
				1872	printf("\t-window <int>\n");
				1873	printf("\t\tSet max skip length between words; default is 5\n");
				1874	printf("\t-sample <float>\n");
				1875	printf(
				1876	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1877	printf(
				1878	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1879	printf("\t-hs <int>\n");
				1880	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1881	printf("\t-negative <int>\n");
				1882	printf(
				1883	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1884	printf("\t-negative-classes <file>\n");
				1885	printf("\t\tNegative classes to sample from\n");
				1886	printf("\t-nce <int>\n");
				1887	printf(
				1888	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1889	printf("\t-threads <int>\n");
				1890	printf("\t\tUse <int> threads (default 12)\n");
				1891	printf("\t-iter <int>\n");
				1892	printf("\t\tRun more training iterations (default 5)\n");
				1893	printf("\t-min-count <int>\n");
				1894	printf(
				1895	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1896	printf("\t-alpha <float>\n");
				1897	printf(
				1898	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1899	printf("\t-classes <int>\n");
				1900	printf(
				1901	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1902	printf("\t-debug <int>\n");
				1903	printf(
				1904	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				1905	printf("\t-binary <int>\n");
				1906	printf(
				1907	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1908	printf("\t-save-vocab <file>\n");
				1909	printf("\t\tThe vocabulary will be saved to <file>\n");
				1910	printf("\t-read-vocab <file>\n");
				1911	printf(
				1912	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1913	printf("\t-read-net <file>\n");
				1914	printf(
				1915	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				1916	printf("\t-save-net <file>\n");
				1917	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1918	printf("\t-show-cc <int>\n");
				1919	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1920	printf("\t-type <int>\n");
				1921	printf(
				1922	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type)\n");
				1923	printf("\t-cap <int>\n");
				1924	printf(
				1925	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1926	printf("\nExamples:\n");
				1927	printf(
				1928	"./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1929	return 0;
				1930	}
				1931	output_file[0] = 0;
				1932	save_vocab_file[0] = 0;
				1933	read_vocab_file[0] = 0;
				1934	save_net_file[0] = 0;
				1935	read_net_file[0] = 0;
				1936	negative_classes_file[0] = 0;
				1937	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				1938	layer1_size = atoi(argv[i + 1]);
				1939	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				1940	strcpy(train_file, argv[i + 1]);
				1941	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				1942	strcpy(save_vocab_file, argv[i + 1]);
				1943	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				1944	strcpy(read_vocab_file, argv[i + 1]);
				1945	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				1946	strcpy(save_net_file, argv[i + 1]);
				1947	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				1948	strcpy(read_net_file, argv[i + 1]);
				1949	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				1950	debug_mode = atoi(argv[i + 1]);
				1951	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				1952	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1953	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				1954	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1955	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				1956	type = atoi(argv[i + 1]);
				1957	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				1958	strcpy(output_file, argv[i + 1]);
				1959	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				1960	window = atoi(argv[i + 1]);
				1961	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				1962	sample = atof(argv[i + 1]);
				1963	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				1964	hs = atoi(argv[i + 1]);
				1965	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				1966	negative = atoi(argv[i + 1]);
				1967	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				1968	strcpy(negative_classes_file, argv[i + 1]);
				1969	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				1970	nce = atoi(argv[i + 1]);
				1971	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				1972	num_threads = atoi(argv[i + 1]);
				1973	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				1974	iter = atoi(argv[i + 1]);
				1975	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				1976	min_count = atoi(argv[i + 1]);
				1977	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				1978	classes = atoi(argv[i + 1]);
				1979	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				1980	cap = atoi(argv[i + 1]);
				1981	if (type == 0 \|\| type == 2 \|\| type == 4)
				1982	alpha = 0.05;
				1983	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				1984	alpha = atof(argv[i + 1]);
				1985	vocab = (struct vocab_word *) calloc(vocab_max_size,
				1986	sizeof(struct vocab_word));
				1987	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				1988	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1989	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1990	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1991	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1992	}
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	1993	SaveArgs(argc, argv);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1994	TrainModel();
				1995	return 0;
				1996	}
				1997