Blame - word2vec.c - ids-kl/dereko2vec

blob: fbf96a1e6808423fdb3a27cb0c192430d59665e0 [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
				15	#include <stdio.h>
				16	#include <stdlib.h>
				17	#include <string.h>
				18	#include <math.h>
				19	#include <pthread.h>
Marc Kupietz	b16eba4	2018-12-20 11:29:57 +0100	[diff] [blame]	20	#include "collocatordb.h"
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	21
				22	#define MAX_STRING 100
				23	#define EXP_TABLE_SIZE 1000
				24	#define MAX_EXP 6
				25	#define MAX_SENTENCE_LENGTH 1000
				26	#define MAX_CODE_LENGTH 40
				27
				28	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				29
				30	typedef float real; // Precision of float numbers
				31
				32	struct vocab_word {
				33	long long cn;
				34	int *point;
				35	char word, code, codelen;
				36	};
				37
				38	char train_file[MAX_STRING], output_file[MAX_STRING];
				39	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				40	struct vocab_word *vocab;
				41	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5, num_threads = 12, min_reduce = 1;
				42	int *vocab_hash;
				43	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				44	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0, classes = 0;
				45	real alpha = 0.025, starting_alpha, sample = 1e-3;
				46	real syn0, syn1, syn1neg, syn1nce, *expTable;
				47	clock_t start;
				48
				49	real syn1_window, syn1neg_window, *syn1nce_window;
				50	int w_offset, window_layer_size;
				51
				52	int window_hidden_size = 500;
				53	real syn_window_hidden, syn_hidden_word, syn_hidden_word_neg, syn_hidden_word_nce;
				54
				55	int hs = 0, negative = 5;
				56	const int table_size = 1e8;
				57	int *table;
				58
				59	//constrastive negative sampling
				60	char negative_classes_file[MAX_STRING];
				61	int *word_to_group;
				62	int group_to_table; //group_sizetable_size
				63	int class_number;
				64
				65	//nce
				66	real* noise_distribution;
				67	int nce = 0;
				68
				69	//param caps
				70	real CAP_VALUE = 50;
				71	int cap = 0;
				72
Marc Kupietz	b16eba4	2018-12-20 11:29:57 +0100	[diff] [blame]	73	COLLOCATORS *cdb = null;
				74
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	75	void capParam(real* array, int index){
				76	if(array[index] > CAP_VALUE)
				77	array[index] = CAP_VALUE;
				78	else if(array[index] < -CAP_VALUE)
				79	array[index] = -CAP_VALUE;
				80	}
				81
				82	real hardTanh(real x){
				83	if(x>=1){
				84	return 1;
				85	}
				86	else if(x<=-1){
				87	return -1;
				88	}
				89	else{
				90	return x;
				91	}
				92	}
				93
				94	real dHardTanh(real x, real g){
				95	if(x > 1 && g > 0){
				96	return 0;
				97	}
				98	if(x < -1 && g < 0){
				99	return 0;
				100	}
				101	return 1;
				102	}
				103
				104	void InitUnigramTable() {
				105	int a, i;
				106	long long train_words_pow = 0;
				107	real d1, power = 0.75;
				108	table = (int )malloc(table_size sizeof(int));
				109	for (a = 0; a < vocab_size; a++) train_words_pow += pow(vocab[a].cn, power);
				110	i = 0;
				111	d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
				112	for (a = 0; a < table_size; a++) {
				113	table[a] = i;
				114	if (a / (real)table_size > d1) {
				115	i++;
				116	d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
				117	}
				118	if (i >= vocab_size) i = vocab_size - 1;
				119	}
				120
				121	noise_distribution = (real *)calloc(vocab_size, sizeof(real));
				122	for (a = 0; a < vocab_size; a++) noise_distribution[a] = pow(vocab[a].cn, power)/(real)train_words_pow;
				123	}
				124
				125	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				126	void ReadWord(char word, FILE fin) {
				127	int a = 0, ch;
				128	while (!feof(fin)) {
				129	ch = fgetc(fin);
				130	if (ch == 13) continue;
				131	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
				132	if (a > 0) {
				133	if (ch == '\n') ungetc(ch, fin);
				134	break;
				135	}
				136	if (ch == '\n') {
				137	strcpy(word, (char *)"</s>");
				138	return;
				139	} else continue;
				140	}
				141	word[a] = ch;
				142	a++;
				143	if (a >= MAX_STRING - 1) a--; // Truncate too long words
				144	}
				145	word[a] = 0;
				146	}
				147
				148	// Returns hash value of a word
				149	int GetWordHash(char *word) {
				150	unsigned long long a, hash = 0;
				151	for (a = 0; a < strlen(word); a++) hash = hash * 257 + word[a];
				152	hash = hash % vocab_hash_size;
				153	return hash;
				154	}
				155
				156	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				157	int SearchVocab(char *word) {
				158	unsigned int hash = GetWordHash(word);
				159	while (1) {
				160	if (vocab_hash[hash] == -1) return -1;
				161	if (!strcmp(word, vocab[vocab_hash[hash]].word)) return vocab_hash[hash];
				162	hash = (hash + 1) % vocab_hash_size;
				163	}
				164	return -1;
				165	}
				166
				167	// Reads a word and returns its index in the vocabulary
				168	int ReadWordIndex(FILE *fin) {
				169	char word[MAX_STRING];
				170	ReadWord(word, fin);
				171	if (feof(fin)) return -1;
				172	return SearchVocab(word);
				173	}
				174
				175	// Adds a word to the vocabulary
				176	int AddWordToVocab(char *word) {
				177	unsigned int hash, length = strlen(word) + 1;
				178	if (length > MAX_STRING) length = MAX_STRING;
				179	vocab[vocab_size].word = (char *)calloc(length, sizeof(char));
				180	strcpy(vocab[vocab_size].word, word);
				181	vocab[vocab_size].cn = 0;
				182	vocab_size++;
				183	// Reallocate memory if needed
				184	if (vocab_size + 2 >= vocab_max_size) {
				185	vocab_max_size += 1000;
				186	vocab = (struct vocab_word )realloc(vocab, vocab_max_size sizeof(struct vocab_word));
				187	}
				188	hash = GetWordHash(word);
				189	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				190	vocab_hash[hash] = vocab_size - 1;
				191	return vocab_size - 1;
				192	}
				193
				194	// Used later for sorting by word counts
				195	int VocabCompare(const void a, const void b) {
				196	return ((struct vocab_word )b)->cn - ((struct vocab_word )a)->cn;
				197	}
				198
				199	// Sorts the vocabulary by frequency using word counts
				200	void SortVocab() {
				201	int a, size;
				202	unsigned int hash;
				203	// Sort the vocabulary and keep </s> at the first position
				204	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				205	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				206	size = vocab_size;
				207	train_words = 0;
				208	for (a = 0; a < size; a++) {
				209	// Words occuring less than min_count times will be discarded from the vocab
				210	if ((vocab[a].cn < min_count) && (a != 0)) {
				211	vocab_size--;
				212	free(vocab[a].word);
				213	} else {
				214	// Hash will be re-computed, as after the sorting it is not actual
				215	hash=GetWordHash(vocab[a].word);
				216	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				217	vocab_hash[hash] = a;
				218	train_words += vocab[a].cn;
				219	}
				220	}
				221	vocab = (struct vocab_word )realloc(vocab, (vocab_size + 1) sizeof(struct vocab_word));
				222	// Allocate memory for the binary tree construction
				223	for (a = 0; a < vocab_size; a++) {
				224	vocab[a].code = (char *)calloc(MAX_CODE_LENGTH, sizeof(char));
				225	vocab[a].point = (int *)calloc(MAX_CODE_LENGTH, sizeof(int));
				226	}
				227	}
				228
				229	// Reduces the vocabulary by removing infrequent tokens
				230	void ReduceVocab() {
				231	int a, b = 0;
				232	unsigned int hash;
				233	for (a = 0; a < vocab_size; a++) if (vocab[a].cn > min_reduce) {
				234	vocab[b].cn = vocab[a].cn;
				235	vocab[b].word = vocab[a].word;
				236	b++;
				237	} else free(vocab[a].word);
				238	vocab_size = b;
				239	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				240	for (a = 0; a < vocab_size; a++) {
				241	// Hash will be re-computed, as it is not actual
				242	hash = GetWordHash(vocab[a].word);
				243	while (vocab_hash[hash] != -1) hash = (hash + 1) % vocab_hash_size;
				244	vocab_hash[hash] = a;
				245	}
				246	fflush(stdout);
				247	min_reduce++;
				248	}
				249
				250	// Create binary Huffman tree using the word counts
				251	// Frequent words will have short uniqe binary codes
				252	void CreateBinaryTree() {
				253	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				254	char code[MAX_CODE_LENGTH];
				255	long long count = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				256	long long binary = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				257	long long parent_node = (long long )calloc(vocab_size * 2 + 1, sizeof(long long));
				258	for (a = 0; a < vocab_size; a++) count[a] = vocab[a].cn;
				259	for (a = vocab_size; a < vocab_size * 2; a++) count[a] = 1e15;
				260	pos1 = vocab_size - 1;
				261	pos2 = vocab_size;
				262	// Following algorithm constructs the Huffman tree by adding one node at a time
				263	for (a = 0; a < vocab_size - 1; a++) {
				264	// First, find two smallest nodes 'min1, min2'
				265	if (pos1 >= 0) {
				266	if (count[pos1] < count[pos2]) {
				267	min1i = pos1;
				268	pos1--;
				269	} else {
				270	min1i = pos2;
				271	pos2++;
				272	}
				273	} else {
				274	min1i = pos2;
				275	pos2++;
				276	}
				277	if (pos1 >= 0) {
				278	if (count[pos1] < count[pos2]) {
				279	min2i = pos1;
				280	pos1--;
				281	} else {
				282	min2i = pos2;
				283	pos2++;
				284	}
				285	} else {
				286	min2i = pos2;
				287	pos2++;
				288	}
				289	count[vocab_size + a] = count[min1i] + count[min2i];
				290	parent_node[min1i] = vocab_size + a;
				291	parent_node[min2i] = vocab_size + a;
				292	binary[min2i] = 1;
				293	}
				294	// Now assign binary code to each vocabulary word
				295	for (a = 0; a < vocab_size; a++) {
				296	b = a;
				297	i = 0;
				298	while (1) {
				299	code[i] = binary[b];
				300	point[i] = b;
				301	i++;
				302	b = parent_node[b];
				303	if (b == vocab_size * 2 - 2) break;
				304	}
				305	vocab[a].codelen = i;
				306	vocab[a].point[0] = vocab_size - 2;
				307	for (b = 0; b < i; b++) {
				308	vocab[a].code[i - b - 1] = code[b];
				309	vocab[a].point[i - b] = point[b] - vocab_size;
				310	}
				311	}
				312	free(count);
				313	free(binary);
				314	free(parent_node);
				315	}
				316
				317	void LearnVocabFromTrainFile() {
				318	char word[MAX_STRING];
				319	FILE *fin;
				320	long long a, i;
				321	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				322	fin = fopen(train_file, "rb");
				323	if (fin == NULL) {
				324	printf("ERROR: training data file not found!\n");
				325	exit(1);
				326	}
				327	vocab_size = 0;
				328	AddWordToVocab((char *)"</s>");
				329	while (1) {
				330	ReadWord(word, fin);
				331	if (feof(fin)) break;
				332	train_words++;
				333	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				334	printf("%lldK%c", train_words / 1000, 13);
				335	fflush(stdout);
				336	}
				337	i = SearchVocab(word);
				338	if (i == -1) {
				339	a = AddWordToVocab(word);
				340	vocab[a].cn = 1;
				341	} else vocab[i].cn++;
				342	if (vocab_size > vocab_hash_size * 0.7) ReduceVocab();
				343	}
				344	SortVocab();
				345	if (debug_mode > 0) {
				346	printf("Vocab size: %lld\n", vocab_size);
				347	printf("Words in train file: %lld\n", train_words);
				348	}
				349	file_size = ftell(fin);
				350	fclose(fin);
				351	}
				352
				353	void SaveVocab() {
				354	long long i;
				355	FILE *fo = fopen(save_vocab_file, "wb");
				356	for (i = 0; i < vocab_size; i++) fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				357	fclose(fo);
				358	}
				359
				360	void ReadVocab() {
				361	long long a, i = 0;
				362	char c;
				363	char word[MAX_STRING];
				364	FILE *fin = fopen(read_vocab_file, "rb");
				365	if (fin == NULL) {
				366	printf("Vocabulary file not found\n");
				367	exit(1);
				368	}
				369	for (a = 0; a < vocab_hash_size; a++) vocab_hash[a] = -1;
				370	vocab_size = 0;
				371	while (1) {
				372	ReadWord(word, fin);
				373	if (feof(fin)) break;
				374	a = AddWordToVocab(word);
				375	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				376	i++;
				377	}
				378	SortVocab();
				379	if (debug_mode > 0) {
				380	printf("Vocab size: %lld\n", vocab_size);
				381	printf("Words in train file: %lld\n", train_words);
				382	}
				383	fin = fopen(train_file, "rb");
				384	if (fin == NULL) {
				385	printf("ERROR: training data file not found!\n");
				386	exit(1);
				387	}
				388	fseek(fin, 0, SEEK_END);
				389	file_size = ftell(fin);
				390	fclose(fin);
				391	}
				392
				393	void InitClassUnigramTable() {
				394	long long a,c;
				395	printf("loading class unigrams \n");
				396	FILE *fin = fopen(negative_classes_file, "rb");
				397	if (fin == NULL) {
				398	printf("ERROR: class file not found!\n");
				399	exit(1);
				400	}
				401	word_to_group = (int )malloc(vocab_size sizeof(int));
				402	for(a = 0; a < vocab_size; a++) word_to_group[a] = -1;
				403	char class[MAX_STRING];
				404	char prev_class[MAX_STRING];
				405	prev_class[0] = 0;
				406	char word[MAX_STRING];
				407	class_number = -1;
				408	while (1) {
				409	if (feof(fin)) break;
				410	ReadWord(class, fin);
				411	ReadWord(word, fin);
				412	int word_index = SearchVocab(word);
				413	if (word_index != -1){
				414	if(strcmp(class, prev_class) != 0){
				415	class_number++;
				416	strcpy(prev_class, class);
				417	}
				418	word_to_group[word_index] = class_number;
				419	}
				420	ReadWord(word, fin);
				421	}
				422	class_number++;
				423	fclose(fin);
				424
				425	group_to_table = (int )malloc(table_size class_number * sizeof(int));
				426	long long train_words_pow = 0;
				427	real d1, power = 0.75;
				428
				429	for(c = 0; c < class_number; c++){
				430	long long offset = c * table_size;
				431	train_words_pow = 0;
				432	for (a = 0; a < vocab_size; a++) if(word_to_group[a] == c) train_words_pow += pow(vocab[a].cn, power);
				433	int i = 0;
				434	while(word_to_group[i]!=c && i < vocab_size) i++;
				435	d1 = pow(vocab[i].cn, power) / (real)train_words_pow;
				436	for (a = 0; a < table_size; a++) {
				437	//printf("index %lld , word %d\n", a, i);
				438	group_to_table[offset + a] = i;
				439	if (a / (real)table_size > d1) {
				440	i++;
				441	while(word_to_group[i]!=c && i < vocab_size) i++;
				442	d1 += pow(vocab[i].cn, power) / (real)train_words_pow;
				443	}
				444	if (i >= vocab_size) while(word_to_group[i]!=c && i >= 0) i--;
				445	}
				446	}
				447	}
				448
				449	void InitNet() {
				450	long long a, b;
				451	unsigned long long next_random = 1;
				452	window_layer_size = layer1_sizewindow2;
				453	a = posix_memalign((void *)&syn0, 128, (long long)vocab_size layer1_size * sizeof(real));
				454	if (syn0 == NULL) {printf("Memory allocation failed\n"); exit(1);}
				455
				456	if (hs) {
				457	a = posix_memalign((void *)&syn1, 128, (long long)vocab_size layer1_size * sizeof(real));
				458	if (syn1 == NULL) {printf("Memory allocation failed\n"); exit(1);}
				459	a = posix_memalign((void *)&syn1_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				460	if (syn1_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				461	a = posix_memalign((void *)&syn_hidden_word, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				462	if (syn_hidden_word == NULL) {printf("Memory allocation failed\n"); exit(1);}
				463
				464	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				465	syn1[a * layer1_size + b] = 0;
				466	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				467	syn1_window[a * window_layer_size + b] = 0;
				468	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				469	syn_hidden_word[a * window_hidden_size + b] = 0;
				470	}
				471	if (negative>0) {
				472	a = posix_memalign((void *)&syn1neg, 128, (long long)vocab_size layer1_size * sizeof(real));
				473	if (syn1neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
				474	a = posix_memalign((void *)&syn1neg_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				475	if (syn1neg_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				476	a = posix_memalign((void *)&syn_hidden_word_neg, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				477	if (syn_hidden_word_neg == NULL) {printf("Memory allocation failed\n"); exit(1);}
				478
				479	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				480	syn1neg[a * layer1_size + b] = 0;
				481	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				482	syn1neg_window[a * window_layer_size + b] = 0;
				483	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				484	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
				485	}
				486	if (nce>0) {
				487	a = posix_memalign((void *)&syn1nce, 128, (long long)vocab_size layer1_size * sizeof(real));
				488	if (syn1nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
				489	a = posix_memalign((void *)&syn1nce_window, 128, (long long)vocab_size window_layer_size * sizeof(real));
				490	if (syn1nce_window == NULL) {printf("Memory allocation failed\n"); exit(1);}
				491	a = posix_memalign((void *)&syn_hidden_word_nce, 128, (long long)vocab_size window_hidden_size * sizeof(real));
				492	if (syn_hidden_word_nce == NULL) {printf("Memory allocation failed\n"); exit(1);}
				493
				494	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++)
				495	syn1nce[a * layer1_size + b] = 0;
				496	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_layer_size; b++)
				497	syn1nce_window[a * window_layer_size + b] = 0;
				498	for (a = 0; a < vocab_size; a++) for (b = 0; b < window_hidden_size; b++)
				499	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				500	}
				501	for (a = 0; a < vocab_size; a++) for (b = 0; b < layer1_size; b++) {
				502	next_random = next_random * (unsigned long long)25214903917 + 11;
				503	syn0[a * layer1_size + b] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / layer1_size;
				504	}
				505
				506	a = posix_memalign((void *)&syn_window_hidden, 128, window_hidden_size window_layer_size * sizeof(real));
				507	if (syn_window_hidden == NULL) {printf("Memory allocation failed\n"); exit(1);}
				508	for (a = 0; a < window_hidden_size * window_layer_size; a++){
				509	next_random = next_random * (unsigned long long)25214903917 + 11;
				510	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real)65536) - 0.5) / (window_hidden_size*window_layer_size);
				511	}
				512
				513	CreateBinaryTree();
				514	}
				515
				516	void TrainModelThread(void id) {
				517	long long a, b, d, cw, word, last_word, sentence_length = 0, sentence_position = 0;
				518	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
				519	long long l1, l2, c, target, label, local_iter = iter;
				520	unsigned long long next_random = (long long)id;
				521	real f, g;
				522	clock_t now;
				523	int input_len_1 = layer1_size;
				524	int window_offset = -1;
				525	if(type == 2 \|\| type == 4){
				526	input_len_1=window_layer_size;
				527	}
				528	real neu1 = (real )calloc(input_len_1, sizeof(real));
				529	real neu1e = (real )calloc(input_len_1, sizeof(real));
				530
				531	int input_len_2 = 0;
				532	if(type == 4){
				533	input_len_2 = window_hidden_size;
				534	}
				535	real neu2 = (real )calloc(input_len_2, sizeof(real));
				536	real neu2e = (real )calloc(input_len_2, sizeof(real));
				537
				538	FILE *fi = fopen(train_file, "rb");
				539	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
				540	while (1) {
				541	if (word_count - last_word_count > 10000) {
				542	word_count_actual += word_count - last_word_count;
				543	last_word_count = word_count;
				544	if ((debug_mode > 1)) {
				545	now=clock();
				546	printf("%cAlpha: %f Progress: %.2f%% Words/thread/sec: %.2fk ", 13, alpha,
				547	word_count_actual / (real)(iter * train_words + 1) * 100,
				548	word_count_actual / ((real)(now - start + 1) / (real)CLOCKS_PER_SEC * 1000));
				549	fflush(stdout);
				550	}
				551	alpha = starting_alpha * (1 - word_count_actual / (real)(iter * train_words + 1));
				552	if (alpha < starting_alpha * 0.0001) alpha = starting_alpha * 0.0001;
				553	}
				554	if (sentence_length == 0) {
				555	while (1) {
				556	word = ReadWordIndex(fi);
				557	if (feof(fi)) break;
				558	if (word == -1) continue;
				559	word_count++;
				560	if (word == 0) break;
				561	// The subsampling randomly discards frequent words while keeping the ranking same
				562	if (sample > 0) {
				563	real ran = (sqrt(vocab[word].cn / (sample * train_words)) + 1) * (sample * train_words) / vocab[word].cn;
				564	next_random = next_random * (unsigned long long)25214903917 + 11;
				565	if (ran < (next_random & 0xFFFF) / (real)65536) continue;
				566	}
				567	sen[sentence_length] = word;
				568	sentence_length++;
				569	if (sentence_length >= MAX_SENTENCE_LENGTH) break;
				570	}
				571	sentence_position = 0;
				572	}
				573	if (feof(fi) \|\| (word_count > train_words / num_threads)) {
				574	word_count_actual += word_count - last_word_count;
				575	local_iter--;
				576	if (local_iter == 0) break;
				577	word_count = 0;
				578	last_word_count = 0;
				579	sentence_length = 0;
				580	fseek(fi, file_size / (long long)num_threads * (long long)id, SEEK_SET);
				581	continue;
				582	}
				583	word = sen[sentence_position];
				584	if (word == -1) continue;
				585	for (c = 0; c < input_len_1; c++) neu1[c] = 0;
				586	for (c = 0; c < input_len_1; c++) neu1e[c] = 0;
				587	for (c = 0; c < input_len_2; c++) neu2[c] = 0;
				588	for (c = 0; c < input_len_2; c++) neu2e[c] = 0;
				589	next_random = next_random * (unsigned long long)25214903917 + 11;
				590	b = next_random % window;
				591	if (type == 0) { //train the cbow architecture
				592	// in -> hidden
				593	cw = 0;
				594	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				595	c = sentence_position - window + a;
				596	if (c < 0) continue;
				597	if (c >= sentence_length) continue;
				598	last_word = sen[c];
				599	if (last_word == -1) continue;
				600	for (c = 0; c < layer1_size; c++) neu1[c] += syn0[c + last_word * layer1_size];
				601	cw++;
				602	}
				603	if (cw) {
				604	for (c = 0; c < layer1_size; c++) neu1[c] /= cw;
				605	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				606	f = 0;
				607	l2 = vocab[word].point[d] * layer1_size;
				608	// Propagate hidden -> output
				609	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1[c + l2];
				610	if (f <= -MAX_EXP) continue;
				611	else if (f >= MAX_EXP) continue;
				612	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				613	// 'g' is the gradient multiplied by the learning rate
				614	g = (1 - vocab[word].code[d] - f) * alpha;
				615	// Propagate errors output -> hidden
				616	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
				617	// Learn weights hidden -> output
				618	for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * neu1[c];
				619	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
				620	}
				621	// NEGATIVE SAMPLING
				622	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				623	if (d == 0) {
				624	target = word;
				625	label = 1;
				626	} else {
				627	next_random = next_random * (unsigned long long)25214903917 + 11;
				628	if(word_to_group != NULL && word_to_group[word] != -1){
				629	target = word;
				630	while(target == word) {
				631	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				632	next_random = next_random * (unsigned long long)25214903917 + 11;
				633	}
				634	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				635	}
				636	else{
				637	target = table[(next_random >> 16) % table_size];
				638	}
				639	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				640	if (target == word) continue;
				641	label = 0;
				642	}
				643	l2 = target * layer1_size;
				644	f = 0;
				645	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1neg[c + l2];
				646	if (f > MAX_EXP) g = (label - 1) * alpha;
				647	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				648	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				649	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
				650	for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * neu1[c];
				651	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
				652	}
				653	// Noise Contrastive Estimation
				654	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				655	if (d == 0) {
				656	target = word;
				657	label = 1;
				658	} else {
				659	next_random = next_random * (unsigned long long)25214903917 + 11;
				660	if(word_to_group != NULL && word_to_group[word] != -1){
				661	target = word;
				662	while(target == word) {
				663	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				664	next_random = next_random * (unsigned long long)25214903917 + 11;
				665	}
				666	}
				667	else{
				668	target = table[(next_random >> 16) % table_size];
				669	}
				670	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				671	if (target == word) continue;
				672	label = 0;
				673	}
				674	l2 = target * layer1_size;
				675	f = 0;
				676
				677	for (c = 0; c < layer1_size; c++) f += neu1[c] * syn1nce[c + l2];
				678	if (f > MAX_EXP) g = (label - 1) * alpha;
				679	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				680	else {
				681	f = exp(f);
				682	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				683	}
				684	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
				685	for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * neu1[c];
				686	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce,c + l2);
				687	}
				688	// hidden -> in
				689	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				690	c = sentence_position - window + a;
				691	if (c < 0) continue;
				692	if (c >= sentence_length) continue;
				693	last_word = sen[c];
				694	if (last_word == -1) continue;
				695	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c];
				696	}
				697	}
				698	} else if(type==1) { //train skip-gram
				699	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				700	c = sentence_position - window + a;
				701	if (c < 0) continue;
				702	if (c >= sentence_length) continue;
				703	last_word = sen[c];
				704	if (last_word == -1) continue;
				705	l1 = last_word * layer1_size;
				706	for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
				707	// HIERARCHICAL SOFTMAX
				708	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				709	f = 0;
				710	l2 = vocab[word].point[d] * layer1_size;
				711	// Propagate hidden -> output
				712	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1[c + l2];
				713	if (f <= -MAX_EXP) continue;
				714	else if (f >= MAX_EXP) continue;
				715	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				716	// 'g' is the gradient multiplied by the learning rate
				717	g = (1 - vocab[word].code[d] - f) * alpha;
				718	// Propagate errors output -> hidden
				719	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1[c + l2];
				720	// Learn weights hidden -> output
				721	for (c = 0; c < layer1_size; c++) syn1[c + l2] += g * syn0[c + l1];
				722	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2);
				723	}
				724	// NEGATIVE SAMPLING
				725	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				726	if (d == 0) {
				727	target = word;
				728	label = 1;
				729	} else {
				730	next_random = next_random * (unsigned long long)25214903917 + 11;
				731	if(word_to_group != NULL && word_to_group[word] != -1){
				732	target = word;
				733	while(target == word) {
				734	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				735	next_random = next_random * (unsigned long long)25214903917 + 11;
				736	}
				737	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				738	}
				739	else{
				740	target = table[(next_random >> 16) % table_size];
				741	}
				742	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				743	if (target == word) continue;
				744	label = 0;
				745	}
				746	l2 = target * layer1_size;
				747	f = 0;
				748	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg[c + l2];
				749	if (f > MAX_EXP) g = (label - 1) * alpha;
				750	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				751	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				752	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg[c + l2];
				753	for (c = 0; c < layer1_size; c++) syn1neg[c + l2] += g * syn0[c + l1];
				754	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg, c + l2);
				755	}
				756	//Noise Contrastive Estimation
				757	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				758	if (d == 0) {
				759	target = word;
				760	label = 1;
				761	} else {
				762	next_random = next_random * (unsigned long long)25214903917 + 11;
				763	if(word_to_group != NULL && word_to_group[word] != -1){
				764	target = word;
				765	while(target == word) {
				766	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				767	next_random = next_random * (unsigned long long)25214903917 + 11;
				768	}
				769	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				770	}
				771	else{
				772	target = table[(next_random >> 16) % table_size];
				773	}
				774	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				775	if (target == word) continue;
				776	label = 0;
				777	}
				778	l2 = target * layer1_size;
				779	f = 0;
				780	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce[c + l2];
				781	if (f > MAX_EXP) g = (label - 1) * alpha;
				782	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				783	else {
				784	f = exp(f);
				785	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				786	}
				787	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce[c + l2];
				788	for (c = 0; c < layer1_size; c++) syn1nce[c + l2] += g * syn0[c + l1];
				789	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce, c + l2);
				790	}
				791	// Learn weights input -> hidden
				792	for (c = 0; c < layer1_size; c++) syn0[c + l1] += neu1e[c];
				793	}
				794	}
				795	else if(type == 2){ //train the cwindow architecture
				796	// in -> hidden
				797	cw = 0;
				798	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				799	c = sentence_position - window + a;
				800	if (c < 0) continue;
				801	if (c >= sentence_length) continue;
				802	last_word = sen[c];
				803	if (last_word == -1) continue;
				804	window_offset = a*layer1_size;
				805	if (a > window) window_offset-=layer1_size;
				806	for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
				807	cw++;
				808	}
				809	if (cw) {
				810	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				811	f = 0;
				812	l2 = vocab[word].point[d] * window_layer_size;
				813	// Propagate hidden -> output
				814	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1_window[c + l2];
				815	if (f <= -MAX_EXP) continue;
				816	else if (f >= MAX_EXP) continue;
				817	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				818	// 'g' is the gradient multiplied by the learning rate
				819	g = (1 - vocab[word].code[d] - f) * alpha;
				820	// Propagate errors output -> hidden
				821	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1_window[c + l2];
				822	// Learn weights hidden -> output
				823	for (c = 0; c < window_layer_size; c++) syn1_window[c + l2] += g * neu1[c];
				824	if (cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1_window, c + l2);
				825	}
				826	// NEGATIVE SAMPLING
				827	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				828	if (d == 0) {
				829	target = word;
				830	label = 1;
				831	} else {
				832	next_random = next_random * (unsigned long long)25214903917 + 11;
				833	if(word_to_group != NULL && word_to_group[word] != -1){
				834	target = word;
				835	while(target == word) {
				836	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				837	next_random = next_random * (unsigned long long)25214903917 + 11;
				838	}
				839	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				840	}
				841	else{
				842	target = table[(next_random >> 16) % table_size];
				843	}
				844	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				845	if (target == word) continue;
				846	label = 0;
				847	}
				848	l2 = target * window_layer_size;
				849	f = 0;
				850	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1neg_window[c + l2];
				851	if (f > MAX_EXP) g = (label - 1) * alpha;
				852	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				853	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				854	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1neg_window[c + l2];
				855	for (c = 0; c < window_layer_size; c++) syn1neg_window[c + l2] += g * neu1[c];
				856	if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1neg_window, c + l2);
				857	}
				858	// Noise Contrastive Estimation
				859	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				860	if (d == 0) {
				861	target = word;
				862	label = 1;
				863	} else {
				864	next_random = next_random * (unsigned long long)25214903917 + 11;
				865	if(word_to_group != NULL && word_to_group[word] != -1){
				866	target = word;
				867	while(target == word) {
				868	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				869	next_random = next_random * (unsigned long long)25214903917 + 11;
				870	}
				871	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				872	}
				873	else{
				874	target = table[(next_random >> 16) % table_size];
				875	}
				876	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				877	if (target == word) continue;
				878	label = 0;
				879	}
				880	l2 = target * window_layer_size;
				881	f = 0;
				882	for (c = 0; c < window_layer_size; c++) f += neu1[c] * syn1nce_window[c + l2];
				883	if (f > MAX_EXP) g = (label - 1) * alpha;
				884	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				885	else {
				886	f = exp(f);
				887	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				888	}
				889	for (c = 0; c < window_layer_size; c++) neu1e[c] += g * syn1nce_window[c + l2];
				890	for (c = 0; c < window_layer_size; c++) syn1nce_window[c + l2] += g * neu1[c];
				891	if(cap == 1) for (c = 0; c < window_layer_size; c++) capParam(syn1nce_window, c + l2);
				892	}
				893	// hidden -> in
				894	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				895	c = sentence_position - window + a;
				896	if (c < 0) continue;
				897	if (c >= sentence_length) continue;
				898	last_word = sen[c];
				899	if (last_word == -1) continue;
				900	window_offset = a * layer1_size;
				901	if(a > window) window_offset -= layer1_size;
				902	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
				903	}
				904	}
				905	}
				906	else if (type == 3){ //train structured skip-gram
				907	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				908	c = sentence_position - window + a;
				909	if (c < 0) continue;
				910	if (c >= sentence_length) continue;
				911	last_word = sen[c];
				912	if (last_word == -1) continue;
				913	l1 = last_word * layer1_size;
				914	window_offset = a * layer1_size;
				915	if(a > window) window_offset -= layer1_size;
				916	for (c = 0; c < layer1_size; c++) neu1e[c] = 0;
				917	// HIERARCHICAL SOFTMAX
				918	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				919	f = 0;
				920	l2 = vocab[word].point[d] * window_layer_size;
				921	// Propagate hidden -> output
				922	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1_window[c + l2 + window_offset];
				923	if (f <= -MAX_EXP) continue;
				924	else if (f >= MAX_EXP) continue;
				925	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				926	// 'g' is the gradient multiplied by the learning rate
				927	g = (1 - vocab[word].code[d] - f) * alpha;
				928	// Propagate errors output -> hidden
				929	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1_window[c + l2 + window_offset];
				930	// Learn weights hidden -> output
				931	for (c = 0; c < layer1_size; c++) syn1[c + l2 + window_offset] += g * syn0[c + l1];
				932	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1, c + l2 + window_offset);
				933	}
				934	// NEGATIVE SAMPLING
				935	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				936	if (d == 0) {
				937	target = word;
				938	label = 1;
				939	} else {
				940	next_random = next_random * (unsigned long long)25214903917 + 11;
				941	if(word_to_group != NULL && word_to_group[word] != -1){
				942	target = word;
				943	while(target == word) {
				944	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				945	next_random = next_random * (unsigned long long)25214903917 + 11;
				946	}
				947	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				948	}
				949	else{
				950	target = table[(next_random >> 16) % table_size];
				951	}
				952	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				953	if (target == word) continue;
				954	label = 0;
				955	}
				956	l2 = target * window_layer_size;
				957	f = 0;
				958	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1neg_window[c + l2 + window_offset];
				959	if (f > MAX_EXP) g = (label - 1) * alpha;
				960	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				961	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha;
				962	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1neg_window[c + l2 + window_offset];
				963	for (c = 0; c < layer1_size; c++) syn1neg_window[c + l2 + window_offset] += g * syn0[c + l1];
				964	if(cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1neg_window, c + l2 + window_offset);
				965	}
				966	// Noise Constrastive Estimation
				967	if (nce > 0) for (d = 0; d < nce + 1; d++) {
				968	if (d == 0) {
				969	target = word;
				970	label = 1;
				971	} else {
				972	next_random = next_random * (unsigned long long)25214903917 + 11;
				973	if(word_to_group != NULL && word_to_group[word] != -1){
				974	target = word;
				975	while(target == word) {
				976	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				977	next_random = next_random * (unsigned long long)25214903917 + 11;
				978	}
				979	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				980	}
				981	else{
				982	target = table[(next_random >> 16) % table_size];
				983	}
				984	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				985	if (target == word) continue;
				986	label = 0;
				987	}
				988	l2 = target * window_layer_size;
				989	f = 0;
				990	for (c = 0; c < layer1_size; c++) f += syn0[c + l1] * syn1nce_window[c + l2 + window_offset];
				991	if (f > MAX_EXP) g = (label - 1) * alpha;
				992	else if (f < -MAX_EXP) g = (label - 0) * alpha;
				993	else {
				994	f = exp(f);
				995	g = (label - f/(noise_distribution[target]nce + f)) alpha;
				996	}
				997	for (c = 0; c < layer1_size; c++) neu1e[c] += g * syn1nce_window[c + l2 + window_offset];
				998	for (c = 0; c < layer1_size; c++) syn1nce_window[c + l2 + window_offset] += g * syn0[c + l1];
				999	if (cap == 1) for (c = 0; c < layer1_size; c++) capParam(syn1nce_window, c + l2 + window_offset);
				1000	}
				1001	// Learn weights input -> hidden
				1002	for (c = 0; c < layer1_size; c++) {syn0[c + l1] += neu1e[c]; if(syn0[c + l1] > 50) syn0[c + l1] = 50; if(syn0[c + l1] < -50) syn0[c + l1] = -50;}
				1003	}
				1004	}
				1005	else if(type == 4){ //training senna
				1006	// in -> hidden
				1007	cw = 0;
				1008	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1009	c = sentence_position - window + a;
				1010	if (c < 0) continue;
				1011	if (c >= sentence_length) continue;
				1012	last_word = sen[c];
				1013	if (last_word == -1) continue;
				1014	window_offset = a*layer1_size;
				1015	if (a > window) window_offset-=layer1_size;
				1016	for (c = 0; c < layer1_size; c++) neu1[c+window_offset] += syn0[c + last_word * layer1_size];
				1017	cw++;
				1018	}
				1019	if (cw) {
				1020	for (a = 0; a < window_hidden_size; a++){
				1021	c = a*window_layer_size;
				1022	for(b = 0; b < window_layer_size; b++){
				1023	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1024	}
				1025	}
				1026	if (hs) for (d = 0; d < vocab[word].codelen; d++) {
				1027	f = 0;
				1028	l2 = vocab[word].point[d] * window_hidden_size;
				1029	// Propagate hidden -> output
				1030	for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1031	if (f <= -MAX_EXP) continue;
				1032	else if (f >= MAX_EXP) continue;
				1033	else f = expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1034	// 'g' is the gradient multiplied by the learning rate
				1035	g = (1 - vocab[word].code[d] - f) * alpha;
				1036	// Propagate errors output -> hidden
				1037	for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word[c + l2];
				1038	// Learn weights hidden -> output
				1039	for (c = 0; c < window_hidden_size; c++) syn_hidden_word[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
				1040	}
				1041	// NEGATIVE SAMPLING
				1042	if (negative > 0) for (d = 0; d < negative + 1; d++) {
				1043	if (d == 0) {
				1044	target = word;
				1045	label = 1;
				1046	} else {
				1047	next_random = next_random * (unsigned long long)25214903917 + 11;
				1048	if(word_to_group != NULL && word_to_group[word] != -1){
				1049	target = word;
				1050	while(target == word) {
				1051	target = group_to_table[word_to_group[word]*table_size + (next_random >> 16) % table_size];
				1052	next_random = next_random * (unsigned long long)25214903917 + 11;
				1053	}
				1054	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1055	}
				1056	else{
				1057	target = table[(next_random >> 16) % table_size];
				1058	}
				1059	if (target == 0) target = next_random % (vocab_size - 1) + 1;
				1060	if (target == word) continue;
				1061	label = 0;
				1062	}
				1063	l2 = target * window_hidden_size;
				1064	f = 0;
				1065	for (c = 0; c < window_hidden_size; c++) f += hardTanh(neu2[c]) * syn_hidden_word_neg[c + l2];
				1066	if (f > MAX_EXP) g = (label - 1) * alpha / negative;
				1067	else if (f < -MAX_EXP) g = (label - 0) * alpha / negative;
				1068	else g = (label - expTable[(int)((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))]) * alpha / negative;
				1069	for (c = 0; c < window_hidden_size; c++) neu2e[c] += dHardTanh(neu2[c],g) * g * syn_hidden_word_neg[c + l2];
				1070	for (c = 0; c < window_hidden_size; c++) syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c],g) * g * neu2[c];
				1071	}
				1072	for (a = 0; a < window_hidden_size; a++)
				1073	for(b = 0; b < window_layer_size; b++)
				1074	neu1e[b] += neu2e[a] * syn_window_hidden[a*window_layer_size + b];
				1075	for (a = 0; a < window_hidden_size; a++)
				1076	for(b = 0; b < window_layer_size; b++)
				1077	syn_window_hidden[awindow_layer_size + b] += neu2e[a] neu1[b];
				1078	// hidden -> in
				1079	for (a = 0; a < window * 2 + 1; a++) if (a != window) {
				1080	c = sentence_position - window + a;
				1081	if (c < 0) continue;
				1082	if (c >= sentence_length) continue;
				1083	last_word = sen[c];
				1084	if (last_word == -1) continue;
				1085	window_offset = a * layer1_size;
				1086	if(a > window) window_offset -= layer1_size;
				1087	for (c = 0; c < layer1_size; c++) syn0[c + last_word * layer1_size] += neu1e[c + window_offset];
				1088	}
				1089	}
Marc Kupietz	b16eba4	2018-12-20 11:29:57 +0100	[diff] [blame]	1090	} else if(type == 5) {
				1091	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				1092	c = sentence_position - window + a;
				1093	if (c < 0) continue;
				1094	if (c >= sentence_length) continue;
				1095	last_word = sen[c];
				1096	if (last_word == -1) continue;
				1097	printf("storing %s %s - %d\n", vocab[word].word, vocab[last_word].word, a - window);
				1098	cw++;
				1099	}
				1100	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1101	else{
				1102	printf("unknown type %i", type);
				1103	exit(0);
				1104	}
				1105	sentence_position++;
				1106	if (sentence_position >= sentence_length) {
				1107	sentence_length = 0;
				1108	continue;
				1109	}
				1110	}
				1111	fclose(fi);
				1112	free(neu1);
				1113	free(neu1e);
				1114	pthread_exit(NULL);
				1115	}
				1116
				1117	void TrainModel() {
				1118	long a, b, c, d;
				1119	FILE *fo;
				1120	pthread_t pt = (pthread_t )malloc(num_threads * sizeof(pthread_t));
				1121	printf("Starting training using file %s\n", train_file);
				1122	starting_alpha = alpha;
				1123	if (read_vocab_file[0] != 0) ReadVocab(); else LearnVocabFromTrainFile();
				1124	if (save_vocab_file[0] != 0) SaveVocab();
				1125	if (output_file[0] == 0) return;
				1126	InitNet();
				1127	if (negative > 0 \|\| nce > 0) InitUnigramTable();
				1128	if (negative_classes_file[0] != 0) InitClassUnigramTable();
				1129	start = clock();
				1130	for (a = 0; a < num_threads; a++) pthread_create(&pt[a], NULL, TrainModelThread, (void *)a);
				1131	for (a = 0; a < num_threads; a++) pthread_join(pt[a], NULL);
				1132	fo = fopen(output_file, "wb");
				1133	if (classes == 0) {
				1134	// Save the word vectors
				1135	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1136	for (a = 0; a < vocab_size; a++) {
				1137	fprintf(fo, "%s ", vocab[a].word);
				1138	if (binary) for (b = 0; b < layer1_size; b++) fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1139	else for (b = 0; b < layer1_size; b++) fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1140	fprintf(fo, "\n");
				1141	}
				1142	} else {
				1143	// Run K-means on the word vectors
				1144	int clcn = classes, iter = 10, closeid;
				1145	int centcn = (int )malloc(classes * sizeof(int));
				1146	int cl = (int )calloc(vocab_size, sizeof(int));
				1147	real closev, x;
				1148	real cent = (real )calloc(classes * layer1_size, sizeof(real));
				1149	for (a = 0; a < vocab_size; a++) cl[a] = a % clcn;
				1150	for (a = 0; a < iter; a++) {
				1151	for (b = 0; b < clcn * layer1_size; b++) cent[b] = 0;
				1152	for (b = 0; b < clcn; b++) centcn[b] = 1;
				1153	for (c = 0; c < vocab_size; c++) {
				1154	for (d = 0; d < layer1_size; d++) cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1155	centcn[cl[c]]++;
				1156	}
				1157	for (b = 0; b < clcn; b++) {
				1158	closev = 0;
				1159	for (c = 0; c < layer1_size; c++) {
				1160	cent[layer1_size * b + c] /= centcn[b];
				1161	closev += cent[layer1_size * b + c] * cent[layer1_size * b + c];
				1162	}
				1163	closev = sqrt(closev);
				1164	for (c = 0; c < layer1_size; c++) cent[layer1_size * b + c] /= closev;
				1165	}
				1166	for (c = 0; c < vocab_size; c++) {
				1167	closev = -10;
				1168	closeid = 0;
				1169	for (d = 0; d < clcn; d++) {
				1170	x = 0;
				1171	for (b = 0; b < layer1_size; b++) x += cent[layer1_size * d + b] * syn0[c * layer1_size + b];
				1172	if (x > closev) {
				1173	closev = x;
				1174	closeid = d;
				1175	}
				1176	}
				1177	cl[c] = closeid;
				1178	}
				1179	}
				1180	// Save the K-means classes
				1181	for (a = 0; a < vocab_size; a++) fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1182	free(centcn);
				1183	free(cent);
				1184	free(cl);
				1185	}
				1186	fclose(fo);
				1187	}
				1188
				1189	int ArgPos(char str, int argc, char *argv) {
				1190	int a;
				1191	for (a = 1; a < argc; a++) if (!strcmp(str, argv[a])) {
				1192	if (a == argc - 1) {
				1193	printf("Argument missing for %s\n", str);
				1194	exit(1);
				1195	}
				1196	return a;
				1197	}
				1198	return -1;
				1199	}
				1200
				1201	int main(int argc, char **argv) {
				1202	int i;
				1203	if (argc == 1) {
				1204	printf("WORD VECTOR estimation toolkit v 0.1c\n\n");
				1205	printf("Options:\n");
				1206	printf("Parameters for training:\n");
				1207	printf("\t-train <file>\n");
				1208	printf("\t\tUse text data from <file> to train the model\n");
				1209	printf("\t-output <file>\n");
				1210	printf("\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1211	printf("\t-size <int>\n");
				1212	printf("\t\tSet size of word vectors; default is 100\n");
				1213	printf("\t-window <int>\n");
				1214	printf("\t\tSet max skip length between words; default is 5\n");
				1215	printf("\t-sample <float>\n");
				1216	printf("\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1217	printf("\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1218	printf("\t-hs <int>\n");
				1219	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1220	printf("\t-negative <int>\n");
				1221	printf("\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1222	printf("\t-negative-classes <file>\n");
				1223	printf("\t\tNegative classes to sample from\n");
				1224	printf("\t-nce <int>\n");
				1225	printf("\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1226	printf("\t-threads <int>\n");
				1227	printf("\t\tUse <int> threads (default 12)\n");
				1228	printf("\t-iter <int>\n");
				1229	printf("\t\tRun more training iterations (default 5)\n");
				1230	printf("\t-min-count <int>\n");
				1231	printf("\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1232	printf("\t-alpha <float>\n");
				1233	printf("\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1234	printf("\t-classes <int>\n");
				1235	printf("\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1236	printf("\t-debug <int>\n");
				1237	printf("\t\tSet the debug mode (default = 2 = more info during training)\n");
				1238	printf("\t-binary <int>\n");
				1239	printf("\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				1240	printf("\t-save-vocab <file>\n");
				1241	printf("\t\tThe vocabulary will be saved to <file>\n");
				1242	printf("\t-read-vocab <file>\n");
				1243	printf("\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				1244	printf("\t-type <int>\n");
Marc Kupietz	b16eba4	2018-12-20 11:29:57 +0100	[diff] [blame]	1245	printf("\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1246	printf("\t-cap <int>\n");
				1247	printf("\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				1248	printf("\nExamples:\n");
				1249	printf("./word2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
				1250	return 0;
				1251	}
				1252	output_file[0] = 0;
				1253	save_vocab_file[0] = 0;
				1254	read_vocab_file[0] = 0;
				1255	negative_classes_file[0] = 0;
				1256	if ((i = ArgPos((char *)"-size", argc, argv)) > 0) layer1_size = atoi(argv[i + 1]);
				1257	if ((i = ArgPos((char *)"-train", argc, argv)) > 0) strcpy(train_file, argv[i + 1]);
				1258	if ((i = ArgPos((char *)"-save-vocab", argc, argv)) > 0) strcpy(save_vocab_file, argv[i + 1]);
				1259	if ((i = ArgPos((char *)"-read-vocab", argc, argv)) > 0) strcpy(read_vocab_file, argv[i + 1]);
				1260	if ((i = ArgPos((char *)"-debug", argc, argv)) > 0) debug_mode = atoi(argv[i + 1]);
				1261	if ((i = ArgPos((char *)"-binary", argc, argv)) > 0) binary = atoi(argv[i + 1]);
				1262	if ((i = ArgPos((char *)"-type", argc, argv)) > 0) type = atoi(argv[i + 1]);
				1263	if ((i = ArgPos((char *)"-output", argc, argv)) > 0) strcpy(output_file, argv[i + 1]);
				1264	if ((i = ArgPos((char *)"-window", argc, argv)) > 0) window = atoi(argv[i + 1]);
				1265	if ((i = ArgPos((char *)"-sample", argc, argv)) > 0) sample = atof(argv[i + 1]);
				1266	if ((i = ArgPos((char *)"-hs", argc, argv)) > 0) hs = atoi(argv[i + 1]);
				1267	if ((i = ArgPos((char *)"-negative", argc, argv)) > 0) negative = atoi(argv[i + 1]);
				1268	if ((i = ArgPos((char *)"-negative-classes", argc, argv)) > 0) strcpy(negative_classes_file, argv[i + 1]);
				1269	if ((i = ArgPos((char *)"-nce", argc, argv)) > 0) nce = atoi(argv[i + 1]);
				1270	if ((i = ArgPos((char *)"-threads", argc, argv)) > 0) num_threads = atoi(argv[i + 1]);
				1271	if ((i = ArgPos((char *)"-iter", argc, argv)) > 0) iter = atoi(argv[i + 1]);
				1272	if ((i = ArgPos((char *)"-min-count", argc, argv)) > 0) min_count = atoi(argv[i + 1]);
				1273	if ((i = ArgPos((char *)"-classes", argc, argv)) > 0) classes = atoi(argv[i + 1]);
				1274	if ((i = ArgPos((char *)"-cap", argc, argv)) > 0) cap = atoi(argv[i + 1]);
				1275	if (type==0 \|\| type==2 \|\| type==4) alpha = 0.05;
Marc Kupietz	b16eba4	2018-12-20 11:29:57 +0100	[diff] [blame]	1276	if (type==5) {
				1277	sample = 0;
				1278	cdb = open_collocators(output_file);
				1279	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1280	if ((i = ArgPos((char *)"-alpha", argc, argv)) > 0) alpha = atof(argv[i + 1]);
				1281	vocab = (struct vocab_word *)calloc(vocab_max_size, sizeof(struct vocab_word));
				1282	vocab_hash = (int *)calloc(vocab_hash_size, sizeof(int));
				1283	expTable = (real )malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				1284	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				1285	expTable[i] = exp((i / (real)EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				1286	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				1287	}
				1288	TrainModel();
				1289	return 0;
				1290	}
				1291