Blame - src/dereko2vec.c - ids-kl/dereko2vec

blob: 9578f1b533a20c99be326fe9f53c0b253099baec [file] [log] [blame]

Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1	// Copyright 2013 Google Inc. All Rights Reserved.
				2	//
				3	// Licensed under the Apache License, Version 2.0 (the "License");
				4	// you may not use this file except in compliance with the License.
				5	// You may obtain a copy of the License at
				6	//
				7	// http://www.apache.org/licenses/LICENSE-2.0
				8	//
				9	// Unless required by applicable law or agreed to in writing, software
				10	// distributed under the License is distributed on an "AS IS" BASIS,
				11	// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
				12	// See the License for the specific language governing permissions and
				13	// limitations under the License.
				14
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	15	#include <locale.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	16	#include <stdio.h>
				17	#include <stdlib.h>
				18	#include <string.h>
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	19	#include <unistd.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	20	#include <math.h>
				21	#include <pthread.h>
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	22	#include <collocatordb.h>
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	23
				24	#define MAX_STRING 100
				25	#define EXP_TABLE_SIZE 1000
				26	#define MAX_EXP 6
				27	#define MAX_SENTENCE_LENGTH 1000
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	28	#define MAX_CC 100
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	29	#define MAX_CODE_LENGTH 40
Marc Kupietz	178a3c9	2023-12-22 15:12:27 +0100	[diff] [blame]	30	#define MAX_METADATA_CATEGORIES 4
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	31
Marc Kupietz	178a3c9	2023-12-22 15:12:27 +0100	[diff] [blame]	32	#define METADATA_MARKER ' '
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	33	const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
				34
				35	typedef float real; // Precision of float numbers
				36
				37	struct vocab_word {
				38	long long cn;
				39	int *point;
				40	char word, code, codelen;
				41	};
				42
				43	char train_file[MAX_STRING], output_file[MAX_STRING];
				44	char save_vocab_file[MAX_STRING], read_vocab_file[MAX_STRING];
				45	char save_net_file[MAX_STRING], read_net_file[MAX_STRING];
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	46	char magic_stop_file[MAX_STRING];
				47
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	48	struct vocab_word *vocab;
				49	int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
Marc Kupietz	879333c	2023-12-20 11:41:09 +0100	[diff] [blame]	50	num_threads = 12, min_reduce = 1, metadata_categories = 0, expected_metadata_categories = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	51	int *vocab_hash;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	52	long long *threadPos;
				53	int *threadIters;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	54	long long vocab_max_size = 1000, vocab_size = 0, layer1_size = 100;
				55	long long train_words = 0, word_count_actual = 0, iter = 5, file_size = 0,
				56	classes = 0;
				57	real alpha = 0.025, starting_alpha, sample = 1e-3;
				58	real syn0, syn1, syn1neg, syn1nce, *expTable;
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	59	real avgWordLength=0;
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	60	clock_t start, start_clock;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	61
				62	real syn1_window, syn1neg_window, *syn1nce_window;
				63	int w_offset, window_layer_size;
				64
				65	int window_hidden_size = 500;
				66	real syn_window_hidden, syn_hidden_word, *syn_hidden_word_neg,
				67	*syn_hidden_word_nce;
				68
				69	int hs = 0, negative = 5;
				70	const int table_size = 1e8;
				71	int *table;
				72
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	73	long cc = 0;
				74
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	75	//constrastive negative sampling
				76	char negative_classes_file[MAX_STRING];
				77	int *word_to_group;
				78	int group_to_table; //group_sizetable_size
				79	int class_number;
				80
				81	//nce
				82	real* noise_distribution;
				83	int nce = 0;
				84
				85	//param caps
				86	real CAP_VALUE = 50;
				87	int cap = 0;
				88
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	89	COLLOCATORDB *cdb = NULL;
				90
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	91	void capParam(real* array, int index) {
				92	if (array[index] > CAP_VALUE)
				93	array[index] = CAP_VALUE;
				94	else if (array[index] < -CAP_VALUE)
				95	array[index] = -CAP_VALUE;
				96	}
				97
				98	real hardTanh(real x) {
				99	if (x >= 1) {
				100	return 1;
				101	} else if (x <= -1) {
				102	return -1;
				103	} else {
				104	return x;
				105	}
				106	}
				107
				108	real dHardTanh(real x, real g) {
				109	if (x > 1 && g > 0) {
				110	return 0;
				111	}
				112	if (x < -1 && g < 0) {
				113	return 0;
				114	}
				115	return 1;
				116	}
				117
				118	void InitUnigramTable() {
				119	int a, i;
				120	long long train_words_pow = 0;
				121	real d1, power = 0.75;
				122	table = (int ) malloc(table_size sizeof(int));
				123	for (a = 0; a < vocab_size; a++)
				124	train_words_pow += pow(vocab[a].cn, power);
				125	i = 0;
				126	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				127	for (a = 0; a < table_size; a++) {
				128	table[a] = i;
				129	if (a / (real) table_size > d1) {
				130	i++;
				131	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				132	}
				133	if (i >= vocab_size)
				134	i = vocab_size - 1;
				135	}
				136
				137	noise_distribution = (real *) calloc(vocab_size, sizeof(real));
				138	for (a = 0; a < vocab_size; a++)
				139	noise_distribution[a] = pow(vocab[a].cn, power)
				140	/ (real) train_words_pow;
				141	}
				142
				143	// Reads a single word from a file, assuming space + tab + EOL to be word boundaries
				144	void ReadWord(char word, FILE fin) {
				145	int a = 0, ch;
				146	while (!feof(fin)) {
				147	ch = fgetc(fin);
				148	if (ch == 13)
				149	continue;
				150	if ((ch == ' ') \|\| (ch == '\t') \|\| (ch == '\n')) {
Marc Kupietz	879333c	2023-12-20 11:41:09 +0100	[diff] [blame]	151	if (ch == '\t' && expected_metadata_categories > 0) {
Marc Kupietz	178a3c9	2023-12-22 15:12:27 +0100	[diff] [blame]	152	word[a] = 0;
				153	a = 0;
				154	expected_metadata_categories--;
Marc Kupietz	c564b1f	2023-12-22 15:38:29 +0100	[diff] [blame^]	155	if (debug_mode > 3)
Marc Kupietz	178a3c9	2023-12-22 15:12:27 +0100	[diff] [blame]	156	printf("Metadata: %s\n", word);
				157	strcpy(word + 1, word);
				158	*word = METADATA_MARKER;
				159	return;
Marc Kupietz	879333c	2023-12-20 11:41:09 +0100	[diff] [blame]	160	} else {
				161	if (a > 0) {
				162	if (ch == '\n') {
				163	expected_metadata_categories = metadata_categories;
				164	ungetc(ch, fin);
				165	}
				166	break;
				167	}
				168	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	169	if (ch == '\n') {
				170	strcpy(word, (char *) "</s>");
Marc Kupietz	879333c	2023-12-20 11:41:09 +0100	[diff] [blame]	171	expected_metadata_categories = metadata_categories;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	172	return;
				173	} else
				174	continue;
				175	}
				176	word[a] = ch;
				177	a++;
				178	if (a >= MAX_STRING - 1)
				179	a--; // Truncate too long words
				180	}
				181	word[a] = 0;
				182	}
				183
				184	// Returns hash value of a word
				185	int GetWordHash(char *word) {
				186	unsigned long long a, hash = 0;
				187	for (a = 0; a < strlen(word); a++)
				188	hash = hash * 257 + word[a];
				189	hash = hash % vocab_hash_size;
				190	return hash;
				191	}
				192
				193	// Returns position of a word in the vocabulary; if the word is not found, returns -1
				194	int SearchVocab(char *word) {
				195	unsigned int hash = GetWordHash(word);
				196	while (1) {
				197	if (vocab_hash[hash] == -1)
				198	return -1;
				199	if (!strcmp(word, vocab[vocab_hash[hash]].word))
				200	return vocab_hash[hash];
				201	hash = (hash + 1) % vocab_hash_size;
				202	}
				203	return -1;
				204	}
				205
				206	// Reads a word and returns its index in the vocabulary
Marc Kupietz	c564b1f	2023-12-22 15:38:29 +0100	[diff] [blame^]	207	int ReadWordIndex(FILE fin, int is_metadata) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	208	char word[MAX_STRING];
				209	ReadWord(word, fin);
				210	if (feof(fin))
				211	return -1;
Marc Kupietz	c564b1f	2023-12-22 15:38:29 +0100	[diff] [blame^]	212	if (word[0] == METADATA_MARKER) {
				213	*is_metadata = 1;
				214	} else {
				215	*is_metadata = 0;
				216	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	217	return SearchVocab(word);
				218	}
				219
				220	// Adds a word to the vocabulary
				221	int AddWordToVocab(char *word) {
				222	unsigned int hash, length = strlen(word) + 1;
				223	if (length > MAX_STRING)
				224	length = MAX_STRING;
				225	vocab[vocab_size].word = (char *) calloc(length, sizeof(char));
				226	strcpy(vocab[vocab_size].word, word);
				227	vocab[vocab_size].cn = 0;
				228	vocab_size++;
				229	// Reallocate memory if needed
				230	if (vocab_size + 2 >= vocab_max_size) {
				231	vocab_max_size += 1000;
				232	vocab = (struct vocab_word *) realloc(vocab,
				233	vocab_max_size * sizeof(struct vocab_word));
				234	}
				235	hash = GetWordHash(word);
				236	while (vocab_hash[hash] != -1)
				237	hash = (hash + 1) % vocab_hash_size;
				238	vocab_hash[hash] = vocab_size - 1;
				239	return vocab_size - 1;
				240	}
				241
				242	// Used later for sorting by word counts
				243	int VocabCompare(const void a, const void b) {
				244	return ((struct vocab_word ) b)->cn - ((struct vocab_word ) a)->cn;
				245	}
				246
				247	// Sorts the vocabulary by frequency using word counts
				248	void SortVocab() {
				249	int a, size;
				250	unsigned int hash;
				251	// Sort the vocabulary and keep </s> at the first position
				252	qsort(&vocab[1], vocab_size - 1, sizeof(struct vocab_word), VocabCompare);
				253	for (a = 0; a < vocab_hash_size; a++)
				254	vocab_hash[a] = -1;
				255	size = vocab_size;
				256	train_words = 0;
				257	for (a = 0; a < size; a++) {
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	258	avgWordLength += vocab[a].cn * (strlen(vocab[a].word) + 1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	259	// Words occuring less than min_count times will be discarded from the vocab
				260	if ((vocab[a].cn < min_count) && (a != 0)) {
				261	vocab_size--;
				262	free(vocab[a].word);
				263	} else {
				264	// Hash will be re-computed, as after the sorting it is not actual
				265	hash = GetWordHash(vocab[a].word);
				266	while (vocab_hash[hash] != -1)
				267	hash = (hash + 1) % vocab_hash_size;
				268	vocab_hash[hash] = a;
				269	train_words += vocab[a].cn;
				270	}
				271	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	272	avgWordLength /= train_words;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	273	vocab = (struct vocab_word *) realloc(vocab,
				274	(vocab_size + 1) * sizeof(struct vocab_word));
				275	// Allocate memory for the binary tree construction
				276	for (a = 0; a < vocab_size; a++) {
				277	vocab[a].code = (char *) calloc(MAX_CODE_LENGTH, sizeof(char));
				278	vocab[a].point = (int *) calloc(MAX_CODE_LENGTH, sizeof(int));
				279	}
				280	}
				281
				282	// Reduces the vocabulary by removing infrequent tokens
				283	void ReduceVocab() {
				284	int a, b = 0;
				285	unsigned int hash;
				286	for (a = 0; a < vocab_size; a++)
				287	if (vocab[a].cn > min_reduce) {
				288	vocab[b].cn = vocab[a].cn;
				289	vocab[b].word = vocab[a].word;
				290	b++;
				291	} else
				292	free(vocab[a].word);
				293	vocab_size = b;
				294	for (a = 0; a < vocab_hash_size; a++)
				295	vocab_hash[a] = -1;
				296	for (a = 0; a < vocab_size; a++) {
				297	// Hash will be re-computed, as it is not actual
				298	hash = GetWordHash(vocab[a].word);
				299	while (vocab_hash[hash] != -1)
				300	hash = (hash + 1) % vocab_hash_size;
				301	vocab_hash[hash] = a;
				302	}
				303	fflush(stdout);
				304	min_reduce++;
				305	}
				306
				307	// Create binary Huffman tree using the word counts
				308	// Frequent words will have short uniqe binary codes
				309	void CreateBinaryTree() {
				310	long long a, b, i, min1i, min2i, pos1, pos2, point[MAX_CODE_LENGTH];
				311	char code[MAX_CODE_LENGTH];
				312	long long count = (long long ) calloc(vocab_size * 2 + 1,
				313	sizeof(long long));
				314	long long binary = (long long ) calloc(vocab_size * 2 + 1,
				315	sizeof(long long));
				316	long long parent_node = (long long ) calloc(vocab_size * 2 + 1,
				317	sizeof(long long));
				318	for (a = 0; a < vocab_size; a++)
				319	count[a] = vocab[a].cn;
				320	for (a = vocab_size; a < vocab_size * 2; a++)
				321	count[a] = 1e15;
				322	pos1 = vocab_size - 1;
				323	pos2 = vocab_size;
				324	// Following algorithm constructs the Huffman tree by adding one node at a time
				325	for (a = 0; a < vocab_size - 1; a++) {
				326	// First, find two smallest nodes 'min1, min2'
				327	if (pos1 >= 0) {
				328	if (count[pos1] < count[pos2]) {
				329	min1i = pos1;
				330	pos1--;
				331	} else {
				332	min1i = pos2;
				333	pos2++;
				334	}
				335	} else {
				336	min1i = pos2;
				337	pos2++;
				338	}
				339	if (pos1 >= 0) {
				340	if (count[pos1] < count[pos2]) {
				341	min2i = pos1;
				342	pos1--;
				343	} else {
				344	min2i = pos2;
				345	pos2++;
				346	}
				347	} else {
				348	min2i = pos2;
				349	pos2++;
				350	}
				351	count[vocab_size + a] = count[min1i] + count[min2i];
				352	parent_node[min1i] = vocab_size + a;
				353	parent_node[min2i] = vocab_size + a;
				354	binary[min2i] = 1;
				355	}
				356	// Now assign binary code to each vocabulary word
				357	for (a = 0; a < vocab_size; a++) {
				358	b = a;
				359	i = 0;
				360	while (1) {
				361	code[i] = binary[b];
				362	point[i] = b;
				363	i++;
				364	b = parent_node[b];
				365	if (b == vocab_size * 2 - 2)
				366	break;
				367	}
				368	vocab[a].codelen = i;
				369	vocab[a].point[0] = vocab_size - 2;
				370	for (b = 0; b < i; b++) {
				371	vocab[a].code[i - b - 1] = code[b];
				372	vocab[a].point[i - b] = point[b] - vocab_size;
				373	}
				374	}
				375	free(count);
				376	free(binary);
				377	free(parent_node);
				378	}
				379
				380	void LearnVocabFromTrainFile() {
				381	char word[MAX_STRING];
				382	FILE *fin;
				383	long long a, i;
				384	for (a = 0; a < vocab_hash_size; a++)
				385	vocab_hash[a] = -1;
				386	fin = fopen(train_file, "rb");
				387	if (fin == NULL) {
				388	printf("ERROR: training data file not found!\n");
				389	exit(1);
				390	}
				391	vocab_size = 0;
				392	AddWordToVocab((char *) "</s>");
Marc Kupietz	879333c	2023-12-20 11:41:09 +0100	[diff] [blame]	393	for (int j=0; j < metadata_categories; j++) {
				394	ReadWord(word, fin);
				395	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	396	while (1) {
				397	ReadWord(word, fin);
				398	if (feof(fin))
				399	break;
				400	train_words++;
				401	if ((debug_mode > 1) && (train_words % 100000 == 0)) {
				402	printf("%lldK%c", train_words / 1000, 13);
				403	fflush(stdout);
				404	}
				405	i = SearchVocab(word);
				406	if (i == -1) {
				407	a = AddWordToVocab(word);
				408	vocab[a].cn = 1;
				409	} else
				410	vocab[i].cn++;
				411	if (vocab_size > vocab_hash_size * 0.7)
				412	ReduceVocab();
				413	}
				414	SortVocab();
				415	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	416	printf("Vocab size: %'lld\n", vocab_size);
				417	printf("Words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	418	}
				419	file_size = ftell(fin);
				420	fclose(fin);
				421	}
				422
				423	void SaveVocab() {
				424	long long i;
				425	FILE *fo = fopen(save_vocab_file, "wb");
				426	for (i = 0; i < vocab_size; i++)
				427	fprintf(fo, "%s %lld\n", vocab[i].word, vocab[i].cn);
				428	fclose(fo);
				429	}
				430
				431	void ReadVocab() {
				432	long long a, i = 0;
				433	char c;
				434	char word[MAX_STRING];
				435	FILE *fin = fopen(read_vocab_file, "rb");
				436	if (fin == NULL) {
				437	printf("Vocabulary file not found\n");
				438	exit(1);
				439	}
				440	for (a = 0; a < vocab_hash_size; a++)
				441	vocab_hash[a] = -1;
				442	vocab_size = 0;
				443	while (1) {
				444	ReadWord(word, fin);
				445	if (feof(fin))
				446	break;
				447	a = AddWordToVocab(word);
				448	fscanf(fin, "%lld%c", &vocab[a].cn, &c);
				449	i++;
				450	}
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	451	fclose(fin);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	452	fin = fopen(train_file, "rb");
				453	if (fin == NULL) {
				454	printf("ERROR: training data file not found!\n");
				455	exit(1);
				456	}
				457	fseek(fin, 0, SEEK_END);
				458	file_size = ftell(fin);
				459	fclose(fin);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	460	SortVocab();
				461	if (debug_mode > 0) {
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	462	printf("Vocab size: %'lld\n", vocab_size);
				463	printf("Words in vocab's train file: %'lld\n", train_words);
				464	printf("Avg. word length in vocab's train file: %.2f\n", avgWordLength);
Marc Kupietz	c2731b2	2016-07-14 08:56:14 +0200	[diff] [blame]	465	}
Marc Kupietz	e23c540	2016-07-14 11:10:09 +0200	[diff] [blame]	466	train_words = file_size / avgWordLength;
				467	if(debug_mode > 0)
				468	printf("Estimated words in train file: %'lld\n", train_words);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	469	}
				470
				471	void InitClassUnigramTable() {
				472	long long a, c;
				473	printf("loading class unigrams \n");
				474	FILE *fin = fopen(negative_classes_file, "rb");
				475	if (fin == NULL) {
				476	printf("ERROR: class file not found!\n");
				477	exit(1);
				478	}
				479	word_to_group = (int ) malloc(vocab_size sizeof(int));
				480	for (a = 0; a < vocab_size; a++)
				481	word_to_group[a] = -1;
				482	char class[MAX_STRING];
				483	char prev_class[MAX_STRING];
				484	prev_class[0] = 0;
				485	char word[MAX_STRING];
				486	class_number = -1;
				487	while (1) {
				488	if (feof(fin))
				489	break;
				490	ReadWord(class, fin);
				491	ReadWord(word, fin);
				492	int word_index = SearchVocab(word);
				493	if (word_index != -1) {
				494	if (strcmp(class, prev_class) != 0) {
				495	class_number++;
				496	strcpy(prev_class, class);
				497	}
				498	word_to_group[word_index] = class_number;
				499	}
				500	ReadWord(word, fin);
				501	}
				502	class_number++;
				503	fclose(fin);
				504
				505	group_to_table = (int ) malloc(table_size class_number * sizeof(int));
				506	long long train_words_pow = 0;
				507	real d1, power = 0.75;
				508
				509	for (c = 0; c < class_number; c++) {
				510	long long offset = c * table_size;
				511	train_words_pow = 0;
				512	for (a = 0; a < vocab_size; a++)
				513	if (word_to_group[a] == c)
				514	train_words_pow += pow(vocab[a].cn, power);
				515	int i = 0;
				516	while (word_to_group[i] != c && i < vocab_size)
				517	i++;
				518	d1 = pow(vocab[i].cn, power) / (real) train_words_pow;
				519	for (a = 0; a < table_size; a++) {
				520	//printf("index %lld , word %d\n", a, i);
				521	group_to_table[offset + a] = i;
				522	if (a / (real) table_size > d1) {
				523	i++;
				524	while (word_to_group[i] != c && i < vocab_size)
				525	i++;
				526	d1 += pow(vocab[i].cn, power) / (real) train_words_pow;
				527	}
				528	if (i >= vocab_size)
				529	while (word_to_group[i] != c && i >= 0)
				530	i--;
				531	}
				532	}
				533	}
				534
Marc Kupietz	61485ad	2023-12-22 16:16:59 +0100	[diff] [blame]	535	void SaveArgs(unsigned int argc, char **argv) {
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	536	unsigned int i;
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	537	char args_file[MAX_STRING];
				538	strcpy(args_file, output_file);
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	539	strcat(args_file, ".args");
				540	FILE *fargs = fopen(args_file, "w");
				541	if (fargs == NULL) {
				542	printf("Cannot save args to %s.\n", args_file);
				543	return;
				544	}
				545
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	546	for(i=1; i<argc; i++)
				547	fprintf(fargs, "%s ", argv[i]);
				548
				549	fprintf(fargs, "\n");
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	550	fclose(fargs);
Marc Kupietz	4413674	2017-12-22 17:52:56 +0100	[diff] [blame]	551
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	552	return;
				553	}
				554
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	555	void SaveNet() {
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	556	if(type != 3 \|\| negative <= 0) {
				557	fprintf(stderr, "save-net only supported for type 3 with negative sampling\n");
				558	return;
				559	}
				560
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	561	FILE *fnet = fopen(save_net_file, "wb");
				562	if (fnet == NULL) {
				563	printf("Net parameter file not found\n");
				564	exit(1);
				565	}
Marc Kupietz	c697933	2016-03-16 15:29:07 +0100	[diff] [blame]	566	fwrite(syn0, sizeof(real), vocab_size * layer1_size, fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	567	fwrite(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	568	fclose(fnet);
				569	}
				570
				571	void InitNet() {
				572	long long a, b;
				573	unsigned long long next_random = 1;
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	574	long long read;
				575
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	576	window_layer_size = layer1_size * window * 2;
				577	a = posix_memalign((void **) &syn0, 128,
				578	(long long) vocab_size * layer1_size * sizeof(real));
				579	if (syn0 == NULL) {
				580	printf("Memory allocation failed\n");
				581	exit(1);
				582	}
				583
				584	if (hs) {
				585	a = posix_memalign((void **) &syn1, 128,
				586	(long long) vocab_size * layer1_size * sizeof(real));
				587	if (syn1 == NULL) {
				588	printf("Memory allocation failed\n");
				589	exit(1);
				590	}
				591	a = posix_memalign((void **) &syn1_window, 128,
				592	(long long) vocab_size * window_layer_size * sizeof(real));
				593	if (syn1_window == NULL) {
				594	printf("Memory allocation failed\n");
				595	exit(1);
				596	}
				597	a = posix_memalign((void **) &syn_hidden_word, 128,
				598	(long long) vocab_size * window_hidden_size * sizeof(real));
				599	if (syn_hidden_word == NULL) {
				600	printf("Memory allocation failed\n");
				601	exit(1);
				602	}
				603
				604	for (a = 0; a < vocab_size; a++)
				605	for (b = 0; b < layer1_size; b++)
				606	syn1[a * layer1_size + b] = 0;
				607	for (a = 0; a < vocab_size; a++)
				608	for (b = 0; b < window_layer_size; b++)
				609	syn1_window[a * window_layer_size + b] = 0;
				610	for (a = 0; a < vocab_size; a++)
				611	for (b = 0; b < window_hidden_size; b++)
				612	syn_hidden_word[a * window_hidden_size + b] = 0;
				613	}
				614	if (negative > 0) {
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	615	if(type == 0) {
				616	a = posix_memalign((void **) &syn1neg, 128,
				617	(long long) vocab_size * layer1_size * sizeof(real));
				618	if (syn1neg == NULL) {
				619	printf("Memory allocation failed\n");
				620	exit(1);
				621	}
				622	for (a = 0; a < vocab_size; a++)
				623	for (b = 0; b < layer1_size; b++)
				624	syn1neg[a * layer1_size + b] = 0;
				625	} else if (type == 3) {
				626	a = posix_memalign((void **) &syn1neg_window, 128,
				627	(long long) vocab_size * window_layer_size * sizeof(real));
				628	if (syn1neg_window == NULL) {
				629	printf("Memory allocation failed\n");
				630	exit(1);
				631	}
				632	for (a = 0; a < vocab_size; a++)
				633	for (b = 0; b < window_layer_size; b++)
				634	syn1neg_window[a * window_layer_size + b] = 0;
				635	} else if (type == 4) {
				636	a = posix_memalign((void **) &syn_hidden_word_neg, 128,
				637	(long long) vocab_size * window_hidden_size * sizeof(real));
				638	if (syn_hidden_word_neg == NULL) {
				639	printf("Memory allocation failed\n");
				640	exit(1);
				641	}
				642	for (a = 0; a < vocab_size; a++)
				643	for (b = 0; b < window_hidden_size; b++)
				644	syn_hidden_word_neg[a * window_hidden_size + b] = 0;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	645	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	646	}
				647	if (nce > 0) {
				648	a = posix_memalign((void **) &syn1nce, 128,
				649	(long long) vocab_size * layer1_size * sizeof(real));
				650	if (syn1nce == NULL) {
				651	printf("Memory allocation failed\n");
				652	exit(1);
				653	}
				654	a = posix_memalign((void **) &syn1nce_window, 128,
				655	(long long) vocab_size * window_layer_size * sizeof(real));
				656	if (syn1nce_window == NULL) {
				657	printf("Memory allocation failed\n");
				658	exit(1);
				659	}
				660	a = posix_memalign((void **) &syn_hidden_word_nce, 128,
				661	(long long) vocab_size * window_hidden_size * sizeof(real));
				662	if (syn_hidden_word_nce == NULL) {
				663	printf("Memory allocation failed\n");
				664	exit(1);
				665	}
				666
				667	for (a = 0; a < vocab_size; a++)
				668	for (b = 0; b < layer1_size; b++)
				669	syn1nce[a * layer1_size + b] = 0;
				670	for (a = 0; a < vocab_size; a++)
				671	for (b = 0; b < window_layer_size; b++)
				672	syn1nce_window[a * window_layer_size + b] = 0;
				673	for (a = 0; a < vocab_size; a++)
				674	for (b = 0; b < window_hidden_size; b++)
				675	syn_hidden_word_nce[a * window_hidden_size + b] = 0;
				676	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	677
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	678	if(type == 4) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	679	a = posix_memalign((void **) &syn_window_hidden, 128,
				680	window_hidden_size * window_layer_size * sizeof(real));
				681	if (syn_window_hidden == NULL) {
				682	printf("Memory allocation failed\n");
				683	exit(1);
				684	}
				685	for (a = 0; a < window_hidden_size * window_layer_size; a++) {
				686	next_random = next_random * (unsigned long long) 25214903917 + 11;
				687	syn_window_hidden[a] = (((next_random & 0xFFFF) / (real) 65536)
				688	- 0.5) / (window_hidden_size * window_layer_size);
				689	}
				690	}
Marc Kupietz	1006a27	2016-03-16 15:50:20 +0100	[diff] [blame]	691
				692	if (read_net_file[0] == 0) {
				693	for (a = 0; a < vocab_size; a++)
				694	for (b = 0; b < layer1_size; b++) {
				695	next_random = next_random * (unsigned long long) 25214903917
				696	+ 11;
				697	syn0[a * layer1_size + b] = (((next_random & 0xFFFF)
				698	/ (real) 65536) - 0.5) / layer1_size;
				699	}
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	700	} else if(type == 3 && negative > 0) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	701	FILE *fnet = fopen(read_net_file, "rb");
				702	if (fnet == NULL) {
				703	printf("Net parameter file not found\n");
				704	exit(1);
				705	}
Marc Kupietz	57c0df1	2016-03-18 12:48:00 +0100	[diff] [blame]	706	printf("vocab-size: %lld, layer1_size: %lld, window_layer_size %d\n", vocab_size, layer1_size, window_layer_size);
				707	read = fread(syn0, sizeof(real), vocab_size * layer1_size, fnet);
				708	if(read != vocab_size * layer1_size) {
				709	fprintf(stderr, "read-net failed %lld\n", read);
				710	exit(-1);
				711	}
				712	read = fread(syn1neg_window, sizeof(real), vocab_size * window_layer_size, fnet);
				713	if(read != (long long) vocab_size * window_layer_size) {
				714	fprintf(stderr, "read-net failed, read %lld, expected: %lld\n", read ,
				715	(long long) sizeof(real) * vocab_size * window_layer_size);
				716	exit(-1);
				717	}
				718	fgetc(fnet);
				719	if(!feof(fnet)) {
				720	fprintf(stderr, "Remaining bytes in net-file after read-net. File position: %ld\n", ftell(fnet));
				721	exit(-1);
				722	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	723	fclose(fnet);
Marc Kupietz	313fcc5	2016-03-16 16:43:37 +0100	[diff] [blame]	724	} else {
				725	fprintf(stderr, "read-net only supported for type 3 with negative sampling\n");
				726	exit(-1);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	727	}
				728
				729	CreateBinaryTree();
				730	}
				731
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	732	char currentDateTime(char buf, real offset) {
				733	time_t t;
				734	time(&t);
				735	t += (long) offset;
				736	struct tm tstruct;
				737	tstruct = *localtime(&t);
				738	strftime(buf, 80, "%c", &tstruct);
				739	return buf;
				740	}
				741
				742	void MonitorThread(void id) {
				743	char *timebuf = malloc(80);;
				744	int i, n=num_threads;
				745	long long sum;
				746	sleep(1);
				747	while(n > 0) {
				748	sleep(1);
				749	sum = n = 0;
				750	for(i=0; i < num_threads; i++) {
				751	if(threadPos[i] >= 0) {
				752	sum += (iter - threadIters[i]) * file_size / num_threads + threadPos[i] - (file_size / num_threads) * i;
				753	n++;
				754	} else {
				755	sum += iter * file_size / num_threads;
				756	}
				757	}
				758	if(n == 0)
				759	break;
				760	real finished_portion = (real) sum / (float) (file_size * iter);
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	761	long long now = time(NULL);
				762	long long elapsed = (now - start);
				763	long long ttg = ((1.0 / finished_portion) * (real) elapsed - elapsed);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	764
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	765	printf("\rAlpha: %.3f Done: %.2f%% with %.2fKB/s TE: %llds TTG: %llds ETA: %s\033[K",
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	766	alpha,
				767	finished_portion * 100,
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	768	(float) sum / elapsed / 1000,
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	769	elapsed,
				770	ttg,
				771	currentDateTime(timebuf, ttg)
				772	);
				773	fflush(stdout);
				774	}
				775	pthread_exit(NULL);
				776	}
				777
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	778	void TrainModelThread(void id) {
				779	long long a, b, d, cw, word, last_word, sentence_length = 0,
				780	sentence_position = 0;
Marc Kupietz	c564b1f	2023-12-22 15:38:29 +0100	[diff] [blame^]	781	long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1], metadata[MAX_METADATA_CATEGORIES];
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	782	long long l1, l2, c, target, label, local_iter = iter;
				783	unsigned long long next_random = (long long) id;
				784	real f, g;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	785	int input_len_1 = layer1_size;
				786	int window_offset = -1;
				787	if (type == 2 \|\| type == 4) {
				788	input_len_1 = window_layer_size;
				789	}
				790	real neu1 = (real ) calloc(input_len_1, sizeof(real));
				791	real neu1e = (real ) calloc(input_len_1, sizeof(real));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	792	threadIters[(long) id] = iter;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	793
				794	int input_len_2 = 0;
				795	if (type == 4) {
				796	input_len_2 = window_hidden_size;
				797	}
				798	real neu2 = (real ) calloc(input_len_2, sizeof(real));
				799	real neu2e = (real ) calloc(input_len_2, sizeof(real));
				800
				801	FILE *fi = fopen(train_file, "rb");
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	802	long long start_pos = file_size / (long long) num_threads * (long long) id;
				803	long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
				804	long long current_pos = start_pos;
Marc Kupietz	c564b1f	2023-12-22 15:38:29 +0100	[diff] [blame^]	805	long long last_pos = start_pos;
				806	int is_metadata = 0;
				807	int metadata_index = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	808	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	809	while (1) {
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	810	if ((current_pos - last_pos > 100000)) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	811	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	812	last_pos = current_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	813	last_word_count = word_count;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	814	alpha = starting_alpha
				815	* (1 - word_count_actual / (real) (iter * train_words + 1));
				816	if (alpha < starting_alpha * 0.0001)
				817	alpha = starting_alpha * 0.0001;
				818	}
				819	if (sentence_length == 0) {
				820	while (1) {
Marc Kupietz	c564b1f	2023-12-22 15:38:29 +0100	[diff] [blame^]	821	word = ReadWordIndex(fi, &is_metadata);
				822	if (is_metadata) {
				823	if (debug_mode > 1)
				824	printf("Metadata: %s\n", vocab[word].word);
				825	metadata[metadata_index++] = word;
				826	if (metadata_index >= metadata_categories) {
				827	metadata_index = 0;
				828	}
				829	continue;
				830	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	831	if (feof(fi))
				832	break;
				833	if (word == -1)
				834	continue;
				835	word_count++;
				836	if (word == 0)
				837	break;
				838	// The subsampling randomly discards frequent words while keeping the ranking same
				839	if (sample > 0) {
				840	real ran = (sqrt(vocab[word].cn / (sample * train_words))
				841	+ 1) * (sample * train_words) / vocab[word].cn;
				842	next_random = next_random * (unsigned long long) 25214903917
				843	+ 11;
Marc Kupietz	ab4e5af	2016-03-22 14:24:03 +0100	[diff] [blame]	844	if (ran < (next_random & 0xFFFF) / (real) 65536) {
				845	if(type == 3) // in structured skipgrams
				846	word = -2; // keep the window position correct
				847	else
				848	continue;
				849	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	850	}
				851	sen[sentence_length] = word;
				852	sentence_length++;
				853	if (sentence_length >= MAX_SENTENCE_LENGTH)
				854	break;
				855	}
				856	sentence_position = 0;
				857	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	858	current_pos = threadPos[(long) id] = ftell(fi);
				859	if (feof(fi) \|\| current_pos >= end_pos ) {
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	860	word_count_actual += word_count - last_word_count;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	861	threadIters[(long) id]--;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	862	local_iter--;
				863	if (local_iter == 0)
				864	break;
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	865	if (magic_stop_file[0] && access(magic_stop_file, F_OK ) != -1) {
				866	printf("Magic stop file %s found. Stopping traing ...\n", magic_stop_file);
				867	break;
				868	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	869	word_count = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	870	current_pos = last_pos = start_pos;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	871	last_word_count = 0;
				872	sentence_length = 0;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	873	fseek(fi, start_pos, SEEK_SET);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	874	continue;
				875	}
				876	word = sen[sentence_position];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	877	while (word == -2 && sentence_position<sentence_length)
				878	word = sen[++sentence_position];
				879	if (sentence_position>=sentence_length) {
				880	sentence_length=0;
				881	continue;
				882	}
				883	if (word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	884	continue;
				885	for (c = 0; c < input_len_1; c++)
				886	neu1[c] = 0;
				887	for (c = 0; c < input_len_1; c++)
				888	neu1e[c] = 0;
				889	for (c = 0; c < input_len_2; c++)
				890	neu2[c] = 0;
				891	for (c = 0; c < input_len_2; c++)
				892	neu2e[c] = 0;
				893	next_random = next_random * (unsigned long long) 25214903917 + 11;
				894	b = next_random % window;
				895	if (type == 0) { //train the cbow architecture
				896	// in -> hidden
				897	cw = 0;
				898	for (a = b; a < window * 2 + 1 - b; a++)
				899	if (a != window) {
				900	c = sentence_position - window + a;
				901	if (c < 0)
				902	continue;
				903	if (c >= sentence_length)
				904	continue;
				905	last_word = sen[c];
				906	if (last_word == -1)
				907	continue;
				908	for (c = 0; c < layer1_size; c++)
				909	neu1[c] += syn0[c + last_word * layer1_size];
				910	cw++;
				911	}
				912	if (cw) {
				913	for (c = 0; c < layer1_size; c++)
				914	neu1[c] /= cw;
				915	if (hs)
				916	for (d = 0; d < vocab[word].codelen; d++) {
				917	f = 0;
				918	l2 = vocab[word].point[d] * layer1_size;
				919	// Propagate hidden -> output
				920	for (c = 0; c < layer1_size; c++)
				921	f += neu1[c] * syn1[c + l2];
				922	if (f <= -MAX_EXP)
				923	continue;
				924	else if (f >= MAX_EXP)
				925	continue;
				926	else
				927	f = expTable[(int) ((f + MAX_EXP)
				928	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				929	// 'g' is the gradient multiplied by the learning rate
				930	g = (1 - vocab[word].code[d] - f) * alpha;
				931	// Propagate errors output -> hidden
				932	for (c = 0; c < layer1_size; c++)
				933	neu1e[c] += g * syn1[c + l2];
				934	// Learn weights hidden -> output
				935	for (c = 0; c < layer1_size; c++)
				936	syn1[c + l2] += g * neu1[c];
				937	if (cap == 1)
				938	for (c = 0; c < layer1_size; c++)
				939	capParam(syn1, c + l2);
				940	}
				941	// NEGATIVE SAMPLING
				942	if (negative > 0)
				943	for (d = 0; d < negative + 1; d++) {
				944	if (d == 0) {
				945	target = word;
				946	label = 1;
				947	} else {
				948	next_random = next_random
				949	* (unsigned long long) 25214903917 + 11;
				950	if (word_to_group != NULL
				951	&& word_to_group[word] != -1) {
				952	target = word;
				953	while (target == word) {
				954	target = group_to_table[word_to_group[word]
				955	* table_size
				956	+ (next_random >> 16) % table_size];
				957	next_random = next_random
				958	* (unsigned long long) 25214903917
				959	+ 11;
				960	}
				961	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				962	} else {
				963	target =
				964	table[(next_random >> 16) % table_size];
				965	}
				966	if (target == 0)
				967	target = next_random % (vocab_size - 1) + 1;
				968	if (target == word)
				969	continue;
				970	label = 0;
				971	}
				972	l2 = target * layer1_size;
				973	f = 0;
				974	for (c = 0; c < layer1_size; c++)
				975	f += neu1[c] * syn1neg[c + l2];
				976	if (f > MAX_EXP)
				977	g = (label - 1) * alpha;
				978	else if (f < -MAX_EXP)
				979	g = (label - 0) * alpha;
				980	else
				981	g = (label
				982	- expTable[(int) ((f + MAX_EXP)
				983	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				984	* alpha;
				985	for (c = 0; c < layer1_size; c++)
				986	neu1e[c] += g * syn1neg[c + l2];
				987	for (c = 0; c < layer1_size; c++)
				988	syn1neg[c + l2] += g * neu1[c];
				989	if (cap == 1)
				990	for (c = 0; c < layer1_size; c++)
				991	capParam(syn1neg, c + l2);
				992	}
				993	// Noise Contrastive Estimation
				994	if (nce > 0)
				995	for (d = 0; d < nce + 1; d++) {
				996	if (d == 0) {
				997	target = word;
				998	label = 1;
				999	} else {
				1000	next_random = next_random
				1001	* (unsigned long long) 25214903917 + 11;
				1002	if (word_to_group != NULL
				1003	&& word_to_group[word] != -1) {
				1004	target = word;
				1005	while (target == word) {
				1006	target = group_to_table[word_to_group[word]
				1007	* table_size
				1008	+ (next_random >> 16) % table_size];
				1009	next_random = next_random
				1010	* (unsigned long long) 25214903917
				1011	+ 11;
				1012	}
				1013	} else {
				1014	target =
				1015	table[(next_random >> 16) % table_size];
				1016	}
				1017	if (target == 0)
				1018	target = next_random % (vocab_size - 1) + 1;
				1019	if (target == word)
				1020	continue;
				1021	label = 0;
				1022	}
				1023	l2 = target * layer1_size;
				1024	f = 0;
				1025
				1026	for (c = 0; c < layer1_size; c++)
				1027	f += neu1[c] * syn1nce[c + l2];
				1028	if (f > MAX_EXP)
				1029	g = (label - 1) * alpha;
				1030	else if (f < -MAX_EXP)
				1031	g = (label - 0) * alpha;
				1032	else {
				1033	f = exp(f);
				1034	g =
				1035	(label
				1036	- f
				1037	/ (noise_distribution[target]
				1038	* nce + f)) * alpha;
				1039	}
				1040	for (c = 0; c < layer1_size; c++)
				1041	neu1e[c] += g * syn1nce[c + l2];
				1042	for (c = 0; c < layer1_size; c++)
				1043	syn1nce[c + l2] += g * neu1[c];
				1044	if (cap == 1)
				1045	for (c = 0; c < layer1_size; c++)
				1046	capParam(syn1nce, c + l2);
				1047	}
				1048	// hidden -> in
				1049	for (a = b; a < window * 2 + 1 - b; a++)
				1050	if (a != window) {
				1051	c = sentence_position - window + a;
				1052	if (c < 0)
				1053	continue;
				1054	if (c >= sentence_length)
				1055	continue;
				1056	last_word = sen[c];
				1057	if (last_word == -1)
				1058	continue;
				1059	for (c = 0; c < layer1_size; c++)
				1060	syn0[c + last_word * layer1_size] += neu1e[c];
				1061	}
				1062	}
				1063	} else if (type == 1) { //train skip-gram
				1064	for (a = b; a < window * 2 + 1 - b; a++)
				1065	if (a != window) {
				1066	c = sentence_position - window + a;
				1067	if (c < 0)
				1068	continue;
				1069	if (c >= sentence_length)
				1070	continue;
				1071	last_word = sen[c];
				1072	if (last_word == -1)
				1073	continue;
				1074	l1 = last_word * layer1_size;
				1075	for (c = 0; c < layer1_size; c++)
				1076	neu1e[c] = 0;
				1077	// HIERARCHICAL SOFTMAX
				1078	if (hs)
				1079	for (d = 0; d < vocab[word].codelen; d++) {
				1080	f = 0;
				1081	l2 = vocab[word].point[d] * layer1_size;
				1082	// Propagate hidden -> output
				1083	for (c = 0; c < layer1_size; c++)
				1084	f += syn0[c + l1] * syn1[c + l2];
				1085	if (f <= -MAX_EXP)
				1086	continue;
				1087	else if (f >= MAX_EXP)
				1088	continue;
				1089	else
				1090	f = expTable[(int) ((f + MAX_EXP)
				1091	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1092	// 'g' is the gradient multiplied by the learning rate
				1093	g = (1 - vocab[word].code[d] - f) * alpha;
				1094	// Propagate errors output -> hidden
				1095	for (c = 0; c < layer1_size; c++)
				1096	neu1e[c] += g * syn1[c + l2];
				1097	// Learn weights hidden -> output
				1098	for (c = 0; c < layer1_size; c++)
				1099	syn1[c + l2] += g * syn0[c + l1];
				1100	if (cap == 1)
				1101	for (c = 0; c < layer1_size; c++)
				1102	capParam(syn1, c + l2);
				1103	}
				1104	// NEGATIVE SAMPLING
				1105	if (negative > 0)
				1106	for (d = 0; d < negative + 1; d++) {
				1107	if (d == 0) {
				1108	target = word;
				1109	label = 1;
				1110	} else {
				1111	next_random = next_random
				1112	* (unsigned long long) 25214903917 + 11;
				1113	if (word_to_group != NULL
				1114	&& word_to_group[word] != -1) {
				1115	target = word;
				1116	while (target == word) {
				1117	target =
				1118	group_to_table[word_to_group[word]
				1119	* table_size
				1120	+ (next_random >> 16)
				1121	% table_size];
				1122	next_random =
				1123	next_random
				1124	* (unsigned long long) 25214903917
				1125	+ 11;
				1126	}
				1127	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1128	} else {
				1129	target = table[(next_random >> 16)
				1130	% table_size];
				1131	}
				1132	if (target == 0)
				1133	target = next_random % (vocab_size - 1) + 1;
				1134	if (target == word)
				1135	continue;
				1136	label = 0;
				1137	}
				1138	l2 = target * layer1_size;
				1139	f = 0;
				1140	for (c = 0; c < layer1_size; c++)
				1141	f += syn0[c + l1] * syn1neg[c + l2];
				1142	if (f > MAX_EXP)
				1143	g = (label - 1) * alpha;
				1144	else if (f < -MAX_EXP)
				1145	g = (label - 0) * alpha;
				1146	else
				1147	g =
				1148	(label
				1149	- expTable[(int) ((f + MAX_EXP)
				1150	* (EXP_TABLE_SIZE
				1151	/ MAX_EXP / 2))])
				1152	* alpha;
				1153	for (c = 0; c < layer1_size; c++)
				1154	neu1e[c] += g * syn1neg[c + l2];
				1155	for (c = 0; c < layer1_size; c++)
				1156	syn1neg[c + l2] += g * syn0[c + l1];
				1157	if (cap == 1)
				1158	for (c = 0; c < layer1_size; c++)
				1159	capParam(syn1neg, c + l2);
				1160	}
				1161	//Noise Contrastive Estimation
				1162	if (nce > 0)
				1163	for (d = 0; d < nce + 1; d++) {
				1164	if (d == 0) {
				1165	target = word;
				1166	label = 1;
				1167	} else {
				1168	next_random = next_random
				1169	* (unsigned long long) 25214903917 + 11;
				1170	if (word_to_group != NULL
				1171	&& word_to_group[word] != -1) {
				1172	target = word;
				1173	while (target == word) {
				1174	target =
				1175	group_to_table[word_to_group[word]
				1176	* table_size
				1177	+ (next_random >> 16)
				1178	% table_size];
				1179	next_random =
				1180	next_random
				1181	* (unsigned long long) 25214903917
				1182	+ 11;
				1183	}
				1184	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1185	} else {
				1186	target = table[(next_random >> 16)
				1187	% table_size];
				1188	}
				1189	if (target == 0)
				1190	target = next_random % (vocab_size - 1) + 1;
				1191	if (target == word)
				1192	continue;
				1193	label = 0;
				1194	}
				1195	l2 = target * layer1_size;
				1196	f = 0;
				1197	for (c = 0; c < layer1_size; c++)
				1198	f += syn0[c + l1] * syn1nce[c + l2];
				1199	if (f > MAX_EXP)
				1200	g = (label - 1) * alpha;
				1201	else if (f < -MAX_EXP)
				1202	g = (label - 0) * alpha;
				1203	else {
				1204	f = exp(f);
				1205	g = (label
				1206	- f
				1207	/ (noise_distribution[target]
				1208	* nce + f)) * alpha;
				1209	}
				1210	for (c = 0; c < layer1_size; c++)
				1211	neu1e[c] += g * syn1nce[c + l2];
				1212	for (c = 0; c < layer1_size; c++)
				1213	syn1nce[c + l2] += g * syn0[c + l1];
				1214	if (cap == 1)
				1215	for (c = 0; c < layer1_size; c++)
				1216	capParam(syn1nce, c + l2);
				1217	}
				1218	// Learn weights input -> hidden
				1219	for (c = 0; c < layer1_size; c++)
				1220	syn0[c + l1] += neu1e[c];
				1221	}
				1222	} else if (type == 2) { //train the cwindow architecture
				1223	// in -> hidden
				1224	cw = 0;
				1225	for (a = 0; a < window * 2 + 1; a++)
				1226	if (a != window) {
				1227	c = sentence_position - window + a;
				1228	if (c < 0)
				1229	continue;
				1230	if (c >= sentence_length)
				1231	continue;
				1232	last_word = sen[c];
				1233	if (last_word == -1)
				1234	continue;
				1235	window_offset = a * layer1_size;
				1236	if (a > window)
				1237	window_offset -= layer1_size;
				1238	for (c = 0; c < layer1_size; c++)
				1239	neu1[c + window_offset] += syn0[c
				1240	+ last_word * layer1_size];
				1241	cw++;
				1242	}
				1243	if (cw) {
				1244	if (hs)
				1245	for (d = 0; d < vocab[word].codelen; d++) {
				1246	f = 0;
				1247	l2 = vocab[word].point[d] * window_layer_size;
				1248	// Propagate hidden -> output
				1249	for (c = 0; c < window_layer_size; c++)
				1250	f += neu1[c] * syn1_window[c + l2];
				1251	if (f <= -MAX_EXP)
				1252	continue;
				1253	else if (f >= MAX_EXP)
				1254	continue;
				1255	else
				1256	f = expTable[(int) ((f + MAX_EXP)
				1257	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1258	// 'g' is the gradient multiplied by the learning rate
				1259	g = (1 - vocab[word].code[d] - f) * alpha;
				1260	// Propagate errors output -> hidden
				1261	for (c = 0; c < window_layer_size; c++)
				1262	neu1e[c] += g * syn1_window[c + l2];
				1263	// Learn weights hidden -> output
				1264	for (c = 0; c < window_layer_size; c++)
				1265	syn1_window[c + l2] += g * neu1[c];
				1266	if (cap == 1)
				1267	for (c = 0; c < window_layer_size; c++)
				1268	capParam(syn1_window, c + l2);
				1269	}
				1270	// NEGATIVE SAMPLING
				1271	if (negative > 0)
				1272	for (d = 0; d < negative + 1; d++) {
				1273	if (d == 0) {
				1274	target = word;
				1275	label = 1;
				1276	} else {
				1277	next_random = next_random
				1278	* (unsigned long long) 25214903917 + 11;
				1279	if (word_to_group != NULL
				1280	&& word_to_group[word] != -1) {
				1281	target = word;
				1282	while (target == word) {
				1283	target = group_to_table[word_to_group[word]
				1284	* table_size
				1285	+ (next_random >> 16) % table_size];
				1286	next_random = next_random
				1287	* (unsigned long long) 25214903917
				1288	+ 11;
				1289	}
				1290	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1291	} else {
				1292	target =
				1293	table[(next_random >> 16) % table_size];
				1294	}
				1295	if (target == 0)
				1296	target = next_random % (vocab_size - 1) + 1;
				1297	if (target == word)
				1298	continue;
				1299	label = 0;
				1300	}
				1301	l2 = target * window_layer_size;
				1302	f = 0;
				1303	for (c = 0; c < window_layer_size; c++)
				1304	f += neu1[c] * syn1neg_window[c + l2];
				1305	if (f > MAX_EXP)
				1306	g = (label - 1) * alpha;
				1307	else if (f < -MAX_EXP)
				1308	g = (label - 0) * alpha;
				1309	else
				1310	g = (label
				1311	- expTable[(int) ((f + MAX_EXP)
				1312	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1313	* alpha;
				1314	for (c = 0; c < window_layer_size; c++)
				1315	neu1e[c] += g * syn1neg_window[c + l2];
				1316	for (c = 0; c < window_layer_size; c++)
				1317	syn1neg_window[c + l2] += g * neu1[c];
				1318	if (cap == 1)
				1319	for (c = 0; c < window_layer_size; c++)
				1320	capParam(syn1neg_window, c + l2);
				1321	}
				1322	// Noise Contrastive Estimation
				1323	if (nce > 0)
				1324	for (d = 0; d < nce + 1; d++) {
				1325	if (d == 0) {
				1326	target = word;
				1327	label = 1;
				1328	} else {
				1329	next_random = next_random
				1330	* (unsigned long long) 25214903917 + 11;
				1331	if (word_to_group != NULL
				1332	&& word_to_group[word] != -1) {
				1333	target = word;
				1334	while (target == word) {
				1335	target = group_to_table[word_to_group[word]
				1336	* table_size
				1337	+ (next_random >> 16) % table_size];
				1338	next_random = next_random
				1339	* (unsigned long long) 25214903917
				1340	+ 11;
				1341	}
				1342	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1343	} else {
				1344	target =
				1345	table[(next_random >> 16) % table_size];
				1346	}
				1347	if (target == 0)
				1348	target = next_random % (vocab_size - 1) + 1;
				1349	if (target == word)
				1350	continue;
				1351	label = 0;
				1352	}
				1353	l2 = target * window_layer_size;
				1354	f = 0;
				1355	for (c = 0; c < window_layer_size; c++)
				1356	f += neu1[c] * syn1nce_window[c + l2];
				1357	if (f > MAX_EXP)
				1358	g = (label - 1) * alpha;
				1359	else if (f < -MAX_EXP)
				1360	g = (label - 0) * alpha;
				1361	else {
				1362	f = exp(f);
				1363	g =
				1364	(label
				1365	- f
				1366	/ (noise_distribution[target]
				1367	* nce + f)) * alpha;
				1368	}
				1369	for (c = 0; c < window_layer_size; c++)
				1370	neu1e[c] += g * syn1nce_window[c + l2];
				1371	for (c = 0; c < window_layer_size; c++)
				1372	syn1nce_window[c + l2] += g * neu1[c];
				1373	if (cap == 1)
				1374	for (c = 0; c < window_layer_size; c++)
				1375	capParam(syn1nce_window, c + l2);
				1376	}
				1377	// hidden -> in
				1378	for (a = 0; a < window * 2 + 1; a++)
				1379	if (a != window) {
				1380	c = sentence_position - window + a;
				1381	if (c < 0)
				1382	continue;
				1383	if (c >= sentence_length)
				1384	continue;
				1385	last_word = sen[c];
				1386	if (last_word == -1)
				1387	continue;
				1388	window_offset = a * layer1_size;
				1389	if (a > window)
				1390	window_offset -= layer1_size;
				1391	for (c = 0; c < layer1_size; c++)
				1392	syn0[c + last_word * layer1_size] += neu1e[c
				1393	+ window_offset];
				1394	}
				1395	}
				1396	} else if (type == 3) { //train structured skip-gram
				1397	for (a = 0; a < window * 2 + 1; a++)
				1398	if (a != window) {
				1399	c = sentence_position - window + a;
				1400	if (c < 0)
				1401	continue;
				1402	if (c >= sentence_length)
				1403	continue;
				1404	last_word = sen[c];
Peter Fankhauser	66035a4	2016-04-20 13:29:33 +0200	[diff] [blame]	1405	if (last_word < 0)
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1406	continue;
				1407	l1 = last_word * layer1_size;
				1408	window_offset = a * layer1_size;
				1409	if (a > window)
				1410	window_offset -= layer1_size;
				1411	for (c = 0; c < layer1_size; c++)
				1412	neu1e[c] = 0;
				1413	// HIERARCHICAL SOFTMAX
				1414	if (hs)
				1415	for (d = 0; d < vocab[word].codelen; d++) {
				1416	f = 0;
				1417	l2 = vocab[word].point[d] * window_layer_size;
				1418	// Propagate hidden -> output
				1419	for (c = 0; c < layer1_size; c++)
				1420	f += syn0[c + l1]
				1421	* syn1_window[c + l2 + window_offset];
				1422	if (f <= -MAX_EXP)
				1423	continue;
				1424	else if (f >= MAX_EXP)
				1425	continue;
				1426	else
				1427	f = expTable[(int) ((f + MAX_EXP)
				1428	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1429	// 'g' is the gradient multiplied by the learning rate
				1430	g = (1 - vocab[word].code[d] - f) * alpha;
				1431	// Propagate errors output -> hidden
				1432	for (c = 0; c < layer1_size; c++)
				1433	neu1e[c] += g
				1434	* syn1_window[c + l2 + window_offset];
				1435	// Learn weights hidden -> output
				1436	for (c = 0; c < layer1_size; c++)
				1437	syn1[c + l2 + window_offset] += g
				1438	* syn0[c + l1];
				1439	if (cap == 1)
				1440	for (c = 0; c < layer1_size; c++)
				1441	capParam(syn1, c + l2 + window_offset);
				1442	}
				1443	// NEGATIVE SAMPLING
				1444	if (negative > 0)
				1445	for (d = 0; d < negative + 1; d++) {
				1446	if (d == 0) {
				1447	target = word;
				1448	label = 1;
				1449	} else {
				1450	next_random = next_random
				1451	* (unsigned long long) 25214903917 + 11;
				1452	if (word_to_group != NULL
				1453	&& word_to_group[word] != -1) {
				1454	target = word;
				1455	while (target == word) {
				1456	target =
				1457	group_to_table[word_to_group[word]
				1458	* table_size
				1459	+ (next_random >> 16)
				1460	% table_size];
				1461	next_random =
				1462	next_random
				1463	* (unsigned long long) 25214903917
				1464	+ 11;
				1465	}
				1466	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1467	} else {
				1468	target = table[(next_random >> 16)
				1469	% table_size];
				1470	}
				1471	if (target == 0)
				1472	target = next_random % (vocab_size - 1) + 1;
				1473	if (target == word)
				1474	continue;
				1475	label = 0;
				1476	}
				1477	l2 = target * window_layer_size;
				1478	f = 0;
				1479	for (c = 0; c < layer1_size; c++)
				1480	f +=
				1481	syn0[c + l1]
				1482	* syn1neg_window[c + l2
				1483	+ window_offset];
				1484	if (f > MAX_EXP)
				1485	g = (label - 1) * alpha;
				1486	else if (f < -MAX_EXP)
				1487	g = (label - 0) * alpha;
				1488	else
				1489	g =
				1490	(label
				1491	- expTable[(int) ((f + MAX_EXP)
				1492	* (EXP_TABLE_SIZE
				1493	/ MAX_EXP / 2))])
				1494	* alpha;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1495	if(debug_mode > 2 && ((long long) id) == 0) {
				1496	printf("negative sampling %lld for input (word) %s (#%lld), target (last word) %s returned %s (#%lld), ", d, vocab[word].word, word, vocab[last_word].word, vocab[target].word, target);
				1497	printf("label %lld, a %lld, gain %.4f\n", label, a-window, g);
				1498	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1499	for (c = 0; c < layer1_size; c++)
				1500	neu1e[c] +=
				1501	g
				1502	* syn1neg_window[c + l2
				1503	+ window_offset];
				1504	for (c = 0; c < layer1_size; c++)
				1505	syn1neg_window[c + l2 + window_offset] += g
				1506	* syn0[c + l1];
				1507	if (cap == 1)
				1508	for (c = 0; c < layer1_size; c++)
				1509	capParam(syn1neg_window,
				1510	c + l2 + window_offset);
				1511	}
				1512	// Noise Constrastive Estimation
				1513	if (nce > 0)
				1514	for (d = 0; d < nce + 1; d++) {
				1515	if (d == 0) {
				1516	target = word;
				1517	label = 1;
				1518	} else {
				1519	next_random = next_random
				1520	* (unsigned long long) 25214903917 + 11;
				1521	if (word_to_group != NULL
				1522	&& word_to_group[word] != -1) {
				1523	target = word;
				1524	while (target == word) {
				1525	target =
				1526	group_to_table[word_to_group[word]
				1527	* table_size
				1528	+ (next_random >> 16)
				1529	% table_size];
				1530	next_random =
				1531	next_random
				1532	* (unsigned long long) 25214903917
				1533	+ 11;
				1534	}
				1535	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1536	} else {
				1537	target = table[(next_random >> 16)
				1538	% table_size];
				1539	}
				1540	if (target == 0)
				1541	target = next_random % (vocab_size - 1) + 1;
				1542	if (target == word)
				1543	continue;
				1544	label = 0;
				1545	}
				1546	l2 = target * window_layer_size;
				1547	f = 0;
				1548	for (c = 0; c < layer1_size; c++)
				1549	f +=
				1550	syn0[c + l1]
				1551	* syn1nce_window[c + l2
				1552	+ window_offset];
				1553	if (f > MAX_EXP)
				1554	g = (label - 1) * alpha;
				1555	else if (f < -MAX_EXP)
				1556	g = (label - 0) * alpha;
				1557	else {
				1558	f = exp(f);
				1559	g = (label
				1560	- f
				1561	/ (noise_distribution[target]
				1562	* nce + f)) * alpha;
				1563	}
				1564	for (c = 0; c < layer1_size; c++)
				1565	neu1e[c] +=
				1566	g
				1567	* syn1nce_window[c + l2
				1568	+ window_offset];
				1569	for (c = 0; c < layer1_size; c++)
				1570	syn1nce_window[c + l2 + window_offset] += g
				1571	* syn0[c + l1];
				1572	if (cap == 1)
				1573	for (c = 0; c < layer1_size; c++)
				1574	capParam(syn1nce_window,
				1575	c + l2 + window_offset);
				1576	}
				1577	// Learn weights input -> hidden
				1578	for (c = 0; c < layer1_size; c++) {
				1579	syn0[c + l1] += neu1e[c];
				1580	if (syn0[c + l1] > 50)
				1581	syn0[c + l1] = 50;
				1582	if (syn0[c + l1] < -50)
				1583	syn0[c + l1] = -50;
				1584	}
				1585	}
				1586	} else if (type == 4) { //training senna
				1587	// in -> hidden
				1588	cw = 0;
				1589	for (a = 0; a < window * 2 + 1; a++)
				1590	if (a != window) {
				1591	c = sentence_position - window + a;
				1592	if (c < 0)
				1593	continue;
				1594	if (c >= sentence_length)
				1595	continue;
				1596	last_word = sen[c];
				1597	if (last_word == -1)
				1598	continue;
				1599	window_offset = a * layer1_size;
				1600	if (a > window)
				1601	window_offset -= layer1_size;
				1602	for (c = 0; c < layer1_size; c++)
				1603	neu1[c + window_offset] += syn0[c
				1604	+ last_word * layer1_size];
				1605	cw++;
				1606	}
				1607	if (cw) {
				1608	for (a = 0; a < window_hidden_size; a++) {
				1609	c = a * window_layer_size;
				1610	for (b = 0; b < window_layer_size; b++) {
				1611	neu2[a] += syn_window_hidden[c + b] * neu1[b];
				1612	}
				1613	}
				1614	if (hs)
				1615	for (d = 0; d < vocab[word].codelen; d++) {
				1616	f = 0;
				1617	l2 = vocab[word].point[d] * window_hidden_size;
				1618	// Propagate hidden -> output
				1619	for (c = 0; c < window_hidden_size; c++)
				1620	f += hardTanh(neu2[c]) * syn_hidden_word[c + l2];
				1621	if (f <= -MAX_EXP)
				1622	continue;
				1623	else if (f >= MAX_EXP)
				1624	continue;
				1625	else
				1626	f = expTable[(int) ((f + MAX_EXP)
				1627	* (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1628	// 'g' is the gradient multiplied by the learning rate
				1629	g = (1 - vocab[word].code[d] - f) * alpha;
				1630	// Propagate errors output -> hidden
				1631	for (c = 0; c < window_hidden_size; c++)
				1632	neu2e[c] += dHardTanh(neu2[c], g) * g
				1633	* syn_hidden_word[c + l2];
				1634	// Learn weights hidden -> output
				1635	for (c = 0; c < window_hidden_size; c++)
				1636	syn_hidden_word[c + l2] += dHardTanh(neu2[c], g) * g
				1637	* neu2[c];
				1638	}
				1639	// NEGATIVE SAMPLING
				1640	if (negative > 0)
				1641	for (d = 0; d < negative + 1; d++) {
				1642	if (d == 0) {
				1643	target = word;
				1644	label = 1;
				1645	} else {
				1646	next_random = next_random
				1647	* (unsigned long long) 25214903917 + 11;
				1648	if (word_to_group != NULL
				1649	&& word_to_group[word] != -1) {
				1650	target = word;
				1651	while (target == word) {
				1652	target = group_to_table[word_to_group[word]
				1653	* table_size
				1654	+ (next_random >> 16) % table_size];
				1655	next_random = next_random
				1656	* (unsigned long long) 25214903917
				1657	+ 11;
				1658	}
				1659	//printf("negative sampling %lld for word %s returned %s\n", d, vocab[word].word, vocab[target].word);
				1660	} else {
				1661	target =
				1662	table[(next_random >> 16) % table_size];
				1663	}
				1664	if (target == 0)
				1665	target = next_random % (vocab_size - 1) + 1;
				1666	if (target == word)
				1667	continue;
				1668	label = 0;
				1669	}
				1670	l2 = target * window_hidden_size;
				1671	f = 0;
				1672	for (c = 0; c < window_hidden_size; c++)
				1673	f += hardTanh(neu2[c])
				1674	* syn_hidden_word_neg[c + l2];
				1675	if (f > MAX_EXP)
				1676	g = (label - 1) * alpha / negative;
				1677	else if (f < -MAX_EXP)
				1678	g = (label - 0) * alpha / negative;
				1679	else
				1680	g = (label
				1681	- expTable[(int) ((f + MAX_EXP)
				1682	* (EXP_TABLE_SIZE / MAX_EXP / 2))])
				1683	* alpha / negative;
				1684	for (c = 0; c < window_hidden_size; c++)
				1685	neu2e[c] += dHardTanh(neu2[c], g) * g
				1686	* syn_hidden_word_neg[c + l2];
				1687	for (c = 0; c < window_hidden_size; c++)
				1688	syn_hidden_word_neg[c + l2] += dHardTanh(neu2[c], g)
				1689	* g * neu2[c];
				1690	}
				1691	for (a = 0; a < window_hidden_size; a++)
				1692	for (b = 0; b < window_layer_size; b++)
				1693	neu1e[b] += neu2e[a]
				1694	* syn_window_hidden[a * window_layer_size + b];
				1695	for (a = 0; a < window_hidden_size; a++)
				1696	for (b = 0; b < window_layer_size; b++)
				1697	syn_window_hidden[a * window_layer_size + b] += neu2e[a]
				1698	* neu1[b];
				1699	// hidden -> in
				1700	for (a = 0; a < window * 2 + 1; a++)
				1701	if (a != window) {
				1702	c = sentence_position - window + a;
				1703	if (c < 0)
				1704	continue;
				1705	if (c >= sentence_length)
				1706	continue;
				1707	last_word = sen[c];
				1708	if (last_word == -1)
				1709	continue;
				1710	window_offset = a * layer1_size;
				1711	if (a > window)
				1712	window_offset -= layer1_size;
				1713	for (c = 0; c < layer1_size; c++)
				1714	syn0[c + last_word * layer1_size] += neu1e[c
				1715	+ window_offset];
				1716	}
				1717	}
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	1718	} else if(type == 5) {
				1719	for (a = b; a < window * 2 + 1 - b; a++) if (a != window) {
				1720	c = sentence_position - window + a;
				1721	if (c < 0) continue;
				1722	if (c >= sentence_length) continue;
				1723	last_word = sen[c];
				1724	if (last_word == -1) continue;
				1725	inc_collocator(cdb, word, last_word, a - window);
				1726	// printf("%2d: storing %s %s - %d\n", id, vocab[word].word, vocab[last_word].word, (int) a - window);
				1727	// cw++;
				1728	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1729	} else {
				1730	printf("unknown type %i", type);
				1731	exit(0);
				1732	}
				1733	sentence_position++;
				1734	if (sentence_position >= sentence_length) {
				1735	sentence_length = 0;
				1736	continue;
				1737	}
				1738	}
				1739	fclose(fi);
				1740	free(neu1);
				1741	free(neu1e);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1742	threadPos[(long) id] = -1;
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1743	pthread_exit(NULL);
				1744	}
				1745
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1746	void ShowCollocations() {
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1747	long a, b, c, d, e, window_offset, target, max_target=0, maxmax_target;
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1748	real f, max_f, maxmax_f;
Marc Kupietz	f00e7b0	2023-12-22 11:11:56 +0100	[diff] [blame]	1749	real *target_sums=0L, bestf[MAX_CC], worstbest;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1750	long besti[MAX_CC];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1751	int N = 10, bestp[MAX_CC];
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1752	a = posix_memalign((void *) &target_sums, 128, vocab_size sizeof(real));
				1753
				1754	for (d = cc; d < vocab_size; d++) {
				1755	for (b = 0; b < vocab_size; b++)
				1756	target_sums[b]=0;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1757	for (b = 0; b < N; b++)
				1758	bestf[b]=-1;
				1759	worstbest = -1;
				1760
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1761	maxmax_f = -1;
				1762	maxmax_target = 0;
Marc Kupietz	0a664c1	2016-03-18 13:18:22 +0100	[diff] [blame]	1763	for (a = window * 2 + 1; a >=0; a--) {
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1764	if (a != window) {
				1765	max_f = -1;
				1766	window_offset = a * layer1_size;
				1767	if (a > window)
				1768	window_offset -= layer1_size;
				1769	for(target = 0; target < vocab_size; target ++) {
				1770	if(target == d)
				1771	continue;
				1772	f = 0;
				1773	for (c = 0; c < layer1_size; c++)
				1774	f += syn0[d* layer1_size + c] * syn1neg_window[target * window_layer_size + window_offset + c];
				1775	if (f < -MAX_EXP)
				1776	continue;
				1777	else if (f > MAX_EXP)
				1778	continue;
				1779	else
				1780	f = expTable[(int) ((f + MAX_EXP) * (EXP_TABLE_SIZE / MAX_EXP / 2))];
				1781	if(f > max_f) {
				1782	max_f = f;
				1783	max_target = target;
				1784	}
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1785	target_sums[target] += (1-target_sums[target]) * f;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1786	if(f > worstbest) {
				1787	for (b = 0; b < N; b++) {
				1788	if (f > bestf[b]) {
				1789	for (e = N - 1; e > b; e--) {
				1790	bestf[e] = bestf[e - 1];
				1791	besti[e] = besti[e - 1];
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1792	bestp[e] = bestp[e - 1];
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1793	}
				1794	bestf[b] = f;
				1795	besti[b] = target;
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1796	bestp[b] = window-a;
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1797	break;
				1798	}
				1799	}
				1800	worstbest = bestf[N-1];
				1801	}
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1802	}
				1803	printf("%s (%.2f) ", vocab[max_target].word, max_f);
				1804	if(max_f > maxmax_f) {
				1805	maxmax_f = max_f;
				1806	maxmax_target = max_target;
				1807	}
				1808	} else {
				1809	printf("\x1b[1m%s\x1b[0m ", vocab[d].word);
				1810	}
				1811	}
				1812	max_f = -1;
				1813	for (b = 0; b < vocab_size; b++) {
				1814	if(target_sums[b] > max_f) {
				1815	max_f = target_sums[b];
				1816	max_target = b;
				1817	}
				1818	}
				1819	printf(" – max sum: %s (%.2f), max resp.: \x1b[1m%s\x1b[0m (%.2f)\n",
Marc Kupietz	0fb5d61	2016-03-18 11:01:21 +0100	[diff] [blame]	1820	vocab[max_target].word, max_f,
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1821	vocab[maxmax_target].word, maxmax_f);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1822	for(b=0; b<N && bestf[b]>-1; b++)
Marc Kupietz	79fd83d	2016-03-18 14:09:07 +0100	[diff] [blame]	1823	printf("%-32s %.2f %d\n", vocab[besti[b]].word, bestf[b], bestp[b]);
Marc Kupietz	71996e7	2016-03-18 13:40:24 +0100	[diff] [blame]	1824	printf("\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1825	}
				1826	}
				1827
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1828	void TrainModel() {
				1829	long a, b, c, d;
				1830	FILE *fo;
				1831	pthread_t pt = (pthread_t ) malloc(num_threads * sizeof(pthread_t));
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1832	threadPos = malloc(num_threads * sizeof(long long));
				1833	threadIters = malloc(num_threads * sizeof(int));
				1834	char *timebuf = malloc(80);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1835	printf("Starting training using file %s\n", train_file);
				1836	starting_alpha = alpha;
				1837	if (read_vocab_file[0] != 0)
				1838	ReadVocab();
				1839	else
				1840	LearnVocabFromTrainFile();
				1841	if (save_vocab_file[0] != 0)
				1842	SaveVocab();
				1843	if (output_file[0] == 0)
				1844	return;
				1845	InitNet();
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	1846	if(cc > 0)
				1847	ShowCollocations();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1848	if (negative > 0 \|\| nce > 0)
				1849	InitUnigramTable();
				1850	if (negative_classes_file[0] != 0)
				1851	InitClassUnigramTable();
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	1852	start = time(NULL);
				1853	start_clock = clock();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1854	for (a = 0; a < num_threads; a++)
				1855	pthread_create(&pt[a], NULL, TrainModelThread, (void *) a);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1856	if(debug_mode > 1)
				1857	pthread_create(&pt[num_threads], NULL, MonitorThread, (void *) a);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1858	for (a = 0; a < num_threads; a++)
				1859	pthread_join(pt[a], NULL);
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1860	if(debug_mode > 1) {
				1861	pthread_join(pt[num_threads], NULL);
Marc Kupietz	b366bcd	2018-01-11 21:29:41 +0100	[diff] [blame]	1862	clock_t now = time(NULL);
				1863	clock_t now_clock = clock();
				1864	printf("\nFinished: %s - user: %lds - real: %lds\n", currentDateTime(timebuf, 0), (now_clock - start_clock) / CLOCKS_PER_SEC, now - start);
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	1865	if(type == 5) // don't save vectorsmfor classic collocators
				1866	return;
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1867	printf("Saving vectors to %s ...", output_file);
				1868	fflush(stdout);
				1869	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1870	fo = fopen(output_file, "wb");
				1871	if (classes == 0) {
				1872	// Save the word vectors
				1873	fprintf(fo, "%lld %lld\n", vocab_size, layer1_size);
				1874	for (a = 0; a < vocab_size; a++) {
				1875	fprintf(fo, "%s ", vocab[a].word);
				1876	if (binary)
				1877	for (b = 0; b < layer1_size; b++)
				1878	fwrite(&syn0[a * layer1_size + b], sizeof(real), 1, fo);
				1879	else
				1880	for (b = 0; b < layer1_size; b++)
				1881	fprintf(fo, "%lf ", syn0[a * layer1_size + b]);
				1882	fprintf(fo, "\n");
				1883	}
Marc Kupietz	202723e	2016-07-14 09:12:00 +0200	[diff] [blame]	1884	if(debug_mode > 1)
				1885	fprintf(stderr, "\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1886	} else {
				1887	// Run K-means on the word vectors
				1888	int clcn = classes, iter = 10, closeid;
				1889	int centcn = (int ) malloc(classes * sizeof(int));
				1890	int cl = (int ) calloc(vocab_size, sizeof(int));
				1891	real closev, x;
				1892	real cent = (real ) calloc(classes * layer1_size, sizeof(real));
				1893	for (a = 0; a < vocab_size; a++)
				1894	cl[a] = a % clcn;
				1895	for (a = 0; a < iter; a++) {
				1896	for (b = 0; b < clcn * layer1_size; b++)
				1897	cent[b] = 0;
				1898	for (b = 0; b < clcn; b++)
				1899	centcn[b] = 1;
				1900	for (c = 0; c < vocab_size; c++) {
				1901	for (d = 0; d < layer1_size; d++)
				1902	cent[layer1_size * cl[c] + d] += syn0[c * layer1_size + d];
				1903	centcn[cl[c]]++;
				1904	}
				1905	for (b = 0; b < clcn; b++) {
				1906	closev = 0;
				1907	for (c = 0; c < layer1_size; c++) {
				1908	cent[layer1_size * b + c] /= centcn[b];
				1909	closev += cent[layer1_size * b + c]
				1910	* cent[layer1_size * b + c];
				1911	}
				1912	closev = sqrt(closev);
				1913	for (c = 0; c < layer1_size; c++)
				1914	cent[layer1_size * b + c] /= closev;
				1915	}
				1916	for (c = 0; c < vocab_size; c++) {
				1917	closev = -10;
				1918	closeid = 0;
				1919	for (d = 0; d < clcn; d++) {
				1920	x = 0;
				1921	for (b = 0; b < layer1_size; b++)
				1922	x += cent[layer1_size * d + b]
				1923	* syn0[c * layer1_size + b];
				1924	if (x > closev) {
				1925	closev = x;
				1926	closeid = d;
				1927	}
				1928	}
				1929	cl[c] = closeid;
				1930	}
				1931	}
				1932	// Save the K-means classes
				1933	for (a = 0; a < vocab_size; a++)
				1934	fprintf(fo, "%s %d\n", vocab[a].word, cl[a]);
				1935	free(centcn);
				1936	free(cent);
				1937	free(cl);
				1938	}
				1939	fclose(fo);
				1940	if (save_net_file[0] != 0)
				1941	SaveNet();
				1942	}
				1943
				1944	int ArgPos(char str, int argc, char *argv) {
				1945	int a;
				1946	for (a = 1; a < argc; a++)
				1947	if (!strcmp(str, argv[a])) {
				1948	if (a == argc - 1) {
				1949	printf("Argument missing for %s\n", str);
				1950	exit(1);
				1951	}
				1952	return a;
				1953	}
				1954	return -1;
				1955	}
				1956
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	1957	void print_help() {
Marc Kupietz	83a67d4	2021-03-22 17:29:36 +0100	[diff] [blame]	1958	printf("WORD VECTOR estimation toolkit v 0.9.0\n\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	1959	printf("Options:\n");
				1960	printf("Parameters for training:\n");
				1961	printf("\t-train <file>\n");
				1962	printf("\t\tUse text data from <file> to train the model\n");
				1963	printf("\t-output <file>\n");
				1964	printf(
				1965	"\t\tUse <file> to save the resulting word vectors / word clusters\n");
				1966	printf("\t-size <int>\n");
				1967	printf("\t\tSet size of word vectors; default is 100\n");
				1968	printf("\t-window <int>\n");
				1969	printf("\t\tSet max skip length between words; default is 5\n");
				1970	printf("\t-sample <float>\n");
				1971	printf(
				1972	"\t\tSet threshold for occurrence of words. Those that appear with higher frequency in the training data\n");
				1973	printf(
				1974	"\t\twill be randomly down-sampled; default is 1e-3, useful range is (0, 1e-5)\n");
				1975	printf("\t-hs <int>\n");
				1976	printf("\t\tUse Hierarchical Softmax; default is 0 (not used)\n");
				1977	printf("\t-negative <int>\n");
				1978	printf(
				1979	"\t\tNumber of negative examples; default is 5, common values are 3 - 10 (0 = not used)\n");
				1980	printf("\t-negative-classes <file>\n");
				1981	printf("\t\tNegative classes to sample from\n");
				1982	printf("\t-nce <int>\n");
				1983	printf(
				1984	"\t\tNumber of negative examples for nce; default is 0, common values are 3 - 10 (0 = not used)\n");
				1985	printf("\t-threads <int>\n");
				1986	printf("\t\tUse <int> threads (default 12)\n");
				1987	printf("\t-iter <int>\n");
				1988	printf("\t\tRun more training iterations (default 5)\n");
				1989	printf("\t-min-count <int>\n");
				1990	printf(
				1991	"\t\tThis will discard words that appear less than <int> times; default is 5\n");
				1992	printf("\t-alpha <float>\n");
				1993	printf(
				1994	"\t\tSet the starting learning rate; default is 0.025 for skip-gram and 0.05 for CBOW\n");
				1995	printf("\t-classes <int>\n");
				1996	printf(
				1997	"\t\tOutput word classes rather than word vectors; default number of classes is 0 (vectors are written)\n");
				1998	printf("\t-debug <int>\n");
				1999	printf(
				2000	"\t\tSet the debug mode (default = 2 = more info during training)\n");
				2001	printf("\t-binary <int>\n");
				2002	printf(
				2003	"\t\tSave the resulting vectors in binary moded; default is 0 (off)\n");
				2004	printf("\t-save-vocab <file>\n");
				2005	printf("\t\tThe vocabulary will be saved to <file>\n");
				2006	printf("\t-read-vocab <file>\n");
				2007	printf(
				2008	"\t\tThe vocabulary will be read from <file>, not constructed from the training data\n");
				2009	printf("\t-read-net <file>\n");
				2010	printf(
				2011	"\t\tThe net parameters will be read from <file>, not initialized randomly\n");
				2012	printf("\t-save-net <file>\n");
				2013	printf("\t\tThe net parameters will be saved to <file>\n");
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	2014	printf("\t-magic-stop-file <file>\n");
				2015	printf("\t\tIf the magic file <file> exists training will stop after the current cycle.\n");
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	2016	printf("\t-show-cc <int>\n");
				2017	printf("\t\tShow words with their collocators starting from word rank <int>. Depends on -read-vocab and -read-net.\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2018	printf("\t-type <int>\n");
				2019	printf(
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	2020	"\t\tType of embeddings (0 for cbow, 1 for skipngram, 2 for cwindow, 3 for structured skipngram, 4 for senna type, 5 for store positional bigramms)\n");
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2021	printf("\t-cap <int>\n");
				2022	printf(
				2023	"\t\tlimit the parameter values to the range [-50, 50]; default is 0 (off)\n");
				2024	printf("\nExamples:\n");
				2025	printf(
Marc Kupietz	83a67d4	2021-03-22 17:29:36 +0100	[diff] [blame]	2026	"./dereko2vec -train data.txt -output vec.txt -size 200 -window 5 -sample 1e-4 -negative 5 -hs 0 -binary 0 -type 1 -iter 3\n\n");
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	2027	}
				2028
				2029	int main(int argc, char **argv) {
				2030	int i;
				2031	setlocale(LC_ALL, "");
				2032	if (argc == 1) {
				2033	print_help();
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2034	return 0;
				2035	}
				2036	output_file[0] = 0;
				2037	save_vocab_file[0] = 0;
				2038	read_vocab_file[0] = 0;
				2039	save_net_file[0] = 0;
				2040	read_net_file[0] = 0;
				2041	negative_classes_file[0] = 0;
Marc Kupietz	c7f773b	2017-12-02 12:04:03 +0100	[diff] [blame]	2042	if ((i = ArgPos((char *) "-h", argc, argv)) > 0) {
				2043	print_help();
				2044	return(0);
				2045	}
				2046	if ((i = ArgPos((char *) "-help", argc, argv)) > 0) {
				2047	print_help();
				2048	return(0);
				2049	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2050	if ((i = ArgPos((char *) "-size", argc, argv)) > 0)
				2051	layer1_size = atoi(argv[i + 1]);
				2052	if ((i = ArgPos((char *) "-train", argc, argv)) > 0)
				2053	strcpy(train_file, argv[i + 1]);
				2054	if ((i = ArgPos((char *) "-save-vocab", argc, argv)) > 0)
				2055	strcpy(save_vocab_file, argv[i + 1]);
				2056	if ((i = ArgPos((char *) "-read-vocab", argc, argv)) > 0)
				2057	strcpy(read_vocab_file, argv[i + 1]);
				2058	if ((i = ArgPos((char *) "-save-net", argc, argv)) > 0)
				2059	strcpy(save_net_file, argv[i + 1]);
				2060	if ((i = ArgPos((char *) "-read-net", argc, argv)) > 0)
				2061	strcpy(read_net_file, argv[i + 1]);
Marc Kupietz	e423f73	2017-12-22 17:57:03 +0100	[diff] [blame]	2062	if ((i = ArgPos((char *) "-magic-stop-file", argc, argv)) > 0) {
				2063	strcpy(magic_stop_file, argv[i + 1]);
				2064	if (access(magic_stop_file, F_OK ) != -1) {
				2065	printf("ERROR: magic stop file %s must not exist at start.\n", magic_stop_file);
				2066	exit(1);
				2067	}
				2068	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2069	if ((i = ArgPos((char *) "-debug", argc, argv)) > 0)
				2070	debug_mode = atoi(argv[i + 1]);
				2071	if ((i = ArgPos((char *) "-binary", argc, argv)) > 0)
				2072	binary = atoi(argv[i + 1]);
Marc Kupietz	6b1f2ba	2016-03-17 21:17:42 +0100	[diff] [blame]	2073	if ((i = ArgPos((char *) "-show-cc", argc, argv)) > 0)
				2074	cc = atoi(argv[i + 1]);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2075	if ((i = ArgPos((char *) "-type", argc, argv)) > 0)
				2076	type = atoi(argv[i + 1]);
				2077	if ((i = ArgPos((char *) "-output", argc, argv)) > 0)
				2078	strcpy(output_file, argv[i + 1]);
				2079	if ((i = ArgPos((char *) "-window", argc, argv)) > 0)
				2080	window = atoi(argv[i + 1]);
				2081	if ((i = ArgPos((char *) "-sample", argc, argv)) > 0)
				2082	sample = atof(argv[i + 1]);
				2083	if ((i = ArgPos((char *) "-hs", argc, argv)) > 0)
				2084	hs = atoi(argv[i + 1]);
				2085	if ((i = ArgPos((char *) "-negative", argc, argv)) > 0)
				2086	negative = atoi(argv[i + 1]);
				2087	if ((i = ArgPos((char *) "-negative-classes", argc, argv)) > 0)
				2088	strcpy(negative_classes_file, argv[i + 1]);
				2089	if ((i = ArgPos((char *) "-nce", argc, argv)) > 0)
				2090	nce = atoi(argv[i + 1]);
				2091	if ((i = ArgPos((char *) "-threads", argc, argv)) > 0)
				2092	num_threads = atoi(argv[i + 1]);
				2093	if ((i = ArgPos((char *) "-iter", argc, argv)) > 0)
				2094	iter = atoi(argv[i + 1]);
				2095	if ((i = ArgPos((char *) "-min-count", argc, argv)) > 0)
				2096	min_count = atoi(argv[i + 1]);
				2097	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
				2098	classes = atoi(argv[i + 1]);
Marc Kupietz	879333c	2023-12-20 11:41:09 +0100	[diff] [blame]	2099	if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0)
Marc Kupietz	178a3c9	2023-12-22 15:12:27 +0100	[diff] [blame]	2100	metadata_categories = atoi(argv[i + 1]);
				2101	if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0) {
				2102	metadata_categories = atoi(argv[i + 1]);
				2103	if (metadata_categories > MAX_METADATA_CATEGORIES) {
				2104	printf("ERROR: metadata categories must be <= %d\n", MAX_METADATA_CATEGORIES);
				2105	exit(1);
				2106	}
				2107	for (int j = 0; j <= metadata_categories; j++) {
				2108	}
				2109	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2110	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
				2111	cap = atoi(argv[i + 1]);
				2112	if (type == 0 \|\| type == 2 \|\| type == 4)
				2113	alpha = 0.05;
Marc Kupietz	613edbf	2018-01-11 21:38:03 +0100	[diff] [blame]	2114	if (type==5) {
				2115	sample = 0;
				2116	cdb = open_collocatordb_for_write(output_file);
				2117	}
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2118	if ((i = ArgPos((char *) "-alpha", argc, argv)) > 0)
				2119	alpha = atof(argv[i + 1]);
				2120	vocab = (struct vocab_word *) calloc(vocab_max_size,
				2121	sizeof(struct vocab_word));
				2122	vocab_hash = (int *) calloc(vocab_hash_size, sizeof(int));
				2123	expTable = (real ) malloc((EXP_TABLE_SIZE + 1) sizeof(real));
				2124	for (i = 0; i < EXP_TABLE_SIZE; i++) {
				2125	expTable[i] = exp((i / (real) EXP_TABLE_SIZE * 2 - 1) * MAX_EXP); // Precompute the exp() table
				2126	expTable[i] = expTable[i] / (expTable[i] + 1); // Precompute f(x) = x / (x + 1)
				2127	}
Marc Kupietz	210b9d5	2016-04-02 21:48:13 +0200	[diff] [blame]	2128	SaveArgs(argc, argv);
Marc Kupietz	d6f9c71	2016-03-16 11:50:56 +0100	[diff] [blame]	2129	TrainModel();
				2130	return 0;
				2131	}
				2132