Corrected a bug that did no training when reading a vocabulary without recounting (-tc 0).

commit: 117aab0e2d4ed1aa668f264ad7252f728c37f224 [log] [tgz]
author: Peter Fankhauser <fankhauser@ids-mannheim.de> Fri Apr 24 13:20:06 2020 +0200
committer: Peter Fankhauser <fankhauser@ids-mannheim.de> Fri Apr 24 13:20:06 2020 +0200
tree: ac9f1e8f53e5f7e09a35f84a1275e91501837b7d
parent: 8c7f24e303d77ed415647f21a9fdb282c84f6f44 [diff]
diff --git a/word2vecExt1.c b/word2vecExt1.c
index 7611517..0fb49ff 100644
--- a/word2vecExt1.c
+++ b/word2vecExt1.c

@@ -427,8 +427,19 @@
 		i++;
 	}
 	fclose(fin);
+	// this is just for determining train_words by avgWordLength
+	fin = fopen(train_file, "rb");
+	if (fin == NULL) {
+		printf("ERROR: training data file not found!\n");
+		exit(1);
+	}
+	fseek(fin, 0, SEEK_END);
+	file_size = ftell(fin);
+	fclose(fin);
 	SortVocab();
-
+	train_words = file_size / avgWordLength;
+	if(debug_mode > 0)
+		printf("Estimated words in train file: %'lld\n", train_words);
 	if (tc > 0) {
 		// recalculate counts for the current corpus
 		// adapted from LearnVocabFromTrainFile()
@@ -441,13 +452,13 @@
 		// reset vocabulary counts
 		for (a = 0; a < vocab_size; a++)
 			vocab[a].cn = 0;
-		train_words = 0;
+		long long train_words1 = 0;
 		while (1) {
 			ReadWord(word, fin);
 			if (feof(fin))
 				break;
-			if ((debug_mode > 1) && (train_words % 100000 == 0)) {
-				printf("%lldK%c", train_words / 1000, 13);
+			if ((debug_mode > 1) && (train_words1 % 100000 == 0)) {
+				printf("%lldK%c", train_words1 / 1000, 13);
 				fflush(stdout);
 			}
 			i = SearchVocab(word);
@@ -455,27 +466,24 @@
 			// because it may have been cut off due to minfreq.
 			if (i >= 0) {
 				vocab[i].cn++;
-				train_words++;
+				train_words1++;
 			}
 		}
 		// we cannot have 0 counts.
 		for (a = 0; a < vocab_size; a++) {
 			if(vocab[a].cn == 0) {
 				vocab[a].cn = 1;
-				train_words++;
+				train_words1++;
 			}
 		}
 		if (debug_mode > 0) {
 			printf("Vocab size: %lld\n", vocab_size);
-			printf("Words in current train file: %'lld\n", train_words);
+			printf("Words in current train file: %'lld\n", train_words1);
 		}
 		fseek(fin, 0, SEEK_END);
 		file_size = ftell(fin);
 		fclose(fin);
 	}
-	train_words = file_size / avgWordLength;
-	if(debug_mode > 0)
-		printf("Estimated words in train file: %'lld\n", train_words);
 }
 
 void InitClassUnigramTable() {
commit	117aab0e2d4ed1aa668f264ad7252f728c37f224	[log] [tgz]
author	Peter Fankhauser <fankhauser@ids-mannheim.de>	Fri Apr 24 13:20:06 2020 +0200
committer	Peter Fankhauser <fankhauser@ids-mannheim.de>	Fri Apr 24 13:20:06 2020 +0200
tree	ac9f1e8f53e5f7e09a35f84a1275e91501837b7d
parent	8c7f24e303d77ed415647f21a9fdb282c84f6f44 [diff]