Add -metadata-categories <n> option to support metadata input
(ignoring the metadata for now)
Change-Id: Iea84524400018eb2de842f6b4bdbed1808bc868b
diff --git a/src/dereko2vec.c b/src/dereko2vec.c
index 8c34179..bff3c4c 100644
--- a/src/dereko2vec.c
+++ b/src/dereko2vec.c
@@ -45,7 +45,7 @@
struct vocab_word *vocab;
int binary = 0, type = 1, debug_mode = 2, window = 5, min_count = 5,
- num_threads = 12, min_reduce = 1;
+ num_threads = 12, min_reduce = 1, metadata_categories = 0, expected_metadata_categories = 0;
int *vocab_hash;
long long *threadPos;
int *threadIters;
@@ -146,13 +146,21 @@
if (ch == 13)
continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
- if (a > 0) {
- if (ch == '\n')
- ungetc(ch, fin);
- break;
- }
+ if (ch == '\t' && expected_metadata_categories > 0) {
+ a = 0;
+ expected_metadata_categories--;
+ } else {
+ if (a > 0) {
+ if (ch == '\n') {
+ expected_metadata_categories = metadata_categories;
+ ungetc(ch, fin);
+ }
+ break;
+ }
+ }
if (ch == '\n') {
strcpy(word, (char *) "</s>");
+ expected_metadata_categories = metadata_categories;
return;
} else
continue;
@@ -369,6 +377,9 @@
}
vocab_size = 0;
AddWordToVocab((char *) "</s>");
+ for (int j=0; j < metadata_categories; j++) {
+ ReadWord(word, fin);
+ }
while (1) {
ReadWord(word, fin);
if (feof(fin))
@@ -2061,6 +2072,8 @@
min_count = atoi(argv[i + 1]);
if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
classes = atoi(argv[i + 1]);
+ if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0)
+ metadata_categories = atoi(argv[i + 1]);
if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
cap = atoi(argv[i + 1]);
if (type == 0 || type == 2 || type == 4)