Train with metadata
Change-Id: I918f64709c0d884e9a31085cb42bbd8820241786
diff --git a/src/dereko2vec.c b/src/dereko2vec.c
index 3671bd6..9578f1b 100644
--- a/src/dereko2vec.c
+++ b/src/dereko2vec.c
@@ -152,7 +152,7 @@
word[a] = 0;
a = 0;
expected_metadata_categories--;
- if (debug_mode > 2)
+ if (debug_mode > 3)
printf("Metadata: %s\n", word);
strcpy(word + 1, word);
*word = METADATA_MARKER;
@@ -204,11 +204,16 @@
}
// Reads a word and returns its index in the vocabulary
-int ReadWordIndex(FILE *fin) {
+int ReadWordIndex(FILE *fin, int *is_metadata) {
char word[MAX_STRING];
ReadWord(word, fin);
if (feof(fin))
return -1;
+ if (word[0] == METADATA_MARKER) {
+ *is_metadata = 1;
+ } else {
+ *is_metadata = 0;
+ }
return SearchVocab(word);
}
@@ -773,7 +778,7 @@
void *TrainModelThread(void *id) {
long long a, b, d, cw, word, last_word, sentence_length = 0,
sentence_position = 0;
- long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1];
+ long long word_count = 0, last_word_count = 0, sen[MAX_SENTENCE_LENGTH + 1], metadata[MAX_METADATA_CATEGORIES];
long long l1, l2, c, target, label, local_iter = iter;
unsigned long long next_random = (long long) id;
real f, g;
@@ -797,7 +802,9 @@
long long start_pos = file_size / (long long) num_threads * (long long) id;
long long end_pos = file_size / (long long) num_threads * (long long) (id + 1) -1;
long long current_pos = start_pos;
- long long last_pos = start_pos;;
+ long long last_pos = start_pos;
+ int is_metadata = 0;
+ int metadata_index = 0;
fseek(fi, start_pos, SEEK_SET);
while (1) {
if ((current_pos - last_pos > 100000)) {
@@ -811,7 +818,16 @@
}
if (sentence_length == 0) {
while (1) {
- word = ReadWordIndex(fi);
+ word = ReadWordIndex(fi, &is_metadata);
+ if (is_metadata) {
+ if (debug_mode > 1)
+ printf("Metadata: %s\n", vocab[word].word);
+ metadata[metadata_index++] = word;
+ if (metadata_index >= metadata_categories) {
+ metadata_index = 0;
+ }
+ continue;
+ }
if (feof(fi))
break;
if (word == -1)