Save metadata in standard dictionary
Change-Id: I4cdd46783094e98e9a9a516a1bcb66faa2d4773d
diff --git a/src/dereko2vec.c b/src/dereko2vec.c
index bff3c4c..3671bd6 100644
--- a/src/dereko2vec.c
+++ b/src/dereko2vec.c
@@ -27,7 +27,9 @@
#define MAX_SENTENCE_LENGTH 1000
#define MAX_CC 100
#define MAX_CODE_LENGTH 40
+#define MAX_METADATA_CATEGORIES 4
+#define METADATA_MARKER ' '
const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
typedef float real; // Precision of float numbers
@@ -147,8 +149,14 @@
continue;
if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
if (ch == '\t' && expected_metadata_categories > 0) {
- a = 0;
- expected_metadata_categories--;
+ word[a] = 0;
+ a = 0;
+ expected_metadata_categories--;
+ if (debug_mode > 2)
+ printf("Metadata: %s\n", word);
+ strcpy(word + 1, word);
+ *word = METADATA_MARKER;
+ return;
} else {
if (a > 0) {
if (ch == '\n') {
@@ -2073,7 +2081,16 @@
if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
classes = atoi(argv[i + 1]);
if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0)
- metadata_categories = atoi(argv[i + 1]);
+ metadata_categories = atoi(argv[i + 1]);
+ if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0) {
+ metadata_categories = atoi(argv[i + 1]);
+ if (metadata_categories > MAX_METADATA_CATEGORIES) {
+ printf("ERROR: metadata categories must be <= %d\n", MAX_METADATA_CATEGORIES);
+ exit(1);
+ }
+ for (int j = 0; j <= metadata_categories; j++) {
+ }
+ }
if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
cap = atoi(argv[i + 1]);
if (type == 0 || type == 2 || type == 4)
diff --git a/tests/test-type-3-with-metadata.sh b/tests/test-type-3-with-metadata.sh
index 7a5405c..0a11c0d 100755
--- a/tests/test-type-3-with-metadata.sh
+++ b/tests/test-type-3-with-metadata.sh
@@ -37,15 +37,24 @@
-size 200 -binary 1 -window 5 -negative 10 -threads 16 -iter 1 -min-count 2 \
> >(tee -a ${BUILDDIR}/stdout.log) 2> >(tee -a ${BUILDDIR}/stderr.log >&2)
-observed=$(cat ${DESTDIR}/wpd19_10000_year_topic.vocab)
+observed=$(cat ${DESTDIR}/wpd19_10000_year_topic.vocab | grep -v -e '^ ')
expected=$(cat ${DESTDIR}/wpd19_10000.vocab)
assert_eq "$observed" "$expected" "wrong vocab output!"
if [ "$?" == 0 ]; then
- log_success "vocab output is identical with and without metadata"
+ log_success "vocab output is identical with and without metadata, except for the metadata"
else
- log_failure "vocab output should be identical with and without metadata"
+ log_failure "vocab output should be identical with and without metadata (except for the metadata)"
fi
+observed=$(cat ${DESTDIR}/wpd19_10000_year_topic.vocab | grep -e '^ ')
+assert_contain "$observed" " gesundheit-ernaehrung"
+if [ "$?" == 0 ]; then
+ log_success "vocab contains gesundheit-ernaehrung metadata"
+ else
+ log_failure "vocab should contain gesundheit-ernaehrung metadata"
+fi
+
+
observed=$(echo -e "Grund\nEXIT" | ${BUILDDIR}/distance ${DESTDIR}/wpd19_10000_year_topic.vecs)
assert_contain "$observed" "Zusammenhang"