Save metadata in standard dictionary

Change-Id: I4cdd46783094e98e9a9a516a1bcb66faa2d4773d
diff --git a/src/dereko2vec.c b/src/dereko2vec.c
index bff3c4c..3671bd6 100644
--- a/src/dereko2vec.c
+++ b/src/dereko2vec.c
@@ -27,7 +27,9 @@
 #define MAX_SENTENCE_LENGTH 1000
 #define MAX_CC 100
 #define MAX_CODE_LENGTH 40
+#define MAX_METADATA_CATEGORIES 4
 
+#define METADATA_MARKER ' '
 const int vocab_hash_size = 30000000; // Maximum 30 * 0.7 = 21M words in the vocabulary
 
 typedef float real;                    // Precision of float numbers
@@ -147,8 +149,14 @@
 			continue;
 		if ((ch == ' ') || (ch == '\t') || (ch == '\n')) {
 			if (ch == '\t' && expected_metadata_categories > 0) {
-        a = 0;
-        expected_metadata_categories--;
+				word[a] = 0;
+				a = 0;
+				expected_metadata_categories--;
+				if (debug_mode > 2)
+					printf("Metadata: %s\n", word);
+				strcpy(word + 1, word);
+				*word = METADATA_MARKER;
+				return;
       } else {
         if (a > 0) {
           if (ch == '\n') {
@@ -2073,7 +2081,16 @@
 	if ((i = ArgPos((char *) "-classes", argc, argv)) > 0)
 		classes = atoi(argv[i + 1]);
   if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0)
-    metadata_categories = atoi(argv[i + 1]);
+		metadata_categories = atoi(argv[i + 1]);
+	if ((i = ArgPos((char *) "-metadata-categories", argc, argv)) > 0) {
+		metadata_categories = atoi(argv[i + 1]);
+		if (metadata_categories > MAX_METADATA_CATEGORIES) {
+			printf("ERROR: metadata categories must be <= %d\n", MAX_METADATA_CATEGORIES);
+			exit(1);
+		}
+		for (int j = 0; j <= metadata_categories; j++) {
+		}
+	}
 	if ((i = ArgPos((char *) "-cap", argc, argv)) > 0)
 		cap = atoi(argv[i + 1]);
 	if (type == 0 || type == 2 || type == 4)
diff --git a/tests/test-type-3-with-metadata.sh b/tests/test-type-3-with-metadata.sh
index 7a5405c..0a11c0d 100755
--- a/tests/test-type-3-with-metadata.sh
+++ b/tests/test-type-3-with-metadata.sh
@@ -37,15 +37,24 @@
   -size 200 -binary 1 -window 5 -negative 10 -threads 16 -iter 1 -min-count 2 \
   > >(tee -a ${BUILDDIR}/stdout.log) 2> >(tee -a ${BUILDDIR}/stderr.log >&2)
 
-observed=$(cat ${DESTDIR}/wpd19_10000_year_topic.vocab)
+observed=$(cat ${DESTDIR}/wpd19_10000_year_topic.vocab | grep -v -e '^ ')
 expected=$(cat ${DESTDIR}/wpd19_10000.vocab)
 assert_eq "$observed" "$expected" "wrong vocab output!"
 if [ "$?" == 0 ]; then
-    log_success "vocab output is identical with and without metadata"
+    log_success "vocab output is identical with and without metadata, except for the metadata"
   else
-    log_failure "vocab output should be identical with and without metadata"
+    log_failure "vocab output should be identical with and without metadata (except for the metadata)"
 fi
 
+observed=$(cat ${DESTDIR}/wpd19_10000_year_topic.vocab | grep -e '^ ')
+assert_contain "$observed" " gesundheit-ernaehrung"
+if [ "$?" == 0 ]; then
+    log_success "vocab contains gesundheit-ernaehrung metadata"
+  else
+    log_failure "vocab should contain gesundheit-ernaehrung metadata"
+fi
+
+
 observed=$(echo -e "Grund\nEXIT" | ${BUILDDIR}/distance ${DESTDIR}/wpd19_10000_year_topic.vecs)
 
 assert_contain "$observed" "Zusammenhang"