collocatordb: read vocab and calc total in lib
diff --git a/collocatordb.cc b/collocatordb.cc
index 56fc53a..2f06c63 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -2,6 +2,7 @@
 #define EXPORT __attribute__((visibility("visible")))
 #define IMPORT
 #include <assert.h>
+#include <inttypes.h>
 #include <memory>
 #include <iostream>
 #include <algorithm>
@@ -190,13 +191,20 @@
     return DecodeFixed64(base_iterator_->value().data());
   }
 
+  class VocabEntry {
+  public:
+    string word;
+    uint64_t freq;
+  };
+
   class Collocators {
   private:
     WriteOptions merge_option_; // for merge
     char _one[sizeof(uint64_t)];
     Slice _one_slice;
-    vocab_entry *_vocab = NULL;
-
+    vector<VocabEntry> _vocab;
+    uint64_t total;
+    
   protected:
     std::shared_ptr<DB> db_;
 
@@ -208,7 +216,8 @@
 
     std::shared_ptr<DB> OpenDb(const char *dbname);
     std::shared_ptr<DB> OpenDbForRead(const char *dbname);
-
+    void read_vocab(string fname);
+    
   public:
     Collocators(const char *db_name, bool read_only);
     ~Collocators();
@@ -293,7 +302,7 @@
 
     virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
     void dump(uint32_t w1, uint32_t w2, int8_t dist);
-    vector<Collocator> get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total);
+    vector<Collocator> get_collocators(uint32_t w1);
     string collocators2json(vector<Collocator> collocators);
 
     // mapped to a rocksdb Merge operation
@@ -338,16 +347,36 @@
     inc(encodeCollocation(w1, w2, dist));
   }
 
+  void rocksdb::Collocators::read_vocab(string fname) {
+    char strbuf[2048];
+    uint64_t freq;
+    FILE *fin = fopen(fname.c_str(), "rb");
+    if (fin == NULL) {
+      cout <<  "Vocabulary file " << fname <<" not found\n";
+      exit(1);
+    }
+    uint64_t i = 0;
+    while(!feof(fin)) {
+      fscanf(fin, "%s %" PRIu64, strbuf, &freq);
+      _vocab.push_back({strbuf, freq});
+      total += freq;
+      i++;
+    }
+    fclose(fin);
+  }
+
   std::shared_ptr<DB> rocksdb::Collocators::OpenDbForRead(const char *name) {
 		DB* db;
 		Options options;
-    ostringstream dbname;
+    ostringstream dbname, vocabname;
     dbname << name << ".rocksdb";
 		auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
 		if (!s.ok()) {
 			std::cerr << s.ToString() << std::endl;
 			assert(false);
 		}
+    vocabname << name << ".vocab";
+    read_vocab(vocabname.str());
 		return std::shared_ptr<DB>(db);
   }
 
@@ -457,8 +486,7 @@
 	bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
 	bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
 
-	std::vector<Collocator> rocksdb::Collocators::get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total) {
-    _vocab = vocab;
+	std::vector<Collocator> rocksdb::Collocators::get_collocators(uint32_t w1) {
 		std::vector<Collocator> collocators;
     uint64_t w2, last_w2 = 0xffffffffffffffff;
     uint64_t sum = 0, total_w1 = 0;
@@ -470,12 +498,12 @@
       if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
       if (w2 != last_w2) {
 				double pmi = log2( total * ((double) sum) /
-													 (AVG_WINDOW_SIZE * ((double)vocab[w1].freq) * ((double)vocab[last_w2].freq) ));
+													 (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq) * ((double)_vocab[last_w2].freq) ));
         //  Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
-        // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) /  (AVG_WINDOW_SIZE * ((double)vocab[w1].freq/total) * ((double)vocab[last_w2].freq/total)));
-        double md = log2((double)sum * sum /  ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * vocab[w1].freq * vocab[last_w2].freq));
+        // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) /  (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
+        double md = log2((double)sum * sum /  ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * _vocab[w1].freq * _vocab[last_w2].freq));
         collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(((double) sum / AVG_WINDOW_SIZE / total))), /* normalize to [-1,1] */
-							calculateLLR(vocab[w1].freq, total, sum, vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
+							calculateLLR(_vocab[w1].freq, total, sum, _vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
         last_w2 = w2;
         sum = value;
       } else {
@@ -488,9 +516,9 @@
     int i=0;
     for (Collocator c : collocators) {
       if(i++>10) break;
-      std::cout << "w1:" << vocab[w1].word << ", w2:" << vocab[c.w2].word
-                << "\t f(w1):" << vocab[w1].freq
-                << "\t f(w2):" << vocab[c.w2].freq
+      std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
+                << "\t f(w1):" << _vocab[w1].freq
+                << "\t f(w2):" << _vocab[c.w2].freq
                 << "\t f(w1, x):" << total_w1
                 << "\t f(w1, w2):" << c.sum
                 << "\t pmi:" << c.pmi
@@ -552,11 +580,11 @@
 		db->dump(w1, w2, dist);
 	}
 
-	void get_collocators(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
-		db->get_collocators(w1, vocab, total);
+	void get_collocators(COLLOCATORS *db, uint32_t w1) {
+		db->get_collocators(w1);
 	}
 
-	const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
-		return strdup(db->collocators2json(db->get_collocators(w1, vocab, total)).c_str());
+	const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
+		return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
 	}
 }