collocatordb: read vocab and calc total in lib
diff --git a/c_testanalysis.c b/c_testanalysis.c
index 5a660cb..bb40f4a 100644
--- a/c_testanalysis.c
+++ b/c_testanalysis.c
@@ -4,34 +4,10 @@
 #include <math.h>
 #include "collocatordb.h"
 
-uint64_t total=0;
-
-vocab_entry vocab[100000];
-
-void read_vocab(char *fname) {
-  char strbuf[2048];
-  long long freq;
-	FILE *fin = fopen(fname, "rb");
-	if (fin == NULL) {
-		printf("Vocabulary file not found\n");
-		exit(1);
-	}
-  uint64_t i = 0;
-  while(!feof(fin)) {
-		fscanf(fin, "%s %lld", strbuf, &freq);
-    vocab[i].word = strdup(strbuf);
-    vocab[i].freq = freq;
-    total += freq;
-    i++;
-  }
-  fclose(fin);
-}
-
 int main() {
 	COLLOCATORS *cdb = open_collocators_for_read("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/wang2vec/sample");
-  read_vocab("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/wang2vec/sample.vocab");
   for(int i=100; i < 1000; i++)
-    get_collocators(cdb, i, vocab, total);
-  printf("%s\n", get_collocators_as_json(cdb, 500, vocab, total));
+    get_collocators(cdb, i);
+  printf("%s\n", get_collocators_as_json(cdb, 500));
 	return 0;
 }
diff --git a/collocatordb.cc b/collocatordb.cc
index 56fc53a..2f06c63 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -2,6 +2,7 @@
 #define EXPORT __attribute__((visibility("visible")))
 #define IMPORT
 #include <assert.h>
+#include <inttypes.h>
 #include <memory>
 #include <iostream>
 #include <algorithm>
@@ -190,13 +191,20 @@
     return DecodeFixed64(base_iterator_->value().data());
   }
 
+  class VocabEntry {
+  public:
+    string word;
+    uint64_t freq;
+  };
+
   class Collocators {
   private:
     WriteOptions merge_option_; // for merge
     char _one[sizeof(uint64_t)];
     Slice _one_slice;
-    vocab_entry *_vocab = NULL;
-
+    vector<VocabEntry> _vocab;
+    uint64_t total;
+    
   protected:
     std::shared_ptr<DB> db_;
 
@@ -208,7 +216,8 @@
 
     std::shared_ptr<DB> OpenDb(const char *dbname);
     std::shared_ptr<DB> OpenDbForRead(const char *dbname);
-
+    void read_vocab(string fname);
+    
   public:
     Collocators(const char *db_name, bool read_only);
     ~Collocators();
@@ -293,7 +302,7 @@
 
     virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
     void dump(uint32_t w1, uint32_t w2, int8_t dist);
-    vector<Collocator> get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total);
+    vector<Collocator> get_collocators(uint32_t w1);
     string collocators2json(vector<Collocator> collocators);
 
     // mapped to a rocksdb Merge operation
@@ -338,16 +347,36 @@
     inc(encodeCollocation(w1, w2, dist));
   }
 
+  void rocksdb::Collocators::read_vocab(string fname) {
+    char strbuf[2048];
+    uint64_t freq;
+    FILE *fin = fopen(fname.c_str(), "rb");
+    if (fin == NULL) {
+      cout <<  "Vocabulary file " << fname <<" not found\n";
+      exit(1);
+    }
+    uint64_t i = 0;
+    while(!feof(fin)) {
+      fscanf(fin, "%s %" PRIu64, strbuf, &freq);
+      _vocab.push_back({strbuf, freq});
+      total += freq;
+      i++;
+    }
+    fclose(fin);
+  }
+
   std::shared_ptr<DB> rocksdb::Collocators::OpenDbForRead(const char *name) {
 		DB* db;
 		Options options;
-    ostringstream dbname;
+    ostringstream dbname, vocabname;
     dbname << name << ".rocksdb";
 		auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
 		if (!s.ok()) {
 			std::cerr << s.ToString() << std::endl;
 			assert(false);
 		}
+    vocabname << name << ".vocab";
+    read_vocab(vocabname.str());
 		return std::shared_ptr<DB>(db);
   }
 
@@ -457,8 +486,7 @@
 	bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
 	bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
 
-	std::vector<Collocator> rocksdb::Collocators::get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total) {
-    _vocab = vocab;
+	std::vector<Collocator> rocksdb::Collocators::get_collocators(uint32_t w1) {
 		std::vector<Collocator> collocators;
     uint64_t w2, last_w2 = 0xffffffffffffffff;
     uint64_t sum = 0, total_w1 = 0;
@@ -470,12 +498,12 @@
       if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
       if (w2 != last_w2) {
 				double pmi = log2( total * ((double) sum) /
-													 (AVG_WINDOW_SIZE * ((double)vocab[w1].freq) * ((double)vocab[last_w2].freq) ));
+													 (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq) * ((double)_vocab[last_w2].freq) ));
         //  Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
-        // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) /  (AVG_WINDOW_SIZE * ((double)vocab[w1].freq/total) * ((double)vocab[last_w2].freq/total)));
-        double md = log2((double)sum * sum /  ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * vocab[w1].freq * vocab[last_w2].freq));
+        // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) /  (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
+        double md = log2((double)sum * sum /  ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * _vocab[w1].freq * _vocab[last_w2].freq));
         collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(((double) sum / AVG_WINDOW_SIZE / total))), /* normalize to [-1,1] */
-							calculateLLR(vocab[w1].freq, total, sum, vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
+							calculateLLR(_vocab[w1].freq, total, sum, _vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
         last_w2 = w2;
         sum = value;
       } else {
@@ -488,9 +516,9 @@
     int i=0;
     for (Collocator c : collocators) {
       if(i++>10) break;
-      std::cout << "w1:" << vocab[w1].word << ", w2:" << vocab[c.w2].word
-                << "\t f(w1):" << vocab[w1].freq
-                << "\t f(w2):" << vocab[c.w2].freq
+      std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
+                << "\t f(w1):" << _vocab[w1].freq
+                << "\t f(w2):" << _vocab[c.w2].freq
                 << "\t f(w1, x):" << total_w1
                 << "\t f(w1, w2):" << c.sum
                 << "\t pmi:" << c.pmi
@@ -552,11 +580,11 @@
 		db->dump(w1, w2, dist);
 	}
 
-	void get_collocators(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
-		db->get_collocators(w1, vocab, total);
+	void get_collocators(COLLOCATORS *db, uint32_t w1) {
+		db->get_collocators(w1);
 	}
 
-	const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
-		return strdup(db->collocators2json(db->get_collocators(w1, vocab, total)).c_str());
+	const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
+		return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
 	}
 }
diff --git a/collocatordb.h b/collocatordb.h
index b370aae..406ea5d 100644
--- a/collocatordb.h
+++ b/collocatordb.h
@@ -10,12 +10,6 @@
 #define W2(key) (uint64_t)((key >> 24) & 0xffffff)
 #define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
 
-
-typedef struct {
-  uint64_t freq;
-  char *word;
-}  vocab_entry;
-
 #ifdef __cplusplus
 namespace rocksdb {
     class CollocatorIterator : public Iterator  {
@@ -53,6 +47,6 @@
 extern COLLOCATORS *open_collocators_for_read(char *s);
 extern void inc_collocators(COLLOCATORS *db, uint64_t w1, uint64_t w2, int8_t dist);
 extern void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist);
-extern void get_collocators(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total);
-extern char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total);
+extern void get_collocators(COLLOCATORS *db, uint32_t w1);
+extern char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1);