collocatordb: add collocation analysis
diff --git a/Makefile b/Makefile
index 895e252..3464afb 100644
--- a/Makefile
+++ b/Makefile
@@ -1,9 +1,10 @@
 PLATFORM_CCFLAGS= -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX  -DOS_LINUX -fno-builtin-memcmp -DROCKSDB_FALLOCATE_PRESENT -DSNAPPY -DGFLAGS=1 -DZLIB -DBZIP2 -DLZ4 -DZSTD -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX -DROCKSDB_BACKTRACE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -march=native  -DROCKSDB_SUPPORT_THREAD_LOCAL
 PLATFORM_CXXFLAGS=-std=c++11  -DROCKSDB_PLATFORM_POSIX -DROCKSDB_LIB_IO_POSIX  -DOS_LINUX -fno-builtin-memcmp -DROCKSDB_FALLOCATE_PRESENT -DSNAPPY -DGFLAGS=1 -DZLIB -DBZIP2 -DLZ4 -DZSTD -DROCKSDB_MALLOC_USABLE_SIZE -DROCKSDB_PTHREAD_ADAPTIVE_MUTEX -DROCKSDB_BACKTRACE -DROCKSDB_RANGESYNC_PRESENT -DROCKSDB_SCHED_GETCPU_PRESENT -march=native  -DROCKSDB_SUPPORT_THREAD_LOCAL
 PLATFORM=OS_LINUX
-PLATFORM_LDFLAGS= -lpthread -lrt -lsnappy -lgflags -lz -lbz2 -llz4 -lzstd
+PLATFORM_LDFLAGS= -lpthread -lrt -lsnappy -lz -lbz2 -llz4 -lzstd
 
-CXXFLAGS = -Wall -Wno-reorder -I/usr/local/include -g -std=c++11 
+CXXFLAGS = -Wall -Wno-reorder -I/usr/local/include -g -std=c++11  -Ofast -march=k8
+CFLAGS = -Wall -I/usr/local/include -g -std=gnu99  -O2 -march=k8
 
 ARFLAGS = ${EXTRA_ARFLAGS} rs
 
@@ -18,7 +19,10 @@
 	$(CXX) $(CXXFLAGS) -L. -L/usr/local/lib $@.cc -o$@ collocatordb.o -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
 
 c_testcdb: c_testcdb.c collocatordb.h collocatordb.o Makefile
-	$(CC) $(CFLAGS) -L. -L/usr/local/lib $@.c -o$@ collocatordb.o -lstdc++ -lm -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
+	$(CC) $(CFLAGS) -L. -L/usr/local/lib $@.c -o$@ collocatordb.o -std=gnu99 -lstdc++ -lm -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CCFLAGS) $(EXEC_LDFLAGS)
+
+c_testanalysis: c_testanalysis.c collocatordb.h collocatordb.o Makefile
+	$(CC) $(CFLAGS) -L. -L/usr/local/lib $@.c -o$@ collocatordb.o -std=gnu99 -lstdc++ -lm -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CCFLAGS) $(EXEC_LDFLAGS)
 
 collocatordb: collocatordb.cc Makefile
 	$(CXX) $(CXXFLAGS) -L/usr/local/lib $@.cc -o$@ -lrocksdb $(PLATFORM_LDFLAGS) $(PLATFORM_CXXFLAGS) $(EXEC_LDFLAGS)
diff --git a/c_testanalysis.c b/c_testanalysis.c
new file mode 100644
index 0000000..6843480
--- /dev/null
+++ b/c_testanalysis.c
@@ -0,0 +1,37 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include "collocatordb.h"
+
+uint64_t total=0;
+
+vocab_entry vocab[100000];
+
+void read_vocab(char *fname) {
+  char strbuf[2048];
+  long long freq;
+	FILE *fin = fopen(fname, "rb");
+	if (fin == NULL) {
+		printf("Vocabulary file not found\n");
+		exit(1);
+	}
+  uint64_t i = 0;
+  while(!feof(fin)) {
+		fscanf(fin, "%s %lld", strbuf, &freq);
+    vocab[i].word = strdup(strbuf);
+    vocab[i].freq = freq;
+    total += freq;
+    i++;
+  }
+  fclose(fin);
+}
+
+int main() {
+	COLLOCATORS *cdb = open_collocators("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/wang2vec/sampledb");
+  read_vocab("/vol/work/kupietz/Work2/kl/trunk/Analysemethoden/wang2vec/sample.vocab");
+  for(int i=500; i < 600; i++)
+    get_collocators(cdb, i, vocab, total);
+  printf("%s\n", get_collocators_as_json(cdb, 500, vocab, total));
+	return 0;
+}
diff --git a/collocatordb.cc b/collocatordb.cc
index d4745fd..e4520c0 100644
--- a/collocatordb.cc
+++ b/collocatordb.cc
@@ -1,35 +1,63 @@
-//  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
-//  This source code is licensed under both the GPLv2 (found in the
-//  COPYING file in the root directory) and Apache 2.0 License
-//  (found in the LICENSE.Apache file in the root directory).
-//
 #include <typeinfo>
 #define EXPORT __attribute__((visibility("visible")))
 #define IMPORT
 #include <assert.h>
 #include <memory>
 #include <iostream>
+#include <algorithm>
+#include <vector>
 #include <stdint.h>
+#include <string>
+#include <sstream> // for ostringstream
+#include <math.h>
 #include "rocksdb/cache.h"
 #include "rocksdb/comparator.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
+#include "rocksdb/table.h"
 #include <rocksdb/merge_operator.h>
+#include <rocksdb/slice_transform.h>
 #include "rocksdb/utilities/db_ttl.h"
+#include "rocksdb/filter_policy.h"
 #include "merge_operators.h"
 
+#define AVG_WINDOW_SIZE 7
 
 #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
 #define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
 #define W1(key) (uint64_t)(key & 0xffffff)
 #define W2(key) (uint64_t)((key >> 24) & 0xffffff)
 #define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
+
+typedef struct {
+  uint64_t freq;
+  char *word;
+}  vocab_entry;
+
+// typedef struct Collocator {
+//   uint64_t w2;
+//   uint64_t sum;
+// };
+
 using namespace rocksdb;
+using namespace std;
 
 namespace rocksdb {
+  class Collocator {
+  public:
+    uint64_t w2;
+    uint64_t sum;
+    double pmi;
+    double npmi;
+    double llr;
+    double md;
+    double lfmd;
+    double fpmi;
+  };
+
   size_t num_merge_operator_calls;
   void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
-  
+
   size_t num_partial_merge_calls;
   void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
 
@@ -110,14 +138,14 @@
   private:
     std::shared_ptr<MergeOperator> mergeOperator_;
   };
-    
+
 
   class CollocatorIterator : public Iterator {
   private:
     char prefixc[sizeof(uint64_t)];
     Iterator *base_iterator_;
-        
-        
+
+
   public:
     CollocatorIterator(Iterator* base_iterator)
       : base_iterator_(base_iterator)
@@ -141,11 +169,11 @@
     bool isValid();
     uint64_t intValue();
     uint64_t intKey();
-  
+
   };
 
   //  rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
-    
+
   bool rocksdb::CollocatorIterator::Valid() const {
     return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
   }
@@ -157,7 +185,7 @@
   uint64_t rocksdb::CollocatorIterator::intKey() {
     return DecodeFixed64(base_iterator_->key().data());
   }
-  
+
   uint64_t rocksdb::CollocatorIterator::intValue() {
     return DecodeFixed64(base_iterator_->value().data());
   }
@@ -167,7 +195,8 @@
     WriteOptions merge_option_; // for merge
     char _one[sizeof(uint64_t)];
     Slice _one_slice;
-  
+    vocab_entry *_vocab = NULL;
+
   protected:
     std::shared_ptr<DB> db_;
 
@@ -264,8 +293,9 @@
 
     virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
     void dump(uint32_t w1, uint32_t w2, int8_t dist);
+    vector<Collocator> get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total);
+    string collocators2json(vector<Collocator> collocators);
 
-   
     // mapped to a rocksdb Merge operation
     virtual bool add(const std::string& key, uint64_t value) {
       char encoded[sizeof(uint64_t)];
@@ -286,13 +316,14 @@
 
   rocksdb::Collocators::Collocators(const char *db_name) {
     std::cout << "Test merge-based counters... " << db_name << "\n";
+		//		merge_option_.sync = true;
     db_ = OpenDb(db_name);
     assert(db_);
     uint64_t one = 1;
     EncodeFixed64(_one, one);
     _one_slice = Slice(_one, sizeof(uint64_t));
   }
-  
+
   rocksdb::CollocatorIterator::~CollocatorIterator() {
     std::cout << "destroying itera\n";
   }
@@ -304,14 +335,37 @@
   void rocksdb::Collocators::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
     inc(encodeCollocation(w1, w2, dist));
   }
-    
+
   std::shared_ptr<DB> rocksdb::Collocators::OpenDb(const char *dbname) {
 		std::cout << "Test merge-based counters... " << dbname << "\n";
 		DB* db;
 		Options options;
+
+
+		options.env->SetBackgroundThreads(4);
 		options.create_if_missing = true;
 		options.merge_operator = std::make_shared<CountMergeOperator>();
 		options.max_successive_merges = 0;
+		//		options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+		options.IncreaseParallelism(70);
+    //		options.OptimizeLevelStyleCompaction();
+    options.max_write_buffer_number = 48;
+    options.max_background_jobs = 48;
+    options.allow_concurrent_memtable_write=true;
+		//		options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
+		// options.enable_write_thread_adaptive_yield = 1;
+		// options.allow_concurrent_memtable_write = 1;
+		// options.memtable_factory.reset(new rocksdb::SkipListFactory);
+		// options.write_buffer_size = 1 << 22;
+		// options.allow_mmap_reads = true;
+		// options.allow_mmap_writes = true;
+		options.max_background_compactions = 40;
+    // BlockBasedTableOptions table_options;
+    // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
+		// options.bloom_locality = 1;
+    // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
+    // table_options.block_cache = cache;
+		// options.table_factory.reset(NewBlockBasedTableFactory(table_options));
 		Status s;
 		//  DestroyDB(dbname, Options());
 		s = DB::Open(options, dbname, &db);
@@ -324,7 +378,7 @@
 
   CollocatorIterator* rocksdb::Collocators::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
     ReadOptions options;
-    options.prefix_same_as_start = true;  
+    options.prefix_same_as_start = true;
     char prefixc[sizeof(uint64_t)];
     EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
     Iterator *it = db_->NewIterator(options);
@@ -344,12 +398,127 @@
     std::cout << "ready dumping\n";
   }
 
+  double calculateLLR(uint64_t f_X_, uint64_t uintN, uint64_t f_X_Y_, uint64_t f_Y_) {
+    double f_e_, f_o_;
+    double A=0.0, B=0.0, C=0.0, D=0.0, N=0.0;
+    double LLR=0.0, statVal=0.0, minusDiffCoeff=0.0;
+    double BlogB=0.0, ClogC=0.0;
+
+    N = (double)uintN;
+    A = (double)f_X_Y_;
+    B = (double)f_X_   -A;
+    C = (double)f_Y_   -A;
+    D = (double)N      -A-B-C;;
+
+    if (B > 0.) BlogB = B*log(B);
+    if (C > 0.) ClogC = C*log(C);
+
+    if ((A>0.) && (D>0.) && (N>0.))	{
+      f_e_ = (double)f_X_  /(double)N;
+      f_o_ = (double)f_X_Y_/(double)f_Y_;
+
+      minusDiffCoeff =
+        (	f_X_==0 ?   (double)((-1)*f_X_Y_) :
+          (	f_X_Y_==0 ? (double)((+1)*f_X_) :
+            (f_e_-f_o_)/(f_e_+f_o_)
+            )
+          );
+
+      /* log likelihood ratio */
+      LLR     =  2*( A*log(A)
+                     +BlogB
+                     +ClogC
+                     +D*log(D)
+                     -(A+B)*log(A+B)
+                     -(A+C)*log(A+C)
+                     -(B+D)*log(B+D)
+                     -(C+D)*log(C+D)
+                     +N*log(N)
+                     );
+    }
+    return(minusDiffCoeff > 0 ? 0 : (statVal=LLR));
+  }
+
+	
+	bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
+	bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
+
+	std::vector<Collocator> rocksdb::Collocators::get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total) {
+    _vocab = vocab;
+		std::vector<Collocator> collocators;
+    uint64_t w2, last_w2 = 0xffffffffffffffff;
+    uint64_t sum = 0, total_w1 = 0;
+    for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
+      uint64_t value = it->intValue(),
+        key = it->intKey();
+      w2 = W2(key);
+      total_w1 += value;
+      if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
+      if (w2 != last_w2) {
+				double pmi = log2( total * ((double) sum) /
+													 (AVG_WINDOW_SIZE * ((double)vocab[w1].freq) * ((double)vocab[last_w2].freq) ));
+        //  Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
+        // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) /  (AVG_WINDOW_SIZE * ((double)vocab[w1].freq/total) * ((double)vocab[last_w2].freq/total)));
+        double md = log2((double)sum * sum /  ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * vocab[w1].freq * vocab[last_w2].freq));
+        collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(((double) sum / AVG_WINDOW_SIZE / total))), /* normalize to [-1,1] */
+							calculateLLR(vocab[w1].freq, total, sum, vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
+        last_w2 = w2;
+        sum = value;
+      } else {
+        sum += value;
+      }
+    }
+
+		sort(collocators.begin(), collocators.end(), sortByLfmd);
+		
+    int i=0;
+    for (Collocator c : collocators) {
+      if(i++>10) break;
+      std::cout << "w1:" << vocab[w1].word << ", w2:" << vocab[c.w2].word
+                << "\t f(w1):" << vocab[w1].freq
+                << "\t f(w2):" << vocab[c.w2].freq
+                << "\t f(w1, x):" << total_w1
+                << "\t f(w1, w2):" << c.sum
+                << "\t pmi:" << c.pmi
+                << "\t npmi:" << c.npmi
+                << "\t llr:" << c.llr
+                << "\t md:" << c.md
+                << "\t lfmd:" << c.lfmd
+                << "\t fpmi:" << c.fpmi
+                << "\t total:" << total
+                << std::endl;
+    }
+		return collocators;
+  }
+
   rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
   rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
   rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
 
 };
 
+string rocksdb::Collocators::collocators2json(vector<Collocator> collocators) {
+  ostringstream s;
+  s << "[";
+  bool first = true;
+  for (Collocator c : collocators) {
+    if(!first)
+      s << ",\n";
+    else
+      first = false;
+    s << "{"
+      "\"word\":\"" << string(_vocab[c.w2].word) << "\"," <<
+      "\"rank\":" << c.w2    << "," <<
+      "\"npmi\":" << c.npmi  << "," <<
+      "\"llr\":" << c.llr   << "," <<
+      "\"lfmd\":" << c.lfmd  << "," <<
+      "\"mi\":" << c.fpmi  <<
+      "}";
+  }
+  s << "]\n";
+  return s.str();
+}
+
 typedef rocksdb::Collocators COLLOCATORS;
 
 extern "C" {
@@ -364,4 +533,12 @@
 	void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
 		db->dump(w1, w2, dist);
 	}
+
+	void get_collocators(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
+		db->get_collocators(w1, vocab, total);
+	}
+
+	const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
+		return strdup(db->collocators2json(db->get_collocators(w1, vocab, total)).c_str());
+	}
 }
diff --git a/collocatordb.h b/collocatordb.h
index 3aea4bd..95848a3 100644
--- a/collocatordb.h
+++ b/collocatordb.h
@@ -10,6 +10,12 @@
 #define W2(key) (uint64_t)((key >> 24) & 0xffffff)
 #define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
 
+
+typedef struct {
+  uint64_t freq;
+  char *word;
+}  vocab_entry;
+
 #ifdef __cplusplus
 namespace rocksdb {
     class CollocatorIterator : public Iterator  {
@@ -46,3 +52,6 @@
 extern COLLOCATORS *open_collocators(char *s);
 extern void inc_collocators(COLLOCATORS *db, uint64_t w1, uint64_t w2, int8_t dist);
 extern void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist);
+extern void get_collocators(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total);
+extern char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total);
+