clean up code clang style

Change-Id: Ib8c2960f276107f7fe5dbf1699ff5b20f1315166
diff --git a/src/collocatordb.cc b/src/collocatordb.cc
index 352efd3..41d0ad0 100644
--- a/src/collocatordb.cc
+++ b/src/collocatordb.cc
@@ -1,29 +1,29 @@
 #define EXPORT __attribute__((visibility("visible")))
 #define IMPORT
 
-#include <cassert>
-#include <memory>
-#include <iostream>
-#include <algorithm>
-#include <vector>
-#include <cstdint>
-#include <string>
-#include <sstream> // for ostringstream
-#include <cmath>
-#include <rocksdb/cache.h>
+#include "config.h"
+#include "export.h"
+#include "merge_operators.h"
 #include "rocksdb/db.h"
 #include "rocksdb/env.h"
 #include "rocksdb/table.h"
+#include <algorithm>
+#include <cassert>
+#include <cmath>
+#include <cstdint>
+#include <iostream>
+#include <memory>
 #include <rocksdb/merge_operator.h>
 #include <rocksdb/slice_transform.h>
-#include "merge_operators.h"
-#include "export.h"
-#include "config.h"
+#include <sstream> // for ostringstream
+#include <string>
+#include <vector>
 
 #define WINDOW_SIZE 5
 #define FREQUENCY_THRESHOLD 5
 #define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
-#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
+#define encodeCollocation(w1, w2, dist)                                        \
+  (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
 #define W1(key) (uint64_t)(key & 0xffffff)
 #define W2(key) (uint64_t)((key >> 24) & 0xffffff)
 #define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
@@ -42,761 +42,760 @@
 using namespace std;
 
 namespace rocksdb {
-  class Collocator {
-  public:
-                                            uint32_t w2;
-                                            uint64_t f2;
-                                            uint64_t raw;
-                                            double pmi;
-                                            double npmi;
-                                            double llr;
-                                            double lfmd;
-                                            double md;
-                                            uint64_t left_raw;
-                                            uint64_t right_raw;
-                                            double left_pmi;
-                                            double right_pmi;
-                                            double dice;
-                                            double logdice;
-                                            double ldaf;
-                                            int window;
-                                            int af_window;
-  };
-
-  size_t num_merge_operator_calls;
-
-  void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
-
-  size_t num_partial_merge_calls;
-
-  void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
-
-
-  inline void EncodeFixed64(char *buf, uint64_t value) {
-    if (!IS_BIG_ENDIAN) {
-      memcpy(buf, &value, sizeof(value));
-    } else {
-      buf[0] = value & 0xff;
-      buf[1] = (value >> 8) & 0xff;
-      buf[2] = (value >> 16) & 0xff;
-      buf[3] = (value >> 24) & 0xff;
-      buf[4] = (value >> 32) & 0xff;
-      buf[5] = (value >> 40) & 0xff;
-      buf[6] = (value >> 48) & 0xff;
-      buf[7] = (value >> 56) & 0xff;
-    }
-  }
-
-  inline uint32_t DecodeFixed32(const char *ptr) {
-    if (!IS_BIG_ENDIAN) {
-      // Load the raw bytes
-      uint32_t result;
-      memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
-      return result;
-    } else {
-      return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
-              | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
-              | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
-              | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
-    }
-  }
-
-  inline uint64_t DecodeFixed64(const char *ptr) {
-    if (!IS_BIG_ENDIAN) {
-      // Load the raw bytes
-      uint64_t result;
-      memcpy(&result, ptr, sizeof(result));  // gcc optimizes this to a plain load
-      return result;
-    } else {
-      uint64_t lo = DecodeFixed32(ptr);
-      uint64_t hi = DecodeFixed32(ptr + 4);
-      return (hi << 32) | lo;
-    }
-  }
-
-  static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
-    double
-        r1 = f1 * window_size,
-        c1 = f2,
-        e = r1 * c1 / total,
-        o = f12;
-    if (f12 < FREQUENCY_THRESHOLD)
-      return -1.0;
-    else
-      return log2(o / e);
-  }
-
-  // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
-  // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL. 
-  static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
-    double
-        r1 = f1 * window_size,
-        c1 = f2,
-        e = r1 * c1 / total,
-        o = f12;
-    if (f12 < FREQUENCY_THRESHOLD)
-      return -1.0;
-    else
-      return log2(o / e) / (-log2(o / total / window_size));
-  }
-
-  // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
-  // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
-  // double md = log2(pow((double)max * window_size / total, 2) /  (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
-  static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
-    double
-        r1 = f1 * window_size,
-        c1 = f2,
-        e = r1 * c1 / total,
-        o = f12;
-    return log2(o * o / e);
-  }
-
-  static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
-    double
-        r1 = f1 * window_size,
-        c1 = f2,
-        e = r1 * c1 / total,
-        o = f12;
-    if (f12 == 0)
-      return 0;
-    else
-      return log2(o * o * o / e);
-  }
-
-  // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714. 
-  // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
-  static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
-    double
-        r1 = (double) w1 * window_size,
-        r2 = (double) n - r1,
-        c1 = w2,
-        c2 = n - c1,
-        o11 = w12, o12 = r1 - o11,
-        o21 = c1 - w12, o22 = r2 - o21,
-        e11 = r1 * c1 / n, e12 = r1 * c2 / n,
-        e21 = r2 * c1 / n, e22 = r2 * c2 / n;
-    return (2 * ((o11 > 0 ? o11 * log(o11 / e11) : 0) + (o12 > 0 ? o12 * log(o12 / e12) : 0) +
-                 (o21 > 0 ? o21 * log(o21 / e21) : 0) + (o22 > 0 ? o22 * log(o22 / e22) : 0)));
-  }
-
-
-  static inline double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
-    double
-        r1 = (double) w1 * window_size,
-        c1 = w2;
-    return 2 * w12 / (c1 + r1);
-  }
-
-  // Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
-  static inline double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
-    double
-        r1 = (double) w1 * window_size,
-        c1 = w2;
-    return 14 + log2(2 * w12 / (c1 + r1));
-  }
-
-  class CountMergeOperator : public AssociativeMergeOperator {
-  public:
-                                            CountMergeOperator() {
-                                              mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
-                                            }
-
-                                            virtual bool Merge(const Slice &key,
-                                                               const Slice *existing_value,
-                                                               const Slice &value,
-                                                               std::string *new_value,
-                                                               Logger *logger) const override {
-                                              assert(new_value->empty());
-                                              ++num_merge_operator_calls;
-                                              if (existing_value == nullptr) {
-                                                new_value->assign(value.data(), value.size());
-                                                return true;
-                                              }
-
-                                              return mergeOperator_->PartialMerge(
-                                                  key,
-                                                  *existing_value,
-                                                  value,
-                                                  new_value,
-                                                  logger);
-                                            }
-
-                                            virtual const char *Name() const override {
-                                              return "UInt64AddOperator";
-                                            }
-
-  private:
-                                            std::shared_ptr<MergeOperator> mergeOperator_;
-  };
-
-
-  class CollocatorIterator : public Iterator {
-  private:
-                                            char prefixc[sizeof(uint64_t)];
-                                            Iterator *base_iterator_;
-
-
-  public:
-                                            CollocatorIterator(Iterator *base_iterator)
-                                                : base_iterator_(base_iterator) {}
-
-                                            void setPrefix(char *prefix) {
-                                              memcpy(prefixc, prefix, sizeof(uint64_t));
-                                            }
-
-                                            virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
-
-                                            virtual void SeekToLast() { base_iterator_->SeekToLast(); }
-
-                                            virtual void Seek(const rocksdb::Slice &s) { base_iterator_->Seek(s); }
-
-                                            virtual void
-                                            SeekForPrev(const rocksdb::Slice &s) { base_iterator_->SeekForPrev(s); }
-
-                                            virtual void Prev() { base_iterator_->Prev(); }
-
-                                            virtual void Next() { base_iterator_->Next(); }
-
-                                            virtual Slice key() const;
-
-                                            virtual Slice value() const;
-
-                                            virtual Status status() const;
-
-                                            virtual bool Valid() const;
-
-                                            bool isValid();
-
-                                            uint64_t intValue();
-
-                                            uint64_t intKey();
-
-  };
-
-  //  rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
-
-  bool rocksdb::CollocatorIterator::Valid() const {
-    return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
-  }
-
-  bool rocksdb::CollocatorIterator::isValid() {
-    return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
-    // return key().starts_with(std::string(prefixc,3));
-  }
-
-  uint64_t rocksdb::CollocatorIterator::intKey() {
-    return DecodeFixed64(base_iterator_->key().data());
-  }
-
-  uint64_t rocksdb::CollocatorIterator::intValue() {
-    return DecodeFixed64(base_iterator_->value().data());
-  }
-
-  class VocabEntry {
-  public:
-                                            string word;
-                                            uint64_t freq;
-  };
-
-  class CollocatorDB {
-  private:
-                                            WriteOptions merge_option_; // for merge
-                                            char _one[sizeof(uint64_t)];
-                                            Slice _one_slice;
-                                            vector<VocabEntry> _vocab;
-                                            uint64_t total = 0;
-                                            uint64_t sentences = 0;
-                                            float avg_window_size = 8.0;
-
-  protected:
-                                            std::shared_ptr<DB> db_;
-
-                                            WriteOptions put_option_;
-                                            ReadOptions get_option_;
-                                            WriteOptions delete_option_;
-
-                                            uint64_t default_;
-
-                                            std::shared_ptr<DB> OpenDb(const char *dbname);
-
-                                            std::shared_ptr<DB> OpenDbForRead(const char *dbname);
-
-
-  public:
-                                            void readVocab(string fname);
-                                            string getWord(uint32_t w1);
-
-                                            uint64_t getWordId(const char *word) const;
-
-                                            CollocatorDB(const char *db_name, bool read_only);
-
-                                            // public interface of CollocatorDB.
-                                            // All four functions return false
-                                            // if the underlying level db operation failed.
-
-                                            // mapped to a levedb Put
-                                            bool set(const std::string &key, uint64_t value) {
-                                              // just treat the internal rep of int64 as the string
-                                              char buf[sizeof(value)];
-                                              EncodeFixed64(buf, value);
-                                              Slice slice(buf, sizeof(value));
-                                              auto s = db_->Put(put_option_, key, slice);
-
-                                              if (s.ok()) {
-                                                return true;
-                                              } else {
-                                                std::cerr << s.ToString() << std::endl;
-                                                return false;
-                                              }
-                                            }
-
-                                            DB *getDb() {
-                                              return db_.get();
-                                            }
-
-                                            // mapped to a rocksdb Delete
-                                            bool remove(const std::string &key) {
-                                              auto s = db_->Delete(delete_option_, key);
-
-                                              if (s.ok()) {
-                                                return true;
-                                              } else {
-                                                std::cerr << s.ToString() << std::endl;
-                                                return false;
-                                              }
-                                            }
-
-                                            // mapped to a rocksdb Get
-                                            bool get(const std::string &key, uint64_t *value) {
-                                              std::string str;
-                                              auto s = db_->Get(get_option_, key, &str);
-
-                                              if (s.IsNotFound()) {
-                                                // return default value if not found;
-                                                *value = default_;
-                                                return true;
-                                              } else if (s.ok()) {
-                                                // deserialization
-                                                if (str.size() != sizeof(uint64_t)) {
-                                                  std::cerr << "value corruption\n";
-                                                  return false;
-                                                }
-                                                *value = DecodeFixed64(&str[0]);
-                                                return true;
-                                              } else {
-                                                std::cerr << s.ToString() << std::endl;
-                                                return false;
-                                              }
-                                            }
-
-
-                                            uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
-                                              char encoded_key[sizeof(uint64_t)];
-                                              EncodeFixed64(encoded_key, encodeCollocation(w1, w2, dist));
-                                              uint64_t value = default_;
-                                              get(std::string(encoded_key, 8), &value);
-                                              return value;
-                                            }
-
-                                            virtual void inc(const std::string &key) {
-                                              db_->Merge(merge_option_, key, _one_slice);
-                                            }
-
-                                            void inc(const uint64_t key) {
-                                              char encoded_key[sizeof(uint64_t)];
-                                              EncodeFixed64(encoded_key, key);
-                                              db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
-                                            }
-
-                                            virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
-
-                                            void dump(uint32_t w1, uint32_t w2, int8_t dist);
-
-                                            vector<Collocator> get_collocators(uint32_t w1);
-
-                                            vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
-
-                                            vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
-
-                                            vector<Collocator>
-                                            get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2);
-
-                                            void
-                                            applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
-                                                            const uint64_t sum, const int usedPositions,
-                                                            int true_window_size, rocksdb::Collocator *result);
-
-                                            void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
-
-                                            string collocators2json(uint32_t w1, vector<Collocator> collocators);
-
-                                            // mapped to a rocksdb Merge operation
-                                            virtual bool add(const std::string &key, uint64_t value) {
-                                              char encoded[sizeof(uint64_t)];
-                                              EncodeFixed64(encoded, value);
-                                              Slice slice(encoded, sizeof(uint64_t));
-                                              auto s = db_->Merge(merge_option_, key, slice);
-
-                                              if (s.ok()) {
-                                                return true;
-                                              } else {
-                                                std::cerr << s.ToString() << std::endl;
-                                                return false;
-                                              }
-                                            }
-
-                                            CollocatorIterator *SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
-  };
-
-  rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
-    //		merge_option_.sync = true;
-    if (read_only)
-      db_ = OpenDbForRead(strdup(db_name));
-    else
-      db_ = OpenDb(db_name);
-    assert(db_);
-    uint64_t one = 1;
-    EncodeFixed64(_one, one);
-    _one_slice = Slice(_one, sizeof(uint64_t));
-  }
-
-  void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
-    inc(encodeCollocation(w1, w2, dist));
-  }
-
-  void rocksdb::CollocatorDB::readVocab(string fname) {
-    char strbuf[2048];
-    uint64_t freq;
-    FILE *fin = fopen(fname.c_str(), "rb");
-    if (fin == NULL) {
-      cout << "Vocabulary file " << fname << " not found\n";
-      exit(1);
-    }
-    uint64_t i = 0;
-    while (fscanf(fin, "%s %lu", strbuf, &freq) == 2) {
-      _vocab.push_back({strbuf, freq});
-      total += freq;
-      i++;
-    }
-    fclose(fin);
-
-    char size_fname[256];
-    strcpy(size_fname, fname.c_str());
-    char *pos = strstr(size_fname, ".vocab");
-    if (pos) {
-      *pos = 0;
-      strcat(size_fname, ".size");
-      FILE *fp = fopen(size_fname, "r");
-      if (fp != NULL) {
-        fscanf(fp, "%lu", &sentences);
-        fscanf(fp, "%lu", &total);
-        float sl = (float) total / (float) sentences;
-        float w = WINDOW_SIZE;
-        avg_window_size = ((sl > 2 * w ? (sl - 2 * w) * 2 * w : 0) + (double) w * (3 * w - 1)) / sl;
-        fprintf(stdout,
-                "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n",
-                total, sentences, sl, avg_window_size);
-        fclose(fp);
-      } else {
-       // std::cout << "size file " << size_fname << " not found\n";
-      }
-    } else {
-      std::cout << "cannot determine size file " << size_fname << "\n";
-    }
-  }
-
-  std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
-    DB *db;
-    Options options;
-    options.env->SetBackgroundThreads(4);
-    options.create_if_missing = true;
-    options.merge_operator = std::make_shared<CountMergeOperator>();
-    options.max_successive_merges = 0;
-    //		options.prefix_extractor.reset(NewFixedPrefixTransform(8));
-    options.IncreaseParallelism();
-    options.OptimizeLevelStyleCompaction();
-    options.prefix_extractor.reset(NewFixedPrefixTransform(3));
-    ostringstream dbname, vocabname;
-    dbname << name << ".rocksdb";
-    auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
-    if (!s.ok()) {
-      std::cerr << s.ToString() << std::endl;
-      assert(false);
-    }
-    vocabname << name << ".vocab";
-    readVocab(vocabname.str());
-    return std::shared_ptr<DB>(db);
-  }
-
-  std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
-    DB *db;
-    Options options;
-
-
-    options.env->SetBackgroundThreads(4);
-    options.create_if_missing = true;
-    options.merge_operator = std::make_shared<CountMergeOperator>();
-    options.max_successive_merges = 0;
-    //		options.prefix_extractor.reset(NewFixedPrefixTransform(8));
-    options.IncreaseParallelism();
-    options.OptimizeLevelStyleCompaction();
-    // options.max_write_buffer_number = 48;
-    // options.max_background_jobs = 48;
-    // options.allow_concurrent_memtable_write=true;
-    //		options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
-    // options.enable_write_thread_adaptive_yield = 1;
-    // options.allow_concurrent_memtable_write = 1;
-    // options.memtable_factory.reset(new rocksdb::SkipListFactory);
-    // options.write_buffer_size = 1 << 22;
-    // options.allow_mmap_reads = true;
-    // options.allow_mmap_writes = true;
-    // options.max_background_compactions = 40;
-    // BlockBasedTableOptions table_options;
-    // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
-    // options.bloom_locality = 1;
-    // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
-    // table_options.block_cache = cache;
-    // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
-    Status s;
-    //  DestroyDB(dbname, Options());
-    s = DB::Open(options, dbname, &db);
-    if (!s.ok()) {
-      std::cerr << s.ToString() << std::endl;
-      assert(false);
-    }
-    total = 1000;
-    return std::shared_ptr<DB>(db);
-  }
-
-  CollocatorIterator *rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
-    ReadOptions options;
-    options.prefix_same_as_start = true;
-    char prefixc[sizeof(uint64_t)];
-    EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
-    Iterator *it = db_->NewIterator(options);
-    CollocatorIterator *cit = new CollocatorIterator(it);
-    if (w2 > 0)
-      cit->Seek(std::string(prefixc, 6));
-    else
-      cit->Seek(std::string(prefixc, 3));
-    cit->setPrefix(prefixc);
-    return cit;
-  }
-
-  void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
-    auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
-    for (; it->isValid(); it->Next()) {
-      uint64_t value = it->intValue();
-      uint64_t key = it->intKey();
-      std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value
-                << std::endl;
-    }
-    std::cout << "ready dumping\n";
-  }
-
-  bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
-
-  bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
-
-  bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
-
-  bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdice > rhs.logdice; }
-
-  bool sortByLogDiceAF(const Collocator &lhs, const Collocator &rhs) { return lhs.ldaf > rhs.ldaf; }
-
-
-  void rocksdb::CollocatorDB::applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
-                                              const uint64_t sum, const int usedPositions, int true_window_size,
-                                              rocksdb::Collocator *result) {
-    uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
-    double o = sum,
-        r1 = f1 * true_window_size,
-        c1 = f2,
-        e = r1 * c1 / total,
-        pmi = log2(o / e),
-        md = log2(o * o / e),
-        lfmd = log2(o * o * o / e),
-        llr = ca_ll(f1, f2, sum, total, true_window_size);
-    double ld = ca_logdice(f1, f2, sum, total, true_window_size);
-
-    int bestWindow = usedPositions;
-    double bestAF = ld;
-    double currentAF;
-    //          if(f1<75000000)
-    //#pragma omp parallel for reduction(max:bestAF)
-    // #pragma omp target teams distribute parallel for reduction(max:bestAF) map(tofrom:bestAF,currentAF,bestWindow,usedPositions)
-    for (int bitmask = 1; bitmask < (1 << (2 * WINDOW_SIZE)); bitmask++) {
-      if ((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0) continue;
-      uint64_t currentWindowSum = 0;
-      // #pragma omp target teams distribute parallel for reduction(+:currentWindowSum) map(tofrom:bitmask,usedPositions)
-      for (int pos = 0; pos < 2 * WINDOW_SIZE; pos++) {
-        if (((1 << pos) & bitmask & usedPositions) != 0)
-          currentWindowSum += sumWindow[pos];
-      }
-      currentAF = ca_logdice(f1, f2, currentWindowSum, total, __builtin_popcount(bitmask));
-      if (currentAF > bestAF) {
-        bestAF = currentAF;
-        bestWindow = bitmask;
-      }
-    }
-
-    *result = {w2,
-               f2,
-               sum,
-               pmi,
-               pmi / (-log2(o / total / true_window_size)),
-               llr,
-               lfmd,
-               md,
-               sumWindow[WINDOW_SIZE],
-               sumWindow[WINDOW_SIZE - 1],
-               ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
-               ca_pmi(f1, f2, sumWindow[WINDOW_SIZE - 1], total, 1),
-               ca_dice(f1, f2, sum, total, true_window_size),
-               ld,
-               bestAF,
-               usedPositions,
-               bestWindow
-    };
-
-  }
-
-  std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2) {
-    std::vector<Collocator> collocators;
-    uint64_t w2, last_w2 = 0xffffffffffffffff;
-    uint64_t maxv = 0, sum = 0;
-    uint64_t *sumWindow = (uint64_t *) malloc(sizeof(uint64_t) * 2 * WINDOW_SIZE);
-    memset(sumWindow, 0, sizeof(uint64_t) * 2 * WINDOW_SIZE);
-    int true_window_size = 1;
-    int usedPositions = 0;
-
-    if (w1 > _vocab.size()) {
-      std::cout << w1 << "> vocabulary size " << _vocab.size() << "\n";
-      w1 -= _vocab.size();
-    }
-#ifdef DEBUG
-    std::cout << "Searching for collocates of " << _vocab[w1].word << "\n";
-#endif
-    // #pragma omp parallel num_threads(40)
-    // #pragma omp single
-    for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0)); it->isValid(); it->Next()) {
-      uint64_t value = it->intValue(),
-          key = it->intKey();
-      if ((w2 = W2(key)) > max_w2)
-        continue;
-      if (last_w2 == 0xffffffffffffffff) last_w2 = w2;
-      if (w2 != last_w2) {
-        if (sum >= FREQUENCY_THRESHOLD) {
-          collocators.push_back({});
-          rocksdb::Collocator *result = &(collocators[collocators.size() - 1]);
-          // #pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions, true_window_size) shared(w1, result) if(sum > 1000000)
-          {
-            // uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2 *WINDOW_SIZE);
-            // memcpy(nsw, sumWindow, sizeof(uint64_t) * 2 *WINDOW_SIZE);
-            applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions, true_window_size, result);
-            // free(nsw);
-          }
-        }
-        memset(sumWindow, 0, 2 * WINDOW_SIZE * sizeof(uint64_t));
-        usedPositions = 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
-        sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
-        last_w2 = w2;
-        maxv = value;
-        sum = value;
-        true_window_size = 1;
-        if (min_w2 == max_w2 && w2 != min_w2)
-          break;
-      } else {
-        sum += value;
-        if (value > maxv)
-          maxv = value;
-        usedPositions |= 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
-        sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
-        true_window_size++;
-      }
-    }
-
-    // #pragma omp taskwait
-    sort(collocators.begin(), collocators.end(), sortByLogDiceAF);
-
-#ifdef DEBUG
-    int i=0;
-    for (Collocator c : collocators) {
-      if(i++>10) break;
-      std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word << "*"
-                << "\t f(w1):" << _vocab[w1].freq
-                << "\t f(w2):" << _vocab[c.w2].freq
-                << "\t f(w1, w2):" << c.raw
-                << "\t pmi:" << c.pmi
-                << "\t npmi:" << c.npmi
-                << "\t llr:" << c.llr
-                << "\t md:" << c.md
-                << "\t lfmd:" << c.lfmd
-                << "\t total:" << total
-                << std::endl;
-    }
-#endif
-
-    return collocators;
-  }
-
-
-  std::vector<Collocator> rocksdb::CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
-    return get_collocators(w1, w2, w2);
-  }
-
-  std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
-    return get_collocators(w1, 0, UINT32_MAX);
-  }
-
-  void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
-    std::vector<Collocator> collocators;
-    std::stringstream stream;
-    uint64_t w2, last_w2 = 0xffffffffffffffff;
-    uint64_t maxv = 0, total_w1 = 0;
-    bool first = true;
-    for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
-      uint64_t value = it->intValue(),
-          key = it->intKey();
-      w2 = W2(key);
-      total_w1 += value;
-      if (last_w2 == 0xffffffffffffffff) last_w2 = w2;
-      if (w2 != last_w2) {
-        if (maxv >= min_cooccur) {
-          double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
-          if (first)
-            first = false;
-          else
-            stream << " ";
-          stream << w2 << " " << llr;
-        }
-        last_w2 = w2;
-        maxv = value;
-      } else {
-        if (value > maxv)
-          maxv = value;
-      }
-    }
-    if (first)
-      stream << "1 0.0";
-    stream << "\n";
-    std::cout << stream.str();
-  }
-
-  rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
-
-  rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
-
-  rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
-
+class Collocator {
+public:
+  uint32_t w2;
+  uint64_t f2;
+  uint64_t raw;
+  double pmi;
+  double npmi;
+  double llr;
+  double lfmd;
+  double md;
+  uint64_t left_raw;
+  uint64_t right_raw;
+  double left_pmi;
+  double right_pmi;
+  double dice;
+  double logdice;
+  double ldaf;
+  int window;
+  int af_window;
 };
 
-string rocksdb::CollocatorDB::getWord(uint32_t w1) {
-  return _vocab[w1].word;
+size_t num_merge_operator_calls;
+
+void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
+
+size_t num_partial_merge_calls;
+
+void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
+
+inline void EncodeFixed64(char *buf, uint64_t value) {
+  if (!IS_BIG_ENDIAN) {
+    memcpy(buf, &value, sizeof(value));
+  } else {
+    buf[0] = value & 0xff;
+    buf[1] = (value >> 8) & 0xff;
+    buf[2] = (value >> 16) & 0xff;
+    buf[3] = (value >> 24) & 0xff;
+    buf[4] = (value >> 32) & 0xff;
+    buf[5] = (value >> 40) & 0xff;
+    buf[6] = (value >> 48) & 0xff;
+    buf[7] = (value >> 56) & 0xff;
+  }
 }
 
-uint64_t rocksdb::CollocatorDB::getWordId(const char *word) const {
+inline uint32_t DecodeFixed32(const char *ptr) {
+  if (!IS_BIG_ENDIAN) {
+    // Load the raw bytes
+    uint32_t result;
+    memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+    return result;
+  } else {
+    return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0]))) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16) |
+            (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
+  }
+}
+
+inline uint64_t DecodeFixed64(const char *ptr) {
+  if (!IS_BIG_ENDIAN) {
+    // Load the raw bytes
+    uint64_t result;
+    memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
+    return result;
+  } else {
+    uint64_t lo = DecodeFixed32(ptr);
+    uint64_t hi = DecodeFixed32(ptr + 4);
+    return (hi << 32) | lo;
+  }
+}
+
+static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12,
+                            uint64_t total, double window_size) {
+  double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
+  if (f12 < FREQUENCY_THRESHOLD)
+    return -1.0;
+  else
+    return log2(o / e);
+}
+
+// Bouma, Gerlof (2009): <a
+// href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
+// Normalized (pointwise) mutual information in collocation extraction</a>. In
+// Proceedings of GSCL.
+static double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12,
+                             uint64_t total, double window_size) {
+  double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
+  if (f12 < FREQUENCY_THRESHOLD)
+    return -1.0;
+  else
+    return log2(o / e) / (-log2(o / total / window_size));
+}
+
+// Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of
+// collocation extraction metrics. In: International Conference on Language
+// Resources and Evaluation (LREC-2002). (2002) 620–625 double md =
+// log2(pow((double)max * window_size / total, 2) /  (window_size *
+// ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
+static double ca_md(uint64_t f1, uint64_t f2, uint64_t f12,
+                           uint64_t total, double window_size) {
+  const double r1 = f1 * window_size;
+  const double c1 = f2;
+  const double e = r1 * c1 / total;
+  const double o = f12;
+  return log2(o * o / e);
+}
+
+static double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12,
+                             uint64_t total, double window_size) {
+  double r1 = f1 * window_size, c1 = f2, e = r1 * c1 / total, o = f12;
+  if (f12 == 0)
+    return 0;
+  return log2(o * o * o / e);
+}
+
+// Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and
+// Collocations. PhD dissertation, IMS, University of Stuttgart. Published in
+// 2005, URN urn:nbn:de:bsz:93-opus-23714. Free PDF available from
+// http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
+static double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n,
+                           uint64_t window_size) {
+  double r1 = (double)w1 * window_size, r2 = (double)n - r1, c1 = w2,
+         c2 = n - c1, o11 = w12, o12 = r1 - o11, o21 = c1 - w12, o22 = r2 - o21,
+         e11 = r1 * c1 / n, e12 = r1 * c2 / n, e21 = r2 * c1 / n,
+         e22 = r2 * c2 / n;
+  return (2 * ((o11 > 0 ? o11 * log(o11 / e11) : 0) +
+               (o12 > 0 ? o12 * log(o12 / e12) : 0) +
+               (o21 > 0 ? o21 * log(o21 / e21) : 0) +
+               (o22 > 0 ? o22 * log(o22 / e22) : 0)));
+}
+
+static double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n,
+                             uint64_t window_size) {
+  double r1 = (double)w1 * window_size, c1 = w2;
+  return 2 * w12 / (c1 + r1);
+}
+
+// Rychlý, Pavel (2008): <a
+// href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A
+// lexicographer-friendly association score.</a> In Proceedings of Recent
+// Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
+static double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12,
+                                uint64_t n, uint64_t window_size) {
+  double r1 = (double)w1 * window_size, c1 = w2;
+  return 14 + log2(2 * w12 / (c1 + r1));
+}
+
+class CountMergeOperator : public AssociativeMergeOperator {
+public:
+  CountMergeOperator() {
+    mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
+  }
+
+  bool Merge(const Slice &key, const Slice *existing_value,
+                     const Slice &value, std::string *new_value,
+                     Logger *logger) const override {
+    assert(new_value->empty());
+    ++num_merge_operator_calls;
+    if (existing_value == nullptr) {
+      new_value->assign(value.data(), value.size());
+      return true;
+    }
+
+    return mergeOperator_->PartialMerge(key, *existing_value, value, new_value,
+                                        logger);
+  }
+
+  const char *Name() const override { return "UInt64AddOperator"; }
+
+private:
+  std::shared_ptr<MergeOperator> mergeOperator_;
+};
+
+class CollocatorIterator : public Iterator {
+  char prefixc[sizeof(uint64_t)]{};
+  Iterator *base_iterator_;
+
+public:
+  explicit CollocatorIterator(Iterator *base_iterator) : base_iterator_(base_iterator) {}
+
+  void setPrefix(char *prefix) { memcpy(prefixc, prefix, sizeof(uint64_t)); }
+
+  void SeekToFirst() override { base_iterator_->SeekToFirst(); }
+
+  void SeekToLast() override { base_iterator_->SeekToLast(); }
+
+  void Seek(const rocksdb::Slice &s) override { base_iterator_->Seek(s); }
+
+  void SeekForPrev(const rocksdb::Slice &s) override {
+    base_iterator_->SeekForPrev(s);
+  }
+
+  void Prev() override { base_iterator_->Prev(); }
+
+  void Next() override { base_iterator_->Next(); }
+
+  Slice key() const override;
+
+  Slice value() const override;
+
+  Status status() const override;
+
+  bool Valid() const override;
+
+  bool isValid();
+
+  uint64_t intValue();
+
+  uint64_t intKey();
+};
+
+//  rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
+
+bool CollocatorIterator::Valid() const {
+  return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
+}
+
+bool CollocatorIterator::isValid() {
+  return base_iterator_->Valid() && key().starts_with(std::string(prefixc, 3));
+  // return key().starts_with(std::string(prefixc,3));
+}
+
+uint64_t CollocatorIterator::intKey() {
+  return DecodeFixed64(base_iterator_->key().data());
+}
+
+uint64_t CollocatorIterator::intValue() {
+  return DecodeFixed64(base_iterator_->value().data());
+}
+
+class VocabEntry {
+public:
+  string word;
+  uint64_t freq;
+};
+
+class CollocatorDB {
+  WriteOptions merge_option_; // for merge
+  char _one[sizeof(uint64_t)]{};
+  Slice _one_slice;
+  vector<VocabEntry> _vocab;
+  uint64_t total = 0;
+  uint64_t sentences = 0;
+  float avg_window_size = 8.0;
+
+protected:
+  std::shared_ptr<DB> db_;
+
+  WriteOptions put_option_;
+  ReadOptions get_option_;
+  WriteOptions delete_option_;
+
+  uint64_t default_{};
+
+  std::shared_ptr<DB> OpenDb(const char *dbname);
+
+  std::shared_ptr<DB> OpenDbForRead(const char *dbname);
+
+public:
+  virtual ~CollocatorDB() = default;
+  void readVocab(const string& fname);
+  string getWord(uint32_t w1);
+
+  uint64_t getWordId(const char *word) const;
+
+  CollocatorDB(const char *db_name, bool read_only);
+
+  // public interface of CollocatorDB.
+  // All four functions return false
+  // if the underlying level db operation failed.
+
+  // mapped to a levedb Put
+  bool set(const std::string &key, uint64_t value) {
+    // just treat the internal rep of int64 as the string
+    char buf[sizeof(value)];
+    EncodeFixed64(buf, value);
+    Slice slice(buf, sizeof(value));
+    auto s = db_->Put(put_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  DB *getDb() { return db_.get(); }
+
+  // mapped to a rocksdb Delete
+  bool remove(const std::string &key) {
+    auto s = db_->Delete(delete_option_, key);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  // mapped to a rocksdb Get
+  bool get(const std::string &key, uint64_t *value) {
+    std::string str;
+    auto s = db_->Get(get_option_, key, &str);
+
+    if (s.IsNotFound()) {
+      // return default value if not found;
+      *value = default_;
+      return true;
+    } else if (s.ok()) {
+      // deserialization
+      if (str.size() != sizeof(uint64_t)) {
+        std::cerr << "value corruption\n";
+        return false;
+      }
+      *value = DecodeFixed64(&str[0]);
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
+    char encoded_key[sizeof(uint64_t)];
+    EncodeFixed64(encoded_key, encodeCollocation(w1, w2, dist));
+    uint64_t value = default_;
+    get(std::string(encoded_key, 8), &value);
+    return value;
+  }
+
+  virtual void inc(const std::string &key) {
+    db_->Merge(merge_option_, key, _one_slice);
+  }
+
+  void inc(const uint64_t key) {
+    char encoded_key[sizeof(uint64_t)];
+    EncodeFixed64(encoded_key, key);
+    db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
+  }
+
+  virtual void inc(uint32_t w1, uint32_t w2, uint8_t dist);
+
+  void dump(uint32_t w1, uint32_t w2, int8_t dist) const;
+
+  vector<Collocator> get_collocators(uint32_t w1);
+
+  vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
+
+  vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
+
+  vector<Collocator> get_collocators(uint32_t w1, uint32_t min_w2,
+                                     uint32_t max_w2);
+
+  void applyCAMeasures(uint32_t w1, uint32_t w2,
+                       uint64_t *sumWindow, uint64_t sum,
+                       int usedPositions, int true_window_size,
+                       Collocator *result) const;
+
+  void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
+
+  string collocators2json(uint32_t w1, const vector<Collocator>& collocators);
+
+  // mapped to a rocksdb Merge operation
+  virtual bool add(const std::string &key, uint64_t value) {
+    char encoded[sizeof(uint64_t)];
+    EncodeFixed64(encoded, value);
+    Slice slice(encoded, sizeof(uint64_t));
+    auto s = db_->Merge(merge_option_, key, slice);
+
+    if (s.ok()) {
+      return true;
+    } else {
+      std::cerr << s.ToString() << std::endl;
+      return false;
+    }
+  }
+
+  CollocatorIterator *SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) const;
+};
+
+CollocatorDB::CollocatorDB(const char *db_name,
+                                    bool read_only = false) {
+  //		merge_option_.sync = true;
+  if (read_only)
+    db_ = OpenDbForRead(strdup(db_name));
+  else
+    db_ = OpenDb(db_name);
+  assert(db_);
+  uint64_t one = 1;
+  EncodeFixed64(_one, one);
+  _one_slice = Slice(_one, sizeof(uint64_t));
+}
+
+void CollocatorDB::inc(const uint32_t w1, const uint32_t w2,
+                                const uint8_t dist) {
+  inc(encodeCollocation(w1, w2, dist));
+}
+
+void CollocatorDB::readVocab(const string& fname) {
+  char strbuf[2048];
+  uint64_t freq;
+  FILE *fin = fopen(fname.c_str(), "rb");
+  if (fin == nullptr) {
+    cout << "Vocabulary file " << fname << " not found\n";
+    exit(1);
+  }
+  uint64_t i = 0;
+  while (fscanf(fin, "%s %lu", strbuf, &freq) == 2) {
+    _vocab.push_back({strbuf, freq});
+    total += freq;
+    i++;
+  }
+  fclose(fin);
+
+  char size_fname[256];
+  strcpy(size_fname, fname.c_str());
+  char *pos = strstr(size_fname, ".vocab");
+  if (pos) {
+    *pos = 0;
+    strcat(size_fname, ".size");
+    FILE *fp = fopen(size_fname, "r");
+    if (fp != nullptr) {
+      fscanf(fp, "%lu", &sentences);
+      fscanf(fp, "%lu", &total);
+      float sl = (float)total / (float)sentences;
+      float w = WINDOW_SIZE;
+      avg_window_size =
+          ((sl > 2 * w ? (sl - 2 * w) * 2 * w : 0) + (double)w * (3 * w - 1)) /
+          sl;
+      fprintf(stdout,
+              "Size corrections found: corpus size: %lu tokens in %lu "
+              "sentences, avg. sentence size: %f, avg. window size: %f\n",
+              total, sentences, sl, avg_window_size);
+      fclose(fp);
+    } else {
+      // std::cout << "size file " << size_fname << " not found\n";
+    }
+  } else {
+    std::cout << "cannot determine size file " << size_fname << "\n";
+  }
+}
+
+std::shared_ptr<DB> CollocatorDB::OpenDbForRead(const char *name) {
+  DB *db;
+  Options options;
+  options.env->SetBackgroundThreads(4);
+  options.create_if_missing = true;
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = 0;
+  //		options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.IncreaseParallelism();
+  options.OptimizeLevelStyleCompaction();
+  options.prefix_extractor.reset(NewFixedPrefixTransform(3));
+  ostringstream dbname, vocabname;
+  dbname << name << ".rocksdb";
+  auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
+  if (!s.ok()) {
+    std::cerr << s.ToString() << std::endl;
+    assert(false);
+  }
+  vocabname << name << ".vocab";
+  readVocab(vocabname.str());
+  return std::shared_ptr<DB>(db);
+}
+
+std::shared_ptr<DB> CollocatorDB::OpenDb(const char *dbname) {
+  DB *db;
+  Options options;
+
+  options.env->SetBackgroundThreads(4);
+  options.create_if_missing = true;
+  options.merge_operator = std::make_shared<CountMergeOperator>();
+  options.max_successive_merges = 0;
+  //		options.prefix_extractor.reset(NewFixedPrefixTransform(8));
+  options.IncreaseParallelism();
+  options.OptimizeLevelStyleCompaction();
+  // options.max_write_buffer_number = 48;
+  // options.max_background_jobs = 48;
+  // options.allow_concurrent_memtable_write=true;
+  //		options.memtable_factory.reset(NewHashLinkListRepFactory(200000));
+  // options.enable_write_thread_adaptive_yield = 1;
+  // options.allow_concurrent_memtable_write = 1;
+  // options.memtable_factory.reset(new SkipListFactory);
+  // options.write_buffer_size = 1 << 22;
+  // options.allow_mmap_reads = true;
+  // options.allow_mmap_writes = true;
+  // options.max_background_compactions = 40;
+  // BlockBasedTableOptions table_options;
+  // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
+  // options.bloom_locality = 1;
+  // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
+  // table_options.block_cache = cache;
+  // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
+  Status s;
+  //  DestroyDB(dbname, Options());
+  s = DB::Open(options, dbname, &db);
+  if (!s.ok()) {
+    std::cerr << s.ToString() << std::endl;
+    assert(false);
+  }
+  total = 1000;
+  return std::shared_ptr<DB>(db);
+}
+
+CollocatorIterator *
+CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) const {
+  ReadOptions options;
+  options.prefix_same_as_start = true;
+  char prefixc[sizeof(uint64_t)];
+  EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
+  Iterator *it = db_->NewIterator(options);
+  auto *cit = new CollocatorIterator(it);
+  if (w2 > 0)
+    cit->Seek(std::string(prefixc, 6));
+  else
+    cit->Seek(std::string(prefixc, 3));
+  cit->setPrefix(prefixc);
+  return cit;
+}
+
+void CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) const {
+  auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
+  for (; it->isValid(); it->Next()) {
+    uint64_t value = it->intValue();
+    uint64_t key = it->intKey();
+    std::cout << "w1:" << W1(key) << ", w2:" << W2(key)
+              << ", dist:" << (int32_t)DIST(key) << " - count:" << value
+              << std::endl;
+  }
+  std::cout << "ready dumping\n";
+}
+
+bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) {
+  return lhs.npmi > rhs.npmi;
+}
+
+bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) {
+  return lhs.lfmd > rhs.lfmd;
+}
+
+bool sortByLlr(const Collocator &lhs, const Collocator &rhs) {
+  return lhs.llr > rhs.llr;
+}
+
+bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) {
+  return lhs.logdice > rhs.logdice;
+}
+
+bool sortByLogDiceAF(const Collocator &lhs, const Collocator &rhs) {
+  return lhs.ldaf > rhs.ldaf;
+}
+
+void CollocatorDB::applyCAMeasures(
+    const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
+    const uint64_t sum, const int usedPositions, int true_window_size,
+    Collocator *result) const {
+  uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
+  double o = sum, r1 = f1 * true_window_size, c1 = f2, e = r1 * c1 / total,
+         pmi = log2(o / e), md = log2(o * o / e), lfmd = log2(o * o * o / e),
+         llr = ca_ll(f1, f2, sum, total, true_window_size);
+  double ld = ca_logdice(f1, f2, sum, total, true_window_size);
+
+  int bestWindow = usedPositions;
+  double bestAF = ld;
+  //          if(f1<75000000)
+  // #pragma omp parallel for reduction(max:bestAF)
+  // #pragma omp target teams distribute parallel for reduction(max:bestAF)
+  // map(tofrom:bestAF,currentAF,bestWindow,usedPositions)
+  for (int bitmask = 1; bitmask < (1 << (2 * WINDOW_SIZE)); bitmask++) {
+    if ((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0)
+      continue;
+    uint64_t currentWindowSum = 0;
+    // #pragma omp target teams distribute parallel for
+    // reduction(+:currentWindowSum) map(tofrom:bitmask,usedPositions)
+    for (int pos = 0; pos < 2 * WINDOW_SIZE; pos++) {
+      if (((1 << pos) & bitmask & usedPositions) != 0)
+        currentWindowSum += sumWindow[pos];
+    }
+    double currentAF = ca_logdice(f1, f2, currentWindowSum, total,
+                                  __builtin_popcount(bitmask));
+    if (currentAF > bestAF) {
+      bestAF = currentAF;
+      bestWindow = bitmask;
+    }
+  }
+
+  *result = {w2,
+             f2,
+             sum,
+             pmi,
+             pmi / (-log2(o / total / true_window_size)),
+             llr,
+             lfmd,
+             md,
+             sumWindow[WINDOW_SIZE],
+             sumWindow[WINDOW_SIZE - 1],
+             ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
+             ca_pmi(f1, f2, sumWindow[WINDOW_SIZE - 1], total, 1),
+             ca_dice(f1, f2, sum, total, true_window_size),
+             ld,
+             bestAF,
+             usedPositions,
+             bestWindow};
+}
+
+std::vector<Collocator>
+CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2,
+                                       uint32_t max_w2) {
+  std::vector<Collocator> collocators;
+  uint64_t w2, last_w2 = 0xffffffffffffffff;
+  uint64_t maxv = 0, sum = 0;
+  auto *sumWindow =
+      static_cast<uint64_t *>(malloc(sizeof(uint64_t) * 2 * WINDOW_SIZE));
+  memset(sumWindow, 0, sizeof(uint64_t) * 2 * WINDOW_SIZE);
+  int true_window_size = 1;
+  int usedPositions = 0;
+
+  if (w1 > _vocab.size()) {
+    std::cout << w1 << "> vocabulary size " << _vocab.size() << "\n";
+    w1 -= _vocab.size();
+  }
+#ifdef DEBUG
+  std::cout << "Searching for collocates of " << _vocab[w1].word << "\n";
+#endif
+  // #pragma omp parallel num_threads(40)
+  // #pragma omp single
+  for (auto it =
+           std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0));
+       it->isValid(); it->Next()) {
+    uint64_t value = it->intValue(), key = it->intKey();
+    if ((w2 = W2(key)) > max_w2)
+      continue;
+    if (last_w2 == 0xffffffffffffffff)
+      last_w2 = w2;
+    if (w2 != last_w2) {
+      if (sum >= FREQUENCY_THRESHOLD) {
+        collocators.push_back({});
+        Collocator *result = &(collocators[collocators.size() - 1]);
+        // #pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions,
+        // true_window_size) shared(w1, result) if(sum > 1000000)
+        {
+          // uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2
+          // *WINDOW_SIZE); memcpy(nsw, sumWindow, sizeof(uint64_t) * 2
+          // *WINDOW_SIZE);
+          applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions,
+                          true_window_size, result);
+          // free(nsw);
+        }
+      }
+      memset(sumWindow, 0, 2 * WINDOW_SIZE * sizeof(uint64_t));
+      usedPositions = 1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
+      sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
+      last_w2 = w2;
+      maxv = value;
+      sum = value;
+      true_window_size = 1;
+      if (min_w2 == max_w2 && w2 != min_w2)
+        break;
+    } else {
+      sum += value;
+      if (value > maxv)
+        maxv = value;
+      usedPositions |=
+          1 << (-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0));
+      sumWindow[-DIST(key) + WINDOW_SIZE - (DIST(key) < 0 ? 1 : 0)] = value;
+      true_window_size++;
+    }
+  }
+
+  // #pragma omp taskwait
+  sort(collocators.begin(), collocators.end(), sortByLogDiceAF);
+
+#ifdef DEBUG
+  int i = 0;
+  for (Collocator c : collocators) {
+    if (i++ > 10)
+      break;
+    std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word
+              << "*"
+              << "\t f(w1):" << _vocab[w1].freq
+              << "\t f(w2):" << _vocab[c.w2].freq << "\t f(w1, w2):" << c.raw
+              << "\t pmi:" << c.pmi << "\t npmi:" << c.npmi
+              << "\t llr:" << c.llr << "\t md:" << c.md << "\t lfmd:" << c.lfmd
+              << "\t total:" << total << std::endl;
+  }
+#endif
+
+  return collocators;
+}
+
+std::vector<Collocator>
+CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
+  return get_collocators(w1, w2, w2);
+}
+
+std::vector<Collocator> CollocatorDB::get_collocators(uint32_t w1) {
+  return get_collocators(w1, 0, UINT32_MAX);
+}
+
+void CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
+  std::vector<Collocator> collocators;
+  std::stringstream stream;
+  uint64_t w2, last_w2 = 0xffffffffffffffff;
+  uint64_t maxv = 0, total_w1 = 0;
+  bool first = true;
+  for (auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0));
+       it->isValid(); it->Next()) {
+    uint64_t value = it->intValue(), key = it->intKey();
+    w2 = W2(key);
+    total_w1 += value;
+    if (last_w2 == 0xffffffffffffffff)
+      last_w2 = w2;
+    if (w2 != last_w2) {
+      if (maxv >= min_cooccur) {
+        double llr =
+            ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
+        if (first)
+          first = false;
+        else
+          stream << " ";
+        stream << w2 << " " << llr;
+      }
+      last_w2 = w2;
+      maxv = value;
+    } else {
+      if (value > maxv)
+        maxv = value;
+    }
+  }
+  if (first)
+    stream << "1 0.0";
+  stream << "\n";
+  std::cout << stream.str();
+}
+
+Slice CollocatorIterator::key() const {
+  return base_iterator_->key();
+}
+
+Slice CollocatorIterator::value() const {
+  return base_iterator_->value();
+}
+
+Status CollocatorIterator::status() const {
+  return base_iterator_->status();
+}
+
+}; // namespace rocksdb
+
+string CollocatorDB::getWord(uint32_t w1) { return _vocab[w1].word; }
+
+uint64_t CollocatorDB::getWordId(const char *word) const {
   for (uint64_t i = 0; i < _vocab.size(); i++) {
     if (strcmp(_vocab[i].word.c_str(), word) == 0)
       return i;
@@ -804,16 +803,17 @@
   return 0;
 }
 
-string rocksdb::CollocatorDB::collocators2json(uint32_t w1, vector<Collocator> collocators) {
+string CollocatorDB::collocators2json(uint32_t w1,
+                                               const vector<Collocator>& collocators) {
   ostringstream s;
   int i = 0;
-  s << " { \"f1\": " << _vocab[w1].freq << "," <<
-    "\"w1\":\"" << string(_vocab[w1].word) << "\", " <<
-    "\"N\": " << total << ", " <<
-    "\"collocates\": [";
+  s << " { \"f1\": " << _vocab[w1].freq << "," << R"("w1":")"
+    << string(_vocab[w1].word) << "\", " << "\"N\": " << total << ", "
+    << "\"collocates\": [";
   bool first = true;
   for (Collocator c : collocators) {
-    if (strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
+    if (strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0)
+      continue;
     if (i++ > 200)
       break;
     if (!first)
@@ -821,98 +821,95 @@
     else
       first = false;
     s << "{"
-         "\"word\":\"" << (string(_vocab[c.w2].word).compare("<num>") == 0 ? string("###") : string(_vocab[c.w2].word))
-      << "\"," <<
-      "\"f2\":" << c.f2 << "," <<
-      "\"f\":" << c.raw << "," <<
-      "\"npmi\":" << c.npmi << "," <<
-      "\"pmi\":" << c.pmi << "," <<
-      "\"llr\":" << c.llr << "," <<
-      "\"lfmd\":" << c.lfmd << "," <<
-      "\"md\":" << c.md << "," <<
-      "\"dice\":" << c.dice << "," <<
-      "\"ld\":" << c.logdice << "," <<
-      "\"ln_count\":" << c.left_raw << "," <<
-      "\"rn_count\":" << c.right_raw << "," <<
-      "\"ln_pmi\":" << c.left_pmi << "," <<
-      "\"rn_pmi\":" << c.right_pmi << "," <<
-      "\"ldaf\":" << c.ldaf << "," <<
-      "\"win\":" << c.window << "," <<
-      "\"afwin\":" << c.af_window <<
-      "}";
+         "\"word\":\""
+      << (string(_vocab[c.w2].word) == "<num>"
+              ? string("###")
+              : string(_vocab[c.w2].word))
+      << "\"," << "\"f2\":" << c.f2 << "," << "\"f\":" << c.raw << ","
+      << "\"npmi\":" << c.npmi << "," << "\"pmi\":" << c.pmi << ","
+      << "\"llr\":" << c.llr << "," << "\"lfmd\":" << c.lfmd << ","
+      << "\"md\":" << c.md << "," << "\"dice\":" << c.dice << ","
+      << "\"ld\":" << c.logdice << "," << "\"ln_count\":" << c.left_raw << ","
+      << "\"rn_count\":" << c.right_raw << "," << "\"ln_pmi\":" << c.left_pmi
+      << "," << "\"rn_pmi\":" << c.right_pmi << "," << "\"ldaf\":" << c.ldaf
+      << "," << "\"win\":" << c.window << "," << "\"afwin\":" << c.af_window
+      << "}";
   }
   s << "]}\n";
   //  std::cout << s.str();
   return s.str();
 }
 
-typedef rocksdb::CollocatorDB COLLOCATORS;
+typedef CollocatorDB COLLOCATORS;
 
 extern "C" {
 #ifdef __clang__
 #pragma clang diagnostic push
 #pragma ide diagnostic ignored "OCUnusedGlobalDeclarationInspection"
 #endif
-  DLL_EXPORT COLLOCATORS *open_collocatordb_for_write(char *dbname) {
-		return new rocksdb::CollocatorDB(dbname, false);
-	}
+DLL_EXPORT COLLOCATORS *open_collocatordb_for_write(char *dbname) {
+  return new CollocatorDB(dbname, false);
+}
 
-	DLL_EXPORT COLLOCATORS *open_collocatordb(char *dbname) {
-		return new rocksdb::CollocatorDB(dbname, true);
-	}
-	
-	DLL_EXPORT void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
-		db->inc(w1, w2, dist);
-	}
+DLL_EXPORT COLLOCATORS *open_collocatordb(char *dbname) {
+  return new CollocatorDB(dbname, true);
+}
 
-  DLL_EXPORT void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
-    db->dump(w1, w2, dist);
-  }
+DLL_EXPORT void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2,
+                               int8_t dist) {
+  db->inc(w1, w2, dist);
+}
 
-  DLL_EXPORT COLLOCATORS *get_collocators(COLLOCATORS *db, uint32_t w1) {
-    std::vector<Collocator> c = db->get_collocators(w1);
-    if (c.empty())
-      return NULL;
-    uint64_t size = c.size() + sizeof c[0];
-    COLLOCATORS *p = (COLLOCATORS *) malloc(size);
-    memcpy(p, c.data(), size);
-    return p;
-  }
+DLL_EXPORT void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2,
+                                 int8_t dist) {
+  db->dump(w1, w2, dist);
+}
 
-  DLL_EXPORT COLLOCATORS *get_collocation_scores(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
-    std::vector<Collocator> c = db->get_collocation_scores(w1, w2);
-    if (c.empty())
-      return NULL;
-    uint64_t size = c.size() + sizeof c[0];
-    COLLOCATORS *p = (COLLOCATORS *) malloc(size);
-    memcpy(p, c.data(), size);
-    return p;
-  }
+DLL_EXPORT COLLOCATORS *get_collocators(COLLOCATORS *db, uint32_t w1) {
+  std::vector<Collocator> c = db->get_collocators(w1);
+  if (c.empty())
+    return nullptr;
+  uint64_t size = c.size() + sizeof c[0];
+  auto *p = (COLLOCATORS *)malloc(size);
+  memcpy(p, c.data(), size);
+  return p;
+}
 
-  DLL_EXPORT char *get_word(COLLOCATORS *db, uint32_t w) {
-    return strdup(db->getWord(w).c_str());
-  }
+DLL_EXPORT COLLOCATORS *get_collocation_scores(COLLOCATORS *db, uint32_t w1,
+                                               uint32_t w2) {
+  std::vector<Collocator> c = db->get_collocation_scores(w1, w2);
+  if (c.empty())
+    return nullptr;
+  uint64_t size = c.size() + sizeof c[0];
+  auto *p = (COLLOCATORS *)malloc(size);
+  memcpy(p, c.data(), size);
+  return p;
+}
 
-  DLL_EXPORT uint64_t get_word_id(COLLOCATORS *db, char *word) {
-    return db->getWordId(word);
-  }
+DLL_EXPORT char *get_word(COLLOCATORS *db, uint32_t w) {
+  return strdup(db->getWord(w).c_str());
+}
 
-  DLL_EXPORT void read_vocab(COLLOCATORS *db, char *fname) {
-    std::string fName(fname);
-    db->readVocab(fName);
-  }
+DLL_EXPORT uint64_t get_word_id(COLLOCATORS *db, char *word) {
+  return db->getWordId(word);
+}
 
-  DLL_EXPORT const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
-    return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
-  }
+DLL_EXPORT void read_vocab(COLLOCATORS *db, char *fname) {
+  std::string fName(fname);
+  db->readVocab(fName);
+}
 
-  DLL_EXPORT const char *get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
-    return strdup(db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
-  }
+DLL_EXPORT const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
+  return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
+}
 
-  DLL_EXPORT const char *get_version() {
-    return PROJECT_VERSION;
-  }
+DLL_EXPORT const char *
+get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
+  return strdup(
+      db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
+}
+
+DLL_EXPORT const char *get_version() { return PROJECT_VERSION; }
 
 #ifdef __clang__
 #pragma clang diagnostic push