blob: 7aca8a5b6421d721381c08f99b8f5186dc89ecc4 [file] [log] [blame]
Marc Kupietz28cc53e2017-12-23 17:24:55 +01001#include <typeinfo>
Marc Kupietz4b799e92018-01-02 11:04:56 +01002#define EXPORT __attribute__((visibility("visible")))
3#define IMPORT
Marc Kupietz28cc53e2017-12-23 17:24:55 +01004#include <assert.h>
Marc Kupietz37359b12018-01-09 21:11:37 +01005#include <inttypes.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01006#include <memory>
7#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01008#include <algorithm>
9#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010010#include <stdint.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010011#include <string>
12#include <sstream> // for ostringstream
13#include <math.h>
Marc Kupietzd31254c2018-01-20 21:29:30 +010014#include <rocksdb/cache.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010015#include "rocksdb/comparator.h"
16#include "rocksdb/db.h"
17#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010019#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010020#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010021#include "rocksdb/utilities/db_ttl.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010022#include "rocksdb/filter_policy.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010023#include "merge_operators.h"
24
Marc Kupietzc8ddf452018-01-07 21:33:12 +010025#define AVG_WINDOW_SIZE 7
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010026
Marc Kupietz28cc53e2017-12-23 17:24:55 +010027#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
28#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010029#define W1(key) (uint64_t)(key & 0xffffff)
30#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
31#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010032
33typedef struct {
34 uint64_t freq;
35 char *word;
36} vocab_entry;
37
38// typedef struct Collocator {
39// uint64_t w2;
40// uint64_t sum;
41// };
42
Marc Kupietz28cc53e2017-12-23 17:24:55 +010043using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010044using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010045
Marc Kupietz4b799e92018-01-02 11:04:56 +010046namespace rocksdb {
Marc Kupietzc8ddf452018-01-07 21:33:12 +010047 class Collocator {
48 public:
49 uint64_t w2;
50 uint64_t sum;
51 double pmi;
52 double npmi;
53 double llr;
54 double md;
55 double lfmd;
56 double fpmi;
57 };
58
Marc Kupietz28cc53e2017-12-23 17:24:55 +010059 size_t num_merge_operator_calls;
60 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010061
Marc Kupietz28cc53e2017-12-23 17:24:55 +010062 size_t num_partial_merge_calls;
63 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010064
65
Marc Kupietz4b799e92018-01-02 11:04:56 +010066 inline void EncodeFixed64(char* buf, uint64_t value) {
67 if (! IS_BIG_ENDIAN) {
68 memcpy(buf, &value, sizeof(value));
69 } else {
70 buf[0] = value & 0xff;
71 buf[1] = (value >> 8) & 0xff;
72 buf[2] = (value >> 16) & 0xff;
73 buf[3] = (value >> 24) & 0xff;
74 buf[4] = (value >> 32) & 0xff;
75 buf[5] = (value >> 40) & 0xff;
76 buf[6] = (value >> 48) & 0xff;
77 buf[7] = (value >> 56) & 0xff;
78 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010079 }
80
Marc Kupietz4b799e92018-01-02 11:04:56 +010081 inline uint32_t DecodeFixed32(const char* ptr) {
82 if (! IS_BIG_ENDIAN) {
83 // Load the raw bytes
84 uint32_t result;
85 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
86 return result;
87 } else {
88 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
89 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
90 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
91 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
92 }
93 }
94
95 inline uint64_t DecodeFixed64(const char* ptr) {
96 if (! IS_BIG_ENDIAN) {
97 // Load the raw bytes
98 uint64_t result;
99 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
100 return result;
101 } else {
102 uint64_t lo = DecodeFixed32(ptr);
103 uint64_t hi = DecodeFixed32(ptr + 4);
104 return (hi << 32) | lo;
105 }
106 }
107
108
109 class CountMergeOperator : public AssociativeMergeOperator {
110 public:
111 CountMergeOperator() {
112 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100113 }
114
Marc Kupietz4b799e92018-01-02 11:04:56 +0100115 virtual bool Merge(const Slice& key,
116 const Slice* existing_value,
117 const Slice& value,
118 std::string* new_value,
119 Logger* logger) const override {
120 assert(new_value->empty());
121 ++num_merge_operator_calls;
122 if (existing_value == nullptr) {
123 new_value->assign(value.data(), value.size());
124 return true;
125 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100126
Marc Kupietz4b799e92018-01-02 11:04:56 +0100127 return mergeOperator_->PartialMerge(
128 key,
129 *existing_value,
130 value,
131 new_value,
132 logger);
133 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100134
Marc Kupietz4b799e92018-01-02 11:04:56 +0100135 virtual const char* Name() const override {
136 return "UInt64AddOperator";
137 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100138
Marc Kupietz4b799e92018-01-02 11:04:56 +0100139 private:
140 std::shared_ptr<MergeOperator> mergeOperator_;
141 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100142
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100143
Marc Kupietz4b799e92018-01-02 11:04:56 +0100144 class CollocatorIterator : public Iterator {
145 private:
146 char prefixc[sizeof(uint64_t)];
147 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100148
149
Marc Kupietz4b799e92018-01-02 11:04:56 +0100150 public:
151 CollocatorIterator(Iterator* base_iterator)
152 : base_iterator_(base_iterator)
153 {}
154
Marc Kupietz4b799e92018-01-02 11:04:56 +0100155 void setPrefix(char *prefix) {
156 memcpy(prefixc, prefix, sizeof(uint64_t));
157 }
158
159 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
160 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
161 virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); }
162 virtual void Prev() { base_iterator_->Prev(); }
163 virtual void Next() { base_iterator_->Next(); }
164 virtual Slice key() const;
165 virtual Slice value() const;
166 virtual Status status() const;
167 virtual bool Valid() const;
168 bool isValid();
169 uint64_t intValue();
170 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100171
Marc Kupietz4b799e92018-01-02 11:04:56 +0100172 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100173
Marc Kupietz4b799e92018-01-02 11:04:56 +0100174 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100175
Marc Kupietz4b799e92018-01-02 11:04:56 +0100176 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz18375e12017-12-24 10:11:18 +0100177 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100178 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100179
Marc Kupietz4b799e92018-01-02 11:04:56 +0100180 bool rocksdb::CollocatorIterator::isValid() {
181 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietzd31254c2018-01-20 21:29:30 +0100182 // return key().starts_with(std::string(prefixc,3));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100183 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100184
Marc Kupietz4b799e92018-01-02 11:04:56 +0100185 uint64_t rocksdb::CollocatorIterator::intKey() {
186 return DecodeFixed64(base_iterator_->key().data());
187 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100188
Marc Kupietz4b799e92018-01-02 11:04:56 +0100189 uint64_t rocksdb::CollocatorIterator::intValue() {
190 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100191 }
192
Marc Kupietz37359b12018-01-09 21:11:37 +0100193 class VocabEntry {
194 public:
195 string word;
196 uint64_t freq;
197 };
198
Marc Kupietz6aec7682018-01-10 09:47:48 +0100199 class CollocatorDB {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100200 private:
201 WriteOptions merge_option_; // for merge
202 char _one[sizeof(uint64_t)];
203 Slice _one_slice;
Marc Kupietz37359b12018-01-09 21:11:37 +0100204 vector<VocabEntry> _vocab;
205 uint64_t total;
206
Marc Kupietz4b799e92018-01-02 11:04:56 +0100207 protected:
208 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100209
Marc Kupietz4b799e92018-01-02 11:04:56 +0100210 WriteOptions put_option_;
211 ReadOptions get_option_;
212 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100213
Marc Kupietz4b799e92018-01-02 11:04:56 +0100214 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100215
Marc Kupietz4b799e92018-01-02 11:04:56 +0100216 std::shared_ptr<DB> OpenDb(const char *dbname);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100217 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
Marc Kupietz37359b12018-01-09 21:11:37 +0100218 void read_vocab(string fname);
219
Marc Kupietz4b799e92018-01-02 11:04:56 +0100220 public:
Marc Kupietz6aec7682018-01-10 09:47:48 +0100221 CollocatorDB(const char *db_name, bool read_only);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100222
Marc Kupietz6aec7682018-01-10 09:47:48 +0100223 // public interface of CollocatorDB.
Marc Kupietz4b799e92018-01-02 11:04:56 +0100224 // All four functions return false
225 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100226
Marc Kupietz4b799e92018-01-02 11:04:56 +0100227 // mapped to a levedb Put
228 bool set(const std::string& key, uint64_t value) {
229 // just treat the internal rep of int64 as the string
230 char buf[sizeof(value)];
231 EncodeFixed64(buf, value);
232 Slice slice(buf, sizeof(value));
233 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100234
Marc Kupietz4b799e92018-01-02 11:04:56 +0100235 if (s.ok()) {
236 return true;
237 } else {
238 std::cerr << s.ToString() << std::endl;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100239 return false;
240 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100241 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100242
243 DB *getDb() {
244 return db_.get();
245 }
246
247 // mapped to a rocksdb Delete
248 bool remove(const std::string& key) {
249 auto s = db_->Delete(delete_option_, key);
250
251 if (s.ok()) {
252 return true;
253 } else {
254 std::cerr << s.ToString() << std::endl;
255 return false;
256 }
257 }
258
259 // mapped to a rocksdb Get
260 bool get(const std::string& key, uint64_t* value) {
261 std::string str;
262 auto s = db_->Get(get_option_, key, &str);
263
264 if (s.IsNotFound()) {
265 // return default value if not found;
266 *value = default_;
267 return true;
268 } else if (s.ok()) {
269 // deserialization
270 if (str.size() != sizeof(uint64_t)) {
271 std::cerr << "value corruption\n";
272 return false;
273 }
274 *value = DecodeFixed64(&str[0]);
275 return true;
276 } else {
277 std::cerr << s.ToString() << std::endl;
278 return false;
279 }
280 }
281
282
283 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
284 char encoded_key[sizeof(uint64_t)];
285 EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist));
286 uint64_t value = default_;
287 get(std::string(encoded_key, 8), &value);
288 return value;
289 }
290
291 virtual void inc(const std::string& key) {
292 db_->Merge(merge_option_, key, _one_slice);
293 }
294
295 void inc(const uint64_t key) {
296 char encoded_key[sizeof(uint64_t)];
297 EncodeFixed64(encoded_key, key);
298 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
299 }
300
301 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100302 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz37359b12018-01-09 21:11:37 +0100303 vector<Collocator> get_collocators(uint32_t w1);
Marc Kupietzd31254c2018-01-20 21:29:30 +0100304 vector<Collocator> get_collocators_avg(uint32_t w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100305 string collocators2json(vector<Collocator> collocators);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100306
Marc Kupietz4b799e92018-01-02 11:04:56 +0100307 // mapped to a rocksdb Merge operation
308 virtual bool add(const std::string& key, uint64_t value) {
309 char encoded[sizeof(uint64_t)];
310 EncodeFixed64(encoded, value);
311 Slice slice(encoded, sizeof(uint64_t));
312 auto s = db_->Merge(merge_option_, key, slice);
313
314 if (s.ok()) {
315 return true;
316 } else {
317 std::cerr << s.ToString() << std::endl;
318 return false;
319 }
320 }
321
322 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
323 };
324
Marc Kupietz6aec7682018-01-10 09:47:48 +0100325 rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100326 // merge_option_.sync = true;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100327 if(read_only)
328 db_ = OpenDbForRead(db_name);
329 else
330 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100331 assert(db_);
332 uint64_t one = 1;
333 EncodeFixed64(_one, one);
334 _one_slice = Slice(_one, sizeof(uint64_t));
335 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100336
Marc Kupietz6aec7682018-01-10 09:47:48 +0100337 void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100338 inc(encodeCollocation(w1, w2, dist));
339 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100340
Marc Kupietz6aec7682018-01-10 09:47:48 +0100341 void rocksdb::CollocatorDB::read_vocab(string fname) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100342 char strbuf[2048];
343 uint64_t freq;
344 FILE *fin = fopen(fname.c_str(), "rb");
345 if (fin == NULL) {
346 cout << "Vocabulary file " << fname <<" not found\n";
347 exit(1);
348 }
349 uint64_t i = 0;
350 while(!feof(fin)) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100351 fscanf(fin, "%s %lu", strbuf, &freq);
Marc Kupietz37359b12018-01-09 21:11:37 +0100352 _vocab.push_back({strbuf, freq});
353 total += freq;
354 i++;
355 }
356 fclose(fin);
357 }
358
Marc Kupietz6aec7682018-01-10 09:47:48 +0100359 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
Marc Kupietz6bb27762018-01-09 17:53:01 +0100360 DB* db;
361 Options options;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100362 options.env->SetBackgroundThreads(4);
363 options.create_if_missing = true;
364 options.merge_operator = std::make_shared<CountMergeOperator>();
365 options.max_successive_merges = 0;
366 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
367 options.IncreaseParallelism();
368 options.OptimizeLevelStyleCompaction();
369 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
Marc Kupietz37359b12018-01-09 21:11:37 +0100370 ostringstream dbname, vocabname;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100371 dbname << name << ".rocksdb";
372 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
373 if (!s.ok()) {
374 std::cerr << s.ToString() << std::endl;
375 assert(false);
376 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100377 vocabname << name << ".vocab";
378 read_vocab(vocabname.str());
Marc Kupietz6bb27762018-01-09 17:53:01 +0100379 return std::shared_ptr<DB>(db);
380 }
381
Marc Kupietz6aec7682018-01-10 09:47:48 +0100382 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100383 DB* db;
384 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100385
386
387 options.env->SetBackgroundThreads(4);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100388 options.create_if_missing = true;
389 options.merge_operator = std::make_shared<CountMergeOperator>();
390 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100391 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
392 options.IncreaseParallelism();
393 options.OptimizeLevelStyleCompaction();
394 // options.max_write_buffer_number = 48;
395 // options.max_background_jobs = 48;
396 // options.allow_concurrent_memtable_write=true;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100397 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
398 // options.enable_write_thread_adaptive_yield = 1;
399 // options.allow_concurrent_memtable_write = 1;
400 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
401 // options.write_buffer_size = 1 << 22;
402 // options.allow_mmap_reads = true;
403 // options.allow_mmap_writes = true;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100404 // options.max_background_compactions = 40;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100405 // BlockBasedTableOptions table_options;
406 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
407 // options.bloom_locality = 1;
408 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
409 // table_options.block_cache = cache;
410 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100411 Status s;
412 // DestroyDB(dbname, Options());
413 s = DB::Open(options, dbname, &db);
414 if (!s.ok()) {
415 std::cerr << s.ToString() << std::endl;
416 assert(false);
417 }
418 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100419 }
420
Marc Kupietz6aec7682018-01-10 09:47:48 +0100421 CollocatorIterator* rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100422 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100423 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100424 char prefixc[sizeof(uint64_t)];
425 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
426 Iterator *it = db_->NewIterator(options);
427 CollocatorIterator *cit = new CollocatorIterator(it);
428 cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) {
429 cit->setPrefix(prefixc);
430 return cit;
431 }
432
Marc Kupietz6aec7682018-01-10 09:47:48 +0100433 void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100434 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
435 for (; it->isValid(); it->Next()) {
436 uint64_t value = it->intValue();
437 uint64_t key = it->intKey();
438 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl;
439 }
440 std::cout << "ready dumping\n";
441 }
442
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100443 double calculateLLR(uint64_t f_X_, uint64_t uintN, uint64_t f_X_Y_, uint64_t f_Y_) {
444 double f_e_, f_o_;
445 double A=0.0, B=0.0, C=0.0, D=0.0, N=0.0;
446 double LLR=0.0, statVal=0.0, minusDiffCoeff=0.0;
447 double BlogB=0.0, ClogC=0.0;
448
449 N = (double)uintN;
450 A = (double)f_X_Y_;
451 B = (double)f_X_ -A;
452 C = (double)f_Y_ -A;
453 D = (double)N -A-B-C;;
454
455 if (B > 0.) BlogB = B*log(B);
456 if (C > 0.) ClogC = C*log(C);
457
458 if ((A>0.) && (D>0.) && (N>0.)) {
459 f_e_ = (double)f_X_ /(double)N;
460 f_o_ = (double)f_X_Y_/(double)f_Y_;
461
462 minusDiffCoeff =
463 ( f_X_==0 ? (double)((-1)*f_X_Y_) :
464 ( f_X_Y_==0 ? (double)((+1)*f_X_) :
465 (f_e_-f_o_)/(f_e_+f_o_)
466 )
467 );
468
469 /* log likelihood ratio */
470 LLR = 2*( A*log(A)
471 +BlogB
472 +ClogC
473 +D*log(D)
474 -(A+B)*log(A+B)
475 -(A+C)*log(A+C)
476 -(B+D)*log(B+D)
477 -(C+D)*log(C+D)
478 +N*log(N)
479 );
480 }
481 return(minusDiffCoeff > 0 ? 0 : (statVal=LLR));
482 }
483
484
485 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
486 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100487 bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100488
Marc Kupietzd31254c2018-01-20 21:29:30 +0100489 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators_avg(uint32_t w1) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100490 std::vector<Collocator> collocators;
491 uint64_t w2, last_w2 = 0xffffffffffffffff;
492 uint64_t sum = 0, total_w1 = 0;
493 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
494 uint64_t value = it->intValue(),
495 key = it->intKey();
496 w2 = W2(key);
497 total_w1 += value;
498 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
499 if (w2 != last_w2) {
500 double pmi = log2( total * ((double) sum) /
Marc Kupietz37359b12018-01-09 21:11:37 +0100501 (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq) * ((double)_vocab[last_w2].freq) ));
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100502 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
Marc Kupietz37359b12018-01-09 21:11:37 +0100503 // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) / (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
504 double md = log2((double)sum * sum / ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * _vocab[w1].freq * _vocab[last_w2].freq));
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100505 collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(((double) sum / AVG_WINDOW_SIZE / total))), /* normalize to [-1,1] */
Marc Kupietz37359b12018-01-09 21:11:37 +0100506 calculateLLR(_vocab[w1].freq, total, sum, _vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100507 last_w2 = w2;
508 sum = value;
509 } else {
510 sum += value;
511 }
512 }
513
514 sort(collocators.begin(), collocators.end(), sortByLfmd);
515
516 int i=0;
517 for (Collocator c : collocators) {
518 if(i++>10) break;
Marc Kupietz37359b12018-01-09 21:11:37 +0100519 std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
520 << "\t f(w1):" << _vocab[w1].freq
521 << "\t f(w2):" << _vocab[c.w2].freq
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100522 << "\t f(w1, x):" << total_w1
523 << "\t f(w1, w2):" << c.sum
524 << "\t pmi:" << c.pmi
525 << "\t npmi:" << c.npmi
526 << "\t llr:" << c.llr
527 << "\t md:" << c.md
528 << "\t lfmd:" << c.lfmd
529 << "\t fpmi:" << c.fpmi
530 << "\t total:" << total
531 << std::endl;
532 }
533 return collocators;
534 }
535
Marc Kupietzd31254c2018-01-20 21:29:30 +0100536 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
537 std::vector<Collocator> collocators;
538 uint64_t w2, last_w2 = 0xffffffffffffffff;
539 uint64_t max = 0, total_w1 = 0;
540 const double window_size = 1;
541
542 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
543 uint64_t value = it->intValue(),
544 key = it->intKey();
545 w2 = W2(key);
546 total_w1 += value;
547 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
548 if (w2 != last_w2) {
549 double pmi = log2( total * ((double) max) /
550 (window_size * ((double)_vocab[w1].freq) * ((double)_vocab[last_w2].freq) ));
551 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
552 // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
553 double md = log2((double)max * max / ((double) total * window_size * window_size * _vocab[w1].freq * _vocab[last_w2].freq));
554 collocators.push_back ( {last_w2, max, pmi, pmi / (-log2(((double) max / window_size / total))), /* normalize to [-1,1] */
555 calculateLLR(_vocab[w1].freq, total, max, _vocab[last_w2].freq), md, md + log2((double)max / window_size / total), pmi*max/total/window_size} );
556 last_w2 = w2;
557 max = value;
558 } else {
559 if(value > max)
560 max = value;
561 }
562 }
563
564 sort(collocators.begin(), collocators.end(), sortByLfmd);
565
566 int i=0;
567 for (Collocator c : collocators) {
568 if(i++>10) break;
569 std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
570 << "\t f(w1):" << _vocab[w1].freq
571 << "\t f(w2):" << _vocab[c.w2].freq
572 << "\t f(w1, x):" << total_w1
573 << "\t f(w1, w2):" << c.sum
574 << "\t pmi:" << c.pmi
575 << "\t npmi:" << c.npmi
576 << "\t llr:" << c.llr
577 << "\t md:" << c.md
578 << "\t lfmd:" << c.lfmd
579 << "\t fpmi:" << c.fpmi
580 << "\t total:" << total
581 << std::endl;
582 }
583 return collocators;
584 }
585
Marc Kupietz4b799e92018-01-02 11:04:56 +0100586 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
587 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
588 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
589
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100590};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100591
Marc Kupietz6aec7682018-01-10 09:47:48 +0100592string rocksdb::CollocatorDB::collocators2json(vector<Collocator> collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100593 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100594 int i = 0;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100595 s << "[";
596 bool first = true;
597 for (Collocator c : collocators) {
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100598 if (i++ > 200)
599 break;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100600 if(!first)
601 s << ",\n";
602 else
603 first = false;
604 s << "{"
605 "\"word\":\"" << string(_vocab[c.w2].word) << "\"," <<
606 "\"rank\":" << c.w2 << "," <<
607 "\"npmi\":" << c.npmi << "," <<
608 "\"llr\":" << c.llr << "," <<
609 "\"lfmd\":" << c.lfmd << "," <<
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100610 "\"fpmi\":" << c.fpmi <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100611 "}";
612 }
613 s << "]\n";
614 return s.str();
615}
616
Marc Kupietz6aec7682018-01-10 09:47:48 +0100617typedef rocksdb::CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100618
619extern "C" {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100620 COLLOCATORS *open_collocatordb_for_write(char *dbname) {
621 return new rocksdb::CollocatorDB(dbname, false);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100622 }
623
Marc Kupietz6aec7682018-01-10 09:47:48 +0100624 COLLOCATORS *open_collocatordb(char *dbname) {
625 return new rocksdb::CollocatorDB(dbname, true);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100626 }
627
Marc Kupietz6aec7682018-01-10 09:47:48 +0100628 void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100629 db->inc(w1, w2, dist);
630 }
631
632 void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
633 db->dump(w1, w2, dist);
634 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100635
Marc Kupietz37359b12018-01-09 21:11:37 +0100636 void get_collocators(COLLOCATORS *db, uint32_t w1) {
637 db->get_collocators(w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100638 }
639
Marc Kupietz37359b12018-01-09 21:11:37 +0100640 const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
641 return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100642 }
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100643}