blob: 56fc53a4eaa3bffc837fab60940e462caefc6a73 [file] [log] [blame]
Marc Kupietz28cc53e2017-12-23 17:24:55 +01001#include <typeinfo>
Marc Kupietz4b799e92018-01-02 11:04:56 +01002#define EXPORT __attribute__((visibility("visible")))
3#define IMPORT
Marc Kupietz28cc53e2017-12-23 17:24:55 +01004#include <assert.h>
5#include <memory>
6#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01007#include <algorithm>
8#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01009#include <stdint.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010010#include <string>
11#include <sstream> // for ostringstream
12#include <math.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010013#include "rocksdb/cache.h"
14#include "rocksdb/comparator.h"
15#include "rocksdb/db.h"
16#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010017#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010018#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010019#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010020#include "rocksdb/utilities/db_ttl.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010021#include "rocksdb/filter_policy.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010022#include "merge_operators.h"
23
Marc Kupietzc8ddf452018-01-07 21:33:12 +010024#define AVG_WINDOW_SIZE 7
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010025
Marc Kupietz28cc53e2017-12-23 17:24:55 +010026#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
27#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010028#define W1(key) (uint64_t)(key & 0xffffff)
29#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
30#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010031
32typedef struct {
33 uint64_t freq;
34 char *word;
35} vocab_entry;
36
37// typedef struct Collocator {
38// uint64_t w2;
39// uint64_t sum;
40// };
41
Marc Kupietz28cc53e2017-12-23 17:24:55 +010042using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010043using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010044
Marc Kupietz4b799e92018-01-02 11:04:56 +010045namespace rocksdb {
Marc Kupietzc8ddf452018-01-07 21:33:12 +010046 class Collocator {
47 public:
48 uint64_t w2;
49 uint64_t sum;
50 double pmi;
51 double npmi;
52 double llr;
53 double md;
54 double lfmd;
55 double fpmi;
56 };
57
Marc Kupietz28cc53e2017-12-23 17:24:55 +010058 size_t num_merge_operator_calls;
59 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010060
Marc Kupietz28cc53e2017-12-23 17:24:55 +010061 size_t num_partial_merge_calls;
62 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010063
64
Marc Kupietz4b799e92018-01-02 11:04:56 +010065 inline void EncodeFixed64(char* buf, uint64_t value) {
66 if (! IS_BIG_ENDIAN) {
67 memcpy(buf, &value, sizeof(value));
68 } else {
69 buf[0] = value & 0xff;
70 buf[1] = (value >> 8) & 0xff;
71 buf[2] = (value >> 16) & 0xff;
72 buf[3] = (value >> 24) & 0xff;
73 buf[4] = (value >> 32) & 0xff;
74 buf[5] = (value >> 40) & 0xff;
75 buf[6] = (value >> 48) & 0xff;
76 buf[7] = (value >> 56) & 0xff;
77 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010078 }
79
Marc Kupietz4b799e92018-01-02 11:04:56 +010080 inline uint32_t DecodeFixed32(const char* ptr) {
81 if (! IS_BIG_ENDIAN) {
82 // Load the raw bytes
83 uint32_t result;
84 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
85 return result;
86 } else {
87 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
88 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
89 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
90 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
91 }
92 }
93
94 inline uint64_t DecodeFixed64(const char* ptr) {
95 if (! IS_BIG_ENDIAN) {
96 // Load the raw bytes
97 uint64_t result;
98 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
99 return result;
100 } else {
101 uint64_t lo = DecodeFixed32(ptr);
102 uint64_t hi = DecodeFixed32(ptr + 4);
103 return (hi << 32) | lo;
104 }
105 }
106
107
108 class CountMergeOperator : public AssociativeMergeOperator {
109 public:
110 CountMergeOperator() {
111 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100112 }
113
Marc Kupietz4b799e92018-01-02 11:04:56 +0100114 virtual bool Merge(const Slice& key,
115 const Slice* existing_value,
116 const Slice& value,
117 std::string* new_value,
118 Logger* logger) const override {
119 assert(new_value->empty());
120 ++num_merge_operator_calls;
121 if (existing_value == nullptr) {
122 new_value->assign(value.data(), value.size());
123 return true;
124 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100125
Marc Kupietz4b799e92018-01-02 11:04:56 +0100126 return mergeOperator_->PartialMerge(
127 key,
128 *existing_value,
129 value,
130 new_value,
131 logger);
132 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100133
Marc Kupietz4b799e92018-01-02 11:04:56 +0100134 virtual const char* Name() const override {
135 return "UInt64AddOperator";
136 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100137
Marc Kupietz4b799e92018-01-02 11:04:56 +0100138 private:
139 std::shared_ptr<MergeOperator> mergeOperator_;
140 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100141
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100142
Marc Kupietz4b799e92018-01-02 11:04:56 +0100143 class CollocatorIterator : public Iterator {
144 private:
145 char prefixc[sizeof(uint64_t)];
146 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100147
148
Marc Kupietz4b799e92018-01-02 11:04:56 +0100149 public:
150 CollocatorIterator(Iterator* base_iterator)
151 : base_iterator_(base_iterator)
152 {}
153
154 ~CollocatorIterator();
155
156 void setPrefix(char *prefix) {
157 memcpy(prefixc, prefix, sizeof(uint64_t));
158 }
159
160 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
161 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
162 virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); }
163 virtual void Prev() { base_iterator_->Prev(); }
164 virtual void Next() { base_iterator_->Next(); }
165 virtual Slice key() const;
166 virtual Slice value() const;
167 virtual Status status() const;
168 virtual bool Valid() const;
169 bool isValid();
170 uint64_t intValue();
171 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100172
Marc Kupietz4b799e92018-01-02 11:04:56 +0100173 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100174
Marc Kupietz4b799e92018-01-02 11:04:56 +0100175 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100176
Marc Kupietz4b799e92018-01-02 11:04:56 +0100177 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz18375e12017-12-24 10:11:18 +0100178 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100179 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100180
Marc Kupietz4b799e92018-01-02 11:04:56 +0100181 bool rocksdb::CollocatorIterator::isValid() {
182 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
183 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100184
Marc Kupietz4b799e92018-01-02 11:04:56 +0100185 uint64_t rocksdb::CollocatorIterator::intKey() {
186 return DecodeFixed64(base_iterator_->key().data());
187 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100188
Marc Kupietz4b799e92018-01-02 11:04:56 +0100189 uint64_t rocksdb::CollocatorIterator::intValue() {
190 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100191 }
192
Marc Kupietz4b799e92018-01-02 11:04:56 +0100193 class Collocators {
194 private:
195 WriteOptions merge_option_; // for merge
196 char _one[sizeof(uint64_t)];
197 Slice _one_slice;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100198 vocab_entry *_vocab = NULL;
199
Marc Kupietz4b799e92018-01-02 11:04:56 +0100200 protected:
201 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100202
Marc Kupietz4b799e92018-01-02 11:04:56 +0100203 WriteOptions put_option_;
204 ReadOptions get_option_;
205 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100206
Marc Kupietz4b799e92018-01-02 11:04:56 +0100207 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100208
Marc Kupietz4b799e92018-01-02 11:04:56 +0100209 std::shared_ptr<DB> OpenDb(const char *dbname);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100210 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100211
Marc Kupietz4b799e92018-01-02 11:04:56 +0100212 public:
Marc Kupietz6bb27762018-01-09 17:53:01 +0100213 Collocators(const char *db_name, bool read_only);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100214 ~Collocators();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100215
Marc Kupietz4b799e92018-01-02 11:04:56 +0100216 // public interface of Collocators.
217 // All four functions return false
218 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100219
Marc Kupietz4b799e92018-01-02 11:04:56 +0100220 // mapped to a levedb Put
221 bool set(const std::string& key, uint64_t value) {
222 // just treat the internal rep of int64 as the string
223 char buf[sizeof(value)];
224 EncodeFixed64(buf, value);
225 Slice slice(buf, sizeof(value));
226 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100227
Marc Kupietz4b799e92018-01-02 11:04:56 +0100228 if (s.ok()) {
229 return true;
230 } else {
231 std::cerr << s.ToString() << std::endl;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100232 return false;
233 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100234 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100235
236 DB *getDb() {
237 return db_.get();
238 }
239
240 // mapped to a rocksdb Delete
241 bool remove(const std::string& key) {
242 auto s = db_->Delete(delete_option_, key);
243
244 if (s.ok()) {
245 return true;
246 } else {
247 std::cerr << s.ToString() << std::endl;
248 return false;
249 }
250 }
251
252 // mapped to a rocksdb Get
253 bool get(const std::string& key, uint64_t* value) {
254 std::string str;
255 auto s = db_->Get(get_option_, key, &str);
256
257 if (s.IsNotFound()) {
258 // return default value if not found;
259 *value = default_;
260 return true;
261 } else if (s.ok()) {
262 // deserialization
263 if (str.size() != sizeof(uint64_t)) {
264 std::cerr << "value corruption\n";
265 return false;
266 }
267 *value = DecodeFixed64(&str[0]);
268 return true;
269 } else {
270 std::cerr << s.ToString() << std::endl;
271 return false;
272 }
273 }
274
275
276 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
277 char encoded_key[sizeof(uint64_t)];
278 EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist));
279 uint64_t value = default_;
280 get(std::string(encoded_key, 8), &value);
281 return value;
282 }
283
284 virtual void inc(const std::string& key) {
285 db_->Merge(merge_option_, key, _one_slice);
286 }
287
288 void inc(const uint64_t key) {
289 char encoded_key[sizeof(uint64_t)];
290 EncodeFixed64(encoded_key, key);
291 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
292 }
293
294 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100295 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100296 vector<Collocator> get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total);
297 string collocators2json(vector<Collocator> collocators);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100298
Marc Kupietz4b799e92018-01-02 11:04:56 +0100299 // mapped to a rocksdb Merge operation
300 virtual bool add(const std::string& key, uint64_t value) {
301 char encoded[sizeof(uint64_t)];
302 EncodeFixed64(encoded, value);
303 Slice slice(encoded, sizeof(uint64_t));
304 auto s = db_->Merge(merge_option_, key, slice);
305
306 if (s.ok()) {
307 return true;
308 } else {
309 std::cerr << s.ToString() << std::endl;
310 return false;
311 }
312 }
313
314 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
315 };
316
Marc Kupietz6bb27762018-01-09 17:53:01 +0100317 rocksdb::Collocators::Collocators(const char *db_name, bool read_only = false) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100318 // merge_option_.sync = true;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100319 if(read_only)
320 db_ = OpenDbForRead(db_name);
321 else
322 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100323 assert(db_);
324 uint64_t one = 1;
325 EncodeFixed64(_one, one);
326 _one_slice = Slice(_one, sizeof(uint64_t));
327 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100328
Marc Kupietz4b799e92018-01-02 11:04:56 +0100329 rocksdb::CollocatorIterator::~CollocatorIterator() {
330 std::cout << "destroying itera\n";
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100331 }
332
Marc Kupietz4b799e92018-01-02 11:04:56 +0100333 rocksdb::Collocators::~Collocators() {
334 std::cout << "destroying coll\n";
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100335 }
336
Marc Kupietz4b799e92018-01-02 11:04:56 +0100337 void rocksdb::Collocators::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100338 inc(encodeCollocation(w1, w2, dist));
339 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100340
Marc Kupietz6bb27762018-01-09 17:53:01 +0100341 std::shared_ptr<DB> rocksdb::Collocators::OpenDbForRead(const char *name) {
342 DB* db;
343 Options options;
344 ostringstream dbname;
345 dbname << name << ".rocksdb";
346 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
347 if (!s.ok()) {
348 std::cerr << s.ToString() << std::endl;
349 assert(false);
350 }
351 return std::shared_ptr<DB>(db);
352 }
353
Marc Kupietz4b799e92018-01-02 11:04:56 +0100354 std::shared_ptr<DB> rocksdb::Collocators::OpenDb(const char *dbname) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100355 DB* db;
356 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100357
358
359 options.env->SetBackgroundThreads(4);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100360 options.create_if_missing = true;
361 options.merge_operator = std::make_shared<CountMergeOperator>();
362 options.max_successive_merges = 0;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100363 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
364 options.IncreaseParallelism(70);
365 // options.OptimizeLevelStyleCompaction();
366 options.max_write_buffer_number = 48;
367 options.max_background_jobs = 48;
368 options.allow_concurrent_memtable_write=true;
369 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
370 // options.enable_write_thread_adaptive_yield = 1;
371 // options.allow_concurrent_memtable_write = 1;
372 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
373 // options.write_buffer_size = 1 << 22;
374 // options.allow_mmap_reads = true;
375 // options.allow_mmap_writes = true;
376 options.max_background_compactions = 40;
377 // BlockBasedTableOptions table_options;
378 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
379 // options.bloom_locality = 1;
380 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
381 // table_options.block_cache = cache;
382 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100383 Status s;
384 // DestroyDB(dbname, Options());
385 s = DB::Open(options, dbname, &db);
386 if (!s.ok()) {
387 std::cerr << s.ToString() << std::endl;
388 assert(false);
389 }
390 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100391 }
392
Marc Kupietz4b799e92018-01-02 11:04:56 +0100393 CollocatorIterator* rocksdb::Collocators::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100394 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100395 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100396 char prefixc[sizeof(uint64_t)];
397 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
398 Iterator *it = db_->NewIterator(options);
399 CollocatorIterator *cit = new CollocatorIterator(it);
400 cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) {
401 cit->setPrefix(prefixc);
402 return cit;
403 }
404
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100405 void rocksdb::Collocators::dump(uint32_t w1, uint32_t w2, int8_t dist) {
406 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
407 for (; it->isValid(); it->Next()) {
408 uint64_t value = it->intValue();
409 uint64_t key = it->intKey();
410 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl;
411 }
412 std::cout << "ready dumping\n";
413 }
414
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100415 double calculateLLR(uint64_t f_X_, uint64_t uintN, uint64_t f_X_Y_, uint64_t f_Y_) {
416 double f_e_, f_o_;
417 double A=0.0, B=0.0, C=0.0, D=0.0, N=0.0;
418 double LLR=0.0, statVal=0.0, minusDiffCoeff=0.0;
419 double BlogB=0.0, ClogC=0.0;
420
421 N = (double)uintN;
422 A = (double)f_X_Y_;
423 B = (double)f_X_ -A;
424 C = (double)f_Y_ -A;
425 D = (double)N -A-B-C;;
426
427 if (B > 0.) BlogB = B*log(B);
428 if (C > 0.) ClogC = C*log(C);
429
430 if ((A>0.) && (D>0.) && (N>0.)) {
431 f_e_ = (double)f_X_ /(double)N;
432 f_o_ = (double)f_X_Y_/(double)f_Y_;
433
434 minusDiffCoeff =
435 ( f_X_==0 ? (double)((-1)*f_X_Y_) :
436 ( f_X_Y_==0 ? (double)((+1)*f_X_) :
437 (f_e_-f_o_)/(f_e_+f_o_)
438 )
439 );
440
441 /* log likelihood ratio */
442 LLR = 2*( A*log(A)
443 +BlogB
444 +ClogC
445 +D*log(D)
446 -(A+B)*log(A+B)
447 -(A+C)*log(A+C)
448 -(B+D)*log(B+D)
449 -(C+D)*log(C+D)
450 +N*log(N)
451 );
452 }
453 return(minusDiffCoeff > 0 ? 0 : (statVal=LLR));
454 }
455
456
457 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
458 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
459
460 std::vector<Collocator> rocksdb::Collocators::get_collocators(uint32_t w1, vocab_entry *vocab, uint64_t total) {
461 _vocab = vocab;
462 std::vector<Collocator> collocators;
463 uint64_t w2, last_w2 = 0xffffffffffffffff;
464 uint64_t sum = 0, total_w1 = 0;
465 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
466 uint64_t value = it->intValue(),
467 key = it->intKey();
468 w2 = W2(key);
469 total_w1 += value;
470 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
471 if (w2 != last_w2) {
472 double pmi = log2( total * ((double) sum) /
473 (AVG_WINDOW_SIZE * ((double)vocab[w1].freq) * ((double)vocab[last_w2].freq) ));
474 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
475 // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) / (AVG_WINDOW_SIZE * ((double)vocab[w1].freq/total) * ((double)vocab[last_w2].freq/total)));
476 double md = log2((double)sum * sum / ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * vocab[w1].freq * vocab[last_w2].freq));
477 collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(((double) sum / AVG_WINDOW_SIZE / total))), /* normalize to [-1,1] */
478 calculateLLR(vocab[w1].freq, total, sum, vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
479 last_w2 = w2;
480 sum = value;
481 } else {
482 sum += value;
483 }
484 }
485
486 sort(collocators.begin(), collocators.end(), sortByLfmd);
487
488 int i=0;
489 for (Collocator c : collocators) {
490 if(i++>10) break;
491 std::cout << "w1:" << vocab[w1].word << ", w2:" << vocab[c.w2].word
492 << "\t f(w1):" << vocab[w1].freq
493 << "\t f(w2):" << vocab[c.w2].freq
494 << "\t f(w1, x):" << total_w1
495 << "\t f(w1, w2):" << c.sum
496 << "\t pmi:" << c.pmi
497 << "\t npmi:" << c.npmi
498 << "\t llr:" << c.llr
499 << "\t md:" << c.md
500 << "\t lfmd:" << c.lfmd
501 << "\t fpmi:" << c.fpmi
502 << "\t total:" << total
503 << std::endl;
504 }
505 return collocators;
506 }
507
Marc Kupietz4b799e92018-01-02 11:04:56 +0100508 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
509 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
510 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
511
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100512};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100513
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100514string rocksdb::Collocators::collocators2json(vector<Collocator> collocators) {
515 ostringstream s;
516 s << "[";
517 bool first = true;
518 for (Collocator c : collocators) {
519 if(!first)
520 s << ",\n";
521 else
522 first = false;
523 s << "{"
524 "\"word\":\"" << string(_vocab[c.w2].word) << "\"," <<
525 "\"rank\":" << c.w2 << "," <<
526 "\"npmi\":" << c.npmi << "," <<
527 "\"llr\":" << c.llr << "," <<
528 "\"lfmd\":" << c.lfmd << "," <<
529 "\"mi\":" << c.fpmi <<
530 "}";
531 }
532 s << "]\n";
533 return s.str();
534}
535
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100536typedef rocksdb::Collocators COLLOCATORS;
537
538extern "C" {
539 COLLOCATORS *open_collocators(char *dbname) {
540 return new rocksdb::Collocators(dbname);
541 }
542
Marc Kupietz6bb27762018-01-09 17:53:01 +0100543 COLLOCATORS *open_collocators_for_read(char *dbname) {
544 return new rocksdb::Collocators(dbname, true);
545 }
546
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100547 void inc_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
548 db->inc(w1, w2, dist);
549 }
550
551 void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
552 db->dump(w1, w2, dist);
553 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100554
555 void get_collocators(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
556 db->get_collocators(w1, vocab, total);
557 }
558
559 const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1, vocab_entry *vocab, uint64_t total) {
560 return strdup(db->collocators2json(db->get_collocators(w1, vocab, total)).c_str());
561 }
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100562}