blob: bad99c38c53886126923013367862d1bb9307030 [file] [log] [blame]
Marc Kupietz28cc53e2017-12-23 17:24:55 +01001#include <typeinfo>
Marc Kupietz4b799e92018-01-02 11:04:56 +01002#define EXPORT __attribute__((visibility("visible")))
3#define IMPORT
Marc Kupietz28cc53e2017-12-23 17:24:55 +01004#include <assert.h>
Marc Kupietz37359b12018-01-09 21:11:37 +01005#include <inttypes.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01006#include <memory>
7#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01008#include <algorithm>
9#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010010#include <stdint.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010011#include <string>
12#include <sstream> // for ostringstream
13#include <math.h>
Marc Kupietzd31254c2018-01-20 21:29:30 +010014#include <rocksdb/cache.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010015#include "rocksdb/comparator.h"
16#include "rocksdb/db.h"
17#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010019#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010020#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010021#include "rocksdb/utilities/db_ttl.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010022#include "rocksdb/filter_policy.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010023#include "merge_operators.h"
24
Marc Kupietzc8ddf452018-01-07 21:33:12 +010025#define AVG_WINDOW_SIZE 7
Marc Kupietz06c9a9f2018-01-02 16:56:43 +010026
Marc Kupietz28cc53e2017-12-23 17:24:55 +010027#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
28#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010029#define W1(key) (uint64_t)(key & 0xffffff)
30#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
31#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010032
33typedef struct {
34 uint64_t freq;
35 char *word;
36} vocab_entry;
37
38// typedef struct Collocator {
39// uint64_t w2;
40// uint64_t sum;
41// };
42
Marc Kupietz28cc53e2017-12-23 17:24:55 +010043using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010044using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010045
Marc Kupietz4b799e92018-01-02 11:04:56 +010046namespace rocksdb {
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020047 class Collocator {
48 public:
Marc Kupietzc8ddf452018-01-07 21:33:12 +010049 uint64_t w2;
Marc Kupietz51f93792018-01-25 08:51:01 +010050 uint64_t raw;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010051 double pmi;
52 double npmi;
53 double llr;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010054 double lfmd;
55 double fpmi;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +010056 double left_lfmd;
57 double right_lfmd;
58 double left_npmi;
59 double right_npmi;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010060 };
61
Marc Kupietz28cc53e2017-12-23 17:24:55 +010062 size_t num_merge_operator_calls;
63 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010064
Marc Kupietz28cc53e2017-12-23 17:24:55 +010065 size_t num_partial_merge_calls;
66 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010067
68
Marc Kupietz4b799e92018-01-02 11:04:56 +010069 inline void EncodeFixed64(char* buf, uint64_t value) {
70 if (! IS_BIG_ENDIAN) {
71 memcpy(buf, &value, sizeof(value));
72 } else {
73 buf[0] = value & 0xff;
74 buf[1] = (value >> 8) & 0xff;
75 buf[2] = (value >> 16) & 0xff;
76 buf[3] = (value >> 24) & 0xff;
77 buf[4] = (value >> 32) & 0xff;
78 buf[5] = (value >> 40) & 0xff;
79 buf[6] = (value >> 48) & 0xff;
80 buf[7] = (value >> 56) & 0xff;
81 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010082 }
83
Marc Kupietz4b799e92018-01-02 11:04:56 +010084 inline uint32_t DecodeFixed32(const char* ptr) {
85 if (! IS_BIG_ENDIAN) {
86 // Load the raw bytes
87 uint32_t result;
88 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
89 return result;
90 } else {
91 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
92 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
93 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
94 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
95 }
96 }
97
98 inline uint64_t DecodeFixed64(const char* ptr) {
99 if (! IS_BIG_ENDIAN) {
100 // Load the raw bytes
101 uint64_t result;
102 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
103 return result;
104 } else {
105 uint64_t lo = DecodeFixed32(ptr);
106 uint64_t hi = DecodeFixed32(ptr + 4);
107 return (hi << 32) | lo;
108 }
109 }
110
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100111 static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
112 return log2( total * ((double) f12) / (window_size * ((double) f1) * ((double)f2) ));
113 }
114
Marc Kupietzce0b8b02018-06-05 11:06:39 +0200115 // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
116 // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100117 static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz8caf9912018-06-05 10:51:18 +0200118 if(f12 == 0)
119 return -1.0;
120 else
121 return log2( total * ((double) f12) / (window_size * ((double) f1) * ((double)f2) )) / (-log2(((double) f12 / window_size / total)));
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100122 }
123
124 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
125 // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
126 // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
127 static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
128 return log2((double)f12 * f12 / ((double) total * window_size * window_size * f1 * f2));
129 }
130
131 static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz8caf9912018-06-05 10:51:18 +0200132 if(f12 == 0)
133 return 0;
134 else
135 return log2((double)f12 * f12 / ((double) total * window_size * window_size * f1 * f2)) + log2((double) f12 / window_size / total);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100136 }
137
Marc Kupietz4b799e92018-01-02 11:04:56 +0100138
139 class CountMergeOperator : public AssociativeMergeOperator {
140 public:
141 CountMergeOperator() {
142 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100143 }
144
Marc Kupietz4b799e92018-01-02 11:04:56 +0100145 virtual bool Merge(const Slice& key,
146 const Slice* existing_value,
147 const Slice& value,
148 std::string* new_value,
149 Logger* logger) const override {
150 assert(new_value->empty());
151 ++num_merge_operator_calls;
152 if (existing_value == nullptr) {
153 new_value->assign(value.data(), value.size());
154 return true;
155 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100156
Marc Kupietz4b799e92018-01-02 11:04:56 +0100157 return mergeOperator_->PartialMerge(
158 key,
159 *existing_value,
160 value,
161 new_value,
162 logger);
163 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100164
Marc Kupietz4b799e92018-01-02 11:04:56 +0100165 virtual const char* Name() const override {
166 return "UInt64AddOperator";
167 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100168
Marc Kupietz4b799e92018-01-02 11:04:56 +0100169 private:
170 std::shared_ptr<MergeOperator> mergeOperator_;
171 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100172
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100173
Marc Kupietz4b799e92018-01-02 11:04:56 +0100174 class CollocatorIterator : public Iterator {
175 private:
176 char prefixc[sizeof(uint64_t)];
177 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100178
179
Marc Kupietz4b799e92018-01-02 11:04:56 +0100180 public:
181 CollocatorIterator(Iterator* base_iterator)
182 : base_iterator_(base_iterator)
183 {}
184
Marc Kupietz4b799e92018-01-02 11:04:56 +0100185 void setPrefix(char *prefix) {
186 memcpy(prefixc, prefix, sizeof(uint64_t));
187 }
188
189 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
190 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
191 virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); }
192 virtual void Prev() { base_iterator_->Prev(); }
193 virtual void Next() { base_iterator_->Next(); }
194 virtual Slice key() const;
195 virtual Slice value() const;
196 virtual Status status() const;
197 virtual bool Valid() const;
198 bool isValid();
199 uint64_t intValue();
200 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100201
Marc Kupietz4b799e92018-01-02 11:04:56 +0100202 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100203
Marc Kupietz4b799e92018-01-02 11:04:56 +0100204 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100205
Marc Kupietz4b799e92018-01-02 11:04:56 +0100206 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz18375e12017-12-24 10:11:18 +0100207 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100208 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100209
Marc Kupietz4b799e92018-01-02 11:04:56 +0100210 bool rocksdb::CollocatorIterator::isValid() {
211 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietzd31254c2018-01-20 21:29:30 +0100212 // return key().starts_with(std::string(prefixc,3));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100213 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100214
Marc Kupietz4b799e92018-01-02 11:04:56 +0100215 uint64_t rocksdb::CollocatorIterator::intKey() {
216 return DecodeFixed64(base_iterator_->key().data());
217 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100218
Marc Kupietz4b799e92018-01-02 11:04:56 +0100219 uint64_t rocksdb::CollocatorIterator::intValue() {
220 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100221 }
222
Marc Kupietz37359b12018-01-09 21:11:37 +0100223 class VocabEntry {
224 public:
225 string word;
226 uint64_t freq;
227 };
228
Marc Kupietz6aec7682018-01-10 09:47:48 +0100229 class CollocatorDB {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100230 private:
231 WriteOptions merge_option_; // for merge
232 char _one[sizeof(uint64_t)];
233 Slice _one_slice;
Marc Kupietz37359b12018-01-09 21:11:37 +0100234 vector<VocabEntry> _vocab;
235 uint64_t total;
236
Marc Kupietz4b799e92018-01-02 11:04:56 +0100237 protected:
238 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100239
Marc Kupietz4b799e92018-01-02 11:04:56 +0100240 WriteOptions put_option_;
241 ReadOptions get_option_;
242 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100243
Marc Kupietz4b799e92018-01-02 11:04:56 +0100244 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100245
Marc Kupietz4b799e92018-01-02 11:04:56 +0100246 std::shared_ptr<DB> OpenDb(const char *dbname);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100247 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
Marc Kupietz37359b12018-01-09 21:11:37 +0100248 void read_vocab(string fname);
249
Marc Kupietz4b799e92018-01-02 11:04:56 +0100250 public:
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200251 string getWord(uint32_t w1);
Marc Kupietz6aec7682018-01-10 09:47:48 +0100252 CollocatorDB(const char *db_name, bool read_only);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100253
Marc Kupietz6aec7682018-01-10 09:47:48 +0100254 // public interface of CollocatorDB.
Marc Kupietz4b799e92018-01-02 11:04:56 +0100255 // All four functions return false
256 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100257
Marc Kupietz4b799e92018-01-02 11:04:56 +0100258 // mapped to a levedb Put
259 bool set(const std::string& key, uint64_t value) {
260 // just treat the internal rep of int64 as the string
261 char buf[sizeof(value)];
262 EncodeFixed64(buf, value);
263 Slice slice(buf, sizeof(value));
264 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100265
Marc Kupietz4b799e92018-01-02 11:04:56 +0100266 if (s.ok()) {
267 return true;
268 } else {
269 std::cerr << s.ToString() << std::endl;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100270 return false;
271 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100272 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100273
274 DB *getDb() {
275 return db_.get();
276 }
277
278 // mapped to a rocksdb Delete
279 bool remove(const std::string& key) {
280 auto s = db_->Delete(delete_option_, key);
281
282 if (s.ok()) {
283 return true;
284 } else {
285 std::cerr << s.ToString() << std::endl;
286 return false;
287 }
288 }
289
290 // mapped to a rocksdb Get
291 bool get(const std::string& key, uint64_t* value) {
292 std::string str;
293 auto s = db_->Get(get_option_, key, &str);
294
295 if (s.IsNotFound()) {
296 // return default value if not found;
297 *value = default_;
298 return true;
299 } else if (s.ok()) {
300 // deserialization
301 if (str.size() != sizeof(uint64_t)) {
302 std::cerr << "value corruption\n";
303 return false;
304 }
305 *value = DecodeFixed64(&str[0]);
306 return true;
307 } else {
308 std::cerr << s.ToString() << std::endl;
309 return false;
310 }
311 }
312
313
314 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
315 char encoded_key[sizeof(uint64_t)];
316 EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist));
317 uint64_t value = default_;
318 get(std::string(encoded_key, 8), &value);
319 return value;
320 }
321
322 virtual void inc(const std::string& key) {
323 db_->Merge(merge_option_, key, _one_slice);
324 }
325
326 void inc(const uint64_t key) {
327 char encoded_key[sizeof(uint64_t)];
328 EncodeFixed64(encoded_key, key);
329 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
330 }
331
332 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100333 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz37359b12018-01-09 21:11:37 +0100334 vector<Collocator> get_collocators(uint32_t w1);
Marc Kupietzbd966192018-10-13 14:14:37 +0200335 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200336 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
Marc Kupietzd31254c2018-01-20 21:29:30 +0100337 vector<Collocator> get_collocators_avg(uint32_t w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100338 string collocators2json(vector<Collocator> collocators);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100339
Marc Kupietz4b799e92018-01-02 11:04:56 +0100340 // mapped to a rocksdb Merge operation
341 virtual bool add(const std::string& key, uint64_t value) {
342 char encoded[sizeof(uint64_t)];
343 EncodeFixed64(encoded, value);
344 Slice slice(encoded, sizeof(uint64_t));
345 auto s = db_->Merge(merge_option_, key, slice);
346
347 if (s.ok()) {
348 return true;
349 } else {
350 std::cerr << s.ToString() << std::endl;
351 return false;
352 }
353 }
354
355 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
356 };
357
Marc Kupietz6aec7682018-01-10 09:47:48 +0100358 rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100359 // merge_option_.sync = true;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100360 if(read_only)
361 db_ = OpenDbForRead(db_name);
362 else
363 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100364 assert(db_);
365 uint64_t one = 1;
366 EncodeFixed64(_one, one);
367 _one_slice = Slice(_one, sizeof(uint64_t));
368 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100369
Marc Kupietz6aec7682018-01-10 09:47:48 +0100370 void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100371 inc(encodeCollocation(w1, w2, dist));
372 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100373
Marc Kupietz6aec7682018-01-10 09:47:48 +0100374 void rocksdb::CollocatorDB::read_vocab(string fname) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100375 char strbuf[2048];
376 uint64_t freq;
377 FILE *fin = fopen(fname.c_str(), "rb");
378 if (fin == NULL) {
379 cout << "Vocabulary file " << fname <<" not found\n";
380 exit(1);
381 }
382 uint64_t i = 0;
383 while(!feof(fin)) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100384 fscanf(fin, "%s %lu", strbuf, &freq);
Marc Kupietz37359b12018-01-09 21:11:37 +0100385 _vocab.push_back({strbuf, freq});
386 total += freq;
387 i++;
388 }
389 fclose(fin);
390 }
391
Marc Kupietz6aec7682018-01-10 09:47:48 +0100392 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
Marc Kupietz6bb27762018-01-09 17:53:01 +0100393 DB* db;
394 Options options;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100395 options.env->SetBackgroundThreads(4);
396 options.create_if_missing = true;
397 options.merge_operator = std::make_shared<CountMergeOperator>();
398 options.max_successive_merges = 0;
399 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
400 options.IncreaseParallelism();
401 options.OptimizeLevelStyleCompaction();
402 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
Marc Kupietz37359b12018-01-09 21:11:37 +0100403 ostringstream dbname, vocabname;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100404 dbname << name << ".rocksdb";
405 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
406 if (!s.ok()) {
407 std::cerr << s.ToString() << std::endl;
408 assert(false);
409 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100410 vocabname << name << ".vocab";
411 read_vocab(vocabname.str());
Marc Kupietz6bb27762018-01-09 17:53:01 +0100412 return std::shared_ptr<DB>(db);
413 }
414
Marc Kupietz6aec7682018-01-10 09:47:48 +0100415 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100416 DB* db;
417 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100418
419
420 options.env->SetBackgroundThreads(4);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100421 options.create_if_missing = true;
422 options.merge_operator = std::make_shared<CountMergeOperator>();
423 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100424 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
425 options.IncreaseParallelism();
426 options.OptimizeLevelStyleCompaction();
427 // options.max_write_buffer_number = 48;
428 // options.max_background_jobs = 48;
429 // options.allow_concurrent_memtable_write=true;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100430 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
431 // options.enable_write_thread_adaptive_yield = 1;
432 // options.allow_concurrent_memtable_write = 1;
433 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
434 // options.write_buffer_size = 1 << 22;
435 // options.allow_mmap_reads = true;
436 // options.allow_mmap_writes = true;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100437 // options.max_background_compactions = 40;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100438 // BlockBasedTableOptions table_options;
439 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
440 // options.bloom_locality = 1;
441 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
442 // table_options.block_cache = cache;
443 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100444 Status s;
445 // DestroyDB(dbname, Options());
446 s = DB::Open(options, dbname, &db);
447 if (!s.ok()) {
448 std::cerr << s.ToString() << std::endl;
449 assert(false);
450 }
451 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100452 }
453
Marc Kupietz6aec7682018-01-10 09:47:48 +0100454 CollocatorIterator* rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100455 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100456 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100457 char prefixc[sizeof(uint64_t)];
458 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
459 Iterator *it = db_->NewIterator(options);
460 CollocatorIterator *cit = new CollocatorIterator(it);
461 cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) {
462 cit->setPrefix(prefixc);
463 return cit;
464 }
465
Marc Kupietz6aec7682018-01-10 09:47:48 +0100466 void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100467 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
468 for (; it->isValid(); it->Next()) {
469 uint64_t value = it->intValue();
470 uint64_t key = it->intKey();
471 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl;
472 }
473 std::cout << "ready dumping\n";
474 }
475
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100476 double calculateLLR(uint64_t f_X_, uint64_t uintN, uint64_t f_X_Y_, uint64_t f_Y_) {
477 double f_e_, f_o_;
478 double A=0.0, B=0.0, C=0.0, D=0.0, N=0.0;
479 double LLR=0.0, statVal=0.0, minusDiffCoeff=0.0;
480 double BlogB=0.0, ClogC=0.0;
481
482 N = (double)uintN;
483 A = (double)f_X_Y_;
484 B = (double)f_X_ -A;
485 C = (double)f_Y_ -A;
486 D = (double)N -A-B-C;;
487
488 if (B > 0.) BlogB = B*log(B);
489 if (C > 0.) ClogC = C*log(C);
490
491 if ((A>0.) && (D>0.) && (N>0.)) {
492 f_e_ = (double)f_X_ /(double)N;
493 f_o_ = (double)f_X_Y_/(double)f_Y_;
494
495 minusDiffCoeff =
496 ( f_X_==0 ? (double)((-1)*f_X_Y_) :
497 ( f_X_Y_==0 ? (double)((+1)*f_X_) :
498 (f_e_-f_o_)/(f_e_+f_o_)
499 )
500 );
501
502 /* log likelihood ratio */
503 LLR = 2*( A*log(A)
504 +BlogB
505 +ClogC
506 +D*log(D)
507 -(A+B)*log(A+B)
508 -(A+C)*log(A+C)
509 -(B+D)*log(B+D)
510 -(C+D)*log(C+D)
511 +N*log(N)
512 );
513 }
514 return(minusDiffCoeff > 0 ? 0 : (statVal=LLR));
515 }
516
517
518 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
519 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100520 bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100521
Marc Kupietzd31254c2018-01-20 21:29:30 +0100522 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators_avg(uint32_t w1) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100523 std::vector<Collocator> collocators;
524 uint64_t w2, last_w2 = 0xffffffffffffffff;
525 uint64_t sum = 0, total_w1 = 0;
526 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
527 uint64_t value = it->intValue(),
528 key = it->intKey();
529 w2 = W2(key);
530 total_w1 += value;
531 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
532 if (w2 != last_w2) {
533 double pmi = log2( total * ((double) sum) /
Marc Kupietz37359b12018-01-09 21:11:37 +0100534 (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq) * ((double)_vocab[last_w2].freq) ));
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100535 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics. In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
Marc Kupietz37359b12018-01-09 21:11:37 +0100536 // double md = log2(pow((double)sum * AVG_WINDOW_SIZE / total, 2) / (AVG_WINDOW_SIZE * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
537 double md = log2((double)sum * sum / ((double) total * AVG_WINDOW_SIZE * AVG_WINDOW_SIZE * _vocab[w1].freq * _vocab[last_w2].freq));
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100538 collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(((double) sum / AVG_WINDOW_SIZE / total))), /* normalize to [-1,1] */
Marc Kupietz37359b12018-01-09 21:11:37 +0100539 calculateLLR(_vocab[w1].freq, total, sum, _vocab[last_w2].freq), md, md + log2((double)sum / AVG_WINDOW_SIZE / total), pmi*sum/total/AVG_WINDOW_SIZE} );
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100540 last_w2 = w2;
541 sum = value;
542 } else {
543 sum += value;
544 }
545 }
546
Marc Kupietzea616ca2018-06-05 11:12:43 +0200547 sort(collocators.begin(), collocators.end(), sortByNpmi);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100548
549 int i=0;
550 for (Collocator c : collocators) {
551 if(i++>10) break;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100552 std::cout << "dont call me w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
Marc Kupietz37359b12018-01-09 21:11:37 +0100553 << "\t f(w1):" << _vocab[w1].freq
554 << "\t f(w2):" << _vocab[c.w2].freq
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100555 << "\t f(w1, x):" << total_w1
Marc Kupietz51f93792018-01-25 08:51:01 +0100556 << "\t f(w1, w2):" << c.raw
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100557 << "\t pmi:" << c.pmi
558 << "\t npmi:" << c.npmi
559 << "\t llr:" << c.llr
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100560 << "\t lfmd:" << c.lfmd
561 << "\t fpmi:" << c.fpmi
562 << "\t total:" << total
563 << std::endl;
564 }
565 return collocators;
566 }
567
Marc Kupietzbd966192018-10-13 14:14:37 +0200568
569 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100570 std::vector<Collocator> collocators;
571 uint64_t w2, last_w2 = 0xffffffffffffffff;
Marc Kupietzbd966192018-10-13 14:14:37 +0200572 uint64_t maxv = 0, left = 0, right = 0;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100573 const double window_size = 1;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100574 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
575 uint64_t value = it->intValue(),
576 key = it->intKey();
Marc Kupietzbd966192018-10-13 14:14:37 +0200577 if((w2 = W2(key)) > max_w2)
578 continue;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100579 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
580 if (w2 != last_w2) {
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100581 double pmi = ca_pmi(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, window_size);
582 double lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, window_size);
583 double left_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
584 double right_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
585 double left_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
586 double right_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
587 collocators.push_back ( {last_w2, maxv, pmi, pmi / (-log2(((double) maxv / window_size / total))), /* normalize to [-1,1] */
588 calculateLLR(_vocab[w1].freq, total, maxv, _vocab[last_w2].freq), lfmd, pmi*maxv/total/window_size,
589 left_lfmd,
590 right_lfmd,
591 left_npmi,
592 right_npmi}
593 );
Marc Kupietzd31254c2018-01-20 21:29:30 +0100594 last_w2 = w2;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100595 maxv = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100596 } else {
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100597 if(value > maxv)
598 maxv = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100599 }
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100600 if(DIST(key) == -1)
601 left = value;
602 else if(DIST(key) == 1)
603 right = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100604 }
605
606 sort(collocators.begin(), collocators.end(), sortByLfmd);
607
Marc Kupietz0779a202018-06-05 11:13:35 +0200608 /*
Marc Kupietzd31254c2018-01-20 21:29:30 +0100609 int i=0;
610 for (Collocator c : collocators) {
611 if(i++>10) break;
612 std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
613 << "\t f(w1):" << _vocab[w1].freq
614 << "\t f(w2):" << _vocab[c.w2].freq
615 << "\t f(w1, x):" << total_w1
Marc Kupietz51f93792018-01-25 08:51:01 +0100616 << "\t f(w1, w2):" << c.raw
Marc Kupietzd31254c2018-01-20 21:29:30 +0100617 << "\t pmi:" << c.pmi
618 << "\t npmi:" << c.npmi
619 << "\t llr:" << c.llr
Marc Kupietzd31254c2018-01-20 21:29:30 +0100620 << "\t lfmd:" << c.lfmd
621 << "\t fpmi:" << c.fpmi
622 << "\t total:" << total
623 << std::endl;
624 }
Marc Kupietz0779a202018-06-05 11:13:35 +0200625 */
Marc Kupietzd31254c2018-01-20 21:29:30 +0100626 return collocators;
627 }
628
Marc Kupietzbd966192018-10-13 14:14:37 +0200629 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
630 return get_collocators(w1, UINT32_MAX);
631 }
632
Marc Kupietz3400aa52018-06-05 10:28:55 +0200633 void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
634 std::vector<Collocator> collocators;
635 std::stringstream stream;
636 uint64_t w2, last_w2 = 0xffffffffffffffff;
637 uint64_t maxv = 0, total_w1 = 0;
638 bool first = true;
639 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
640 uint64_t value = it->intValue(),
641 key = it->intKey();
642 w2 = W2(key);
643 total_w1 += value;
644 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
645 if (w2 != last_w2) {
646 if(maxv >= min_cooccur) {
647 double llr = calculateLLR(_vocab[w1].freq, total, maxv, _vocab[last_w2].freq);
648 if(first)
649 first = false;
650 else
651 stream << " ";
652 stream << w2 << " " << llr;
653 }
654 last_w2 = w2;
655 maxv = value;
656 } else {
657 if(value > maxv)
658 maxv = value;
659 }
660 }
661 if(first)
662 stream << "1 0.0";
663 stream << "\n";
664 std::cout << stream.str();
665 }
666
Marc Kupietz4b799e92018-01-02 11:04:56 +0100667 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
668 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
669 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
670
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100671};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100672
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200673string rocksdb::CollocatorDB::getWord(uint32_t w1) {
674 return _vocab[w1].word;
675}
676
Marc Kupietz6aec7682018-01-10 09:47:48 +0100677string rocksdb::CollocatorDB::collocators2json(vector<Collocator> collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100678 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100679 int i = 0;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100680 s << "[";
681 bool first = true;
682 for (Collocator c : collocators) {
Marc Kupietzb999ec52018-06-05 11:20:46 +0200683 if(strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100684 if (i++ > 200)
685 break;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100686 if(!first)
687 s << ",\n";
688 else
689 first = false;
690 s << "{"
691 "\"word\":\"" << string(_vocab[c.w2].word) << "\"," <<
692 "\"rank\":" << c.w2 << "," <<
Marc Kupietz51f93792018-01-25 08:51:01 +0100693 "\"f\":" << c.raw << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100694 "\"npmi\":" << c.npmi << "," <<
695 "\"llr\":" << c.llr << "," <<
696 "\"lfmd\":" << c.lfmd << "," <<
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100697 "\"fpmi\":" << c.fpmi << "," <<
698 "\"llfmd\":" << c.left_lfmd << "," <<
699 "\"rlfmd\":" << c.right_lfmd << "," <<
700 "\"lnpmi\":" << c.left_npmi << "," <<
701 "\"rnpmi\":" << c.right_npmi <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100702 "}";
703 }
704 s << "]\n";
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100705 // cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100706 return s.str();
707}
708
Marc Kupietz6aec7682018-01-10 09:47:48 +0100709typedef rocksdb::CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100710
711extern "C" {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100712 COLLOCATORS *open_collocatordb_for_write(char *dbname) {
713 return new rocksdb::CollocatorDB(dbname, false);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100714 }
715
Marc Kupietz6aec7682018-01-10 09:47:48 +0100716 COLLOCATORS *open_collocatordb(char *dbname) {
717 return new rocksdb::CollocatorDB(dbname, true);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100718 }
719
Marc Kupietz6aec7682018-01-10 09:47:48 +0100720 void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100721 db->inc(w1, w2, dist);
722 }
723
724 void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
725 db->dump(w1, w2, dist);
726 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100727
Marc Kupietz37359b12018-01-09 21:11:37 +0100728 void get_collocators(COLLOCATORS *db, uint32_t w1) {
729 db->get_collocators(w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100730 }
731
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200732 const char *get_word(COLLOCATORS *db, uint32_t w) {
733 return db->getWord(w).c_str();
734 }
735
Marc Kupietz37359b12018-01-09 21:11:37 +0100736 const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
737 return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100738 }
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100739}