blob: 074a0b54d5e2e99a623a57b15bcf99605b5f2484 [file] [log] [blame]
Marc Kupietz28cc53e2017-12-23 17:24:55 +01001#include <typeinfo>
Marc Kupietz4b799e92018-01-02 11:04:56 +01002#define EXPORT __attribute__((visibility("visible")))
3#define IMPORT
Marc Kupietz28cc53e2017-12-23 17:24:55 +01004#include <assert.h>
Marc Kupietz37359b12018-01-09 21:11:37 +01005#include <inttypes.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01006#include <memory>
7#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01008#include <algorithm>
9#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010010#include <stdint.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010011#include <string>
12#include <sstream> // for ostringstream
13#include <math.h>
Marc Kupietzd31254c2018-01-20 21:29:30 +010014#include <rocksdb/cache.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010015#include "rocksdb/comparator.h"
16#include "rocksdb/db.h"
17#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010019#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010020#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010021#include "rocksdb/utilities/db_ttl.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010022#include "rocksdb/filter_policy.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010023#include "merge_operators.h"
24
Marc Kupietz8cf7e912019-01-21 17:05:23 +010025#define WINDOW_SIZE 5.0
Marc Kupietz98cbcdc2019-01-21 17:11:27 +010026#define FREQUENCY_THRESHOLD 5
Marc Kupietz28cc53e2017-12-23 17:24:55 +010027#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
28#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010029#define W1(key) (uint64_t)(key & 0xffffff)
30#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
31#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010032
33typedef struct {
34 uint64_t freq;
35 char *word;
36} vocab_entry;
37
38// typedef struct Collocator {
39// uint64_t w2;
40// uint64_t sum;
41// };
42
Marc Kupietz28cc53e2017-12-23 17:24:55 +010043using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010044using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010045
Marc Kupietz4b799e92018-01-02 11:04:56 +010046namespace rocksdb {
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020047 class Collocator {
48 public:
Marc Kupietzc8ddf452018-01-07 21:33:12 +010049 uint64_t w2;
Marc Kupietz51f93792018-01-25 08:51:01 +010050 uint64_t raw;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010051 double pmi;
52 double npmi;
53 double llr;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010054 double lfmd;
Marc Kupietz41880452019-01-22 15:29:06 +010055 double md;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +010056 double left_lfmd;
57 double right_lfmd;
58 double left_npmi;
59 double right_npmi;
Marc Kupietz41880452019-01-22 15:29:06 +010060 double dice;
61 double logdice;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010062 };
63
Marc Kupietz28cc53e2017-12-23 17:24:55 +010064 size_t num_merge_operator_calls;
65 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010066
Marc Kupietz28cc53e2017-12-23 17:24:55 +010067 size_t num_partial_merge_calls;
68 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010069
70
Marc Kupietz4b799e92018-01-02 11:04:56 +010071 inline void EncodeFixed64(char* buf, uint64_t value) {
72 if (! IS_BIG_ENDIAN) {
73 memcpy(buf, &value, sizeof(value));
74 } else {
75 buf[0] = value & 0xff;
76 buf[1] = (value >> 8) & 0xff;
77 buf[2] = (value >> 16) & 0xff;
78 buf[3] = (value >> 24) & 0xff;
79 buf[4] = (value >> 32) & 0xff;
80 buf[5] = (value >> 40) & 0xff;
81 buf[6] = (value >> 48) & 0xff;
82 buf[7] = (value >> 56) & 0xff;
83 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010084 }
85
Marc Kupietz4b799e92018-01-02 11:04:56 +010086 inline uint32_t DecodeFixed32(const char* ptr) {
87 if (! IS_BIG_ENDIAN) {
88 // Load the raw bytes
89 uint32_t result;
90 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
91 return result;
92 } else {
93 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
94 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
95 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
96 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
97 }
98 }
99
100 inline uint64_t DecodeFixed64(const char* ptr) {
101 if (! IS_BIG_ENDIAN) {
102 // Load the raw bytes
103 uint64_t result;
104 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
105 return result;
106 } else {
107 uint64_t lo = DecodeFixed32(ptr);
108 uint64_t hi = DecodeFixed32(ptr + 4);
109 return (hi << 32) | lo;
110 }
111 }
112
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100113 static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
114 return log2( total * ((double) f12) / (window_size * ((double) f1) * ((double)f2) ));
115 }
116
Marc Kupietzce0b8b02018-06-05 11:06:39 +0200117 // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
118 // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100119 static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz8caf9912018-06-05 10:51:18 +0200120 if(f12 == 0)
121 return -1.0;
122 else
123 return log2( total * ((double) f12) / (window_size * ((double) f1) * ((double)f2) )) / (-log2(((double) f12 / window_size / total)));
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100124 }
125
126 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
127 // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
128 // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
129 static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
130 return log2((double)f12 * f12 / ((double) total * window_size * window_size * f1 * f2));
131 }
132
133 static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz8caf9912018-06-05 10:51:18 +0200134 if(f12 == 0)
135 return 0;
136 else
137 return log2((double)f12 * f12 / ((double) total * window_size * window_size * f1 * f2)) + log2((double) f12 / window_size / total);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100138 }
139
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100140 // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
141 // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
142 static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
143 double
144 r1 = (double) w1 * window_size,
145 r2 = (double) n - r1,
146 c1 = w2,
147 c2 = n - c1,
148 o11 = w12, o12 = r1 - o11,
149 o21 = c1 - w12, o22 = r2 - o21,
150 e11 = r1 * c1 / n, e12 = r1 * c2 / n,
151 e21 = r2 * c1 / n, e22 = r2 * c2 / n;
152 return (2 * ( (o11>0? o11 * log(o11/e11):0) + (o12>0? o12 * log(o12/e12):0) + (o21>0? o21 * log(o21/e21):0) + (o22>0? o22 * log(o22/e22):0)));
153 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100154
Marc Kupietz41880452019-01-22 15:29:06 +0100155
156 static inline double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
157 double
158 r1 = (double) w1 * window_size,
159 c1 = w2;
160 return 2 * w12 / (c1+r1);
161 }
162
163 // Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
164 static inline double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
165 double
166 e = 0.5,
167 r1 = (double) w1 * window_size,
168 c1 = w2;
169 return 14 + log2(2 * (w12+e) / (c1+e+r1+e));
170 }
171
Marc Kupietz4b799e92018-01-02 11:04:56 +0100172 class CountMergeOperator : public AssociativeMergeOperator {
173 public:
174 CountMergeOperator() {
175 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100176 }
177
Marc Kupietz4b799e92018-01-02 11:04:56 +0100178 virtual bool Merge(const Slice& key,
179 const Slice* existing_value,
180 const Slice& value,
181 std::string* new_value,
182 Logger* logger) const override {
183 assert(new_value->empty());
184 ++num_merge_operator_calls;
185 if (existing_value == nullptr) {
186 new_value->assign(value.data(), value.size());
187 return true;
188 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100189
Marc Kupietz4b799e92018-01-02 11:04:56 +0100190 return mergeOperator_->PartialMerge(
191 key,
192 *existing_value,
193 value,
194 new_value,
195 logger);
196 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100197
Marc Kupietz4b799e92018-01-02 11:04:56 +0100198 virtual const char* Name() const override {
199 return "UInt64AddOperator";
200 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100201
Marc Kupietz4b799e92018-01-02 11:04:56 +0100202 private:
203 std::shared_ptr<MergeOperator> mergeOperator_;
204 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100205
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100206
Marc Kupietz4b799e92018-01-02 11:04:56 +0100207 class CollocatorIterator : public Iterator {
208 private:
209 char prefixc[sizeof(uint64_t)];
210 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100211
212
Marc Kupietz4b799e92018-01-02 11:04:56 +0100213 public:
214 CollocatorIterator(Iterator* base_iterator)
215 : base_iterator_(base_iterator)
216 {}
217
Marc Kupietz4b799e92018-01-02 11:04:56 +0100218 void setPrefix(char *prefix) {
219 memcpy(prefixc, prefix, sizeof(uint64_t));
220 }
221
222 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
223 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
224 virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); }
225 virtual void Prev() { base_iterator_->Prev(); }
226 virtual void Next() { base_iterator_->Next(); }
227 virtual Slice key() const;
228 virtual Slice value() const;
229 virtual Status status() const;
230 virtual bool Valid() const;
231 bool isValid();
232 uint64_t intValue();
233 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100234
Marc Kupietz4b799e92018-01-02 11:04:56 +0100235 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100236
Marc Kupietz4b799e92018-01-02 11:04:56 +0100237 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100238
Marc Kupietz4b799e92018-01-02 11:04:56 +0100239 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz18375e12017-12-24 10:11:18 +0100240 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100241 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100242
Marc Kupietz4b799e92018-01-02 11:04:56 +0100243 bool rocksdb::CollocatorIterator::isValid() {
244 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietzd31254c2018-01-20 21:29:30 +0100245 // return key().starts_with(std::string(prefixc,3));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100246 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100247
Marc Kupietz4b799e92018-01-02 11:04:56 +0100248 uint64_t rocksdb::CollocatorIterator::intKey() {
249 return DecodeFixed64(base_iterator_->key().data());
250 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100251
Marc Kupietz4b799e92018-01-02 11:04:56 +0100252 uint64_t rocksdb::CollocatorIterator::intValue() {
253 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100254 }
255
Marc Kupietz37359b12018-01-09 21:11:37 +0100256 class VocabEntry {
257 public:
258 string word;
259 uint64_t freq;
260 };
261
Marc Kupietz6aec7682018-01-10 09:47:48 +0100262 class CollocatorDB {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100263 private:
264 WriteOptions merge_option_; // for merge
265 char _one[sizeof(uint64_t)];
266 Slice _one_slice;
Marc Kupietz37359b12018-01-09 21:11:37 +0100267 vector<VocabEntry> _vocab;
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100268 uint64_t total = 0;
269 uint64_t sentences = 0;
Marc Kupietz8cf7e912019-01-21 17:05:23 +0100270 float avg_window_size = 8.0;
Marc Kupietz37359b12018-01-09 21:11:37 +0100271
Marc Kupietz4b799e92018-01-02 11:04:56 +0100272 protected:
273 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100274
Marc Kupietz4b799e92018-01-02 11:04:56 +0100275 WriteOptions put_option_;
276 ReadOptions get_option_;
277 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100278
Marc Kupietz4b799e92018-01-02 11:04:56 +0100279 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100280
Marc Kupietz4b799e92018-01-02 11:04:56 +0100281 std::shared_ptr<DB> OpenDb(const char *dbname);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100282 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
Marc Kupietz37359b12018-01-09 21:11:37 +0100283 void read_vocab(string fname);
284
Marc Kupietz4b799e92018-01-02 11:04:56 +0100285 public:
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200286 string getWord(uint32_t w1);
Marc Kupietz6aec7682018-01-10 09:47:48 +0100287 CollocatorDB(const char *db_name, bool read_only);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100288
Marc Kupietz6aec7682018-01-10 09:47:48 +0100289 // public interface of CollocatorDB.
Marc Kupietz4b799e92018-01-02 11:04:56 +0100290 // All four functions return false
291 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100292
Marc Kupietz4b799e92018-01-02 11:04:56 +0100293 // mapped to a levedb Put
294 bool set(const std::string& key, uint64_t value) {
295 // just treat the internal rep of int64 as the string
296 char buf[sizeof(value)];
297 EncodeFixed64(buf, value);
298 Slice slice(buf, sizeof(value));
299 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100300
Marc Kupietz4b799e92018-01-02 11:04:56 +0100301 if (s.ok()) {
302 return true;
303 } else {
304 std::cerr << s.ToString() << std::endl;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100305 return false;
306 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100307 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100308
309 DB *getDb() {
310 return db_.get();
311 }
312
313 // mapped to a rocksdb Delete
314 bool remove(const std::string& key) {
315 auto s = db_->Delete(delete_option_, key);
316
317 if (s.ok()) {
318 return true;
319 } else {
320 std::cerr << s.ToString() << std::endl;
321 return false;
322 }
323 }
324
325 // mapped to a rocksdb Get
326 bool get(const std::string& key, uint64_t* value) {
327 std::string str;
328 auto s = db_->Get(get_option_, key, &str);
329
330 if (s.IsNotFound()) {
331 // return default value if not found;
332 *value = default_;
333 return true;
334 } else if (s.ok()) {
335 // deserialization
336 if (str.size() != sizeof(uint64_t)) {
337 std::cerr << "value corruption\n";
338 return false;
339 }
340 *value = DecodeFixed64(&str[0]);
341 return true;
342 } else {
343 std::cerr << s.ToString() << std::endl;
344 return false;
345 }
346 }
347
348
349 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
350 char encoded_key[sizeof(uint64_t)];
351 EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist));
352 uint64_t value = default_;
353 get(std::string(encoded_key, 8), &value);
354 return value;
355 }
356
357 virtual void inc(const std::string& key) {
358 db_->Merge(merge_option_, key, _one_slice);
359 }
360
361 void inc(const uint64_t key) {
362 char encoded_key[sizeof(uint64_t)];
363 EncodeFixed64(encoded_key, key);
364 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
365 }
366
367 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100368 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz37359b12018-01-09 21:11:37 +0100369 vector<Collocator> get_collocators(uint32_t w1);
Marc Kupietzbd966192018-10-13 14:14:37 +0200370 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200371 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100372 string collocators2json(vector<Collocator> collocators);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100373
Marc Kupietz4b799e92018-01-02 11:04:56 +0100374 // mapped to a rocksdb Merge operation
375 virtual bool add(const std::string& key, uint64_t value) {
376 char encoded[sizeof(uint64_t)];
377 EncodeFixed64(encoded, value);
378 Slice slice(encoded, sizeof(uint64_t));
379 auto s = db_->Merge(merge_option_, key, slice);
380
381 if (s.ok()) {
382 return true;
383 } else {
384 std::cerr << s.ToString() << std::endl;
385 return false;
386 }
387 }
388
389 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
390 };
391
Marc Kupietz6aec7682018-01-10 09:47:48 +0100392 rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100393 // merge_option_.sync = true;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100394 if(read_only)
395 db_ = OpenDbForRead(db_name);
396 else
397 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100398 assert(db_);
399 uint64_t one = 1;
400 EncodeFixed64(_one, one);
401 _one_slice = Slice(_one, sizeof(uint64_t));
402 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100403
Marc Kupietz6aec7682018-01-10 09:47:48 +0100404 void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100405 inc(encodeCollocation(w1, w2, dist));
406 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100407
Marc Kupietz6aec7682018-01-10 09:47:48 +0100408 void rocksdb::CollocatorDB::read_vocab(string fname) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100409 char strbuf[2048];
410 uint64_t freq;
411 FILE *fin = fopen(fname.c_str(), "rb");
412 if (fin == NULL) {
413 cout << "Vocabulary file " << fname <<" not found\n";
414 exit(1);
415 }
416 uint64_t i = 0;
417 while(!feof(fin)) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100418 fscanf(fin, "%s %lu", strbuf, &freq);
Marc Kupietz37359b12018-01-09 21:11:37 +0100419 _vocab.push_back({strbuf, freq});
420 total += freq;
421 i++;
422 }
423 fclose(fin);
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100424
425 char size_fname[256];
426 strcpy(size_fname, fname.c_str());
427 char *pos = strstr(size_fname, ".vocab");
428 if(pos) {
429 *pos=0;
430 strcat(size_fname, ".size");
431 FILE *fp = fopen(size_fname, "r");
432 if (fp != NULL) {
433 fscanf(fp, "%lu", &sentences);
434 fscanf(fp, "%lu", &total);
435 float sl = (float)total/(float)sentences;
436 float w = WINDOW_SIZE;
437 avg_window_size = ((sl > 2*w? (sl-2*w)*2*w: 0) + (double) w * (3*w -1)) / sl;
438 fprintf(stdout, "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n", total, sentences, sl, avg_window_size);
439 fclose(fp);
440 } else {
441 std::cout << "size file " << size_fname << " not found\n";
442 }
443 } else {
444 std::cout << "cannot determine size file " << size_fname << "\n";
445 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100446 }
447
Marc Kupietz6aec7682018-01-10 09:47:48 +0100448 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
Marc Kupietz6bb27762018-01-09 17:53:01 +0100449 DB* db;
450 Options options;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100451 options.env->SetBackgroundThreads(4);
452 options.create_if_missing = true;
453 options.merge_operator = std::make_shared<CountMergeOperator>();
454 options.max_successive_merges = 0;
455 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
456 options.IncreaseParallelism();
457 options.OptimizeLevelStyleCompaction();
458 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
Marc Kupietz37359b12018-01-09 21:11:37 +0100459 ostringstream dbname, vocabname;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100460 dbname << name << ".rocksdb";
461 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
462 if (!s.ok()) {
463 std::cerr << s.ToString() << std::endl;
464 assert(false);
465 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100466 vocabname << name << ".vocab";
467 read_vocab(vocabname.str());
Marc Kupietz6bb27762018-01-09 17:53:01 +0100468 return std::shared_ptr<DB>(db);
469 }
470
Marc Kupietz6aec7682018-01-10 09:47:48 +0100471 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100472 DB* db;
473 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100474
475
476 options.env->SetBackgroundThreads(4);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100477 options.create_if_missing = true;
478 options.merge_operator = std::make_shared<CountMergeOperator>();
479 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100480 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
481 options.IncreaseParallelism();
482 options.OptimizeLevelStyleCompaction();
483 // options.max_write_buffer_number = 48;
484 // options.max_background_jobs = 48;
485 // options.allow_concurrent_memtable_write=true;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100486 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
487 // options.enable_write_thread_adaptive_yield = 1;
488 // options.allow_concurrent_memtable_write = 1;
489 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
490 // options.write_buffer_size = 1 << 22;
491 // options.allow_mmap_reads = true;
492 // options.allow_mmap_writes = true;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100493 // options.max_background_compactions = 40;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100494 // BlockBasedTableOptions table_options;
495 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
496 // options.bloom_locality = 1;
497 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
498 // table_options.block_cache = cache;
499 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100500 Status s;
501 // DestroyDB(dbname, Options());
502 s = DB::Open(options, dbname, &db);
503 if (!s.ok()) {
504 std::cerr << s.ToString() << std::endl;
505 assert(false);
506 }
507 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100508 }
509
Marc Kupietz6aec7682018-01-10 09:47:48 +0100510 CollocatorIterator* rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100511 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100512 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100513 char prefixc[sizeof(uint64_t)];
514 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
515 Iterator *it = db_->NewIterator(options);
516 CollocatorIterator *cit = new CollocatorIterator(it);
517 cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) {
518 cit->setPrefix(prefixc);
519 return cit;
520 }
521
Marc Kupietz6aec7682018-01-10 09:47:48 +0100522 void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100523 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
524 for (; it->isValid(); it->Next()) {
525 uint64_t value = it->intValue();
526 uint64_t key = it->intKey();
527 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl;
528 }
529 std::cout << "ready dumping\n";
530 }
531
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100532 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
533 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100534 bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100535
Marc Kupietzbd966192018-10-13 14:14:37 +0200536 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100537 std::vector<Collocator> collocators;
538 uint64_t w2, last_w2 = 0xffffffffffffffff;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100539 uint64_t maxv = 0, sum = 0, left = 0, right = 0;
540
Marc Kupietzd31254c2018-01-20 21:29:30 +0100541 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
542 uint64_t value = it->intValue(),
543 key = it->intKey();
Marc Kupietzbd966192018-10-13 14:14:37 +0200544 if((w2 = W2(key)) > max_w2)
545 continue;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100546 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
547 if (w2 != last_w2) {
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100548 if(sum >= FREQUENCY_THRESHOLD) {
549 double o = sum,
550 r1 = (double)_vocab[w1].freq * avg_window_size,
551 c1 = (double)_vocab[last_w2].freq,
552 e = r1 * c1 / total,
553 pmi = log2(o/e),
554 md = log2(o*o/e),
555 lfmd = log2(o*o*o/e),
556 llr = ca_ll((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size);
557 double left_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
558 double right_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
559 double left_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
560 double right_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
Marc Kupietz41880452019-01-22 15:29:06 +0100561 collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(o/total/avg_window_size)), /* normalize to [-1,1] */
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100562 llr, lfmd, md,
563 left_lfmd,
564 right_lfmd,
565 left_npmi,
Marc Kupietz41880452019-01-22 15:29:06 +0100566 right_npmi,
567 ca_dice((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size),
568 ca_logdice((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size)
569 }
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100570 );
571 }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100572 last_w2 = w2;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100573 maxv = value;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100574 sum = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100575 } else {
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100576 sum += value;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100577 if(value > maxv)
578 maxv = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100579 }
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100580 if(DIST(key) == -1)
581 left = value;
582 else if(DIST(key) == 1)
583 right = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100584 }
585
586 sort(collocators.begin(), collocators.end(), sortByLfmd);
587
Marc Kupietz0779a202018-06-05 11:13:35 +0200588 /*
Marc Kupietzd31254c2018-01-20 21:29:30 +0100589 int i=0;
590 for (Collocator c : collocators) {
591 if(i++>10) break;
592 std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
593 << "\t f(w1):" << _vocab[w1].freq
594 << "\t f(w2):" << _vocab[c.w2].freq
595 << "\t f(w1, x):" << total_w1
Marc Kupietz51f93792018-01-25 08:51:01 +0100596 << "\t f(w1, w2):" << c.raw
Marc Kupietzd31254c2018-01-20 21:29:30 +0100597 << "\t pmi:" << c.pmi
598 << "\t npmi:" << c.npmi
599 << "\t llr:" << c.llr
Marc Kupietzd31254c2018-01-20 21:29:30 +0100600 << "\t lfmd:" << c.lfmd
601 << "\t fpmi:" << c.fpmi
602 << "\t total:" << total
603 << std::endl;
604 }
Marc Kupietz0779a202018-06-05 11:13:35 +0200605 */
Marc Kupietzd31254c2018-01-20 21:29:30 +0100606 return collocators;
607 }
608
Marc Kupietzbd966192018-10-13 14:14:37 +0200609 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
610 return get_collocators(w1, UINT32_MAX);
611 }
612
Marc Kupietz3400aa52018-06-05 10:28:55 +0200613 void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
614 std::vector<Collocator> collocators;
615 std::stringstream stream;
616 uint64_t w2, last_w2 = 0xffffffffffffffff;
617 uint64_t maxv = 0, total_w1 = 0;
618 bool first = true;
619 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
620 uint64_t value = it->intValue(),
621 key = it->intKey();
622 w2 = W2(key);
623 total_w1 += value;
624 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
625 if (w2 != last_w2) {
626 if(maxv >= min_cooccur) {
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100627 double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200628 if(first)
629 first = false;
630 else
631 stream << " ";
632 stream << w2 << " " << llr;
633 }
634 last_w2 = w2;
635 maxv = value;
636 } else {
637 if(value > maxv)
638 maxv = value;
639 }
640 }
641 if(first)
642 stream << "1 0.0";
643 stream << "\n";
644 std::cout << stream.str();
645 }
646
Marc Kupietz4b799e92018-01-02 11:04:56 +0100647 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
648 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
649 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
650
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100651};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100652
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200653string rocksdb::CollocatorDB::getWord(uint32_t w1) {
654 return _vocab[w1].word;
655}
656
Marc Kupietz6aec7682018-01-10 09:47:48 +0100657string rocksdb::CollocatorDB::collocators2json(vector<Collocator> collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100658 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100659 int i = 0;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100660 s << "[";
661 bool first = true;
662 for (Collocator c : collocators) {
Marc Kupietzb999ec52018-06-05 11:20:46 +0200663 if(strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100664 if (i++ > 200)
665 break;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100666 if(!first)
667 s << ",\n";
668 else
669 first = false;
670 s << "{"
671 "\"word\":\"" << string(_vocab[c.w2].word) << "\"," <<
672 "\"rank\":" << c.w2 << "," <<
Marc Kupietz51f93792018-01-25 08:51:01 +0100673 "\"f\":" << c.raw << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100674 "\"npmi\":" << c.npmi << "," <<
Marc Kupietz41880452019-01-22 15:29:06 +0100675 "\"pmi\":" << c.pmi << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100676 "\"llr\":" << c.llr << "," <<
677 "\"lfmd\":" << c.lfmd << "," <<
Marc Kupietz41880452019-01-22 15:29:06 +0100678 "\"md\":" << c.md << "," <<
679 "\"dice\":" << c.dice << "," <<
680 "\"ld\":" << c.logdice << "," <<
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100681 "\"llfmd\":" << c.left_lfmd << "," <<
682 "\"rlfmd\":" << c.right_lfmd << "," <<
683 "\"lnpmi\":" << c.left_npmi << "," <<
684 "\"rnpmi\":" << c.right_npmi <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100685 "}";
686 }
687 s << "]\n";
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100688 // cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100689 return s.str();
690}
691
Marc Kupietz6aec7682018-01-10 09:47:48 +0100692typedef rocksdb::CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100693
694extern "C" {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100695 COLLOCATORS *open_collocatordb_for_write(char *dbname) {
696 return new rocksdb::CollocatorDB(dbname, false);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100697 }
698
Marc Kupietz6aec7682018-01-10 09:47:48 +0100699 COLLOCATORS *open_collocatordb(char *dbname) {
700 return new rocksdb::CollocatorDB(dbname, true);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100701 }
702
Marc Kupietz6aec7682018-01-10 09:47:48 +0100703 void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100704 db->inc(w1, w2, dist);
705 }
706
707 void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
708 db->dump(w1, w2, dist);
709 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100710
Marc Kupietz37359b12018-01-09 21:11:37 +0100711 void get_collocators(COLLOCATORS *db, uint32_t w1) {
712 db->get_collocators(w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100713 }
714
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200715 const char *get_word(COLLOCATORS *db, uint32_t w) {
716 return db->getWord(w).c_str();
717 }
718
Marc Kupietz37359b12018-01-09 21:11:37 +0100719 const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
720 return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100721 }
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100722}