blob: d62ce8177e4aa91b09c66b20dee432f3d7ee106b [file] [log] [blame]
Marc Kupietz28cc53e2017-12-23 17:24:55 +01001#include <typeinfo>
Marc Kupietz4b799e92018-01-02 11:04:56 +01002#define EXPORT __attribute__((visibility("visible")))
3#define IMPORT
Marc Kupietz28cc53e2017-12-23 17:24:55 +01004#include <assert.h>
Marc Kupietz37359b12018-01-09 21:11:37 +01005#include <inttypes.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01006#include <memory>
7#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01008#include <algorithm>
9#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010010#include <stdint.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010011#include <string>
12#include <sstream> // for ostringstream
13#include <math.h>
Marc Kupietzd31254c2018-01-20 21:29:30 +010014#include <rocksdb/cache.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010015#include "rocksdb/comparator.h"
16#include "rocksdb/db.h"
17#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010019#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010020#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010021#include "rocksdb/utilities/db_ttl.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010022#include "rocksdb/filter_policy.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010023#include "merge_operators.h"
24
Marc Kupietz75af60f2019-01-22 22:34:29 +010025#define WINDOW_SIZE 5
Marc Kupietz98cbcdc2019-01-21 17:11:27 +010026#define FREQUENCY_THRESHOLD 5
Marc Kupietz28cc53e2017-12-23 17:24:55 +010027#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
28#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010029#define W1(key) (uint64_t)(key & 0xffffff)
30#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
31#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010032
33typedef struct {
34 uint64_t freq;
35 char *word;
36} vocab_entry;
37
38// typedef struct Collocator {
39// uint64_t w2;
40// uint64_t sum;
41// };
42
Marc Kupietz28cc53e2017-12-23 17:24:55 +010043using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010044using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010045
Marc Kupietz4b799e92018-01-02 11:04:56 +010046namespace rocksdb {
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020047 class Collocator {
48 public:
Marc Kupietzc8ddf452018-01-07 21:33:12 +010049 uint64_t w2;
Marc Kupietzcc6c4592019-01-23 10:11:23 +010050 uint64_t f2;
Marc Kupietz51f93792018-01-25 08:51:01 +010051 uint64_t raw;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010052 double pmi;
53 double npmi;
54 double llr;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010055 double lfmd;
Marc Kupietz41880452019-01-22 15:29:06 +010056 double md;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +010057 double left_lfmd;
58 double right_lfmd;
59 double left_npmi;
60 double right_npmi;
Marc Kupietz41880452019-01-22 15:29:06 +010061 double dice;
62 double logdice;
Marc Kupietz75af60f2019-01-22 22:34:29 +010063 double af;
64 int window;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010065 };
66
Marc Kupietz28cc53e2017-12-23 17:24:55 +010067 size_t num_merge_operator_calls;
68 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010069
Marc Kupietz28cc53e2017-12-23 17:24:55 +010070 size_t num_partial_merge_calls;
71 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010072
73
Marc Kupietz4b799e92018-01-02 11:04:56 +010074 inline void EncodeFixed64(char* buf, uint64_t value) {
75 if (! IS_BIG_ENDIAN) {
76 memcpy(buf, &value, sizeof(value));
77 } else {
78 buf[0] = value & 0xff;
79 buf[1] = (value >> 8) & 0xff;
80 buf[2] = (value >> 16) & 0xff;
81 buf[3] = (value >> 24) & 0xff;
82 buf[4] = (value >> 32) & 0xff;
83 buf[5] = (value >> 40) & 0xff;
84 buf[6] = (value >> 48) & 0xff;
85 buf[7] = (value >> 56) & 0xff;
86 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010087 }
88
Marc Kupietz4b799e92018-01-02 11:04:56 +010089 inline uint32_t DecodeFixed32(const char* ptr) {
90 if (! IS_BIG_ENDIAN) {
91 // Load the raw bytes
92 uint32_t result;
93 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
94 return result;
95 } else {
96 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
97 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
98 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
99 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
100 }
101 }
102
103 inline uint64_t DecodeFixed64(const char* ptr) {
104 if (! IS_BIG_ENDIAN) {
105 // Load the raw bytes
106 uint64_t result;
107 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
108 return result;
109 } else {
110 uint64_t lo = DecodeFixed32(ptr);
111 uint64_t hi = DecodeFixed32(ptr + 4);
112 return (hi << 32) | lo;
113 }
114 }
115
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100116 static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100117 double
118 r1 = f1 * window_size,
119 c1 = f2,
120 e = r1 * c1 / total,
121 o = f12;
122 return log2(o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100123 }
124
Marc Kupietzce0b8b02018-06-05 11:06:39 +0200125 // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
126 // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100127 static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100128 double
129 r1 = f1 * window_size,
130 c1 = f2,
131 e = r1 * c1 / total,
132 o = f12;
133 if(f12 < FREQUENCY_THRESHOLD)
Marc Kupietz8caf9912018-06-05 10:51:18 +0200134 return -1.0;
135 else
Marc Kupietz1335dd72019-01-22 15:35:21 +0100136 return log2(o/e) / (-log2(o/total/window_size));
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100137 }
138
139 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
140 // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
141 // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
142 static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100143 double
144 r1 = f1 * window_size,
145 c1 = f2,
146 e = r1 * c1 / total,
147 o = f12;
148 return log2(o*o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100149 }
150
151 static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100152 double
153 r1 = f1 * window_size,
154 c1 = f2,
155 e = r1 * c1 / total,
156 o = f12;
Marc Kupietz8caf9912018-06-05 10:51:18 +0200157 if(f12 == 0)
158 return 0;
159 else
Marc Kupietz1335dd72019-01-22 15:35:21 +0100160 return log2(o*o*o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100161 }
162
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100163 // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
164 // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
165 static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
166 double
167 r1 = (double) w1 * window_size,
168 r2 = (double) n - r1,
169 c1 = w2,
170 c2 = n - c1,
171 o11 = w12, o12 = r1 - o11,
172 o21 = c1 - w12, o22 = r2 - o21,
173 e11 = r1 * c1 / n, e12 = r1 * c2 / n,
174 e21 = r2 * c1 / n, e22 = r2 * c2 / n;
175 return (2 * ( (o11>0? o11 * log(o11/e11):0) + (o12>0? o12 * log(o12/e12):0) + (o21>0? o21 * log(o21/e21):0) + (o22>0? o22 * log(o22/e22):0)));
176 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100177
Marc Kupietz41880452019-01-22 15:29:06 +0100178
179 static inline double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
180 double
181 r1 = (double) w1 * window_size,
182 c1 = w2;
183 return 2 * w12 / (c1+r1);
184 }
185
186 // Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
187 static inline double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
188 double
189 e = 0.5,
190 r1 = (double) w1 * window_size,
191 c1 = w2;
192 return 14 + log2(2 * (w12+e) / (c1+e+r1+e));
193 }
194
Marc Kupietz4b799e92018-01-02 11:04:56 +0100195 class CountMergeOperator : public AssociativeMergeOperator {
196 public:
197 CountMergeOperator() {
198 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100199 }
200
Marc Kupietz4b799e92018-01-02 11:04:56 +0100201 virtual bool Merge(const Slice& key,
202 const Slice* existing_value,
203 const Slice& value,
204 std::string* new_value,
205 Logger* logger) const override {
206 assert(new_value->empty());
207 ++num_merge_operator_calls;
208 if (existing_value == nullptr) {
209 new_value->assign(value.data(), value.size());
210 return true;
211 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100212
Marc Kupietz4b799e92018-01-02 11:04:56 +0100213 return mergeOperator_->PartialMerge(
214 key,
215 *existing_value,
216 value,
217 new_value,
218 logger);
219 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100220
Marc Kupietz4b799e92018-01-02 11:04:56 +0100221 virtual const char* Name() const override {
222 return "UInt64AddOperator";
223 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100224
Marc Kupietz4b799e92018-01-02 11:04:56 +0100225 private:
226 std::shared_ptr<MergeOperator> mergeOperator_;
227 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100228
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100229
Marc Kupietz4b799e92018-01-02 11:04:56 +0100230 class CollocatorIterator : public Iterator {
231 private:
232 char prefixc[sizeof(uint64_t)];
233 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100234
235
Marc Kupietz4b799e92018-01-02 11:04:56 +0100236 public:
237 CollocatorIterator(Iterator* base_iterator)
238 : base_iterator_(base_iterator)
239 {}
240
Marc Kupietz4b799e92018-01-02 11:04:56 +0100241 void setPrefix(char *prefix) {
242 memcpy(prefixc, prefix, sizeof(uint64_t));
243 }
244
245 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
246 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
247 virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); }
248 virtual void Prev() { base_iterator_->Prev(); }
249 virtual void Next() { base_iterator_->Next(); }
250 virtual Slice key() const;
251 virtual Slice value() const;
252 virtual Status status() const;
253 virtual bool Valid() const;
254 bool isValid();
255 uint64_t intValue();
256 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100257
Marc Kupietz4b799e92018-01-02 11:04:56 +0100258 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100259
Marc Kupietz4b799e92018-01-02 11:04:56 +0100260 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100261
Marc Kupietz4b799e92018-01-02 11:04:56 +0100262 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz18375e12017-12-24 10:11:18 +0100263 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100264 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100265
Marc Kupietz4b799e92018-01-02 11:04:56 +0100266 bool rocksdb::CollocatorIterator::isValid() {
267 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietzd31254c2018-01-20 21:29:30 +0100268 // return key().starts_with(std::string(prefixc,3));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100269 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100270
Marc Kupietz4b799e92018-01-02 11:04:56 +0100271 uint64_t rocksdb::CollocatorIterator::intKey() {
272 return DecodeFixed64(base_iterator_->key().data());
273 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100274
Marc Kupietz4b799e92018-01-02 11:04:56 +0100275 uint64_t rocksdb::CollocatorIterator::intValue() {
276 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100277 }
278
Marc Kupietz37359b12018-01-09 21:11:37 +0100279 class VocabEntry {
280 public:
281 string word;
282 uint64_t freq;
283 };
284
Marc Kupietz6aec7682018-01-10 09:47:48 +0100285 class CollocatorDB {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100286 private:
287 WriteOptions merge_option_; // for merge
288 char _one[sizeof(uint64_t)];
289 Slice _one_slice;
Marc Kupietz37359b12018-01-09 21:11:37 +0100290 vector<VocabEntry> _vocab;
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100291 uint64_t total = 0;
292 uint64_t sentences = 0;
Marc Kupietz8cf7e912019-01-21 17:05:23 +0100293 float avg_window_size = 8.0;
Marc Kupietz37359b12018-01-09 21:11:37 +0100294
Marc Kupietz4b799e92018-01-02 11:04:56 +0100295 protected:
296 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100297
Marc Kupietz4b799e92018-01-02 11:04:56 +0100298 WriteOptions put_option_;
299 ReadOptions get_option_;
300 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100301
Marc Kupietz4b799e92018-01-02 11:04:56 +0100302 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100303
Marc Kupietz4b799e92018-01-02 11:04:56 +0100304 std::shared_ptr<DB> OpenDb(const char *dbname);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100305 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
Marc Kupietz37359b12018-01-09 21:11:37 +0100306 void read_vocab(string fname);
307
Marc Kupietz4b799e92018-01-02 11:04:56 +0100308 public:
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200309 string getWord(uint32_t w1);
Marc Kupietz6aec7682018-01-10 09:47:48 +0100310 CollocatorDB(const char *db_name, bool read_only);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100311
Marc Kupietz6aec7682018-01-10 09:47:48 +0100312 // public interface of CollocatorDB.
Marc Kupietz4b799e92018-01-02 11:04:56 +0100313 // All four functions return false
314 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100315
Marc Kupietz4b799e92018-01-02 11:04:56 +0100316 // mapped to a levedb Put
317 bool set(const std::string& key, uint64_t value) {
318 // just treat the internal rep of int64 as the string
319 char buf[sizeof(value)];
320 EncodeFixed64(buf, value);
321 Slice slice(buf, sizeof(value));
322 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100323
Marc Kupietz4b799e92018-01-02 11:04:56 +0100324 if (s.ok()) {
325 return true;
326 } else {
327 std::cerr << s.ToString() << std::endl;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100328 return false;
329 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100330 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100331
332 DB *getDb() {
333 return db_.get();
334 }
335
336 // mapped to a rocksdb Delete
337 bool remove(const std::string& key) {
338 auto s = db_->Delete(delete_option_, key);
339
340 if (s.ok()) {
341 return true;
342 } else {
343 std::cerr << s.ToString() << std::endl;
344 return false;
345 }
346 }
347
348 // mapped to a rocksdb Get
349 bool get(const std::string& key, uint64_t* value) {
350 std::string str;
351 auto s = db_->Get(get_option_, key, &str);
352
353 if (s.IsNotFound()) {
354 // return default value if not found;
355 *value = default_;
356 return true;
357 } else if (s.ok()) {
358 // deserialization
359 if (str.size() != sizeof(uint64_t)) {
360 std::cerr << "value corruption\n";
361 return false;
362 }
363 *value = DecodeFixed64(&str[0]);
364 return true;
365 } else {
366 std::cerr << s.ToString() << std::endl;
367 return false;
368 }
369 }
370
371
372 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
373 char encoded_key[sizeof(uint64_t)];
374 EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist));
375 uint64_t value = default_;
376 get(std::string(encoded_key, 8), &value);
377 return value;
378 }
379
380 virtual void inc(const std::string& key) {
381 db_->Merge(merge_option_, key, _one_slice);
382 }
383
384 void inc(const uint64_t key) {
385 char encoded_key[sizeof(uint64_t)];
386 EncodeFixed64(encoded_key, key);
387 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
388 }
389
390 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100391 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz37359b12018-01-09 21:11:37 +0100392 vector<Collocator> get_collocators(uint32_t w1);
Marc Kupietzbd966192018-10-13 14:14:37 +0200393 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200394 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100395 string collocators2json(vector<Collocator> collocators);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100396
Marc Kupietz4b799e92018-01-02 11:04:56 +0100397 // mapped to a rocksdb Merge operation
398 virtual bool add(const std::string& key, uint64_t value) {
399 char encoded[sizeof(uint64_t)];
400 EncodeFixed64(encoded, value);
401 Slice slice(encoded, sizeof(uint64_t));
402 auto s = db_->Merge(merge_option_, key, slice);
403
404 if (s.ok()) {
405 return true;
406 } else {
407 std::cerr << s.ToString() << std::endl;
408 return false;
409 }
410 }
411
412 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
413 };
414
Marc Kupietz6aec7682018-01-10 09:47:48 +0100415 rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100416 // merge_option_.sync = true;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100417 if(read_only)
418 db_ = OpenDbForRead(db_name);
419 else
420 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100421 assert(db_);
422 uint64_t one = 1;
423 EncodeFixed64(_one, one);
424 _one_slice = Slice(_one, sizeof(uint64_t));
425 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100426
Marc Kupietz6aec7682018-01-10 09:47:48 +0100427 void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100428 inc(encodeCollocation(w1, w2, dist));
429 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100430
Marc Kupietz6aec7682018-01-10 09:47:48 +0100431 void rocksdb::CollocatorDB::read_vocab(string fname) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100432 char strbuf[2048];
433 uint64_t freq;
434 FILE *fin = fopen(fname.c_str(), "rb");
435 if (fin == NULL) {
436 cout << "Vocabulary file " << fname <<" not found\n";
437 exit(1);
438 }
439 uint64_t i = 0;
440 while(!feof(fin)) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100441 fscanf(fin, "%s %lu", strbuf, &freq);
Marc Kupietz37359b12018-01-09 21:11:37 +0100442 _vocab.push_back({strbuf, freq});
443 total += freq;
444 i++;
445 }
446 fclose(fin);
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100447
448 char size_fname[256];
449 strcpy(size_fname, fname.c_str());
450 char *pos = strstr(size_fname, ".vocab");
451 if(pos) {
452 *pos=0;
453 strcat(size_fname, ".size");
454 FILE *fp = fopen(size_fname, "r");
455 if (fp != NULL) {
456 fscanf(fp, "%lu", &sentences);
457 fscanf(fp, "%lu", &total);
458 float sl = (float)total/(float)sentences;
459 float w = WINDOW_SIZE;
460 avg_window_size = ((sl > 2*w? (sl-2*w)*2*w: 0) + (double) w * (3*w -1)) / sl;
461 fprintf(stdout, "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n", total, sentences, sl, avg_window_size);
462 fclose(fp);
463 } else {
464 std::cout << "size file " << size_fname << " not found\n";
465 }
466 } else {
467 std::cout << "cannot determine size file " << size_fname << "\n";
468 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100469 }
470
Marc Kupietz6aec7682018-01-10 09:47:48 +0100471 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
Marc Kupietz6bb27762018-01-09 17:53:01 +0100472 DB* db;
473 Options options;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100474 options.env->SetBackgroundThreads(4);
475 options.create_if_missing = true;
476 options.merge_operator = std::make_shared<CountMergeOperator>();
477 options.max_successive_merges = 0;
478 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
479 options.IncreaseParallelism();
480 options.OptimizeLevelStyleCompaction();
481 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
Marc Kupietz37359b12018-01-09 21:11:37 +0100482 ostringstream dbname, vocabname;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100483 dbname << name << ".rocksdb";
484 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
485 if (!s.ok()) {
486 std::cerr << s.ToString() << std::endl;
487 assert(false);
488 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100489 vocabname << name << ".vocab";
490 read_vocab(vocabname.str());
Marc Kupietz6bb27762018-01-09 17:53:01 +0100491 return std::shared_ptr<DB>(db);
492 }
493
Marc Kupietz6aec7682018-01-10 09:47:48 +0100494 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100495 DB* db;
496 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100497
498
499 options.env->SetBackgroundThreads(4);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100500 options.create_if_missing = true;
501 options.merge_operator = std::make_shared<CountMergeOperator>();
502 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100503 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
504 options.IncreaseParallelism();
505 options.OptimizeLevelStyleCompaction();
506 // options.max_write_buffer_number = 48;
507 // options.max_background_jobs = 48;
508 // options.allow_concurrent_memtable_write=true;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100509 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
510 // options.enable_write_thread_adaptive_yield = 1;
511 // options.allow_concurrent_memtable_write = 1;
512 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
513 // options.write_buffer_size = 1 << 22;
514 // options.allow_mmap_reads = true;
515 // options.allow_mmap_writes = true;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100516 // options.max_background_compactions = 40;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100517 // BlockBasedTableOptions table_options;
518 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
519 // options.bloom_locality = 1;
520 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
521 // table_options.block_cache = cache;
522 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100523 Status s;
524 // DestroyDB(dbname, Options());
525 s = DB::Open(options, dbname, &db);
526 if (!s.ok()) {
527 std::cerr << s.ToString() << std::endl;
528 assert(false);
529 }
530 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100531 }
532
Marc Kupietz6aec7682018-01-10 09:47:48 +0100533 CollocatorIterator* rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100534 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100535 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100536 char prefixc[sizeof(uint64_t)];
537 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
538 Iterator *it = db_->NewIterator(options);
539 CollocatorIterator *cit = new CollocatorIterator(it);
540 cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) {
541 cit->setPrefix(prefixc);
542 return cit;
543 }
544
Marc Kupietz6aec7682018-01-10 09:47:48 +0100545 void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100546 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
547 for (; it->isValid(); it->Next()) {
548 uint64_t value = it->intValue();
549 uint64_t key = it->intKey();
550 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl;
551 }
552 std::cout << "ready dumping\n";
553 }
554
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100555 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
556 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100557 bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
Marc Kupietz7e3dfde2019-01-22 16:27:33 +0100558 bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdice > rhs.logdice; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100559
Marc Kupietz75af60f2019-01-22 22:34:29 +0100560 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
561 std::vector<Collocator> collocators;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100562 uint64_t w2, last_w2 = 0xffffffffffffffff;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100563 uint64_t maxv = 0, sum = 0, left = 0, right = 0;
Marc Kupietz75af60f2019-01-22 22:34:29 +0100564 uint64_t sumWindow[2*WINDOW_SIZE+1] = {};
Marc Kupietzade33222019-01-22 22:52:44 +0100565 int true_window_size = 1;
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100566 int usedPositions=0;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100567
Marc Kupietzd31254c2018-01-20 21:29:30 +0100568 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
569 uint64_t value = it->intValue(),
570 key = it->intKey();
Marc Kupietzbd966192018-10-13 14:14:37 +0200571 if((w2 = W2(key)) > max_w2)
572 continue;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100573 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
574 if (w2 != last_w2) {
Marc Kupietz75af60f2019-01-22 22:34:29 +0100575 if (sum >= FREQUENCY_THRESHOLD) {
576 uint64_t f1 = _vocab[w1].freq, f2 = _vocab[last_w2].freq;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100577 double o = sum,
Marc Kupietzade33222019-01-22 22:52:44 +0100578 r1 = (double)_vocab[w1].freq * true_window_size,
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100579 c1 = (double)_vocab[last_w2].freq,
580 e = r1 * c1 / total,
581 pmi = log2(o/e),
582 md = log2(o*o/e),
583 lfmd = log2(o*o*o/e),
Marc Kupietzade33222019-01-22 22:52:44 +0100584 llr = ca_ll(f1, f2, sum, total, true_window_size);
Marc Kupietz75af60f2019-01-22 22:34:29 +0100585 double left_lfmd = ca_lfmd(f1, f2, left, total, 1);
586 double right_lfmd = ca_lfmd(f1, f2, right, total, 1);
587 double left_npmi = ca_npmi(f1, f2, left, total, 1);
588 double right_npmi = ca_npmi(f1, f2, right, total, 1);
589
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100590 int bestWindow = usedPositions; // (1 << (2*WINDOW_SIZE)) - 1;
591 double bestAF = ca_logdice(f1, f2, sum, total, true_window_size);
Marc Kupietz75af60f2019-01-22 22:34:29 +0100592 double currentAF;
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100593 if(f1<75000000)
Marc Kupietz75af60f2019-01-22 22:34:29 +0100594 for (int bitmask=1; bitmask < (1 << (2*WINDOW_SIZE)); bitmask++) {
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100595 if((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0) continue;
Marc Kupietz75af60f2019-01-22 22:34:29 +0100596 uint64_t currentWindowSum=0;
597 for (int pos=0; pos < 2*WINDOW_SIZE; pos++) {
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100598 if (((1<<pos) & bitmask & usedPositions) != 0)
Marc Kupietz75af60f2019-01-22 22:34:29 +0100599 currentWindowSum+=sumWindow[pos];
600 }
601 currentAF = ca_logdice(f1, f2, currentWindowSum, total, __builtin_popcount(bitmask));
602 if(currentAF > bestAF) {
603 bestAF = currentAF;
604 bestWindow = bitmask;
605 }
606 }
Marc Kupietzcc6c4592019-01-23 10:11:23 +0100607 collocators.push_back ( {last_w2, f2, sum,
608 pmi, pmi / (-log2(o/total/true_window_size)), /* normalize to [-1,1] */
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100609 llr, lfmd, md,
610 left_lfmd,
611 right_lfmd,
612 left_npmi,
Marc Kupietz41880452019-01-22 15:29:06 +0100613 right_npmi,
Marc Kupietzade33222019-01-22 22:52:44 +0100614 ca_dice(f1, f2, sum, total, true_window_size),
615 ca_logdice(f1, f2, sum, total, true_window_size),
Marc Kupietz75af60f2019-01-22 22:34:29 +0100616 bestAF,
617 bestWindow
Marc Kupietz41880452019-01-22 15:29:06 +0100618 }
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100619 );
620 }
Marc Kupietz75af60f2019-01-22 22:34:29 +0100621 memset(sumWindow, 0, 2*WINDOW_SIZE * sizeof(uint64_t));
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100622 usedPositions = 1 << (-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0));
Marc Kupietz75af60f2019-01-22 22:34:29 +0100623 sumWindow[-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0)] = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100624 last_w2 = w2;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100625 maxv = value;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100626 sum = value;
Marc Kupietzade33222019-01-22 22:52:44 +0100627 true_window_size = 1;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100628 } else {
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100629 sum += value;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100630 if(value > maxv)
631 maxv = value;
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100632 usedPositions |= 1 << (-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0));
Marc Kupietz75af60f2019-01-22 22:34:29 +0100633 sumWindow[-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0)] = value;
Marc Kupietzade33222019-01-22 22:52:44 +0100634 true_window_size++;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100635 }
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100636 if(DIST(key) == -1)
637 left = value;
638 else if(DIST(key) == 1)
639 right = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100640 }
641
Marc Kupietzd91e1d42019-01-22 16:18:32 +0100642 sort(collocators.begin(), collocators.end(), sortByLogDice);
Marc Kupietzd31254c2018-01-20 21:29:30 +0100643
Marc Kupietz0779a202018-06-05 11:13:35 +0200644 /*
Marc Kupietzd31254c2018-01-20 21:29:30 +0100645 int i=0;
646 for (Collocator c : collocators) {
647 if(i++>10) break;
648 std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
649 << "\t f(w1):" << _vocab[w1].freq
650 << "\t f(w2):" << _vocab[c.w2].freq
651 << "\t f(w1, x):" << total_w1
Marc Kupietz51f93792018-01-25 08:51:01 +0100652 << "\t f(w1, w2):" << c.raw
Marc Kupietzd31254c2018-01-20 21:29:30 +0100653 << "\t pmi:" << c.pmi
654 << "\t npmi:" << c.npmi
655 << "\t llr:" << c.llr
Marc Kupietzd31254c2018-01-20 21:29:30 +0100656 << "\t lfmd:" << c.lfmd
657 << "\t fpmi:" << c.fpmi
658 << "\t total:" << total
659 << std::endl;
660 }
Marc Kupietz0779a202018-06-05 11:13:35 +0200661 */
Marc Kupietzd31254c2018-01-20 21:29:30 +0100662 return collocators;
663 }
664
Marc Kupietzbd966192018-10-13 14:14:37 +0200665 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
666 return get_collocators(w1, UINT32_MAX);
667 }
668
Marc Kupietz3400aa52018-06-05 10:28:55 +0200669 void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
670 std::vector<Collocator> collocators;
671 std::stringstream stream;
672 uint64_t w2, last_w2 = 0xffffffffffffffff;
673 uint64_t maxv = 0, total_w1 = 0;
674 bool first = true;
675 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
676 uint64_t value = it->intValue(),
677 key = it->intKey();
678 w2 = W2(key);
679 total_w1 += value;
680 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
681 if (w2 != last_w2) {
682 if(maxv >= min_cooccur) {
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100683 double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200684 if(first)
685 first = false;
686 else
687 stream << " ";
688 stream << w2 << " " << llr;
689 }
690 last_w2 = w2;
691 maxv = value;
692 } else {
693 if(value > maxv)
694 maxv = value;
695 }
696 }
697 if(first)
698 stream << "1 0.0";
699 stream << "\n";
700 std::cout << stream.str();
701 }
702
Marc Kupietz4b799e92018-01-02 11:04:56 +0100703 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
704 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
705 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
706
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100707};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100708
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200709string rocksdb::CollocatorDB::getWord(uint32_t w1) {
710 return _vocab[w1].word;
711}
712
Marc Kupietz6aec7682018-01-10 09:47:48 +0100713string rocksdb::CollocatorDB::collocators2json(vector<Collocator> collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100714 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100715 int i = 0;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100716 s << "[";
717 bool first = true;
718 for (Collocator c : collocators) {
Marc Kupietzb999ec52018-06-05 11:20:46 +0200719 if(strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100720 if (i++ > 200)
721 break;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100722 if(!first)
723 s << ",\n";
724 else
725 first = false;
726 s << "{"
Marc Kupietz7d9558f2019-01-22 16:26:50 +0100727 "\"word\":\"" << (string(_vocab[c.w2].word).compare("<num>") == 0? string("###") : string(_vocab[c.w2].word)) << "\"," <<
Marc Kupietzcc6c4592019-01-23 10:11:23 +0100728 "\"f2\":" << c.f2 << "," <<
Marc Kupietz51f93792018-01-25 08:51:01 +0100729 "\"f\":" << c.raw << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100730 "\"npmi\":" << c.npmi << "," <<
Marc Kupietz41880452019-01-22 15:29:06 +0100731 "\"pmi\":" << c.pmi << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100732 "\"llr\":" << c.llr << "," <<
733 "\"lfmd\":" << c.lfmd << "," <<
Marc Kupietz41880452019-01-22 15:29:06 +0100734 "\"md\":" << c.md << "," <<
735 "\"dice\":" << c.dice << "," <<
736 "\"ld\":" << c.logdice << "," <<
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100737 "\"llfmd\":" << c.left_lfmd << "," <<
738 "\"rlfmd\":" << c.right_lfmd << "," <<
739 "\"lnpmi\":" << c.left_npmi << "," <<
Marc Kupietz75af60f2019-01-22 22:34:29 +0100740 "\"rnpmi\":" << c.right_npmi << "," <<
741 "\"af\":" << c.af << "," <<
742 "\"win\":" << c.window <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100743 "}";
744 }
745 s << "]\n";
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100746 // cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100747 return s.str();
748}
749
Marc Kupietz6aec7682018-01-10 09:47:48 +0100750typedef rocksdb::CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100751
752extern "C" {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100753 COLLOCATORS *open_collocatordb_for_write(char *dbname) {
754 return new rocksdb::CollocatorDB(dbname, false);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100755 }
756
Marc Kupietz6aec7682018-01-10 09:47:48 +0100757 COLLOCATORS *open_collocatordb(char *dbname) {
758 return new rocksdb::CollocatorDB(dbname, true);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100759 }
760
Marc Kupietz6aec7682018-01-10 09:47:48 +0100761 void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100762 db->inc(w1, w2, dist);
763 }
764
765 void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
766 db->dump(w1, w2, dist);
767 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100768
Marc Kupietz37359b12018-01-09 21:11:37 +0100769 void get_collocators(COLLOCATORS *db, uint32_t w1) {
770 db->get_collocators(w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100771 }
772
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200773 const char *get_word(COLLOCATORS *db, uint32_t w) {
774 return db->getWord(w).c_str();
775 }
776
Marc Kupietz37359b12018-01-09 21:11:37 +0100777 const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
778 return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100779 }
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100780}