blob: 823473947df4eebb0beed7f2cd2f66d42846f9b6 [file] [log] [blame]
Marc Kupietz28cc53e2017-12-23 17:24:55 +01001#include <typeinfo>
Marc Kupietz4b799e92018-01-02 11:04:56 +01002#define EXPORT __attribute__((visibility("visible")))
3#define IMPORT
Marc Kupietz28cc53e2017-12-23 17:24:55 +01004#include <assert.h>
Marc Kupietz37359b12018-01-09 21:11:37 +01005#include <inttypes.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01006#include <memory>
7#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01008#include <algorithm>
9#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010010#include <stdint.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010011#include <string>
12#include <sstream> // for ostringstream
13#include <math.h>
Marc Kupietzd31254c2018-01-20 21:29:30 +010014#include <rocksdb/cache.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010015#include "rocksdb/comparator.h"
16#include "rocksdb/db.h"
17#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010019#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010020#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010021#include "rocksdb/utilities/db_ttl.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010022#include "rocksdb/filter_policy.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010023#include "merge_operators.h"
24
Marc Kupietz8cf7e912019-01-21 17:05:23 +010025#define WINDOW_SIZE 5.0
Marc Kupietz98cbcdc2019-01-21 17:11:27 +010026#define FREQUENCY_THRESHOLD 5
Marc Kupietz28cc53e2017-12-23 17:24:55 +010027#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
28#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010029#define W1(key) (uint64_t)(key & 0xffffff)
30#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
31#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010032
33typedef struct {
34 uint64_t freq;
35 char *word;
36} vocab_entry;
37
38// typedef struct Collocator {
39// uint64_t w2;
40// uint64_t sum;
41// };
42
Marc Kupietz28cc53e2017-12-23 17:24:55 +010043using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010044using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010045
Marc Kupietz4b799e92018-01-02 11:04:56 +010046namespace rocksdb {
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020047 class Collocator {
48 public:
Marc Kupietzc8ddf452018-01-07 21:33:12 +010049 uint64_t w2;
Marc Kupietz51f93792018-01-25 08:51:01 +010050 uint64_t raw;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010051 double pmi;
52 double npmi;
53 double llr;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010054 double lfmd;
Marc Kupietz41880452019-01-22 15:29:06 +010055 double md;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +010056 double left_lfmd;
57 double right_lfmd;
58 double left_npmi;
59 double right_npmi;
Marc Kupietz41880452019-01-22 15:29:06 +010060 double dice;
61 double logdice;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010062 };
63
Marc Kupietz28cc53e2017-12-23 17:24:55 +010064 size_t num_merge_operator_calls;
65 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010066
Marc Kupietz28cc53e2017-12-23 17:24:55 +010067 size_t num_partial_merge_calls;
68 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010069
70
Marc Kupietz4b799e92018-01-02 11:04:56 +010071 inline void EncodeFixed64(char* buf, uint64_t value) {
72 if (! IS_BIG_ENDIAN) {
73 memcpy(buf, &value, sizeof(value));
74 } else {
75 buf[0] = value & 0xff;
76 buf[1] = (value >> 8) & 0xff;
77 buf[2] = (value >> 16) & 0xff;
78 buf[3] = (value >> 24) & 0xff;
79 buf[4] = (value >> 32) & 0xff;
80 buf[5] = (value >> 40) & 0xff;
81 buf[6] = (value >> 48) & 0xff;
82 buf[7] = (value >> 56) & 0xff;
83 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010084 }
85
Marc Kupietz4b799e92018-01-02 11:04:56 +010086 inline uint32_t DecodeFixed32(const char* ptr) {
87 if (! IS_BIG_ENDIAN) {
88 // Load the raw bytes
89 uint32_t result;
90 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
91 return result;
92 } else {
93 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
94 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
95 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
96 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
97 }
98 }
99
100 inline uint64_t DecodeFixed64(const char* ptr) {
101 if (! IS_BIG_ENDIAN) {
102 // Load the raw bytes
103 uint64_t result;
104 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
105 return result;
106 } else {
107 uint64_t lo = DecodeFixed32(ptr);
108 uint64_t hi = DecodeFixed32(ptr + 4);
109 return (hi << 32) | lo;
110 }
111 }
112
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100113 static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100114 double
115 r1 = f1 * window_size,
116 c1 = f2,
117 e = r1 * c1 / total,
118 o = f12;
119 return log2(o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100120 }
121
Marc Kupietzce0b8b02018-06-05 11:06:39 +0200122 // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
123 // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100124 static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100125 double
126 r1 = f1 * window_size,
127 c1 = f2,
128 e = r1 * c1 / total,
129 o = f12;
130 if(f12 < FREQUENCY_THRESHOLD)
Marc Kupietz8caf9912018-06-05 10:51:18 +0200131 return -1.0;
132 else
Marc Kupietz1335dd72019-01-22 15:35:21 +0100133 return log2(o/e) / (-log2(o/total/window_size));
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100134 }
135
136 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
137 // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
138 // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
139 static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100140 double
141 r1 = f1 * window_size,
142 c1 = f2,
143 e = r1 * c1 / total,
144 o = f12;
145 return log2(o*o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100146 }
147
148 static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100149 double
150 r1 = f1 * window_size,
151 c1 = f2,
152 e = r1 * c1 / total,
153 o = f12;
Marc Kupietz8caf9912018-06-05 10:51:18 +0200154 if(f12 == 0)
155 return 0;
156 else
Marc Kupietz1335dd72019-01-22 15:35:21 +0100157 return log2(o*o*o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100158 }
159
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100160 // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
161 // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
162 static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
163 double
164 r1 = (double) w1 * window_size,
165 r2 = (double) n - r1,
166 c1 = w2,
167 c2 = n - c1,
168 o11 = w12, o12 = r1 - o11,
169 o21 = c1 - w12, o22 = r2 - o21,
170 e11 = r1 * c1 / n, e12 = r1 * c2 / n,
171 e21 = r2 * c1 / n, e22 = r2 * c2 / n;
172 return (2 * ( (o11>0? o11 * log(o11/e11):0) + (o12>0? o12 * log(o12/e12):0) + (o21>0? o21 * log(o21/e21):0) + (o22>0? o22 * log(o22/e22):0)));
173 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100174
Marc Kupietz41880452019-01-22 15:29:06 +0100175
176 static inline double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
177 double
178 r1 = (double) w1 * window_size,
179 c1 = w2;
180 return 2 * w12 / (c1+r1);
181 }
182
183 // Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
184 static inline double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
185 double
186 e = 0.5,
187 r1 = (double) w1 * window_size,
188 c1 = w2;
189 return 14 + log2(2 * (w12+e) / (c1+e+r1+e));
190 }
191
Marc Kupietz4b799e92018-01-02 11:04:56 +0100192 class CountMergeOperator : public AssociativeMergeOperator {
193 public:
194 CountMergeOperator() {
195 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100196 }
197
Marc Kupietz4b799e92018-01-02 11:04:56 +0100198 virtual bool Merge(const Slice& key,
199 const Slice* existing_value,
200 const Slice& value,
201 std::string* new_value,
202 Logger* logger) const override {
203 assert(new_value->empty());
204 ++num_merge_operator_calls;
205 if (existing_value == nullptr) {
206 new_value->assign(value.data(), value.size());
207 return true;
208 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100209
Marc Kupietz4b799e92018-01-02 11:04:56 +0100210 return mergeOperator_->PartialMerge(
211 key,
212 *existing_value,
213 value,
214 new_value,
215 logger);
216 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100217
Marc Kupietz4b799e92018-01-02 11:04:56 +0100218 virtual const char* Name() const override {
219 return "UInt64AddOperator";
220 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100221
Marc Kupietz4b799e92018-01-02 11:04:56 +0100222 private:
223 std::shared_ptr<MergeOperator> mergeOperator_;
224 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100225
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100226
Marc Kupietz4b799e92018-01-02 11:04:56 +0100227 class CollocatorIterator : public Iterator {
228 private:
229 char prefixc[sizeof(uint64_t)];
230 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100231
232
Marc Kupietz4b799e92018-01-02 11:04:56 +0100233 public:
234 CollocatorIterator(Iterator* base_iterator)
235 : base_iterator_(base_iterator)
236 {}
237
Marc Kupietz4b799e92018-01-02 11:04:56 +0100238 void setPrefix(char *prefix) {
239 memcpy(prefixc, prefix, sizeof(uint64_t));
240 }
241
242 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
243 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
244 virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); }
245 virtual void Prev() { base_iterator_->Prev(); }
246 virtual void Next() { base_iterator_->Next(); }
247 virtual Slice key() const;
248 virtual Slice value() const;
249 virtual Status status() const;
250 virtual bool Valid() const;
251 bool isValid();
252 uint64_t intValue();
253 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100254
Marc Kupietz4b799e92018-01-02 11:04:56 +0100255 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100256
Marc Kupietz4b799e92018-01-02 11:04:56 +0100257 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100258
Marc Kupietz4b799e92018-01-02 11:04:56 +0100259 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz18375e12017-12-24 10:11:18 +0100260 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100261 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100262
Marc Kupietz4b799e92018-01-02 11:04:56 +0100263 bool rocksdb::CollocatorIterator::isValid() {
264 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietzd31254c2018-01-20 21:29:30 +0100265 // return key().starts_with(std::string(prefixc,3));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100266 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100267
Marc Kupietz4b799e92018-01-02 11:04:56 +0100268 uint64_t rocksdb::CollocatorIterator::intKey() {
269 return DecodeFixed64(base_iterator_->key().data());
270 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100271
Marc Kupietz4b799e92018-01-02 11:04:56 +0100272 uint64_t rocksdb::CollocatorIterator::intValue() {
273 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100274 }
275
Marc Kupietz37359b12018-01-09 21:11:37 +0100276 class VocabEntry {
277 public:
278 string word;
279 uint64_t freq;
280 };
281
Marc Kupietz6aec7682018-01-10 09:47:48 +0100282 class CollocatorDB {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100283 private:
284 WriteOptions merge_option_; // for merge
285 char _one[sizeof(uint64_t)];
286 Slice _one_slice;
Marc Kupietz37359b12018-01-09 21:11:37 +0100287 vector<VocabEntry> _vocab;
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100288 uint64_t total = 0;
289 uint64_t sentences = 0;
Marc Kupietz8cf7e912019-01-21 17:05:23 +0100290 float avg_window_size = 8.0;
Marc Kupietz37359b12018-01-09 21:11:37 +0100291
Marc Kupietz4b799e92018-01-02 11:04:56 +0100292 protected:
293 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100294
Marc Kupietz4b799e92018-01-02 11:04:56 +0100295 WriteOptions put_option_;
296 ReadOptions get_option_;
297 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100298
Marc Kupietz4b799e92018-01-02 11:04:56 +0100299 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100300
Marc Kupietz4b799e92018-01-02 11:04:56 +0100301 std::shared_ptr<DB> OpenDb(const char *dbname);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100302 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
Marc Kupietz37359b12018-01-09 21:11:37 +0100303 void read_vocab(string fname);
304
Marc Kupietz4b799e92018-01-02 11:04:56 +0100305 public:
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200306 string getWord(uint32_t w1);
Marc Kupietz6aec7682018-01-10 09:47:48 +0100307 CollocatorDB(const char *db_name, bool read_only);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100308
Marc Kupietz6aec7682018-01-10 09:47:48 +0100309 // public interface of CollocatorDB.
Marc Kupietz4b799e92018-01-02 11:04:56 +0100310 // All four functions return false
311 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100312
Marc Kupietz4b799e92018-01-02 11:04:56 +0100313 // mapped to a levedb Put
314 bool set(const std::string& key, uint64_t value) {
315 // just treat the internal rep of int64 as the string
316 char buf[sizeof(value)];
317 EncodeFixed64(buf, value);
318 Slice slice(buf, sizeof(value));
319 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100320
Marc Kupietz4b799e92018-01-02 11:04:56 +0100321 if (s.ok()) {
322 return true;
323 } else {
324 std::cerr << s.ToString() << std::endl;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100325 return false;
326 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100327 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100328
329 DB *getDb() {
330 return db_.get();
331 }
332
333 // mapped to a rocksdb Delete
334 bool remove(const std::string& key) {
335 auto s = db_->Delete(delete_option_, key);
336
337 if (s.ok()) {
338 return true;
339 } else {
340 std::cerr << s.ToString() << std::endl;
341 return false;
342 }
343 }
344
345 // mapped to a rocksdb Get
346 bool get(const std::string& key, uint64_t* value) {
347 std::string str;
348 auto s = db_->Get(get_option_, key, &str);
349
350 if (s.IsNotFound()) {
351 // return default value if not found;
352 *value = default_;
353 return true;
354 } else if (s.ok()) {
355 // deserialization
356 if (str.size() != sizeof(uint64_t)) {
357 std::cerr << "value corruption\n";
358 return false;
359 }
360 *value = DecodeFixed64(&str[0]);
361 return true;
362 } else {
363 std::cerr << s.ToString() << std::endl;
364 return false;
365 }
366 }
367
368
369 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
370 char encoded_key[sizeof(uint64_t)];
371 EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist));
372 uint64_t value = default_;
373 get(std::string(encoded_key, 8), &value);
374 return value;
375 }
376
377 virtual void inc(const std::string& key) {
378 db_->Merge(merge_option_, key, _one_slice);
379 }
380
381 void inc(const uint64_t key) {
382 char encoded_key[sizeof(uint64_t)];
383 EncodeFixed64(encoded_key, key);
384 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
385 }
386
387 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100388 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz37359b12018-01-09 21:11:37 +0100389 vector<Collocator> get_collocators(uint32_t w1);
Marc Kupietzbd966192018-10-13 14:14:37 +0200390 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200391 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100392 string collocators2json(vector<Collocator> collocators);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100393
Marc Kupietz4b799e92018-01-02 11:04:56 +0100394 // mapped to a rocksdb Merge operation
395 virtual bool add(const std::string& key, uint64_t value) {
396 char encoded[sizeof(uint64_t)];
397 EncodeFixed64(encoded, value);
398 Slice slice(encoded, sizeof(uint64_t));
399 auto s = db_->Merge(merge_option_, key, slice);
400
401 if (s.ok()) {
402 return true;
403 } else {
404 std::cerr << s.ToString() << std::endl;
405 return false;
406 }
407 }
408
409 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
410 };
411
Marc Kupietz6aec7682018-01-10 09:47:48 +0100412 rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100413 // merge_option_.sync = true;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100414 if(read_only)
415 db_ = OpenDbForRead(db_name);
416 else
417 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100418 assert(db_);
419 uint64_t one = 1;
420 EncodeFixed64(_one, one);
421 _one_slice = Slice(_one, sizeof(uint64_t));
422 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100423
Marc Kupietz6aec7682018-01-10 09:47:48 +0100424 void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100425 inc(encodeCollocation(w1, w2, dist));
426 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100427
Marc Kupietz6aec7682018-01-10 09:47:48 +0100428 void rocksdb::CollocatorDB::read_vocab(string fname) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100429 char strbuf[2048];
430 uint64_t freq;
431 FILE *fin = fopen(fname.c_str(), "rb");
432 if (fin == NULL) {
433 cout << "Vocabulary file " << fname <<" not found\n";
434 exit(1);
435 }
436 uint64_t i = 0;
437 while(!feof(fin)) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100438 fscanf(fin, "%s %lu", strbuf, &freq);
Marc Kupietz37359b12018-01-09 21:11:37 +0100439 _vocab.push_back({strbuf, freq});
440 total += freq;
441 i++;
442 }
443 fclose(fin);
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100444
445 char size_fname[256];
446 strcpy(size_fname, fname.c_str());
447 char *pos = strstr(size_fname, ".vocab");
448 if(pos) {
449 *pos=0;
450 strcat(size_fname, ".size");
451 FILE *fp = fopen(size_fname, "r");
452 if (fp != NULL) {
453 fscanf(fp, "%lu", &sentences);
454 fscanf(fp, "%lu", &total);
455 float sl = (float)total/(float)sentences;
456 float w = WINDOW_SIZE;
457 avg_window_size = ((sl > 2*w? (sl-2*w)*2*w: 0) + (double) w * (3*w -1)) / sl;
458 fprintf(stdout, "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n", total, sentences, sl, avg_window_size);
459 fclose(fp);
460 } else {
461 std::cout << "size file " << size_fname << " not found\n";
462 }
463 } else {
464 std::cout << "cannot determine size file " << size_fname << "\n";
465 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100466 }
467
Marc Kupietz6aec7682018-01-10 09:47:48 +0100468 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
Marc Kupietz6bb27762018-01-09 17:53:01 +0100469 DB* db;
470 Options options;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100471 options.env->SetBackgroundThreads(4);
472 options.create_if_missing = true;
473 options.merge_operator = std::make_shared<CountMergeOperator>();
474 options.max_successive_merges = 0;
475 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
476 options.IncreaseParallelism();
477 options.OptimizeLevelStyleCompaction();
478 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
Marc Kupietz37359b12018-01-09 21:11:37 +0100479 ostringstream dbname, vocabname;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100480 dbname << name << ".rocksdb";
481 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
482 if (!s.ok()) {
483 std::cerr << s.ToString() << std::endl;
484 assert(false);
485 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100486 vocabname << name << ".vocab";
487 read_vocab(vocabname.str());
Marc Kupietz6bb27762018-01-09 17:53:01 +0100488 return std::shared_ptr<DB>(db);
489 }
490
Marc Kupietz6aec7682018-01-10 09:47:48 +0100491 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100492 DB* db;
493 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100494
495
496 options.env->SetBackgroundThreads(4);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100497 options.create_if_missing = true;
498 options.merge_operator = std::make_shared<CountMergeOperator>();
499 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100500 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
501 options.IncreaseParallelism();
502 options.OptimizeLevelStyleCompaction();
503 // options.max_write_buffer_number = 48;
504 // options.max_background_jobs = 48;
505 // options.allow_concurrent_memtable_write=true;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100506 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
507 // options.enable_write_thread_adaptive_yield = 1;
508 // options.allow_concurrent_memtable_write = 1;
509 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
510 // options.write_buffer_size = 1 << 22;
511 // options.allow_mmap_reads = true;
512 // options.allow_mmap_writes = true;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100513 // options.max_background_compactions = 40;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100514 // BlockBasedTableOptions table_options;
515 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
516 // options.bloom_locality = 1;
517 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
518 // table_options.block_cache = cache;
519 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100520 Status s;
521 // DestroyDB(dbname, Options());
522 s = DB::Open(options, dbname, &db);
523 if (!s.ok()) {
524 std::cerr << s.ToString() << std::endl;
525 assert(false);
526 }
527 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100528 }
529
Marc Kupietz6aec7682018-01-10 09:47:48 +0100530 CollocatorIterator* rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100531 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100532 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100533 char prefixc[sizeof(uint64_t)];
534 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
535 Iterator *it = db_->NewIterator(options);
536 CollocatorIterator *cit = new CollocatorIterator(it);
537 cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) {
538 cit->setPrefix(prefixc);
539 return cit;
540 }
541
Marc Kupietz6aec7682018-01-10 09:47:48 +0100542 void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100543 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
544 for (; it->isValid(); it->Next()) {
545 uint64_t value = it->intValue();
546 uint64_t key = it->intKey();
547 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl;
548 }
549 std::cout << "ready dumping\n";
550 }
551
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100552 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
553 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100554 bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
Marc Kupietzd91e1d42019-01-22 16:18:32 +0100555 bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdoce > rhs.logdice; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100556
Marc Kupietzbd966192018-10-13 14:14:37 +0200557 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100558 std::vector<Collocator> collocators;
559 uint64_t w2, last_w2 = 0xffffffffffffffff;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100560 uint64_t maxv = 0, sum = 0, left = 0, right = 0;
561
Marc Kupietzd31254c2018-01-20 21:29:30 +0100562 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
563 uint64_t value = it->intValue(),
564 key = it->intKey();
Marc Kupietzbd966192018-10-13 14:14:37 +0200565 if((w2 = W2(key)) > max_w2)
566 continue;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100567 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
568 if (w2 != last_w2) {
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100569 if(sum >= FREQUENCY_THRESHOLD) {
570 double o = sum,
571 r1 = (double)_vocab[w1].freq * avg_window_size,
572 c1 = (double)_vocab[last_w2].freq,
573 e = r1 * c1 / total,
574 pmi = log2(o/e),
575 md = log2(o*o/e),
576 lfmd = log2(o*o*o/e),
577 llr = ca_ll((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size);
578 double left_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
579 double right_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
580 double left_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
581 double right_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
Marc Kupietz41880452019-01-22 15:29:06 +0100582 collocators.push_back ( {last_w2, sum, pmi, pmi / (-log2(o/total/avg_window_size)), /* normalize to [-1,1] */
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100583 llr, lfmd, md,
584 left_lfmd,
585 right_lfmd,
586 left_npmi,
Marc Kupietz41880452019-01-22 15:29:06 +0100587 right_npmi,
588 ca_dice((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size),
589 ca_logdice((double)_vocab[w1].freq, (double)_vocab[last_w2].freq, sum, total, avg_window_size)
590 }
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100591 );
592 }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100593 last_w2 = w2;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100594 maxv = value;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100595 sum = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100596 } else {
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100597 sum += value;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100598 if(value > maxv)
599 maxv = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100600 }
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100601 if(DIST(key) == -1)
602 left = value;
603 else if(DIST(key) == 1)
604 right = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100605 }
606
Marc Kupietzd91e1d42019-01-22 16:18:32 +0100607 sort(collocators.begin(), collocators.end(), sortByLogDice);
Marc Kupietzd31254c2018-01-20 21:29:30 +0100608
Marc Kupietz0779a202018-06-05 11:13:35 +0200609 /*
Marc Kupietzd31254c2018-01-20 21:29:30 +0100610 int i=0;
611 for (Collocator c : collocators) {
612 if(i++>10) break;
613 std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
614 << "\t f(w1):" << _vocab[w1].freq
615 << "\t f(w2):" << _vocab[c.w2].freq
616 << "\t f(w1, x):" << total_w1
Marc Kupietz51f93792018-01-25 08:51:01 +0100617 << "\t f(w1, w2):" << c.raw
Marc Kupietzd31254c2018-01-20 21:29:30 +0100618 << "\t pmi:" << c.pmi
619 << "\t npmi:" << c.npmi
620 << "\t llr:" << c.llr
Marc Kupietzd31254c2018-01-20 21:29:30 +0100621 << "\t lfmd:" << c.lfmd
622 << "\t fpmi:" << c.fpmi
623 << "\t total:" << total
624 << std::endl;
625 }
Marc Kupietz0779a202018-06-05 11:13:35 +0200626 */
Marc Kupietzd31254c2018-01-20 21:29:30 +0100627 return collocators;
628 }
629
Marc Kupietzbd966192018-10-13 14:14:37 +0200630 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
631 return get_collocators(w1, UINT32_MAX);
632 }
633
Marc Kupietz3400aa52018-06-05 10:28:55 +0200634 void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
635 std::vector<Collocator> collocators;
636 std::stringstream stream;
637 uint64_t w2, last_w2 = 0xffffffffffffffff;
638 uint64_t maxv = 0, total_w1 = 0;
639 bool first = true;
640 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
641 uint64_t value = it->intValue(),
642 key = it->intKey();
643 w2 = W2(key);
644 total_w1 += value;
645 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
646 if (w2 != last_w2) {
647 if(maxv >= min_cooccur) {
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100648 double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200649 if(first)
650 first = false;
651 else
652 stream << " ";
653 stream << w2 << " " << llr;
654 }
655 last_w2 = w2;
656 maxv = value;
657 } else {
658 if(value > maxv)
659 maxv = value;
660 }
661 }
662 if(first)
663 stream << "1 0.0";
664 stream << "\n";
665 std::cout << stream.str();
666 }
667
Marc Kupietz4b799e92018-01-02 11:04:56 +0100668 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
669 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
670 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
671
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100672};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100673
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200674string rocksdb::CollocatorDB::getWord(uint32_t w1) {
675 return _vocab[w1].word;
676}
677
Marc Kupietz6aec7682018-01-10 09:47:48 +0100678string rocksdb::CollocatorDB::collocators2json(vector<Collocator> collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100679 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100680 int i = 0;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100681 s << "[";
682 bool first = true;
683 for (Collocator c : collocators) {
Marc Kupietzb999ec52018-06-05 11:20:46 +0200684 if(strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100685 if (i++ > 200)
686 break;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100687 if(!first)
688 s << ",\n";
689 else
690 first = false;
691 s << "{"
692 "\"word\":\"" << string(_vocab[c.w2].word) << "\"," <<
693 "\"rank\":" << c.w2 << "," <<
Marc Kupietz51f93792018-01-25 08:51:01 +0100694 "\"f\":" << c.raw << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100695 "\"npmi\":" << c.npmi << "," <<
Marc Kupietz41880452019-01-22 15:29:06 +0100696 "\"pmi\":" << c.pmi << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100697 "\"llr\":" << c.llr << "," <<
698 "\"lfmd\":" << c.lfmd << "," <<
Marc Kupietz41880452019-01-22 15:29:06 +0100699 "\"md\":" << c.md << "," <<
700 "\"dice\":" << c.dice << "," <<
701 "\"ld\":" << c.logdice << "," <<
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100702 "\"llfmd\":" << c.left_lfmd << "," <<
703 "\"rlfmd\":" << c.right_lfmd << "," <<
704 "\"lnpmi\":" << c.left_npmi << "," <<
705 "\"rnpmi\":" << c.right_npmi <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100706 "}";
707 }
708 s << "]\n";
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100709 // cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100710 return s.str();
711}
712
Marc Kupietz6aec7682018-01-10 09:47:48 +0100713typedef rocksdb::CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100714
715extern "C" {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100716 COLLOCATORS *open_collocatordb_for_write(char *dbname) {
717 return new rocksdb::CollocatorDB(dbname, false);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100718 }
719
Marc Kupietz6aec7682018-01-10 09:47:48 +0100720 COLLOCATORS *open_collocatordb(char *dbname) {
721 return new rocksdb::CollocatorDB(dbname, true);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100722 }
723
Marc Kupietz6aec7682018-01-10 09:47:48 +0100724 void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100725 db->inc(w1, w2, dist);
726 }
727
728 void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
729 db->dump(w1, w2, dist);
730 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100731
Marc Kupietz37359b12018-01-09 21:11:37 +0100732 void get_collocators(COLLOCATORS *db, uint32_t w1) {
733 db->get_collocators(w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100734 }
735
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200736 const char *get_word(COLLOCATORS *db, uint32_t w) {
737 return db->getWord(w).c_str();
738 }
739
Marc Kupietz37359b12018-01-09 21:11:37 +0100740 const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
741 return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100742 }
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100743}