blob: 06ba8c3cac7f4af86b28d2e8b81a21c4e453e252 [file] [log] [blame]
Marc Kupietz28cc53e2017-12-23 17:24:55 +01001#include <typeinfo>
Marc Kupietz4b799e92018-01-02 11:04:56 +01002#define EXPORT __attribute__((visibility("visible")))
3#define IMPORT
Marc Kupietz28cc53e2017-12-23 17:24:55 +01004#include <assert.h>
Marc Kupietz37359b12018-01-09 21:11:37 +01005#include <inttypes.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01006#include <memory>
7#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01008#include <algorithm>
9#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010010#include <stdint.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010011#include <string>
12#include <sstream> // for ostringstream
13#include <math.h>
Marc Kupietzd31254c2018-01-20 21:29:30 +010014#include <rocksdb/cache.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010015#include "rocksdb/comparator.h"
16#include "rocksdb/db.h"
17#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010018#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010019#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010020#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010021#include "rocksdb/utilities/db_ttl.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010022#include "rocksdb/filter_policy.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010023#include "merge_operators.h"
24
Marc Kupietz8cf7e912019-01-21 17:05:23 +010025#define WINDOW_SIZE 5.0
Marc Kupietz28cc53e2017-12-23 17:24:55 +010026#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
27#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010028#define W1(key) (uint64_t)(key & 0xffffff)
29#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
30#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010031
32typedef struct {
33 uint64_t freq;
34 char *word;
35} vocab_entry;
36
37// typedef struct Collocator {
38// uint64_t w2;
39// uint64_t sum;
40// };
41
Marc Kupietz28cc53e2017-12-23 17:24:55 +010042using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010043using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010044
Marc Kupietz4b799e92018-01-02 11:04:56 +010045namespace rocksdb {
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020046 class Collocator {
47 public:
Marc Kupietzc8ddf452018-01-07 21:33:12 +010048 uint64_t w2;
Marc Kupietz51f93792018-01-25 08:51:01 +010049 uint64_t raw;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010050 double pmi;
51 double npmi;
52 double llr;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010053 double lfmd;
54 double fpmi;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +010055 double left_lfmd;
56 double right_lfmd;
57 double left_npmi;
58 double right_npmi;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010059 };
60
Marc Kupietz28cc53e2017-12-23 17:24:55 +010061 size_t num_merge_operator_calls;
62 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010063
Marc Kupietz28cc53e2017-12-23 17:24:55 +010064 size_t num_partial_merge_calls;
65 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010066
67
Marc Kupietz4b799e92018-01-02 11:04:56 +010068 inline void EncodeFixed64(char* buf, uint64_t value) {
69 if (! IS_BIG_ENDIAN) {
70 memcpy(buf, &value, sizeof(value));
71 } else {
72 buf[0] = value & 0xff;
73 buf[1] = (value >> 8) & 0xff;
74 buf[2] = (value >> 16) & 0xff;
75 buf[3] = (value >> 24) & 0xff;
76 buf[4] = (value >> 32) & 0xff;
77 buf[5] = (value >> 40) & 0xff;
78 buf[6] = (value >> 48) & 0xff;
79 buf[7] = (value >> 56) & 0xff;
80 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010081 }
82
Marc Kupietz4b799e92018-01-02 11:04:56 +010083 inline uint32_t DecodeFixed32(const char* ptr) {
84 if (! IS_BIG_ENDIAN) {
85 // Load the raw bytes
86 uint32_t result;
87 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
88 return result;
89 } else {
90 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
91 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
92 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
93 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
94 }
95 }
96
97 inline uint64_t DecodeFixed64(const char* ptr) {
98 if (! IS_BIG_ENDIAN) {
99 // Load the raw bytes
100 uint64_t result;
101 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
102 return result;
103 } else {
104 uint64_t lo = DecodeFixed32(ptr);
105 uint64_t hi = DecodeFixed32(ptr + 4);
106 return (hi << 32) | lo;
107 }
108 }
109
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100110 static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
111 return log2( total * ((double) f12) / (window_size * ((double) f1) * ((double)f2) ));
112 }
113
Marc Kupietzce0b8b02018-06-05 11:06:39 +0200114 // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
115 // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100116 static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz8caf9912018-06-05 10:51:18 +0200117 if(f12 == 0)
118 return -1.0;
119 else
120 return log2( total * ((double) f12) / (window_size * ((double) f1) * ((double)f2) )) / (-log2(((double) f12 / window_size / total)));
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100121 }
122
123 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
124 // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
125 // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
126 static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
127 return log2((double)f12 * f12 / ((double) total * window_size * window_size * f1 * f2));
128 }
129
130 static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz8caf9912018-06-05 10:51:18 +0200131 if(f12 == 0)
132 return 0;
133 else
134 return log2((double)f12 * f12 / ((double) total * window_size * window_size * f1 * f2)) + log2((double) f12 / window_size / total);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100135 }
136
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100137 // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
138 // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
139 static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
140 double
141 r1 = (double) w1 * window_size,
142 r2 = (double) n - r1,
143 c1 = w2,
144 c2 = n - c1,
145 o11 = w12, o12 = r1 - o11,
146 o21 = c1 - w12, o22 = r2 - o21,
147 e11 = r1 * c1 / n, e12 = r1 * c2 / n,
148 e21 = r2 * c1 / n, e22 = r2 * c2 / n;
149 return (2 * ( (o11>0? o11 * log(o11/e11):0) + (o12>0? o12 * log(o12/e12):0) + (o21>0? o21 * log(o21/e21):0) + (o22>0? o22 * log(o22/e22):0)));
150 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100151
152 class CountMergeOperator : public AssociativeMergeOperator {
153 public:
154 CountMergeOperator() {
155 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100156 }
157
Marc Kupietz4b799e92018-01-02 11:04:56 +0100158 virtual bool Merge(const Slice& key,
159 const Slice* existing_value,
160 const Slice& value,
161 std::string* new_value,
162 Logger* logger) const override {
163 assert(new_value->empty());
164 ++num_merge_operator_calls;
165 if (existing_value == nullptr) {
166 new_value->assign(value.data(), value.size());
167 return true;
168 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100169
Marc Kupietz4b799e92018-01-02 11:04:56 +0100170 return mergeOperator_->PartialMerge(
171 key,
172 *existing_value,
173 value,
174 new_value,
175 logger);
176 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100177
Marc Kupietz4b799e92018-01-02 11:04:56 +0100178 virtual const char* Name() const override {
179 return "UInt64AddOperator";
180 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100181
Marc Kupietz4b799e92018-01-02 11:04:56 +0100182 private:
183 std::shared_ptr<MergeOperator> mergeOperator_;
184 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100185
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100186
Marc Kupietz4b799e92018-01-02 11:04:56 +0100187 class CollocatorIterator : public Iterator {
188 private:
189 char prefixc[sizeof(uint64_t)];
190 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100191
192
Marc Kupietz4b799e92018-01-02 11:04:56 +0100193 public:
194 CollocatorIterator(Iterator* base_iterator)
195 : base_iterator_(base_iterator)
196 {}
197
Marc Kupietz4b799e92018-01-02 11:04:56 +0100198 void setPrefix(char *prefix) {
199 memcpy(prefixc, prefix, sizeof(uint64_t));
200 }
201
202 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
203 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
204 virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); }
205 virtual void Prev() { base_iterator_->Prev(); }
206 virtual void Next() { base_iterator_->Next(); }
207 virtual Slice key() const;
208 virtual Slice value() const;
209 virtual Status status() const;
210 virtual bool Valid() const;
211 bool isValid();
212 uint64_t intValue();
213 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100214
Marc Kupietz4b799e92018-01-02 11:04:56 +0100215 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100216
Marc Kupietz4b799e92018-01-02 11:04:56 +0100217 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100218
Marc Kupietz4b799e92018-01-02 11:04:56 +0100219 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz18375e12017-12-24 10:11:18 +0100220 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100221 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100222
Marc Kupietz4b799e92018-01-02 11:04:56 +0100223 bool rocksdb::CollocatorIterator::isValid() {
224 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietzd31254c2018-01-20 21:29:30 +0100225 // return key().starts_with(std::string(prefixc,3));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100226 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100227
Marc Kupietz4b799e92018-01-02 11:04:56 +0100228 uint64_t rocksdb::CollocatorIterator::intKey() {
229 return DecodeFixed64(base_iterator_->key().data());
230 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100231
Marc Kupietz4b799e92018-01-02 11:04:56 +0100232 uint64_t rocksdb::CollocatorIterator::intValue() {
233 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100234 }
235
Marc Kupietz37359b12018-01-09 21:11:37 +0100236 class VocabEntry {
237 public:
238 string word;
239 uint64_t freq;
240 };
241
Marc Kupietz6aec7682018-01-10 09:47:48 +0100242 class CollocatorDB {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100243 private:
244 WriteOptions merge_option_; // for merge
245 char _one[sizeof(uint64_t)];
246 Slice _one_slice;
Marc Kupietz37359b12018-01-09 21:11:37 +0100247 vector<VocabEntry> _vocab;
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100248 uint64_t total = 0;
249 uint64_t sentences = 0;
Marc Kupietz8cf7e912019-01-21 17:05:23 +0100250 float avg_window_size = 8.0;
Marc Kupietz37359b12018-01-09 21:11:37 +0100251
Marc Kupietz4b799e92018-01-02 11:04:56 +0100252 protected:
253 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100254
Marc Kupietz4b799e92018-01-02 11:04:56 +0100255 WriteOptions put_option_;
256 ReadOptions get_option_;
257 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100258
Marc Kupietz4b799e92018-01-02 11:04:56 +0100259 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100260
Marc Kupietz4b799e92018-01-02 11:04:56 +0100261 std::shared_ptr<DB> OpenDb(const char *dbname);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100262 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
Marc Kupietz37359b12018-01-09 21:11:37 +0100263 void read_vocab(string fname);
264
Marc Kupietz4b799e92018-01-02 11:04:56 +0100265 public:
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200266 string getWord(uint32_t w1);
Marc Kupietz6aec7682018-01-10 09:47:48 +0100267 CollocatorDB(const char *db_name, bool read_only);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100268
Marc Kupietz6aec7682018-01-10 09:47:48 +0100269 // public interface of CollocatorDB.
Marc Kupietz4b799e92018-01-02 11:04:56 +0100270 // All four functions return false
271 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100272
Marc Kupietz4b799e92018-01-02 11:04:56 +0100273 // mapped to a levedb Put
274 bool set(const std::string& key, uint64_t value) {
275 // just treat the internal rep of int64 as the string
276 char buf[sizeof(value)];
277 EncodeFixed64(buf, value);
278 Slice slice(buf, sizeof(value));
279 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100280
Marc Kupietz4b799e92018-01-02 11:04:56 +0100281 if (s.ok()) {
282 return true;
283 } else {
284 std::cerr << s.ToString() << std::endl;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100285 return false;
286 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100287 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100288
289 DB *getDb() {
290 return db_.get();
291 }
292
293 // mapped to a rocksdb Delete
294 bool remove(const std::string& key) {
295 auto s = db_->Delete(delete_option_, key);
296
297 if (s.ok()) {
298 return true;
299 } else {
300 std::cerr << s.ToString() << std::endl;
301 return false;
302 }
303 }
304
305 // mapped to a rocksdb Get
306 bool get(const std::string& key, uint64_t* value) {
307 std::string str;
308 auto s = db_->Get(get_option_, key, &str);
309
310 if (s.IsNotFound()) {
311 // return default value if not found;
312 *value = default_;
313 return true;
314 } else if (s.ok()) {
315 // deserialization
316 if (str.size() != sizeof(uint64_t)) {
317 std::cerr << "value corruption\n";
318 return false;
319 }
320 *value = DecodeFixed64(&str[0]);
321 return true;
322 } else {
323 std::cerr << s.ToString() << std::endl;
324 return false;
325 }
326 }
327
328
329 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
330 char encoded_key[sizeof(uint64_t)];
331 EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist));
332 uint64_t value = default_;
333 get(std::string(encoded_key, 8), &value);
334 return value;
335 }
336
337 virtual void inc(const std::string& key) {
338 db_->Merge(merge_option_, key, _one_slice);
339 }
340
341 void inc(const uint64_t key) {
342 char encoded_key[sizeof(uint64_t)];
343 EncodeFixed64(encoded_key, key);
344 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
345 }
346
347 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100348 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz37359b12018-01-09 21:11:37 +0100349 vector<Collocator> get_collocators(uint32_t w1);
Marc Kupietzbd966192018-10-13 14:14:37 +0200350 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200351 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100352 string collocators2json(vector<Collocator> collocators);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100353
Marc Kupietz4b799e92018-01-02 11:04:56 +0100354 // mapped to a rocksdb Merge operation
355 virtual bool add(const std::string& key, uint64_t value) {
356 char encoded[sizeof(uint64_t)];
357 EncodeFixed64(encoded, value);
358 Slice slice(encoded, sizeof(uint64_t));
359 auto s = db_->Merge(merge_option_, key, slice);
360
361 if (s.ok()) {
362 return true;
363 } else {
364 std::cerr << s.ToString() << std::endl;
365 return false;
366 }
367 }
368
369 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
370 };
371
Marc Kupietz6aec7682018-01-10 09:47:48 +0100372 rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100373 // merge_option_.sync = true;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100374 if(read_only)
375 db_ = OpenDbForRead(db_name);
376 else
377 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100378 assert(db_);
379 uint64_t one = 1;
380 EncodeFixed64(_one, one);
381 _one_slice = Slice(_one, sizeof(uint64_t));
382 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100383
Marc Kupietz6aec7682018-01-10 09:47:48 +0100384 void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100385 inc(encodeCollocation(w1, w2, dist));
386 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100387
Marc Kupietz6aec7682018-01-10 09:47:48 +0100388 void rocksdb::CollocatorDB::read_vocab(string fname) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100389 char strbuf[2048];
390 uint64_t freq;
391 FILE *fin = fopen(fname.c_str(), "rb");
392 if (fin == NULL) {
393 cout << "Vocabulary file " << fname <<" not found\n";
394 exit(1);
395 }
396 uint64_t i = 0;
397 while(!feof(fin)) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100398 fscanf(fin, "%s %lu", strbuf, &freq);
Marc Kupietz37359b12018-01-09 21:11:37 +0100399 _vocab.push_back({strbuf, freq});
400 total += freq;
401 i++;
402 }
403 fclose(fin);
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100404
405 char size_fname[256];
406 strcpy(size_fname, fname.c_str());
407 char *pos = strstr(size_fname, ".vocab");
408 if(pos) {
409 *pos=0;
410 strcat(size_fname, ".size");
411 FILE *fp = fopen(size_fname, "r");
412 if (fp != NULL) {
413 fscanf(fp, "%lu", &sentences);
414 fscanf(fp, "%lu", &total);
415 float sl = (float)total/(float)sentences;
416 float w = WINDOW_SIZE;
417 avg_window_size = ((sl > 2*w? (sl-2*w)*2*w: 0) + (double) w * (3*w -1)) / sl;
418 fprintf(stdout, "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n", total, sentences, sl, avg_window_size);
419 fclose(fp);
420 } else {
421 std::cout << "size file " << size_fname << " not found\n";
422 }
423 } else {
424 std::cout << "cannot determine size file " << size_fname << "\n";
425 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100426 }
427
Marc Kupietz6aec7682018-01-10 09:47:48 +0100428 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
Marc Kupietz6bb27762018-01-09 17:53:01 +0100429 DB* db;
430 Options options;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100431 options.env->SetBackgroundThreads(4);
432 options.create_if_missing = true;
433 options.merge_operator = std::make_shared<CountMergeOperator>();
434 options.max_successive_merges = 0;
435 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
436 options.IncreaseParallelism();
437 options.OptimizeLevelStyleCompaction();
438 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
Marc Kupietz37359b12018-01-09 21:11:37 +0100439 ostringstream dbname, vocabname;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100440 dbname << name << ".rocksdb";
441 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
442 if (!s.ok()) {
443 std::cerr << s.ToString() << std::endl;
444 assert(false);
445 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100446 vocabname << name << ".vocab";
447 read_vocab(vocabname.str());
Marc Kupietz6bb27762018-01-09 17:53:01 +0100448 return std::shared_ptr<DB>(db);
449 }
450
Marc Kupietz6aec7682018-01-10 09:47:48 +0100451 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100452 DB* db;
453 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100454
455
456 options.env->SetBackgroundThreads(4);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100457 options.create_if_missing = true;
458 options.merge_operator = std::make_shared<CountMergeOperator>();
459 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100460 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
461 options.IncreaseParallelism();
462 options.OptimizeLevelStyleCompaction();
463 // options.max_write_buffer_number = 48;
464 // options.max_background_jobs = 48;
465 // options.allow_concurrent_memtable_write=true;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100466 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
467 // options.enable_write_thread_adaptive_yield = 1;
468 // options.allow_concurrent_memtable_write = 1;
469 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
470 // options.write_buffer_size = 1 << 22;
471 // options.allow_mmap_reads = true;
472 // options.allow_mmap_writes = true;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100473 // options.max_background_compactions = 40;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100474 // BlockBasedTableOptions table_options;
475 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
476 // options.bloom_locality = 1;
477 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
478 // table_options.block_cache = cache;
479 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100480 Status s;
481 // DestroyDB(dbname, Options());
482 s = DB::Open(options, dbname, &db);
483 if (!s.ok()) {
484 std::cerr << s.ToString() << std::endl;
485 assert(false);
486 }
487 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100488 }
489
Marc Kupietz6aec7682018-01-10 09:47:48 +0100490 CollocatorIterator* rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100491 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100492 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100493 char prefixc[sizeof(uint64_t)];
494 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
495 Iterator *it = db_->NewIterator(options);
496 CollocatorIterator *cit = new CollocatorIterator(it);
497 cit->Seek(std::string(prefixc,3));// it->Valid() && it->key().starts_with(std::string(prefixc,3)); it->Next()) {
498 cit->setPrefix(prefixc);
499 return cit;
500 }
501
Marc Kupietz6aec7682018-01-10 09:47:48 +0100502 void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100503 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
504 for (; it->isValid(); it->Next()) {
505 uint64_t value = it->intValue();
506 uint64_t key = it->intKey();
507 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl;
508 }
509 std::cout << "ready dumping\n";
510 }
511
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100512 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
513 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100514 bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100515
Marc Kupietzbd966192018-10-13 14:14:37 +0200516 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t max_w2) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100517 std::vector<Collocator> collocators;
518 uint64_t w2, last_w2 = 0xffffffffffffffff;
Marc Kupietzbd966192018-10-13 14:14:37 +0200519 uint64_t maxv = 0, left = 0, right = 0;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100520 const double window_size = 1;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100521 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
522 uint64_t value = it->intValue(),
523 key = it->intKey();
Marc Kupietzbd966192018-10-13 14:14:37 +0200524 if((w2 = W2(key)) > max_w2)
525 continue;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100526 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
527 if (w2 != last_w2) {
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100528 double pmi = ca_pmi(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, window_size);
529 double lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, window_size);
530 double left_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
531 double right_lfmd = ca_lfmd(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
532 double left_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, left, total, 1);
533 double right_npmi = ca_npmi(_vocab[w1].freq, _vocab[last_w2].freq, right, total, 1);
534 collocators.push_back ( {last_w2, maxv, pmi, pmi / (-log2(((double) maxv / window_size / total))), /* normalize to [-1,1] */
535 calculateLLR(_vocab[w1].freq, total, maxv, _vocab[last_w2].freq), lfmd, pmi*maxv/total/window_size,
536 left_lfmd,
537 right_lfmd,
538 left_npmi,
539 right_npmi}
540 );
Marc Kupietzd31254c2018-01-20 21:29:30 +0100541 last_w2 = w2;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100542 maxv = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100543 } else {
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100544 if(value > maxv)
545 maxv = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100546 }
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100547 if(DIST(key) == -1)
548 left = value;
549 else if(DIST(key) == 1)
550 right = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100551 }
552
553 sort(collocators.begin(), collocators.end(), sortByLfmd);
554
Marc Kupietz0779a202018-06-05 11:13:35 +0200555 /*
Marc Kupietzd31254c2018-01-20 21:29:30 +0100556 int i=0;
557 for (Collocator c : collocators) {
558 if(i++>10) break;
559 std::cout << "w1:" << _vocab[w1].word << ", w2:" << _vocab[c.w2].word
560 << "\t f(w1):" << _vocab[w1].freq
561 << "\t f(w2):" << _vocab[c.w2].freq
562 << "\t f(w1, x):" << total_w1
Marc Kupietz51f93792018-01-25 08:51:01 +0100563 << "\t f(w1, w2):" << c.raw
Marc Kupietzd31254c2018-01-20 21:29:30 +0100564 << "\t pmi:" << c.pmi
565 << "\t npmi:" << c.npmi
566 << "\t llr:" << c.llr
Marc Kupietzd31254c2018-01-20 21:29:30 +0100567 << "\t lfmd:" << c.lfmd
568 << "\t fpmi:" << c.fpmi
569 << "\t total:" << total
570 << std::endl;
571 }
Marc Kupietz0779a202018-06-05 11:13:35 +0200572 */
Marc Kupietzd31254c2018-01-20 21:29:30 +0100573 return collocators;
574 }
575
Marc Kupietzbd966192018-10-13 14:14:37 +0200576 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
577 return get_collocators(w1, UINT32_MAX);
578 }
579
Marc Kupietz3400aa52018-06-05 10:28:55 +0200580 void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
581 std::vector<Collocator> collocators;
582 std::stringstream stream;
583 uint64_t w2, last_w2 = 0xffffffffffffffff;
584 uint64_t maxv = 0, total_w1 = 0;
585 bool first = true;
586 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
587 uint64_t value = it->intValue(),
588 key = it->intKey();
589 w2 = W2(key);
590 total_w1 += value;
591 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
592 if (w2 != last_w2) {
593 if(maxv >= min_cooccur) {
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100594 double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200595 if(first)
596 first = false;
597 else
598 stream << " ";
599 stream << w2 << " " << llr;
600 }
601 last_w2 = w2;
602 maxv = value;
603 } else {
604 if(value > maxv)
605 maxv = value;
606 }
607 }
608 if(first)
609 stream << "1 0.0";
610 stream << "\n";
611 std::cout << stream.str();
612 }
613
Marc Kupietz4b799e92018-01-02 11:04:56 +0100614 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
615 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
616 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
617
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100618};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100619
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200620string rocksdb::CollocatorDB::getWord(uint32_t w1) {
621 return _vocab[w1].word;
622}
623
Marc Kupietz6aec7682018-01-10 09:47:48 +0100624string rocksdb::CollocatorDB::collocators2json(vector<Collocator> collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100625 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100626 int i = 0;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100627 s << "[";
628 bool first = true;
629 for (Collocator c : collocators) {
Marc Kupietzb999ec52018-06-05 11:20:46 +0200630 if(strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100631 if (i++ > 200)
632 break;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100633 if(!first)
634 s << ",\n";
635 else
636 first = false;
637 s << "{"
638 "\"word\":\"" << string(_vocab[c.w2].word) << "\"," <<
639 "\"rank\":" << c.w2 << "," <<
Marc Kupietz51f93792018-01-25 08:51:01 +0100640 "\"f\":" << c.raw << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100641 "\"npmi\":" << c.npmi << "," <<
642 "\"llr\":" << c.llr << "," <<
643 "\"lfmd\":" << c.lfmd << "," <<
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100644 "\"fpmi\":" << c.fpmi << "," <<
645 "\"llfmd\":" << c.left_lfmd << "," <<
646 "\"rlfmd\":" << c.right_lfmd << "," <<
647 "\"lnpmi\":" << c.left_npmi << "," <<
648 "\"rnpmi\":" << c.right_npmi <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100649 "}";
650 }
651 s << "]\n";
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100652 // cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100653 return s.str();
654}
655
Marc Kupietz6aec7682018-01-10 09:47:48 +0100656typedef rocksdb::CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100657
658extern "C" {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100659 COLLOCATORS *open_collocatordb_for_write(char *dbname) {
660 return new rocksdb::CollocatorDB(dbname, false);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100661 }
662
Marc Kupietz6aec7682018-01-10 09:47:48 +0100663 COLLOCATORS *open_collocatordb(char *dbname) {
664 return new rocksdb::CollocatorDB(dbname, true);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100665 }
666
Marc Kupietz6aec7682018-01-10 09:47:48 +0100667 void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100668 db->inc(w1, w2, dist);
669 }
670
671 void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
672 db->dump(w1, w2, dist);
673 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100674
Marc Kupietz37359b12018-01-09 21:11:37 +0100675 void get_collocators(COLLOCATORS *db, uint32_t w1) {
676 db->get_collocators(w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100677 }
678
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200679 const char *get_word(COLLOCATORS *db, uint32_t w) {
680 return db->getWord(w).c_str();
681 }
682
Marc Kupietz37359b12018-01-09 21:11:37 +0100683 const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
684 return strdup(db->collocators2json(db->get_collocators(w1)).c_str());
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100685 }
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100686}