blob: 9a2317325c9a5f48e2089441817f0b34cabbfd17 [file] [log] [blame]
Marc Kupietz4b799e92018-01-02 11:04:56 +01001#define EXPORT __attribute__((visibility("visible")))
2#define IMPORT
Marc Kupietz28cc53e2017-12-23 17:24:55 +01003#include <assert.h>
4#include <memory>
5#include <iostream>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01006#include <algorithm>
7#include <vector>
Marc Kupietz28cc53e2017-12-23 17:24:55 +01008#include <stdint.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +01009#include <string>
10#include <sstream> // for ostringstream
11#include <math.h>
Marc Kupietzd31254c2018-01-20 21:29:30 +010012#include <rocksdb/cache.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010013#include "rocksdb/db.h"
14#include "rocksdb/env.h"
Marc Kupietzc8ddf452018-01-07 21:33:12 +010015#include "rocksdb/table.h"
Marc Kupietz28cc53e2017-12-23 17:24:55 +010016#include <rocksdb/merge_operator.h>
Marc Kupietzc8ddf452018-01-07 21:33:12 +010017#include <rocksdb/slice_transform.h>
Marc Kupietz28cc53e2017-12-23 17:24:55 +010018#include "merge_operators.h"
19
Marc Kupietz75af60f2019-01-22 22:34:29 +010020#define WINDOW_SIZE 5
Marc Kupietz98cbcdc2019-01-21 17:11:27 +010021#define FREQUENCY_THRESHOLD 5
Marc Kupietz28cc53e2017-12-23 17:24:55 +010022#define IS_BIG_ENDIAN (*(uint16_t *)"\0\xff" < 0x100)
23#define encodeCollocation(w1, w2, dist) (((uint64_t)dist << 56) | ((uint64_t)w2 << 24) | w1)
Marc Kupietz18375e12017-12-24 10:11:18 +010024#define W1(key) (uint64_t)(key & 0xffffff)
25#define W2(key) (uint64_t)((key >> 24) & 0xffffff)
26#define DIST(key) (int8_t)((uint64_t)((key >> 56) & 0xff))
Marc Kupietzc8ddf452018-01-07 21:33:12 +010027
28typedef struct {
29 uint64_t freq;
30 char *word;
31} vocab_entry;
32
33// typedef struct Collocator {
34// uint64_t w2;
35// uint64_t sum;
36// };
37
Marc Kupietz28cc53e2017-12-23 17:24:55 +010038using namespace rocksdb;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010039using namespace std;
Marc Kupietz28cc53e2017-12-23 17:24:55 +010040
Marc Kupietz4b799e92018-01-02 11:04:56 +010041namespace rocksdb {
Marc Kupietz4a5e08a2018-06-05 11:07:11 +020042 class Collocator {
43 public:
Marc Kupietz8c62c372019-01-31 12:21:01 +010044 uint32_t w2;
Marc Kupietzcc6c4592019-01-23 10:11:23 +010045 uint64_t f2;
Marc Kupietz51f93792018-01-25 08:51:01 +010046 uint64_t raw;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010047 double pmi;
48 double npmi;
49 double llr;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010050 double lfmd;
Marc Kupietz41880452019-01-22 15:29:06 +010051 double md;
Marc Kupietz6702e042021-03-13 18:05:14 +010052 double left_raw;
53 double right_raw;
54 double left_pmi;
55 double right_pmi;
Marc Kupietz41880452019-01-22 15:29:06 +010056 double dice;
57 double logdice;
Marc Kupietz3203e4c2019-02-04 12:42:45 +010058 double ldaf;
Marc Kupietz75af60f2019-01-22 22:34:29 +010059 int window;
Marc Kupietze9f58932019-01-24 15:12:59 +010060 int af_window;
Marc Kupietzc8ddf452018-01-07 21:33:12 +010061 };
62
Marc Kupietz28cc53e2017-12-23 17:24:55 +010063 size_t num_merge_operator_calls;
64 void resetNumMergeOperatorCalls() { num_merge_operator_calls = 0; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +010065
Marc Kupietz28cc53e2017-12-23 17:24:55 +010066 size_t num_partial_merge_calls;
67 void resetNumPartialMergeCalls() { num_partial_merge_calls = 0; }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010068
69
Marc Kupietz4b799e92018-01-02 11:04:56 +010070 inline void EncodeFixed64(char* buf, uint64_t value) {
71 if (! IS_BIG_ENDIAN) {
72 memcpy(buf, &value, sizeof(value));
73 } else {
74 buf[0] = value & 0xff;
75 buf[1] = (value >> 8) & 0xff;
76 buf[2] = (value >> 16) & 0xff;
77 buf[3] = (value >> 24) & 0xff;
78 buf[4] = (value >> 32) & 0xff;
79 buf[5] = (value >> 40) & 0xff;
80 buf[6] = (value >> 48) & 0xff;
81 buf[7] = (value >> 56) & 0xff;
82 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +010083 }
84
Marc Kupietz4b799e92018-01-02 11:04:56 +010085 inline uint32_t DecodeFixed32(const char* ptr) {
86 if (! IS_BIG_ENDIAN) {
87 // Load the raw bytes
88 uint32_t result;
89 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
90 return result;
91 } else {
92 return ((static_cast<uint32_t>(static_cast<unsigned char>(ptr[0])))
93 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[1])) << 8)
94 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[2])) << 16)
95 | (static_cast<uint32_t>(static_cast<unsigned char>(ptr[3])) << 24));
96 }
97 }
98
99 inline uint64_t DecodeFixed64(const char* ptr) {
100 if (! IS_BIG_ENDIAN) {
101 // Load the raw bytes
102 uint64_t result;
103 memcpy(&result, ptr, sizeof(result)); // gcc optimizes this to a plain load
104 return result;
105 } else {
106 uint64_t lo = DecodeFixed32(ptr);
107 uint64_t hi = DecodeFixed32(ptr + 4);
108 return (hi << 32) | lo;
109 }
110 }
111
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100112 static inline double ca_pmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100113 double
114 r1 = f1 * window_size,
115 c1 = f2,
116 e = r1 * c1 / total,
117 o = f12;
Marc Kupietzf4a649a2021-02-26 09:18:01 +0100118 if(f12 < FREQUENCY_THRESHOLD)
119 return -1.0;
120 else
121 return log2(o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100122 }
123
Marc Kupietzce0b8b02018-06-05 11:06:39 +0200124 // Bouma, Gerlof (2009): <a href="https://svn.spraakdata.gu.se/repos/gerlof/pub/www/Docs/npmi-pfd.pdf">
125 // Normalized (pointwise) mutual information in collocation extraction</a>. In Proceedings of GSCL.
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100126 static inline double ca_npmi(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100127 double
128 r1 = f1 * window_size,
129 c1 = f2,
130 e = r1 * c1 / total,
131 o = f12;
132 if(f12 < FREQUENCY_THRESHOLD)
Marc Kupietz8caf9912018-06-05 10:51:18 +0200133 return -1.0;
134 else
Marc Kupietz1335dd72019-01-22 15:35:21 +0100135 return log2(o/e) / (-log2(o/total/window_size));
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100136 }
137
138 // Thanopoulos, A., Fakotakis, N., Kokkinakis, G.: Comparative evaluation of collocation extraction metrics.
139 // In: International Conference on Language Resources and Evaluation (LREC-2002). (2002) 620–625
140 // double md = log2(pow((double)max * window_size / total, 2) / (window_size * ((double)_vocab[w1].freq/total) * ((double)_vocab[last_w2].freq/total)));
141 static inline double ca_md(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100142 double
143 r1 = f1 * window_size,
144 c1 = f2,
145 e = r1 * c1 / total,
146 o = f12;
147 return log2(o*o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100148 }
149
150 static inline double ca_lfmd(uint64_t f1, uint64_t f2, uint64_t f12, uint64_t total, double window_size) {
Marc Kupietz1335dd72019-01-22 15:35:21 +0100151 double
152 r1 = f1 * window_size,
153 c1 = f2,
154 e = r1 * c1 / total,
155 o = f12;
Marc Kupietz8caf9912018-06-05 10:51:18 +0200156 if(f12 == 0)
157 return 0;
158 else
Marc Kupietz1335dd72019-01-22 15:35:21 +0100159 return log2(o*o*o/e);
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100160 }
161
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100162 // Evert, Stefan (2004): The Statistics of Word Cooccurrences: Word Pairs and Collocations. PhD dissertation, IMS, University of Stuttgart. Published in 2005, URN urn:nbn:de:bsz:93-opus-23714.
163 // Free PDF available from http://purl.org/stefan.evert/PUB/Evert2004phd.pdf
164 static inline double ca_ll(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
165 double
166 r1 = (double) w1 * window_size,
167 r2 = (double) n - r1,
168 c1 = w2,
169 c2 = n - c1,
170 o11 = w12, o12 = r1 - o11,
171 o21 = c1 - w12, o22 = r2 - o21,
172 e11 = r1 * c1 / n, e12 = r1 * c2 / n,
173 e21 = r2 * c1 / n, e22 = r2 * c2 / n;
174 return (2 * ( (o11>0? o11 * log(o11/e11):0) + (o12>0? o12 * log(o12/e12):0) + (o21>0? o21 * log(o21/e21):0) + (o22>0? o22 * log(o22/e22):0)));
175 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100176
Marc Kupietz41880452019-01-22 15:29:06 +0100177
178 static inline double ca_dice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
179 double
180 r1 = (double) w1 * window_size,
181 c1 = w2;
182 return 2 * w12 / (c1+r1);
183 }
184
185 // Rychlý, Pavel (2008): <a href="http://www.fi.muni.cz/usr/sojka/download/raslan2008/13.pdf">A lexicographer-friendly association score.</a> In Proceedings of Recent Advances in Slavonic Natural Language Processing, RASLAN, 6–9.
186 static inline double ca_logdice(uint64_t w1, uint64_t w2, uint64_t w12, uint64_t n, uint64_t window_size) {
187 double
Marc Kupietz41880452019-01-22 15:29:06 +0100188 r1 = (double) w1 * window_size,
189 c1 = w2;
Marc Kupietzfdc0acf2019-01-31 12:42:58 +0100190 return 14 + log2(2 * w12 / (c1+r1));
Marc Kupietz41880452019-01-22 15:29:06 +0100191 }
192
Marc Kupietz4b799e92018-01-02 11:04:56 +0100193 class CountMergeOperator : public AssociativeMergeOperator {
194 public:
195 CountMergeOperator() {
196 mergeOperator_ = MergeOperators::CreateUInt64AddOperator();
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100197 }
198
Marc Kupietz4b799e92018-01-02 11:04:56 +0100199 virtual bool Merge(const Slice& key,
200 const Slice* existing_value,
201 const Slice& value,
202 std::string* new_value,
203 Logger* logger) const override {
204 assert(new_value->empty());
205 ++num_merge_operator_calls;
206 if (existing_value == nullptr) {
207 new_value->assign(value.data(), value.size());
208 return true;
209 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100210
Marc Kupietz4b799e92018-01-02 11:04:56 +0100211 return mergeOperator_->PartialMerge(
212 key,
213 *existing_value,
214 value,
215 new_value,
216 logger);
217 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100218
Marc Kupietz4b799e92018-01-02 11:04:56 +0100219 virtual const char* Name() const override {
220 return "UInt64AddOperator";
221 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100222
Marc Kupietz4b799e92018-01-02 11:04:56 +0100223 private:
224 std::shared_ptr<MergeOperator> mergeOperator_;
225 };
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100226
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100227
Marc Kupietz4b799e92018-01-02 11:04:56 +0100228 class CollocatorIterator : public Iterator {
229 private:
230 char prefixc[sizeof(uint64_t)];
231 Iterator *base_iterator_;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100232
233
Marc Kupietz4b799e92018-01-02 11:04:56 +0100234 public:
235 CollocatorIterator(Iterator* base_iterator)
236 : base_iterator_(base_iterator)
237 {}
238
Marc Kupietz4b799e92018-01-02 11:04:56 +0100239 void setPrefix(char *prefix) {
240 memcpy(prefixc, prefix, sizeof(uint64_t));
241 }
242
243 virtual void SeekToFirst() { base_iterator_->SeekToFirst(); }
244 virtual void SeekToLast() { base_iterator_->SeekToLast(); }
245 virtual void Seek(const rocksdb::Slice& s) { base_iterator_->Seek(s); }
Marc Kupietz60d10512021-03-13 18:05:14 +0100246 virtual void SeekForPrev(const rocksdb::Slice& s) { base_iterator_->SeekForPrev(s); }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100247 virtual void Prev() { base_iterator_->Prev(); }
248 virtual void Next() { base_iterator_->Next(); }
249 virtual Slice key() const;
250 virtual Slice value() const;
251 virtual Status status() const;
252 virtual bool Valid() const;
253 bool isValid();
254 uint64_t intValue();
255 uint64_t intKey();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100256
Marc Kupietz4b799e92018-01-02 11:04:56 +0100257 };
Marc Kupietz18375e12017-12-24 10:11:18 +0100258
Marc Kupietz4b799e92018-01-02 11:04:56 +0100259 // rocksdb::CollocatorIterator::CollocatorIterator(Iterator* base_iterator) {}
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100260
Marc Kupietz4b799e92018-01-02 11:04:56 +0100261 bool rocksdb::CollocatorIterator::Valid() const {
Marc Kupietz18375e12017-12-24 10:11:18 +0100262 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100263 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100264
Marc Kupietz4b799e92018-01-02 11:04:56 +0100265 bool rocksdb::CollocatorIterator::isValid() {
266 return base_iterator_->Valid() && key().starts_with(std::string(prefixc,3));
Marc Kupietzd31254c2018-01-20 21:29:30 +0100267 // return key().starts_with(std::string(prefixc,3));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100268 }
Marc Kupietz18375e12017-12-24 10:11:18 +0100269
Marc Kupietz4b799e92018-01-02 11:04:56 +0100270 uint64_t rocksdb::CollocatorIterator::intKey() {
271 return DecodeFixed64(base_iterator_->key().data());
272 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100273
Marc Kupietz4b799e92018-01-02 11:04:56 +0100274 uint64_t rocksdb::CollocatorIterator::intValue() {
275 return DecodeFixed64(base_iterator_->value().data());
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100276 }
277
Marc Kupietz37359b12018-01-09 21:11:37 +0100278 class VocabEntry {
279 public:
280 string word;
281 uint64_t freq;
282 };
283
Marc Kupietz6aec7682018-01-10 09:47:48 +0100284 class CollocatorDB {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100285 private:
286 WriteOptions merge_option_; // for merge
287 char _one[sizeof(uint64_t)];
288 Slice _one_slice;
Marc Kupietz37359b12018-01-09 21:11:37 +0100289 vector<VocabEntry> _vocab;
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100290 uint64_t total = 0;
291 uint64_t sentences = 0;
Marc Kupietz8cf7e912019-01-21 17:05:23 +0100292 float avg_window_size = 8.0;
Marc Kupietz37359b12018-01-09 21:11:37 +0100293
Marc Kupietz4b799e92018-01-02 11:04:56 +0100294 protected:
295 std::shared_ptr<DB> db_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100296
Marc Kupietz4b799e92018-01-02 11:04:56 +0100297 WriteOptions put_option_;
298 ReadOptions get_option_;
299 WriteOptions delete_option_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100300
Marc Kupietz4b799e92018-01-02 11:04:56 +0100301 uint64_t default_;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100302
Marc Kupietz4b799e92018-01-02 11:04:56 +0100303 std::shared_ptr<DB> OpenDb(const char *dbname);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100304 std::shared_ptr<DB> OpenDbForRead(const char *dbname);
Marc Kupietz37359b12018-01-09 21:11:37 +0100305 void read_vocab(string fname);
306
Marc Kupietz4b799e92018-01-02 11:04:56 +0100307 public:
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200308 string getWord(uint32_t w1);
Marc Kupietz6aec7682018-01-10 09:47:48 +0100309 CollocatorDB(const char *db_name, bool read_only);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100310
Marc Kupietz6aec7682018-01-10 09:47:48 +0100311 // public interface of CollocatorDB.
Marc Kupietz4b799e92018-01-02 11:04:56 +0100312 // All four functions return false
313 // if the underlying level db operation failed.
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100314
Marc Kupietz4b799e92018-01-02 11:04:56 +0100315 // mapped to a levedb Put
316 bool set(const std::string& key, uint64_t value) {
317 // just treat the internal rep of int64 as the string
318 char buf[sizeof(value)];
319 EncodeFixed64(buf, value);
320 Slice slice(buf, sizeof(value));
321 auto s = db_->Put(put_option_, key, slice);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100322
Marc Kupietz4b799e92018-01-02 11:04:56 +0100323 if (s.ok()) {
324 return true;
325 } else {
326 std::cerr << s.ToString() << std::endl;
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100327 return false;
328 }
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100329 }
Marc Kupietz4b799e92018-01-02 11:04:56 +0100330
331 DB *getDb() {
332 return db_.get();
333 }
334
335 // mapped to a rocksdb Delete
336 bool remove(const std::string& key) {
337 auto s = db_->Delete(delete_option_, key);
338
339 if (s.ok()) {
340 return true;
341 } else {
342 std::cerr << s.ToString() << std::endl;
343 return false;
344 }
345 }
346
347 // mapped to a rocksdb Get
348 bool get(const std::string& key, uint64_t* value) {
349 std::string str;
350 auto s = db_->Get(get_option_, key, &str);
351
352 if (s.IsNotFound()) {
353 // return default value if not found;
354 *value = default_;
355 return true;
356 } else if (s.ok()) {
357 // deserialization
358 if (str.size() != sizeof(uint64_t)) {
359 std::cerr << "value corruption\n";
360 return false;
361 }
362 *value = DecodeFixed64(&str[0]);
363 return true;
364 } else {
365 std::cerr << s.ToString() << std::endl;
366 return false;
367 }
368 }
369
370
371 uint64_t get(const uint32_t w1, const uint32_t w2, const int8_t dist) {
372 char encoded_key[sizeof(uint64_t)];
373 EncodeFixed64(encoded_key, encodeCollocation(w1,w2,dist));
374 uint64_t value = default_;
375 get(std::string(encoded_key, 8), &value);
376 return value;
377 }
378
379 virtual void inc(const std::string& key) {
380 db_->Merge(merge_option_, key, _one_slice);
381 }
382
383 void inc(const uint64_t key) {
384 char encoded_key[sizeof(uint64_t)];
385 EncodeFixed64(encoded_key, key);
386 db_->Merge(merge_option_, std::string(encoded_key, 8), _one_slice);
387 }
388
389 virtual void inc(const uint32_t w1, const uint32_t w2, const uint8_t dist);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100390 void dump(uint32_t w1, uint32_t w2, int8_t dist);
Marc Kupietz37359b12018-01-09 21:11:37 +0100391 vector<Collocator> get_collocators(uint32_t w1);
Marc Kupietzbd966192018-10-13 14:14:37 +0200392 vector<Collocator> get_collocators(uint32_t w1, uint32_t max_w2);
Marc Kupietz88d116b2021-03-13 18:05:14 +0100393 vector<Collocator> get_collocation_scores(uint32_t w1, uint32_t w2);
394 vector<Collocator> get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2);
Marc Kupietz8c62c372019-01-31 12:21:01 +0100395 void applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow, const uint64_t sum, const int usedPositions, int true_window_size, rocksdb::Collocator *result);
396
Marc Kupietz3400aa52018-06-05 10:28:55 +0200397 void dumpSparseLlr(uint32_t w1, uint32_t min_cooccur);
Marc Kupietze9627152019-02-04 12:32:12 +0100398 string collocators2json(uint32_t w1, vector<Collocator> collocators);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100399
Marc Kupietz4b799e92018-01-02 11:04:56 +0100400 // mapped to a rocksdb Merge operation
401 virtual bool add(const std::string& key, uint64_t value) {
402 char encoded[sizeof(uint64_t)];
403 EncodeFixed64(encoded, value);
404 Slice slice(encoded, sizeof(uint64_t));
405 auto s = db_->Merge(merge_option_, key, slice);
406
407 if (s.ok()) {
408 return true;
409 } else {
410 std::cerr << s.ToString() << std::endl;
411 return false;
412 }
413 }
414
415 CollocatorIterator* SeekIterator(uint64_t w1, uint64_t w2, int8_t dist);
416 };
417
Marc Kupietz6aec7682018-01-10 09:47:48 +0100418 rocksdb::CollocatorDB::CollocatorDB(const char *db_name, bool read_only = false) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100419 // merge_option_.sync = true;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100420 if(read_only)
Marc Kupietz88d116b2021-03-13 18:05:14 +0100421 db_ = OpenDbForRead(strdup(db_name));
Marc Kupietz6bb27762018-01-09 17:53:01 +0100422 else
423 db_ = OpenDb(db_name);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100424 assert(db_);
425 uint64_t one = 1;
426 EncodeFixed64(_one, one);
427 _one_slice = Slice(_one, sizeof(uint64_t));
428 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100429
Marc Kupietz6aec7682018-01-10 09:47:48 +0100430 void rocksdb::CollocatorDB::inc(const uint32_t w1, const uint32_t w2, const uint8_t dist) {
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100431 inc(encodeCollocation(w1, w2, dist));
432 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100433
Marc Kupietz6aec7682018-01-10 09:47:48 +0100434 void rocksdb::CollocatorDB::read_vocab(string fname) {
Marc Kupietz37359b12018-01-09 21:11:37 +0100435 char strbuf[2048];
436 uint64_t freq;
437 FILE *fin = fopen(fname.c_str(), "rb");
438 if (fin == NULL) {
439 cout << "Vocabulary file " << fname <<" not found\n";
440 exit(1);
441 }
442 uint64_t i = 0;
443 while(!feof(fin)) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100444 fscanf(fin, "%s %lu", strbuf, &freq);
Marc Kupietz37359b12018-01-09 21:11:37 +0100445 _vocab.push_back({strbuf, freq});
446 total += freq;
447 i++;
448 }
449 fclose(fin);
Marc Kupietz4ec51c12019-01-21 11:06:39 +0100450
451 char size_fname[256];
452 strcpy(size_fname, fname.c_str());
453 char *pos = strstr(size_fname, ".vocab");
454 if(pos) {
455 *pos=0;
456 strcat(size_fname, ".size");
457 FILE *fp = fopen(size_fname, "r");
458 if (fp != NULL) {
459 fscanf(fp, "%lu", &sentences);
460 fscanf(fp, "%lu", &total);
461 float sl = (float)total/(float)sentences;
462 float w = WINDOW_SIZE;
463 avg_window_size = ((sl > 2*w? (sl-2*w)*2*w: 0) + (double) w * (3*w -1)) / sl;
464 fprintf(stdout, "Size corrections found: corpus size: %lu tokens in %lu sentences, avg. sentence size: %f, avg. window size: %f\n", total, sentences, sl, avg_window_size);
465 fclose(fp);
466 } else {
467 std::cout << "size file " << size_fname << " not found\n";
468 }
469 } else {
470 std::cout << "cannot determine size file " << size_fname << "\n";
471 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100472 }
473
Marc Kupietz6aec7682018-01-10 09:47:48 +0100474 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDbForRead(const char *name) {
Marc Kupietz6bb27762018-01-09 17:53:01 +0100475 DB* db;
476 Options options;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100477 options.env->SetBackgroundThreads(4);
478 options.create_if_missing = true;
479 options.merge_operator = std::make_shared<CountMergeOperator>();
480 options.max_successive_merges = 0;
481 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
482 options.IncreaseParallelism();
483 options.OptimizeLevelStyleCompaction();
484 options.prefix_extractor.reset(NewFixedPrefixTransform(3));
Marc Kupietz37359b12018-01-09 21:11:37 +0100485 ostringstream dbname, vocabname;
Marc Kupietz6bb27762018-01-09 17:53:01 +0100486 dbname << name << ".rocksdb";
487 auto s = DB::OpenForReadOnly(options, dbname.str(), &db);
488 if (!s.ok()) {
489 std::cerr << s.ToString() << std::endl;
490 assert(false);
491 }
Marc Kupietz37359b12018-01-09 21:11:37 +0100492 vocabname << name << ".vocab";
493 read_vocab(vocabname.str());
Marc Kupietz6bb27762018-01-09 17:53:01 +0100494 return std::shared_ptr<DB>(db);
495 }
496
Marc Kupietz6aec7682018-01-10 09:47:48 +0100497 std::shared_ptr<DB> rocksdb::CollocatorDB::OpenDb(const char *dbname) {
Marc Kupietz4b799e92018-01-02 11:04:56 +0100498 DB* db;
499 Options options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100500
501
502 options.env->SetBackgroundThreads(4);
Marc Kupietz4b799e92018-01-02 11:04:56 +0100503 options.create_if_missing = true;
504 options.merge_operator = std::make_shared<CountMergeOperator>();
505 options.max_successive_merges = 0;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100506 // options.prefix_extractor.reset(NewFixedPrefixTransform(8));
507 options.IncreaseParallelism();
508 options.OptimizeLevelStyleCompaction();
509 // options.max_write_buffer_number = 48;
510 // options.max_background_jobs = 48;
511 // options.allow_concurrent_memtable_write=true;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100512 // options.memtable_factory.reset(rocksdb::NewHashLinkListRepFactory(200000));
513 // options.enable_write_thread_adaptive_yield = 1;
514 // options.allow_concurrent_memtable_write = 1;
515 // options.memtable_factory.reset(new rocksdb::SkipListFactory);
516 // options.write_buffer_size = 1 << 22;
517 // options.allow_mmap_reads = true;
518 // options.allow_mmap_writes = true;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100519 // options.max_background_compactions = 40;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100520 // BlockBasedTableOptions table_options;
521 // table_options.filter_policy.reset(NewBloomFilterPolicy(24, false));
522 // options.bloom_locality = 1;
523 // std::shared_ptr<Cache> cache = NewLRUCache(512 * 1024 * 1024);
524 // table_options.block_cache = cache;
525 // options.table_factory.reset(NewBlockBasedTableFactory(table_options));
Marc Kupietz4b799e92018-01-02 11:04:56 +0100526 Status s;
527 // DestroyDB(dbname, Options());
528 s = DB::Open(options, dbname, &db);
529 if (!s.ok()) {
530 std::cerr << s.ToString() << std::endl;
531 assert(false);
532 }
533 return std::shared_ptr<DB>(db);
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100534 }
535
Marc Kupietz6aec7682018-01-10 09:47:48 +0100536 CollocatorIterator* rocksdb::CollocatorDB::SeekIterator(uint64_t w1, uint64_t w2, int8_t dist) {
Marc Kupietz18375e12017-12-24 10:11:18 +0100537 ReadOptions options;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100538 options.prefix_same_as_start = true;
Marc Kupietz18375e12017-12-24 10:11:18 +0100539 char prefixc[sizeof(uint64_t)];
540 EncodeFixed64(prefixc, encodeCollocation(w1, w2, dist));
541 Iterator *it = db_->NewIterator(options);
542 CollocatorIterator *cit = new CollocatorIterator(it);
Marc Kupietz88d116b2021-03-13 18:05:14 +0100543 if (w2 > 0)
544 cit->Seek(std::string(prefixc, 6));
545 else
546 cit->Seek(std::string(prefixc, 3));
Marc Kupietz18375e12017-12-24 10:11:18 +0100547 cit->setPrefix(prefixc);
548 return cit;
549 }
550
Marc Kupietz6aec7682018-01-10 09:47:48 +0100551 void rocksdb::CollocatorDB::dump(uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100552 auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, w2, dist));
553 for (; it->isValid(); it->Next()) {
554 uint64_t value = it->intValue();
555 uint64_t key = it->intKey();
556 std::cout << "w1:" << W1(key) << ", w2:" << W2(key) << ", dist:" << (int32_t) DIST(key) << " - count:" << value << std::endl;
557 }
558 std::cout << "ready dumping\n";
559 }
560
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100561 bool sortByNpmi(const Collocator &lhs, const Collocator &rhs) { return lhs.npmi > rhs.npmi; }
562 bool sortByLfmd(const Collocator &lhs, const Collocator &rhs) { return lhs.lfmd > rhs.lfmd; }
Marc Kupietzd31254c2018-01-20 21:29:30 +0100563 bool sortByLlr(const Collocator &lhs, const Collocator &rhs) { return lhs.llr > rhs.llr; }
Marc Kupietz7e3dfde2019-01-22 16:27:33 +0100564 bool sortByLogDice(const Collocator &lhs, const Collocator &rhs) { return lhs.logdice > rhs.logdice; }
Marc Kupietz3203e4c2019-02-04 12:42:45 +0100565 bool sortByLogDiceAF(const Collocator &lhs, const Collocator &rhs) { return lhs.ldaf > rhs.ldaf; }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100566
Marc Kupietz8c62c372019-01-31 12:21:01 +0100567
568 void rocksdb::CollocatorDB::applyCAMeasures(const uint32_t w1, const uint32_t w2, uint64_t *sumWindow,
569 const uint64_t sum, const int usedPositions, int true_window_size, rocksdb::Collocator *result) {
570 uint64_t f1 = _vocab[w1].freq, f2 = _vocab[w2].freq;
571 double o = sum,
572 r1 = f1 * true_window_size,
573 c1 = f2,
574 e = r1 * c1 / total,
575 pmi = log2(o/e),
576 md = log2(o*o/e),
577 lfmd = log2(o*o*o/e),
578 llr = ca_ll(f1, f2, sum, total, true_window_size);
579 double ld = ca_logdice(f1, f2, sum, total, true_window_size);
580
581 int bestWindow = usedPositions;
582 double bestAF = ld;
583 double currentAF;
584 // if(f1<75000000)
585 //#pragma omp parallel for reduction(max:bestAF)
Marc Kupietz6d0fa542021-02-26 09:24:35 +0100586 // #pragma omp target teams distribute parallel for reduction(max:bestAF) map(tofrom:bestAF,currentAF,bestWindow,usedPositions)
Marc Kupietz8c62c372019-01-31 12:21:01 +0100587 for (int bitmask=1; bitmask < (1 << (2*WINDOW_SIZE)); bitmask++) {
588 if((bitmask & usedPositions) == 0 || (bitmask & ~usedPositions) > 0) continue;
589 uint64_t currentWindowSum=0;
Marc Kupietz6d0fa542021-02-26 09:24:35 +0100590 // #pragma omp target teams distribute parallel for reduction(+:currentWindowSum) map(tofrom:bitmask,usedPositions)
Marc Kupietz8c62c372019-01-31 12:21:01 +0100591 for (int pos=0; pos < 2*WINDOW_SIZE; pos++) {
592 if (((1<<pos) & bitmask & usedPositions) != 0)
593 currentWindowSum+=sumWindow[pos];
594 }
595 currentAF = ca_logdice(f1, f2, currentWindowSum, total, __builtin_popcount(bitmask));
596 if(currentAF > bestAF) {
597 bestAF = currentAF;
598 bestWindow = bitmask;
599 }
600 }
601
602 *result = {w2, f2, sum,
603 pmi, pmi / (-log2(o/total/true_window_size)),
604 llr, lfmd, md,
Marc Kupietz6d9221d2021-02-26 09:34:40 +0100605 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE], total, 1),
606 ca_pmi(f1, f2, sumWindow[WINDOW_SIZE-1], total, 1),
607 (double)sumWindow[WINDOW_SIZE],
608 (double)sumWindow[WINDOW_SIZE-1],
Marc Kupietz8c62c372019-01-31 12:21:01 +0100609 ca_dice(f1, f2, sum, total, true_window_size),
610 ld,
611 bestAF,
612 usedPositions,
613 bestWindow
614 };
615
616 }
617
Marc Kupietz88d116b2021-03-13 18:05:14 +0100618 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1, uint32_t min_w2, uint32_t max_w2) {
Marc Kupietz75af60f2019-01-22 22:34:29 +0100619 std::vector<Collocator> collocators;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100620 uint64_t w2, last_w2 = 0xffffffffffffffff;
Marc Kupietz8c62c372019-01-31 12:21:01 +0100621 uint64_t maxv = 0, sum = 0;
622 uint64_t *sumWindow = (uint64_t*) malloc(sizeof(uint64_t)*2*WINDOW_SIZE);
623 memset(sumWindow, 0, sizeof(uint64_t)*2*WINDOW_SIZE);
Marc Kupietzade33222019-01-22 22:52:44 +0100624 int true_window_size = 1;
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100625 int usedPositions=0;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100626
Marc Kupietz6d0fa542021-02-26 09:24:35 +0100627 if(w1 > _vocab.size()) {
628 std::cout << w1 << "> vocabulary size " << _vocab.size() << "\n";
629 w1 -= _vocab.size();
630 }
631 #ifdef DEBUG
632 std::cout << "Searching for collocates of " << _vocab[w1].word << "\n";
633 #endif
634 // #pragma omp parallel num_threads(40)
635 // #pragma omp single
Marc Kupietz88d116b2021-03-13 18:05:14 +0100636 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, min_w2, 0)); it->isValid(); it->Next()) {
Marc Kupietzd31254c2018-01-20 21:29:30 +0100637 uint64_t value = it->intValue(),
638 key = it->intKey();
Marc Kupietzbd966192018-10-13 14:14:37 +0200639 if((w2 = W2(key)) > max_w2)
640 continue;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100641 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
642 if (w2 != last_w2) {
Marc Kupietz75af60f2019-01-22 22:34:29 +0100643 if (sum >= FREQUENCY_THRESHOLD) {
Marc Kupietz8c62c372019-01-31 12:21:01 +0100644 collocators.push_back({});
645 rocksdb::Collocator *result = &(collocators[collocators.size()-1]);
Marc Kupietz6d0fa542021-02-26 09:24:35 +0100646 // #pragma omp task firstprivate(last_w2, sumWindow, sum, usedPositions, true_window_size) shared(w1, result) if(sum > 1000000)
Marc Kupietz8c62c372019-01-31 12:21:01 +0100647 {
648 // uint64_t *nsw = (uint64_t *)malloc(sizeof(uint64_t) * 2 *WINDOW_SIZE);
649 // memcpy(nsw, sumWindow, sizeof(uint64_t) * 2 *WINDOW_SIZE);
650 applyCAMeasures(w1, last_w2, sumWindow, sum, usedPositions, true_window_size, result);
651 // free(nsw);
Marc Kupietz75af60f2019-01-22 22:34:29 +0100652 }
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100653 }
Marc Kupietz75af60f2019-01-22 22:34:29 +0100654 memset(sumWindow, 0, 2*WINDOW_SIZE * sizeof(uint64_t));
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100655 usedPositions = 1 << (-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0));
Marc Kupietz75af60f2019-01-22 22:34:29 +0100656 sumWindow[-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0)] = value;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100657 last_w2 = w2;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100658 maxv = value;
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100659 sum = value;
Marc Kupietzade33222019-01-22 22:52:44 +0100660 true_window_size = 1;
Marc Kupietz88d116b2021-03-13 18:05:14 +0100661 if (min_w2 == max_w2 && w2 != min_w2)
662 break;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100663 } else {
Marc Kupietz98cbcdc2019-01-21 17:11:27 +0100664 sum += value;
Marc Kupietz8e0ebea2018-01-24 09:53:26 +0100665 if(value > maxv)
666 maxv = value;
Marc Kupietz39a4fd02019-01-23 10:18:43 +0100667 usedPositions |= 1 << (-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0));
Marc Kupietz75af60f2019-01-22 22:34:29 +0100668 sumWindow[-DIST(key)+WINDOW_SIZE-(DIST(key)<0?1:0)] = value;
Marc Kupietzade33222019-01-22 22:52:44 +0100669 true_window_size++;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100670 }
671 }
672
Marc Kupietz6d0fa542021-02-26 09:24:35 +0100673 // #pragma omp taskwait
674 sort(collocators.begin(), collocators.end(), sortByLogDiceAF);
Marc Kupietz8c62c372019-01-31 12:21:01 +0100675
Marc Kupietz30e4d5f2021-02-26 09:27:49 +0100676 #ifdef DEBUG
Marc Kupietzd31254c2018-01-20 21:29:30 +0100677 int i=0;
678 for (Collocator c : collocators) {
679 if(i++>10) break;
Marc Kupietz8c62c372019-01-31 12:21:01 +0100680 std::cout << "w1:" << _vocab[w1].word << ", w2: *" << _vocab[c.w2].word << "*"
Marc Kupietzd31254c2018-01-20 21:29:30 +0100681 << "\t f(w1):" << _vocab[w1].freq
682 << "\t f(w2):" << _vocab[c.w2].freq
Marc Kupietz51f93792018-01-25 08:51:01 +0100683 << "\t f(w1, w2):" << c.raw
Marc Kupietzd31254c2018-01-20 21:29:30 +0100684 << "\t pmi:" << c.pmi
685 << "\t npmi:" << c.npmi
686 << "\t llr:" << c.llr
Marc Kupietz8c62c372019-01-31 12:21:01 +0100687 << "\t md:" << c.md
Marc Kupietzd31254c2018-01-20 21:29:30 +0100688 << "\t lfmd:" << c.lfmd
Marc Kupietzd31254c2018-01-20 21:29:30 +0100689 << "\t total:" << total
690 << std::endl;
691 }
Marc Kupietz30e4d5f2021-02-26 09:27:49 +0100692 #endif
Marc Kupietz8c62c372019-01-31 12:21:01 +0100693
694 return collocators;
Marc Kupietzd31254c2018-01-20 21:29:30 +0100695 }
696
Marc Kupietz88d116b2021-03-13 18:05:14 +0100697
698 std::vector<Collocator> rocksdb::CollocatorDB::get_collocation_scores(uint32_t w1, uint32_t w2) {
699 return get_collocators(w1, w2, w2);
700 }
701
Marc Kupietz8c62c372019-01-31 12:21:01 +0100702 std::vector<Collocator> rocksdb::CollocatorDB::get_collocators(uint32_t w1) {
Marc Kupietz88d116b2021-03-13 18:05:14 +0100703 return get_collocators(w1, 0, UINT32_MAX);
Marc Kupietzbd966192018-10-13 14:14:37 +0200704 }
705
Marc Kupietz3400aa52018-06-05 10:28:55 +0200706 void rocksdb::CollocatorDB::dumpSparseLlr(uint32_t w1, uint32_t min_cooccur) {
707 std::vector<Collocator> collocators;
708 std::stringstream stream;
709 uint64_t w2, last_w2 = 0xffffffffffffffff;
710 uint64_t maxv = 0, total_w1 = 0;
711 bool first = true;
712 for ( auto it = std::unique_ptr<CollocatorIterator>(SeekIterator(w1, 0, 0)); it->isValid(); it->Next()) {
713 uint64_t value = it->intValue(),
714 key = it->intKey();
715 w2 = W2(key);
716 total_w1 += value;
717 if(last_w2 == 0xffffffffffffffff) last_w2 = w2;
718 if (w2 != last_w2) {
719 if(maxv >= min_cooccur) {
Marc Kupietzbbd236e2019-01-21 16:50:19 +0100720 double llr = ca_ll(_vocab[w1].freq, _vocab[last_w2].freq, maxv, total, 1);
Marc Kupietz3400aa52018-06-05 10:28:55 +0200721 if(first)
722 first = false;
723 else
724 stream << " ";
725 stream << w2 << " " << llr;
726 }
727 last_w2 = w2;
728 maxv = value;
729 } else {
730 if(value > maxv)
731 maxv = value;
732 }
733 }
734 if(first)
735 stream << "1 0.0";
736 stream << "\n";
737 std::cout << stream.str();
738 }
739
Marc Kupietz4b799e92018-01-02 11:04:56 +0100740 rocksdb::Slice rocksdb::CollocatorIterator::key() const { return base_iterator_->key(); }
741 rocksdb::Slice rocksdb::CollocatorIterator::value() const { return base_iterator_->value(); }
742 rocksdb::Status rocksdb::CollocatorIterator::status() const { return base_iterator_->status(); }
743
Marc Kupietz28cc53e2017-12-23 17:24:55 +0100744};
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100745
Marc Kupietz4a5e08a2018-06-05 11:07:11 +0200746string rocksdb::CollocatorDB::getWord(uint32_t w1) {
747 return _vocab[w1].word;
748}
749
Marc Kupietze9627152019-02-04 12:32:12 +0100750string rocksdb::CollocatorDB::collocators2json(uint32_t w1, vector<Collocator> collocators) {
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100751 ostringstream s;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100752 int i = 0;
Marc Kupietze9627152019-02-04 12:32:12 +0100753 s << " { \"f1\": " << _vocab[w1].freq << "," <<
754 "\"w1\":\"" << string(_vocab[w1].word) << "\", " <<
755 "\"N\": " << total << ", " <<
756 "\"collocates\": [";
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100757 bool first = true;
758 for (Collocator c : collocators) {
Marc Kupietzb999ec52018-06-05 11:20:46 +0200759 if(strncmp(_vocab[c.w2].word.c_str(), "quot", 4) == 0) continue;
Marc Kupietz0dd86ef2018-01-11 22:23:17 +0100760 if (i++ > 200)
761 break;
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100762 if(!first)
763 s << ",\n";
764 else
765 first = false;
766 s << "{"
Marc Kupietz7d9558f2019-01-22 16:26:50 +0100767 "\"word\":\"" << (string(_vocab[c.w2].word).compare("<num>") == 0? string("###") : string(_vocab[c.w2].word)) << "\"," <<
Marc Kupietzcc6c4592019-01-23 10:11:23 +0100768 "\"f2\":" << c.f2 << "," <<
Marc Kupietz51f93792018-01-25 08:51:01 +0100769 "\"f\":" << c.raw << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100770 "\"npmi\":" << c.npmi << "," <<
Marc Kupietz41880452019-01-22 15:29:06 +0100771 "\"pmi\":" << c.pmi << "," <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100772 "\"llr\":" << c.llr << "," <<
773 "\"lfmd\":" << c.lfmd << "," <<
Marc Kupietz41880452019-01-22 15:29:06 +0100774 "\"md\":" << c.md << "," <<
775 "\"dice\":" << c.dice << "," <<
776 "\"ld\":" << c.logdice << "," <<
Marc Kupietz6702e042021-03-13 18:05:14 +0100777 "\"lncount\":" << c.left_raw << "," <<
778 "\"rncount\":" << c.right_raw << "," <<
779 "\"lnpmi\":" << c.left_pmi << "," <<
780 "\"rnpmi\":" << c.right_pmi << "," <<
Marc Kupietz3203e4c2019-02-04 12:42:45 +0100781 "\"af\":" << c.ldaf << "," <<
Marc Kupietze9f58932019-01-24 15:12:59 +0100782 "\"win\":" << c.window << "," <<
783 "\"afwin\":" << c.af_window <<
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100784 "}";
785 }
Marc Kupietze9627152019-02-04 12:32:12 +0100786 s << "]}\n";
Marc Kupietz8c62c372019-01-31 12:21:01 +0100787 std::cout << s.str();
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100788 return s.str();
789}
790
Marc Kupietz6aec7682018-01-10 09:47:48 +0100791typedef rocksdb::CollocatorDB COLLOCATORS;
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100792
793extern "C" {
Marc Kupietz6aec7682018-01-10 09:47:48 +0100794 COLLOCATORS *open_collocatordb_for_write(char *dbname) {
795 return new rocksdb::CollocatorDB(dbname, false);
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100796 }
797
Marc Kupietz6aec7682018-01-10 09:47:48 +0100798 COLLOCATORS *open_collocatordb(char *dbname) {
799 return new rocksdb::CollocatorDB(dbname, true);
Marc Kupietz6bb27762018-01-09 17:53:01 +0100800 }
801
Marc Kupietz6aec7682018-01-10 09:47:48 +0100802 void inc_collocator(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100803 db->inc(w1, w2, dist);
804 }
805
806 void dump_collocators(COLLOCATORS *db, uint32_t w1, uint32_t w2, int8_t dist) {
807 db->dump(w1, w2, dist);
808 }
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100809
Marc Kupietz37359b12018-01-09 21:11:37 +0100810 void get_collocators(COLLOCATORS *db, uint32_t w1) {
811 db->get_collocators(w1);
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100812 }
813
Marc Kupietz88d116b2021-03-13 18:05:14 +0100814 void get_collocation_scores(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
815 db->get_collocation_scores(w1, w2);
816 }
817
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200818 const char *get_word(COLLOCATORS *db, uint32_t w) {
Marc Kupietz88d116b2021-03-13 18:05:14 +0100819 return strdup(db->getWord(w).c_str());
Marc Kupietzca3a52e2018-06-05 14:16:23 +0200820 }
821
Marc Kupietz37359b12018-01-09 21:11:37 +0100822 const char *get_collocators_as_json(COLLOCATORS *db, uint32_t w1) {
Marc Kupietze9627152019-02-04 12:32:12 +0100823 return strdup(db->collocators2json(w1, db->get_collocators(w1)).c_str());
Marc Kupietzc8ddf452018-01-07 21:33:12 +0100824 }
Marc Kupietz88d116b2021-03-13 18:05:14 +0100825
826 const char *get_collocation_scores_as_json(COLLOCATORS *db, uint32_t w1, uint32_t w2) {
827 return strdup(db->collocators2json(w1, db->get_collocation_scores(w1, w2)).c_str());
828 }
Marc Kupietz06c9a9f2018-01-02 16:56:43 +0100829}